From a3b4e10a5b611869f2cd0c4a785093d4d15dbcf8 Mon Sep 17 00:00:00 2001
From: zhangjinpeng1987 <zhangjinpeng@pingcap.com>
Date: Tue, 12 Dec 2017 09:33:51 +0800
Subject: [PATCH] Squashed 'librocksdb_sys/rocksdb/' changes from
 185a70d..80134e9

80134e9 Preserve overlapping file endpoint invariant (#11)
23ba5bb fix delete range bug (#10)
bfcf674 cherry-pick optimize ingest sst (#5)
7ecb1c0 cherry-pick table filter (#7)
149f822 Merge pull request #8 from zhangjinpeng1987/zhangjinpeng/5.8
b8655d9 cmake: pass "-msse4.2" to when building crc32c.cc if HAVE_SSE42
fe6e58b 5.8.fb
aead404 fix HISTORY.md typo
a0cdc3c Bump version to 5.8.7
7513f63 Fix IOError on WAL write doesn't propagate to write group follower
9e47084 Bump version to 5.8.6
36074ba Enable cacheline_aligned_alloc() to allocate from jemalloc if enabled.
aa00523 Add -DPORTABLE=1 to MSVC CI build
cf2b982 Bump version to 5.8.5
e8c9350 Blob DB: not using PinnableSlice move assignment
4907d24 Bump version to 5.8.4
5d928c7 Blob DB: Fix race condition between flush and write
725bb9d Blob DB: Fix release build
b7367fe Bump version to 5.8.3
13b2a9b Blob DB: use compression in file header instead of global options
5dc70a1 Fix PinnableSlice move assignment
9019e91 dynamically change current memtable size
7f1815c Bump version to 5.8.2
2584a18 Blob DB: Fix BlobDBTest::SnapshotAndGarbageCollection asan failure
17f67b5 PinnableSlice move assignment
6fb56c5 Blob DB: Add compaction filter to remove expired blob index entries
f90ced9 Blob DB: fix snapshot handling
632f36d Blob DB: option to enable garbage collection
11bacd5 Blob DB: Fix flaky BlobDBTest::GCExpiredKeyWhileOverwriting test
f98efcb Blob DB: Evict oldest blob file when close to blob db size limit
c1e99ed Blob DB: cleanup unused options
ffc3c62 Blob DB: Initialize all fields in Blob Header, Footer and Record structs
9e82540 Blob DB: update blob file format
d66bb21 Blob DB: Inline small values in base DB
05d5c57 Return write error on reaching blob dir size limit
2b8893b Blob DB: Store blob index as kTypeBlobIndex in base db
419b93c Blob DB: not writing sequence number as blob record footer
8afb003 fix lite build
dded348 Blob DB: Move BlobFile definition to a separate file
3747361 add GetLiveFiles and GetLiveFilesMetaData for BlobDB
8cff6e9 Enable WAL for blob index
c293472 Add ValueType::kTypeBlobIndex
eae53de Make it explicit blob db doesn't support CF
65aec19 Fix memory leak on blob db open
30b38c9 TableProperty::oldest_key_time defaults to 0
2879f4b Bump version to 5.8.1
88595c8 Add DB::Properties::kEstimateOldestKeyTime
266ac24 Bumping version to 5.8
64185c2 update HISTORY.md for DeleteRange bug fix
e83d6a0 Not using aligned_alloc with gcc4 + asan
0980dc6 Fix wrong smallest key of delete range tombstones
b767972 avoid use-after-move error
c417442 CMake: Fix formatting
c21ea8f CMake: Add support for CMake packages
5444345 add Erlang to the list of language bindings
2972a70 Minor updates to FlushWAL blog
fbfa3e7 WriteAtPrepare: Efficient read from snapshot list
b01f426 Blog post for FlushWAL
503db68 make blob file close synchronous
3c840d1 Allow DB reopen with reduced options.num_levels
92bfd6c Fix DropColumnFamily data race
7fdf735 Pinnableslice examples and blog post
7fbb9ec support disabling checksum in block-based table
19cc66d fix clang bug in block-based table reader
7eba54e test compaction input-level split range tombstone assumption
cd26af3 Add unit test for WritePrepared skeleton
a124798 Improved transactions support in C API
c10b391 LANGUAGE-BINDINGS.md: add another rust binding
9017743 Remove leftover references to phutil_module_cache
234f33a allow nullptr Slice only as sentinel
ccf7f83 Use PinnableSlice in Transactions
1dfcdb1 Extend pin_l0 to filter partitions
39ef900 stop calling memcmp with nullptrs
78cb6b6 Provide byte[] version of SstFileWriter.merge to reduce GC Stall
867fe92 Scale histogram bucket size by constant factor
f004307 CMake improvements
09ac620 Circumvent ASAN false positive
5b68b11 Blob db create a snapshot before every read
4624ae5 GC the oldest file when out of space
8ace1f7 add counter for deletion dropping optimization
0d8e992 Revert the mistake in version update
5358a80 add VerifyChecksum to HISTORY.md
ed0a4c9 perf_context measure user bytes read
1efc600 Preload l0 index partitions
bddd5d3 Added mechanism to track deadlock chain
c1384a7 fix db_stress uint64_t to int32 cast
29877ec Fix blob db crash during calculating write amp
8f2598a Enable Cassandra merge operator to be called with a single merge operand
9a44b4c Allow merge operator to be called even with a single operand
ac8fb77 fix some misspellings
2359317 minor improvements to db_stress
af012c0 fix deleterange with memtable prefix bloom
1c8dbe2 update scores after picking universal compaction
eb64253 Update WritePrepared with the pseudo code
132306f Remove PartialMerge implementation from Cassandra merge operator
71598cd Fix false removal of tombstone issue in FIFO and kCompactionStyleNone
3204a4f Fix missing stdlib include required for abort()
7aa96db db_stress rolling active window
dfa6c23 Update RocksDBCommonHelper to use escapeshellarg
e367774 Overload new[] to properly align LRUCacheShard
ad42d2f Remove residual arcanist_util directory
279296f properly set C[XX]FLAGS during CMake configure-time checks
c5f0c6c compile with correct flags to determine SSE4.2 support
185ade4 cmake: support more compression type
5449c09 rocksdb: make buildable on aarch64
a144a97 Fix for CMakeLists.txt on Windows for RocksJava
acf935e fix deletion dropping in intra-L0
8254e9b make sst_dump compression size command consistent
74f18c1 db_bench support for non-uniform column family ops
5de98f2 approximate histogram stats to save cpu
3f58884 Fix c_test ASAN failure
e5a1b72 Fix blob DB transaction usage while GC
6f051e0 fix corruption_test valgrind
ac098a4 expose set_skip_stats_update_on_db_open to C bindings
666a005 Support prefetch last 512KB with direct I/O in block based file reader
ad77ee0 Revert "Makefile: correct faligned-new test"
b87ee6f Use more keys per lock in daily TSAN crash test
25df242 Add column families related functions (C API)
64f8484 block_cache_tier: fix gcc-7 warnings
0cecf81 Write batch for `TransactionDB` in C API
6a9de43 Windows.h macro call fix
23c7d13 fix comment
1fbad84 Makefile: correct faligned-new test
7848f0b add VerifyChecksum() to db.h
47ed3bf fix WinEnv assertions
d97a72d Try to repair db with wal_dir option, avoid leak some WAL files
36375de gcc-7/i386: markup intentional fallthroughs
bdc056f Refactor PessimisticTransaction
a9a4e89 Fix valgrind complaint about initialization
4ca11b4 Update USERS.md
c9804e0 Refactor TransactionDBImpl
20dc5e7 Optimize range-delete aggregator call in merge helper.
0d4a2b7 Avoid blob db call Sync() while writing
627c9f1 Don't add -ljemalloc when DISABLE_JEMALLOC is set
dce6d5a db_bench background work thread pool size arguments
4f81ab3 Makefile: fix for GCC 7+ and clang 4+
92afe83 Update all blob db TTL and timestamps to uint64_t
5883a1a Fix /bin/bash shebangs
cc01985 Introduce bottom-pri thread pool for large universal compactions
0b814ba Allow concurrent writes to blob db
2c45ada Blob DB garbage collection should keep keys with newer version
58410ae Fix the overflow bug in AwaitState
c3d5c4d Refactor TransactionImpl
060ccd4 support multiple CFs with OPTIONS file
3453870 Fix statistics in RocksJava sample
1900771 Dump Blob DB options to info log
3218edc Fix universal compaction bug
6a36b3a fix db get/write stats
a84cee8 Add a missing "once" in .h
21696ba Replace dynamic_cast<>
e85f2c6 Prevent empty memtables from using a lot of memory
ac748c5 Fix FIFO Compaction with TTL tests
aaf42fe Move blob_db/ttl_extractor.h into blob_db/blob_db.h
aace465 Fix license headers in Cassandra related files
50a9691 CacheActivityLogger, component to log cache activity into a file
6083bc7 Blob DB TTL extractor
710411a fix asan/valgrind for TableCache cleanup
3a3fb00 TARGETS file not setting sse explicitly
fca4d6d Build fewer tests in Travis platform_dependent tests
8f553d3 remove unnecessary internal_comparator param in newIterator
7f6d012 "ccache -C" in Travis
d12691b move TableCache::EraseHandle outside of db mutex
f33f113 fix db_bench argument type
e7697b8 Fix LITE unit tests
3ce20e9 Fix use of RocksDBCommonHelper in cont_integration.sh
c281b44 Revert "CRC32 Power Optimization Changes"
9980de2 Fix FIFO compaction picker test
2289d38 CRC32 Power Optimization Changes
30b58cf Remove the orphan assert on !need_log_sync
fe1a555 Fix flaky write_callback_test
addbd27 5.6.1 release blog post
30edff3 buckification: remove explicit `-msse*` compiler flags
2b259c9 Lower num of iterations in DeadlockCycle test
277f6f2 Release note for partitioned index/filters
5e731a1 Remove unused rocksdb arcanist lib
9b11d43 Fix broken links
06f1917 add vcpkg as an windows option
ea8ad4f Fix compaction div by zero logging
34112ae Added db paths to c
1d8aa29 Gcc 7 ParsedInternalKey replace memset with clear function.
a4c42e8 Fix UBSAN issue of passing nullptr to memcmp
16e0388 LRUCacheShard cache line size alignment
216644c enable UBSAN macro in TARGETS
e67b35c Add Iterator::Refresh()
a34b2e3 Fix caching of compaction picker's next index
72502cf Revert "comment out unused parameters"
1d7048c comment out unused parameters
534c255 Cassandra compaction filter for purge expired columns and rows
63163a8 Remove make_new_version.sh
0302da4 Reduce blob db noisy logging
3e5ea29 Fix Flaky DeleteSchedulerTest::ImmediateDeleteOn25PercDBSize
a22b9cc overlapping endpoint fixes in level compaction picker
ffd2a2e delete ExpandInputsToCleanCut failure log
3e6e863 Remove arcanist_util directory
36651d1 Moving static AdaptationContext to outside function
6e3ee01 Update java/rocksjni.pom
ecff9d5 Include write_buffer_manager in ImmutableDBOptions::Dump
ae28634 Remove some left-over BSD headers
33b1de8 Remove format compatibility hack
2f37515 checkout local branch in check_format_compatible.sh
ddb22ac avoid collision with master branch in check format
0c03a7f set the remote for git checkout
7ac184c Revert cmake -DNDEBUG for non-MSVC
0655b58 enable PinnableSlice for RowCache
00464a3 Fix column_family_test with LITE build
b2dd192 tools/write_stress.cc: Correct "1204" typos.
cbaab30 table/block.h: change memset
f1a056e CodeMod: Prefer ADD_FAILURE() over EXPECT_TRUE(false), et cetera
4a2e489 Add back the LevelDB license file
a7321fc Remove the licensing description in CONTRIBUTING.md
3c327ac Change RocksDB License

git-subtree-dir: librocksdb_sys/rocksdb
git-subtree-split: 80134e957b9633d18ce61c251635ebb6eb1763f5
---
 .deprecated_arcconfig                         |   17 -
 .gitignore                                    |    1 -
 .travis.yml                                   |    4 +-
 CMakeLists.txt                                |  326 +-
 HISTORY.md                                    |   32 +-
 INSTALL.md                                    |    2 +
 LANGUAGE-BINDINGS.md                          |    5 +-
 Makefile                                      |   40 +-
 TARGETS                                       |   39 +-
 USERS.md                                      |    3 +
 appveyor.yml                                  |    2 +-
 arcanist_util/__phutil_library_init__.php     |    3 -
 arcanist_util/__phutil_library_map__.php      |   71 -
 .../config/FacebookArcanistConfiguration.php  |   43 -
 .../FacebookOldArcanistConfiguration.php      |   43 -
 .../cpp_linter/ArcanistCpplintLinter.php      |   88 -
 .../BaseDirectoryScopedFormatLinter.php       |   74 -
 .../cpp_linter/FacebookHowtoevenLinter.php    |  223 -
 .../cpp_linter/FbcodeClangFormatLinter.php    |   58 -
 arcanist_util/cpp_linter/FbcodeCppLinter.php  |  126 -
 arcanist_util/cpp_linter/cpplint.py           | 4767 -----------------
 .../lint_engine/FacebookFbcodeLintEngine.php  |  138 -
 .../FacebookHowtoevenLintEngine.php           |   27 -
 .../FacebookFbcodeUnitTestEngine.php          |   17 -
 .../FacebookOldFbcodeUnitTestEngine.php       |   17 -
 buckifier/rocks_test_runner.sh                |    2 +-
 buckifier/targets_cfg.py                      |   12 +-
 .../RocksDBCommonHelper.php                   |   68 +-
 build_tools/build_detect_platform             |   13 +-
 build_tools/cont_integration.sh               |    4 +-
 build_tools/dockerbuild.sh                    |    2 +-
 build_tools/format-diff.sh                    |    2 +-
 build_tools/make_new_version.sh               |   53 -
 build_tools/regression_build_test.sh          |    2 +-
 build_tools/rocksdb-lego-determinator         |   63 +-
 cache/clock_cache.cc                          |    5 +
 cache/lru_cache.cc                            |   51 +-
 cache/lru_cache.h                             |   54 +-
 cache/lru_cache_test.cc                       |   11 +-
 cmake/RocksDBConfig.cmake.in                  |    3 +
 coverage/coverage_test.sh                     |    2 +-
 db/builder.cc                                 |   17 +-
 db/builder.h                                  |    5 +-
 db/c.cc                                       |  242 +-
 db/c_test.c                                   |   64 +
 db/column_family.cc                           |    7 +
 db/column_family_test.cc                      |    2 +
 db/compaction.cc                              |    1 +
 db/compaction_iteration_stats.h               |    2 +
 db/compaction_iterator.cc                     |   18 +-
 db/compaction_iterator_test.cc                |  105 +
 db/compaction_job.cc                          |   26 +-
 db/compaction_picker.cc                       |   87 +-
 db/compaction_picker_test.cc                  |  119 +-
 db/compaction_picker_universal.cc             |    1 +
 db/convenience.cc                             |   35 +-
 db/corruption_test.cc                         |   15 +-
 db/db_basic_test.cc                           |   61 +-
 db/db_blob_index_test.cc                      |  409 ++
 db/db_block_cache_test.cc                     |    4 +-
 db/db_compaction_test.cc                      |   94 +
 db/db_encryption_test.cc                      |    8 +-
 db/db_impl.cc                                 |  278 +-
 db/db_impl.h                                  |  100 +-
 db/db_impl_compaction_flush.cc                |  143 +-
 db/db_impl_debug.cc                           |   13 +-
 db/db_impl_files.cc                           |    3 +
 db/db_impl_readonly.cc                        |    4 +-
 db/db_impl_write.cc                           |   39 +-
 db/db_io_failure_test.cc                      |    2 +-
 db/db_iter.cc                                 |  172 +-
 db/db_iter.h                                  |   38 +-
 db/db_iter_test.cc                            |  138 +-
 db/db_iterator_test.cc                        |  151 +-
 db/db_properties_test.cc                      |   74 +
 db/db_range_del_test.cc                       |   68 +
 db/db_sst_test.cc                             |   12 +
 db/db_test.cc                                 |  117 +-
 db/db_test2.cc                                |   39 +-
 db/db_test_util.cc                            |   24 +-
 db/db_test_util.h                             |   33 +
 db/db_universal_compaction_test.cc            |  145 +-
 db/db_write_test.cc                           |   50 +-
 db/dbformat.cc                                |    2 +-
 db/dbformat.h                                 |   13 +-
 db/experimental.cc                            |   12 +-
 db/external_sst_file_basic_test.cc            |   44 +-
 db/external_sst_file_ingestion_job.cc         |   99 +-
 db/external_sst_file_ingestion_job.h          |    7 -
 db/flush_job.cc                               |    5 +-
 db/internal_stats.cc                          |   38 +-
 db/internal_stats.h                           |    2 +
 db/memtable.cc                                |   71 +-
 db/memtable.h                                 |   46 +-
 db/memtable_list.cc                           |   38 +-
 db/memtable_list.h                            |   21 +-
 db/merge_helper.cc                            |   37 +-
 db/merge_helper.h                             |   18 +-
 db/range_del_aggregator.cc                    |   26 +-
 db/range_del_aggregator.h                     |   10 +
 db/range_del_aggregator_test.cc               |   34 +-
 db/repair.cc                                  |   22 +-
 db/repair_test.cc                             |   34 +
 db/snapshot_impl.h                            |   27 +-
 db/table_cache.cc                             |   37 +-
 db/version_builder.cc                         |   99 +-
 db/version_builder.h                          |    1 +
 db/version_set.cc                             |   95 +-
 db/version_set.h                              |    8 +-
 db/wal_manager.cc                             |    5 +-
 db/write_batch.cc                             |   69 +-
 db/write_batch_internal.h                     |    3 +
 db/write_batch_test.cc                        |   18 +-
 db/write_callback_test.cc                     |   54 +-
 db/write_thread.cc                            |   60 +-
 db/write_thread.h                             |    2 +-
 docs/_docs/faq.md                             |    2 +-
 docs/_docs/getting-started.md                 |    2 +-
 ...17-05-12-partitioned-index-filter.markdown |    2 +-
 ...2017-07-25-rocksdb-5-6-1-released.markdown |   22 +
 docs/_posts/2017-08-24-pinnableslice.markdown |   37 +
 docs/_posts/2017-08-25-flushwal.markdown      |   26 +
 env/env_encryption.cc                         |    8 +-
 env/env_posix.cc                              |   12 +-
 env/env_test.cc                               |   14 +-
 env/mock_env.cc                               |    4 +-
 examples/Makefile                             |    4 +
 examples/compaction_filter_example.cc         |    6 +-
 examples/simple_example.cc                    |   27 +
 include/rocksdb/c.h                           |   97 +
 include/rocksdb/cleanable.h                   |   11 +-
 include/rocksdb/compaction_filter.h           |    3 +
 include/rocksdb/comparator.h                  |    4 +
 include/rocksdb/convenience.h                 |    8 +-
 include/rocksdb/db.h                          |   20 +
 include/rocksdb/env.h                         |    4 +-
 include/rocksdb/env_encryption.h              |    8 +-
 include/rocksdb/iterator.h                    |    7 +
 include/rocksdb/listener.h                    |    1 +
 include/rocksdb/merge_operator.h              |    7 +
 include/rocksdb/options.h                     |    8 +
 include/rocksdb/perf_context.h                |    5 +
 include/rocksdb/slice.h                       |    5 +
 include/rocksdb/statistics.h                  |    7 +-
 include/rocksdb/table.h                       |    8 +-
 include/rocksdb/table_properties.h            |    5 +-
 include/rocksdb/utilities/debug.h             |    6 +-
 include/rocksdb/utilities/sim_cache.h         |   14 +
 include/rocksdb/utilities/stackable_db.h      |   13 +
 include/rocksdb/utilities/transaction.h       |   34 +
 include/rocksdb/utilities/transaction_db.h    |   41 +
 .../utilities/write_batch_with_index.h        |   10 +
 include/rocksdb/version.h                     |    4 +-
 include/rocksdb/write_batch.h                 |    6 +
 java/CMakeLists.txt                           |    4 +
 java/Makefile                                 |    1 +
 .../rocksjni/cassandra_compactionfilterjni.cc |   22 +
 java/rocksjni/cassandra_value_operator.cc     |    6 +-
 java/rocksjni/sst_file_writerjni.cc           |  116 +-
 java/rocksjni/statisticsjni.cc                |    6 +-
 java/rocksjni/statisticsjni.h                 |    6 +-
 java/rocksjni/write_batch_test.cc             |    2 +-
 java/samples/src/main/java/RocksDBSample.java |   13 +-
 .../rocksdb/CassandraCompactionFilter.java    |   18 +
 .../rocksdb/CassandraValueMergeOperator.java  |    4 +-
 .../main/java/org/rocksdb/SstFileWriter.java  |   50 +
 .../src/main/java/org/rocksdb/StatsLevel.java |    6 +-
 .../java/org/rocksdb/SstFileWriterTest.java   |   26 +-
 .../test/java/org/rocksdb/StatisticsTest.java |    6 +-
 memtable/inlineskiplist_test.cc               |    1 +
 memtable/skiplist_test.cc                     |    1 +
 monitoring/file_read_sample.h                 |    8 +-
 monitoring/histogram.cc                       |   78 +-
 monitoring/histogram.h                        |   10 +-
 monitoring/histogram_test.cc                  |   26 +-
 monitoring/histogram_windowing.cc             |    5 +-
 monitoring/perf_context.cc                    |    6 +
 options/cf_options.h                          |    2 +-
 options/db_options.cc                         |    2 +
 options/options_helper.cc                     |  315 +-
 options/options_helper.h                      |  118 +-
 options/options_parser.cc                     |   59 +-
 options/options_parser.h                      |   10 +-
 options/options_test.cc                       |   24 +-
 port/port_posix.cc                            |   19 +
 port/port_posix.h                             |    7 +
 port/win/env_win.cc                           |   10 +-
 port/win/port_win.cc                          |   77 -
 port/win/port_win.h                           |   36 +
 port/win/win_jemalloc.cc                      |   47 +
 port/win/xpress_win.cc                        |   59 +-
 src.mk                                        |   25 +-
 table/adaptive_table_factory.cc               |    3 +-
 table/block.h                                 |    3 +-
 table/block_based_table_builder.cc            |   26 +-
 table/block_based_table_builder.h             |    3 +-
 table/block_based_table_factory.cc            |  196 +-
 table/block_based_table_factory.h             |   94 +-
 table/block_based_table_reader.cc             |  449 +-
 table/block_based_table_reader.h              |   43 +-
 table/cuckoo_table_builder_test.cc            |    6 +-
 table/cuckoo_table_factory.h                  |    5 +
 table/cuckoo_table_reader.cc                  |    3 +-
 table/cuckoo_table_reader.h                   |    1 -
 table/filter_block.h                          |    5 +-
 table/format.cc                               |  193 +-
 table/format.h                                |   12 +-
 table/full_filter_bits_builder.h              |    6 +-
 table/get_context.cc                          |   33 +-
 table/get_context.h                           |   10 +-
 table/iterator.cc                             |   13 +
 table/meta_blocks.cc                          |   47 +-
 table/meta_blocks.h                           |    8 +-
 table/mock_table.cc                           |    3 +-
 table/mock_table.h                            |    1 -
 table/partitioned_filter_block.cc             |  145 +-
 table/partitioned_filter_block.h              |   16 +-
 table/partitioned_filter_block_test.cc        |    9 +-
 table/plain_table_factory.cc                  |  144 +-
 table/plain_table_factory.h                   |   32 +
 table/plain_table_reader.cc                   |   14 +-
 table/plain_table_reader.h                    |    3 +-
 table/table_builder.h                         |    6 +-
 table/table_properties.cc                     |    5 +
 table/table_reader.h                          |    6 +-
 table/table_test.cc                           |    2 +-
 thirdparty.inc                                |   27 +-
 tools/benchmark.sh                            |    2 +-
 tools/benchmark_leveldb.sh                    |    2 +-
 tools/check_format_compatible.sh              |    9 +-
 tools/db_bench_tool.cc                        |   90 +-
 tools/db_crashtest.py                         |    1 +
 tools/db_stress.cc                            |   91 +-
 tools/dbench_monitor                          |    2 +-
 tools/generate_random_db.sh                   |    2 +-
 tools/ldb_cmd.cc                              |    7 +-
 tools/pflag                                   |    2 +-
 tools/rdb/rdb                                 |    2 +-
 tools/regression_test.sh                      |    2 +-
 tools/run_flash_bench.sh                      |    2 +-
 tools/run_leveldb.sh                          |    2 +-
 tools/sst_dump_test.cc                        |    2 +-
 tools/sst_dump_tool.cc                        |  114 +-
 tools/sst_dump_tool_imp.h                     |    6 +-
 tools/verify_random_db.sh                     |    2 +-
 tools/write_stress.cc                         |    4 +-
 util/arena.h                                  |    4 +
 util/arena_test.cc                            |    6 +
 util/cast_util.h                              |   21 +
 util/concurrent_arena.h                       |   15 +
 util/delete_scheduler_test.cc                 |    3 +-
 util/file_reader_writer.cc                    |   28 +
 util/file_reader_writer.h                     |   11 +
 util/file_reader_writer_test.cc               |    8 +-
 util/murmurhash.cc                            |    4 +-
 util/thread_local_test.cc                     |    2 +-
 util/threadpool_imp.cc                        |    6 +-
 utilities/backupable/backupable_db_test.cc    |    2 +-
 utilities/blob_db/blob_compaction_filter.h    |   78 +
 utilities/blob_db/blob_db.cc                  |  132 +-
 utilities/blob_db/blob_db.h                   |  136 +-
 utilities/blob_db/blob_db_impl.cc             | 1888 ++++---
 utilities/blob_db/blob_db_impl.h              |  429 +-
 utilities/blob_db/blob_db_iterator.h          |  104 +
 utilities/blob_db/blob_db_options_impl.cc     |   69 -
 utilities/blob_db/blob_db_options_impl.h      |   76 -
 utilities/blob_db/blob_db_test.cc             |  916 +++-
 utilities/blob_db/blob_dump_tool.cc           |   90 +-
 utilities/blob_db/blob_file.cc                |  105 +-
 utilities/blob_db/blob_file.h                 |  216 +
 utilities/blob_db/blob_index.h                |  161 +
 utilities/blob_db/blob_log_format.cc          |  387 +-
 utilities/blob_db/blob_log_format.h           |  309 +-
 utilities/blob_db/blob_log_reader.cc          |  175 +-
 utilities/blob_db/blob_log_reader.h           |   29 +-
 utilities/blob_db/blob_log_writer.cc          |   99 +-
 utilities/blob_db/blob_log_writer.h           |   33 +-
 .../cassandra/cassandra_compaction_filter.cc  |   47 +
 .../cassandra/cassandra_compaction_filter.h   |   39 +
 .../cassandra/cassandra_format_test.cc        |   71 +-
 .../cassandra/cassandra_functional_test.cc    |  251 +
 .../cassandra/cassandra_row_merge_test.cc     |    6 +-
 .../cassandra/cassandra_serialize_test.cc     |    4 +-
 .../{merge_operators => }/cassandra/format.cc |  104 +-
 .../{merge_operators => }/cassandra/format.h  |   61 +-
 .../cassandra/merge_operator.cc               |   27 +-
 .../cassandra/merge_operator.h                |   12 +-
 .../cassandra/serialize.h                     |    4 +-
 .../cassandra/test_utils.cc                   |   22 +-
 .../cassandra/test_utils.h                    |   14 +-
 utilities/checkpoint/checkpoint_test.cc       |    2 +-
 utilities/column_aware_encoding_util.cc       |    4 +-
 utilities/date_tiered/date_tiered_db_impl.cc  |    4 +-
 utilities/lua/rocks_lua_test.cc               |    2 +-
 .../cassandra/cassandra_merge_test.cc         |  134 -
 utilities/merge_operators/max.cc              |    2 +
 utilities/options/options_util_test.cc        |   26 +-
 .../persistent_cache/block_cache_tier.cc      |    2 +-
 .../persistent_cache/persistent_cache_test.cc |    4 +-
 utilities/simulator_cache/sim_cache.cc        |  157 +
 utilities/simulator_cache/sim_cache_test.cc   |   71 +
 ...tion_impl.cc => optimistic_transaction.cc} |   45 +-
 ...action_impl.h => optimistic_transaction.h} |   22 +-
 .../optimistic_transaction_db_impl.cc         |    8 +-
 ...ion_impl.cc => pessimistic_transaction.cc} |  161 +-
 ...ction_impl.h => pessimistic_transaction.h} |   85 +-
 .../pessimistic_transaction_db.cc             |  806 +++
 .../transactions/pessimistic_transaction_db.h |  316 ++
 utilities/transactions/transaction_base.cc    |   36 +-
 utilities/transactions/transaction_base.h     |    9 +
 utilities/transactions/transaction_db_impl.cc |  466 --
 utilities/transactions/transaction_db_impl.h  |  127 -
 .../transactions/transaction_lock_mgr.cc      |  127 +-
 utilities/transactions/transaction_lock_mgr.h |   55 +-
 utilities/transactions/transaction_test.cc    |  338 +-
 utilities/transactions/write_prepared_txn.cc  |   88 +
 utilities/transactions/write_prepared_txn.h   |   76 +
 utilities/ttl/ttl_test.cc                     |   12 +-
 .../write_batch_with_index.cc                 |   56 +-
 319 files changed, 12096 insertions(+), 11533 deletions(-)
 delete mode 100644 .deprecated_arcconfig
 delete mode 100644 arcanist_util/__phutil_library_init__.php
 delete mode 100644 arcanist_util/__phutil_library_map__.php
 delete mode 100644 arcanist_util/config/FacebookArcanistConfiguration.php
 delete mode 100644 arcanist_util/config/FacebookOldArcanistConfiguration.php
 delete mode 100644 arcanist_util/cpp_linter/ArcanistCpplintLinter.php
 delete mode 100644 arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php
 delete mode 100644 arcanist_util/cpp_linter/FacebookHowtoevenLinter.php
 delete mode 100644 arcanist_util/cpp_linter/FbcodeClangFormatLinter.php
 delete mode 100644 arcanist_util/cpp_linter/FbcodeCppLinter.php
 delete mode 100755 arcanist_util/cpp_linter/cpplint.py
 delete mode 100644 arcanist_util/lint_engine/FacebookFbcodeLintEngine.php
 delete mode 100644 arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php
 delete mode 100644 arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php
 delete mode 100644 arcanist_util/unit_engine/FacebookOldFbcodeUnitTestEngine.php
 rename {arcanist_util/config => build_tools}/RocksDBCommonHelper.php (85%)
 delete mode 100755 build_tools/make_new_version.sh
 create mode 100644 cmake/RocksDBConfig.cmake.in
 create mode 100644 db/db_blob_index_test.cc
 create mode 100644 docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
 create mode 100644 docs/_posts/2017-08-24-pinnableslice.markdown
 create mode 100644 docs/_posts/2017-08-25-flushwal.markdown
 create mode 100644 java/rocksjni/cassandra_compactionfilterjni.cc
 create mode 100644 java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
 create mode 100644 port/win/win_jemalloc.cc
 create mode 100644 util/cast_util.h
 create mode 100644 utilities/blob_db/blob_compaction_filter.h
 create mode 100644 utilities/blob_db/blob_db_iterator.h
 delete mode 100644 utilities/blob_db/blob_db_options_impl.cc
 delete mode 100644 utilities/blob_db/blob_db_options_impl.h
 create mode 100644 utilities/blob_db/blob_file.h
 create mode 100644 utilities/blob_db/blob_index.h
 create mode 100644 utilities/cassandra/cassandra_compaction_filter.cc
 create mode 100644 utilities/cassandra/cassandra_compaction_filter.h
 rename utilities/{merge_operators => }/cassandra/cassandra_format_test.cc (80%)
 create mode 100644 utilities/cassandra/cassandra_functional_test.cc
 rename utilities/{merge_operators => }/cassandra/cassandra_row_merge_test.cc (92%)
 rename utilities/{merge_operators => }/cassandra/cassandra_serialize_test.cc (96%)
 rename utilities/{merge_operators => }/cassandra/format.cc (75%)
 rename utilities/{merge_operators => }/cassandra/format.h (80%)
 rename utilities/{merge_operators => }/cassandra/merge_operator.cc (65%)
 rename utilities/{merge_operators => }/cassandra/merge_operator.h (68%)
 rename utilities/{merge_operators => }/cassandra/serialize.h (91%)
 rename utilities/{merge_operators => }/cassandra/test_utils.cc (77%)
 rename utilities/{merge_operators => }/cassandra/test_utils.h (68%)
 delete mode 100644 utilities/merge_operators/cassandra/cassandra_merge_test.cc
 rename utilities/transactions/{optimistic_transaction_impl.cc => optimistic_transaction.cc} (70%)
 rename utilities/transactions/{optimistic_transaction_impl.h => optimistic_transaction.h} (78%)
 rename utilities/transactions/{transaction_impl.cc => pessimistic_transaction.cc} (77%)
 rename utilities/transactions/{transaction_impl.h => pessimistic_transaction.h} (76%)
 create mode 100644 utilities/transactions/pessimistic_transaction_db.cc
 create mode 100644 utilities/transactions/pessimistic_transaction_db.h
 delete mode 100644 utilities/transactions/transaction_db_impl.cc
 delete mode 100644 utilities/transactions/transaction_db_impl.h
 create mode 100644 utilities/transactions/write_prepared_txn.cc
 create mode 100644 utilities/transactions/write_prepared_txn.h

diff --git a/.deprecated_arcconfig b/.deprecated_arcconfig
deleted file mode 100644
index 6cf07ffac..000000000
--- a/.deprecated_arcconfig
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "project_id" : "rocksdb",
-  "conduit_uri" : "https://phabricator.fb.com/api/",
-  "copyright_holder" : "Facebook",
-  "load" : [
-    "arcanist_util"
-  ],
-  "lint.engine" : "FacebookFbcodeLintEngine",
-  "lint.engine.single.linter" : "FbcodeCppLinter",
-  "unit.engine" : "FacebookFbcodeUnitTestEngine",
-  "arcanist_configuration" : "FacebookArcanistConfiguration",
-  "base" : "git:HEAD^, hg:.^",
-  "git.default-relative-commit" : "HEAD^",
-  "git:arc.feature.start.default" : "origin/master",
-  "arc.feature.start.default" : "master",
-  "history.immutable" : false
-}
diff --git a/.gitignore b/.gitignore
index 87d5b98a4..03b805983 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,7 +39,6 @@ coverage/COVERAGE_REPORT
 .gdbhistory
 .gdb_history
 package/
-.phutil_module_cache
 unity.a
 tags
 etags
diff --git a/.travis.yml b/.travis.yml
index 7e2bf115c..b76973d4e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -57,11 +57,11 @@ before_script:
 
 script:
   - ${CXX} --version
-  - if [ "${TEST_GROUP}" == 'platform_dependent' ]; then OPT=-DTRAVIS V=1 make -j4 all; OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 check_some; fi
+  - if [ "${TEST_GROUP}" == 'platform_dependent' ]; then ccache -C && OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some; fi
   - if [ "${TEST_GROUP}" == '1' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=comparator_db_test make -j4 check_some; fi
   - if [ "${TEST_GROUP}" == '2' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=comparator_db_test make -j4 check_some; fi
   - if [ "${JOB_NAME}" == 'java_test' ]; then OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest; fi
-  - if [ "${JOB_NAME}" == 'lite_build' ]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib; fi
+  - if [ "${JOB_NAME}" == 'lite_build' ]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib tools; fi
   - if [ "${JOB_NAME}" == 'examples' ]; then OPT=-DTRAVIS V=1 make -j4 static_lib; cd examples; make -j4; fi
   - if [ "${JOB_NAME}" == 'cmake' ]; then mkdir build && cd build && cmake .. && make -j4 rocksdb; fi
   - if [ "${JOB_NAME}" == 'cmake-mingw' ]; then mkdir build && cd build && cmake .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb; fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d2f2b2b1..404e14a87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@
 # 3. cmake ..
 # 4. make -j
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 2.6)
 project(rocksdb)
 
 if(POLICY CMP0042)
@@ -41,10 +41,10 @@ endif()
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules/")
 
+option(WITH_JEMALLOC "build with JeMalloc" OFF)
 if(MSVC)
   include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
 else()
-  option(WITH_JEMALLOC "build with JeMalloc" OFF)
   if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
     # FreeBSD has jemaloc as default malloc
     # but it does not have all the jemalloc files in include/...
@@ -98,7 +98,17 @@ else()
   endif()
 endif()
 
-string(TIMESTAMP GIT_DATE_TIME "%Y/%m/%d %H:%M:%S" UTC)
+if(WIN32)
+  execute_process(COMMAND powershell -noprofile -Command "Get-Date -format MM_dd_yyyy" OUTPUT_VARIABLE DATE)
+  execute_process(COMMAND powershell -noprofile -Command "Get-Date -format HH:mm:ss" OUTPUT_VARIABLE TIME)
+  string(REGEX REPLACE "(..)_(..)_..(..).*" "\\1/\\2/\\3" DATE "${DATE}")
+  string(REGEX REPLACE "(..):(.....).*" " \\1:\\2" TIME "${TIME}")
+  set(GIT_DATE_TIME "${DATE} ${TIME}")
+else()
+  execute_process(COMMAND date "+%Y/%m/%d %H:%M:%S" OUTPUT_VARIABLE DATETIME)
+  string(REGEX REPLACE "\n" "" DATETIME ${DATETIME})
+  set(GIT_DATE_TIME "${DATETIME}")
+endif()
 
 find_package(Git)
 
@@ -138,6 +148,31 @@ if(WIN32 AND MSVC)
   endif()
 endif()
 
+set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc)
+configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY)
+add_library(build_version OBJECT ${BUILD_VERSION_CC})
+target_include_directories(build_version PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/util)
+if(MSVC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /wd4127 /wd4800 /wd4996 /wd4351")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers")
+  if(MINGW)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format")
+  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fno-omit-frame-pointer")
+    include(CheckCXXCompilerFlag)
+    CHECK_CXX_COMPILER_FLAG("-momit-leaf-frame-pointer" HAVE_OMIT_LEAF_FRAME_POINTER)
+    if(HAVE_OMIT_LEAF_FRAME_POINTER)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -momit-leaf-frame-pointer")
+    endif()
+  endif()
+endif()
+
 option(PORTABLE "build a portable binary" OFF)
 option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
 if(PORTABLE)
@@ -184,32 +219,6 @@ if(HAVE_THREAD_LOCAL)
   add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL)
 endif()
 
-set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc)
-configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY)
-add_library(build_version OBJECT ${BUILD_VERSION_CC})
-target_include_directories(build_version PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/util)
-if(MSVC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /wd4127 /wd4800 /wd4996 /wd4351")
-else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers")
-  if(MINGW)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format")
-  endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-  if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-    add_definitions(-DNDEBUG)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fno-omit-frame-pointer")
-    include(CheckCXXCompilerFlag)
-    CHECK_CXX_COMPILER_FLAG("-momit-leaf-frame-pointer" HAVE_OMIT_LEAF_FRAME_POINTER)
-    if(HAVE_OMIT_LEAF_FRAME_POINTER)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -momit-leaf-frame-pointer")
-    endif()
-  endif()
-endif()
-
 option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON)
 if(FAIL_ON_WARNINGS)
   if(MSVC)
@@ -319,6 +328,7 @@ endif()
 option(WITH_FALLOCATE "build with fallocate" ON)
 
 if(WITH_FALLOCATE)
+  set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS})
   include(CheckCSourceCompiles)
   CHECK_C_SOURCE_COMPILES("
 #include <fcntl.h>
@@ -503,13 +513,15 @@ set(SOURCES
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_db.cc
         utilities/blob_db/blob_db_impl.cc
-        utilities/blob_db/blob_db_options_impl.cc
         utilities/blob_db/blob_dump_tool.cc
         utilities/blob_db/blob_file.cc
         utilities/blob_db/blob_log_reader.cc
         utilities/blob_db/blob_log_writer.cc
         utilities/blob_db/blob_log_format.cc
         utilities/blob_db/ttl_extractor.cc
+        utilities/cassandra/cassandra_compaction_filter.cc
+        utilities/cassandra/format.cc
+        utilities/cassandra/merge_operator.cc
         utilities/checkpoint/checkpoint_impl.cc
         utilities/col_buf_decoder.cc
         utilities/col_buf_encoder.cc
@@ -528,8 +540,6 @@ set(SOURCES
         utilities/memory/memory_util.cc
         utilities/merge_operators/max.cc
         utilities/merge_operators/put.cc
-        utilities/merge_operators/cassandra/format.cc
-        utilities/merge_operators/cassandra/merge_operator.cc
         utilities/merge_operators/string_append/stringappend.cc
         utilities/merge_operators/string_append/stringappend2.cc
         utilities/merge_operators/uint64add.cc
@@ -545,13 +555,14 @@ set(SOURCES
         utilities/spatialdb/spatial_db.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
         utilities/transactions/optimistic_transaction_db_impl.cc
-        utilities/transactions/optimistic_transaction_impl.cc
+        utilities/transactions/optimistic_transaction.cc
         utilities/transactions/transaction_base.cc
-        utilities/transactions/transaction_db_impl.cc
+        utilities/transactions/pessimistic_transaction_db.cc
         utilities/transactions/transaction_db_mutex_impl.cc
-        utilities/transactions/transaction_impl.cc
+        utilities/transactions/pessimistic_transaction.cc
         utilities/transactions/transaction_lock_mgr.cc
         utilities/transactions/transaction_util.cc
+        utilities/transactions/write_prepared_txn.cc
         utilities/ttl/db_ttl_impl.cc
         utilities/write_batch_with_index/write_batch_with_index.cc
         utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -572,6 +583,12 @@ if(WIN32)
     port/win/win_logger.cc
     port/win/win_thread.cc
     port/win/xpress_win.cc)
+	
+if(WITH_JEMALLOC)
+  list(APPEND SOURCES
+    port/win/win_jemalloc.cc)
+endif()
+	
 else()
   list(APPEND SOURCES
     port/port_posix.cc
@@ -633,7 +650,72 @@ else()
   message(STATUS "JNI library is disabled")
 endif()
 
-set(TESTS
+# Installation and packaging
+if(WIN32)
+  option(ROCKSDB_INSTALL_ON_WINDOWS "Enable install target on Windows" OFF)
+endif()
+if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
+  if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+      # Change default installation prefix on Linux to /usr
+      set(CMAKE_INSTALL_PREFIX /usr CACHE PATH "Install path prefix, prepended onto install directories." FORCE)
+    endif()
+  endif()
+
+  include(GNUInstallDirs)
+  include(CMakePackageConfigHelpers)
+
+  set(package_config_destination ${CMAKE_INSTALL_LIBDIR}/cmake/rocksdb)
+
+  configure_package_config_file(
+    ${CMAKE_SOURCE_DIR}/cmake/RocksDBConfig.cmake.in RocksDBConfig.cmake
+    INSTALL_DESTINATION ${package_config_destination}
+  )
+
+  write_basic_package_version_file(
+    RocksDBConfigVersion.cmake
+    VERSION ${ROCKSDB_VERSION}
+    COMPATIBILITY SameMajorVersion
+  )
+
+  install(DIRECTORY include/rocksdb COMPONENT devel DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+
+  install(
+    TARGETS ${ROCKSDB_STATIC_LIB}
+    EXPORT RocksDBTargets
+    COMPONENT devel
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  )
+
+  install(
+    TARGETS ${ROCKSDB_SHARED_LIB}
+    EXPORT RocksDBTargets
+    COMPONENT runtime
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  )
+
+  install(
+    EXPORT RocksDBTargets
+    COMPONENT devel
+    DESTINATION ${package_config_destination}
+    NAMESPACE RocksDB::
+  )
+
+  install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/RocksDBConfig.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/RocksDBConfigVersion.cmake
+    COMPONENT devel
+    DESTINATION ${package_config_destination}
+  )
+endif()
+
+option(WITH_TESTS "build with tests" ON)
+if(WITH_TESTS)
+  set(TESTS
         cache/cache_test.cc
         cache/lru_cache_test.cc
         db/column_family_test.cc
@@ -646,6 +728,7 @@ set(TESTS
         db/corruption_test.cc
         db/cuckoo_table_db_test.cc
         db/db_basic_test.cc
+        db/db_blob_index_test.cc
         db/db_block_cache_test.cc
         db/db_bloom_filter_test.cc
         db/db_compaction_filter_test.cc
@@ -711,6 +794,7 @@ set(TESTS
         options/options_test.cc
         table/block_based_filter_block_test.cc
         table/block_test.cc
+        table/cleanable_test.cc
         table/cuckoo_table_builder_test.cc
         table/cuckoo_table_reader_test.cc
         table/full_filter_block_test.cc
@@ -739,6 +823,10 @@ set(TESTS
         util/thread_local_test.cc
         utilities/backupable/backupable_db_test.cc
         utilities/blob_db/blob_db_test.cc
+        utilities/cassandra/cassandra_functional_test.cc
+        utilities/cassandra/cassandra_format_test.cc
+        utilities/cassandra/cassandra_row_merge_test.cc
+        utilities/cassandra/cassandra_serialize_test.cc
         utilities/checkpoint/checkpoint_test.cc
         utilities/column_aware_encoding_test.cc
         utilities/date_tiered/date_tiered_test.cc
@@ -747,10 +835,6 @@ set(TESTS
         utilities/geodb/geodb_test.cc
         utilities/lua/rocks_lua_test.cc
         utilities/memory/memory_test.cc
-        utilities/merge_operators/cassandra/cassandra_merge_test.cc
-        utilities/merge_operators/cassandra/cassandra_format_test.cc
-        utilities/merge_operators/cassandra/cassandra_row_merge_test.cc
-        utilities/merge_operators/cassandra/cassandra_serialize_test.cc
         utilities/merge_operators/string_append/stringappend_test.cc
         utilities/object_registry_test.cc
         utilities/option_change_migration/option_change_migration_test.cc
@@ -759,98 +843,94 @@ set(TESTS
         utilities/persistent_cache/persistent_cache_test.cc
         utilities/redis/redis_lists_test.cc
         utilities/spatialdb/spatial_db_test.cc
+        utilities/simulator_cache/sim_cache_test.cc
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
         utilities/transactions/optimistic_transaction_test.cc
         utilities/transactions/transaction_test.cc
         utilities/ttl/ttl_test.cc
         utilities/write_batch_with_index/write_batch_with_index_test.cc
-)
-if(WITH_LIBRADOS)
-  list(APPEND TESTS utilities/env_librados_test.cc)
-endif()
-
-set(BENCHMARKS
-  cache/cache_bench.cc
-  memtable/memtablerep_bench.cc
-  tools/db_bench.cc
-  table/table_reader_bench.cc
-  utilities/column_aware_encoding_exp.cc
-  utilities/persistent_cache/hash_table_bench.cc)
-add_library(testharness OBJECT util/testharness.cc)
-foreach(sourcefile ${BENCHMARKS})
-  get_filename_component(exename ${sourcefile} NAME_WE)
-  add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
-    $<TARGET_OBJECTS:testharness>)
-  target_link_libraries(${exename}${ARTIFACT_SUFFIX} gtest ${LIBS})
-endforeach(sourcefile ${BENCHMARKS})
-
-# For test util library that is build only in DEBUG mode
-# and linked to tests. Add test only code that is not #ifdefed for Release here.
-set(TESTUTIL_SOURCE
-    db/db_test_util.cc
-    monitoring/thread_status_updater_debug.cc
-    table/mock_table.cc
-    util/fault_injection_test_env.cc
-    utilities/merge_operators/cassandra/test_utils.cc
-)
-# test utilities are only build in debug
-enable_testing()
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND})
-set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
-add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
-if(MSVC)
-  set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
-endif()
-set_target_properties(${TESTUTILLIB}
-      PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
-      EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
-      EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
-      )
-
-# Tests are excluded from Release builds
-set(TEST_EXES ${TESTS})
+  )
+  if(WITH_LIBRADOS)
+    list(APPEND TESTS utilities/env_librados_test.cc)
+  endif()
 
-foreach(sourcefile ${TEST_EXES})
+  set(BENCHMARKS
+    cache/cache_bench.cc
+    memtable/memtablerep_bench.cc
+    tools/db_bench.cc
+    table/table_reader_bench.cc
+    utilities/column_aware_encoding_exp.cc
+    utilities/persistent_cache/hash_table_bench.cc)
+  add_library(testharness OBJECT util/testharness.cc)
+  foreach(sourcefile ${BENCHMARKS})
     get_filename_component(exename ${sourcefile} NAME_WE)
     add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
       $<TARGET_OBJECTS:testharness>)
-    set_target_properties(${exename}${ARTIFACT_SUFFIX}
-      PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
-      EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
-      EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
-      )
-    target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} gtest ${LIBS})
-    if(NOT "${exename}" MATCHES "db_sanity_test")
+    target_link_libraries(${exename}${ARTIFACT_SUFFIX} gtest ${LIBS})
+  endforeach(sourcefile ${BENCHMARKS})
+
+  # For test util library that is build only in DEBUG mode
+  # and linked to tests. Add test only code that is not #ifdefed for Release here.
+  set(TESTUTIL_SOURCE
+      db/db_test_util.cc
+      monitoring/thread_status_updater_debug.cc
+      table/mock_table.cc
+      util/fault_injection_test_env.cc
+      utilities/cassandra/test_utils.cc
+  )
+  # test utilities are only build in debug
+  enable_testing()
+  add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND})
+  set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
+  add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
+  if(MSVC)
+    set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
+  endif()
+  set_target_properties(${TESTUTILLIB}
+        PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
+        )
+
+  # Tests are excluded from Release builds
+  set(TEST_EXES ${TESTS})
+
+  foreach(sourcefile ${TEST_EXES})
+      get_filename_component(exename ${sourcefile} NAME_WE)
+      add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
+        $<TARGET_OBJECTS:testharness>)
+      set_target_properties(${exename}${ARTIFACT_SUFFIX}
+        PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
+        )
+      target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} gtest ${LIBS})
+      if(NOT "${exename}" MATCHES "db_sanity_test")
+        add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
+        add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
+      endif()
+  endforeach(sourcefile ${TEST_EXES})
+
+  # C executables must link to a shared object
+  set(C_TESTS db/c_test.c)
+  set(C_TEST_EXES ${C_TESTS})
+
+  foreach(sourcefile ${C_TEST_EXES})
+      string(REPLACE ".c" "" exename ${sourcefile})
+      string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
+      add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
+      set_target_properties(${exename}${ARTIFACT_SUFFIX}
+        PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
+        )
+      target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_IMPORT_LIB} testutillib${ARTIFACT_SUFFIX})
       add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
       add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
-    endif()
-endforeach(sourcefile ${TEST_EXES})
-
-# C executables must link to a shared object
-set(C_TESTS db/c_test.c)
-set(C_TEST_EXES ${C_TESTS})
-
-foreach(sourcefile ${C_TEST_EXES})
-    string(REPLACE ".c" "" exename ${sourcefile})
-    string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
-    add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
-    set_target_properties(${exename}${ARTIFACT_SUFFIX}
-      PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
-      EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
-      EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
-      )
-    target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${ROCKSDB_IMPORT_LIB} testutillib${ARTIFACT_SUFFIX})
-    add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
-    add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
-endforeach(sourcefile ${C_TEST_EXES})
-add_subdirectory(tools)
-
-# Installation and packaging for Linux
-if(NOT WIN32)
-install(TARGETS ${ROCKSDB_STATIC_LIB} COMPONENT devel ARCHIVE DESTINATION lib64)
-install(TARGETS ${ROCKSDB_SHARED_LIB} COMPONENT runtime DESTINATION lib64)
-install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/rocksdb/"
-        COMPONENT devel
-        DESTINATION include/rocksdb)
-set(CMAKE_INSTALL_PREFIX /usr)
+  endforeach(sourcefile ${C_TEST_EXES})
+endif()
+
+option(WITH_TOOLS "build with tools" ON)
+if(WITH_TOOLS)
+  add_subdirectory(tools)
 endif()
diff --git a/HISTORY.md b/HISTORY.md
index 2ddf01cb7..e528440ee 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,17 +1,35 @@
 # Rocksdb Change Log
-## 5.7.3 (08/29/2017)
+## 5.8.7 (11/28/2017)
 ### Bug Fixes
-* Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
-* Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
+* Fix IOError on WAL write doesn't propagate to write group follower
 
-## 5.7.2 (08/15/2017)
+## 5.8.6 (11/20/2017)
 ### Bug Fixes
-* Fix incorrect dropping of deletions issue with FIFO compaction.
-* Fix LITE build compiler error with missing abort().
+* Fixed aligned_alloc issues with Windows.
+
+## 5.8.1 (10/23/2017)
+### New Features
+* Add a new db property "rocksdb.estimate-oldest-key-time" to return oldest data timestamp. The property is available only for FIFO compaction with compaction_options_fifo.allow_compaction = false.
+
+## 5.8.0 (08/30/2017)
+### Public API Change
+* Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints.
+* `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr.
+* `Transaction::Get` and `Transaction::GetForUpdate` variants with `PinnableSlice` added.
+
+### New Features
+* Add Iterator::Refresh(), which allows users to update the iterator state so that they can avoid some initialization costs of recreating iterators.
+* Replace dynamic_cast<> (except unit test) so people can choose to build with RTTI off. With make, release mode is by default built with -fno-rtti and debug mode is built without it. Users can override it by setting USE_RTTI=0 or 1.
+* Universal compactions including the bottom level can be executed in a dedicated thread pool. This alleviates head-of-line blocking in the compaction queue, which cause write stalling, particularly in multi-instance use cases. Users can enable this feature via `Env::SetBackgroundThreads(N, Env::Priority::BOTTOM)`, where `N > 0`.
+* Allow merge operator to be called even with a single merge operand during compactions, by appropriately overriding `MergeOperator::AllowSingleOperand`.
+* Add `DB::VerifyChecksum()`, which verifies the checksums in all SST files in a running DB.
+* Block-based table support for disabling checksums by setting `BlockBasedTableOptions::checksum = kNoChecksum`.
 
-## 5.7.1 (08/13/2017)
 ### Bug Fixes
+* Fix wrong latencies in `rocksdb.db.get.micros`, `rocksdb.db.write.micros`, and `rocksdb.sst.read.micros`.
 * Fix incorrect dropping of deletions during intra-L0 compaction.
+* Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
+* Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
 
 ## 5.7.0 (07/13/2017)
 ### Public API Change
diff --git a/INSTALL.md b/INSTALL.md
index 820293a57..04f0eb279 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -113,6 +113,8 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
 * **Windows**:
   * For building with MS Visual Studio 13 you will need Update 4 installed.
   * Read and follow the instructions at CMakeLists.txt
+  * Or install via [vcpkg](https://github.com/microsoft/vcpkg) 
+       * run `vcpkg install rocksdb`
 
 * **AIX 6.1**
     * Install AIX Toolbox rpms with gcc
diff --git a/LANGUAGE-BINDINGS.md b/LANGUAGE-BINDINGS.md
index d28035bf8..ffeed98f2 100644
--- a/LANGUAGE-BINDINGS.md
+++ b/LANGUAGE-BINDINGS.md
@@ -9,5 +9,8 @@ This is the list of all known third-party language bindings for RocksDB. If some
 * Haskell - https://hackage.haskell.org/package/rocksdb-haskell
 * PHP - https://github.com/Photonios/rocksdb-php
 * C# - https://github.com/warrenfalk/rocksdb-sharp
-* Rust - https://github.com/spacejam/rust-rocksdb
+* Rust
+    * https://github.com/spacejam/rust-rocksdb
+    * https://github.com/bh1xuw/rust-rocks
 * D programming language - https://github.com/b1naryth1ef/rocksdb
+* Erlang - https://gitlab.com/barrel-db/erlang-rocksdb
diff --git a/Makefile b/Makefile
index 1b273224b..5a89f6bf7 100644
--- a/Makefile
+++ b/Makefile
@@ -101,7 +101,19 @@ endif
 ifeq ($(DEBUG_LEVEL),0)
 OPT += -DNDEBUG
 DISABLE_WARNING_AS_ERROR=1
+
+ifneq ($(USE_RTTI), 1)
+	CXXFLAGS += -fno-rtti
+else
+	CXXFLAGS += -DROCKSDB_USE_RTTI
+endif
 else
+ifneq ($(USE_RTTI), 0)
+	CXXFLAGS += -DROCKSDB_USE_RTTI
+else
+	CXXFLAGS += -fno-rtti
+endif
+
 $(warning Warning: Compiling in debug mode. Don't use the resulting binary in production)
 endif
 
@@ -220,6 +232,10 @@ ifndef DISABLE_JEMALLOC
 		PLATFORM_CXXFLAGS += -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE
 		PLATFORM_CCFLAGS  += -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE
 	endif
+	ifdef WITH_JEMALLOC_FLAG
+		PLATFORM_LDFLAGS += -ljemalloc
+		JAVA_LDFLAGS += -ljemalloc
+	endif
 	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
 	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE)
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE)
@@ -344,6 +360,7 @@ TESTS = \
 	db_wal_test \
 	db_block_cache_test \
 	db_test \
+	db_blob_index_test \
 	db_bloom_filter_test \
 	db_iter_test \
 	db_log_iter_test \
@@ -405,7 +422,7 @@ TESTS = \
 	write_buffer_manager_test \
 	stringappend_test \
 	cassandra_format_test \
-	cassandra_merge_test \
+	cassandra_functional_test \
 	cassandra_row_merge_test \
 	cassandra_serialize_test \
 	ttl_test \
@@ -575,6 +592,8 @@ endif  # PLATFORM_SHARED_EXT
 
 all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
 
+all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(SUBSET)
+
 static_lib: $(LIBRARY)
 
 shared_lib: $(SHARED)
@@ -778,8 +797,8 @@ ldb_tests: ldb
 crash_test: whitebox_crash_test blackbox_crash_test
 
 blackbox_crash_test: db_stress
-	python -u tools/db_crashtest.py --simple blackbox
-	python -u tools/db_crashtest.py blackbox
+	python -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
+	python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
 
 ifeq ($(CRASH_TEST_KILL_ODD),)
   CRASH_TEST_KILL_ODD=888887
@@ -787,9 +806,9 @@ endif
 
 whitebox_crash_test: db_stress
 	python -u tools/db_crashtest.py --simple whitebox --random_kill_odd \
-      $(CRASH_TEST_KILL_ODD)
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 	python -u tools/db_crashtest.py whitebox  --random_kill_odd \
-      $(CRASH_TEST_KILL_ODD)
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 asan_check:
 	$(MAKE) clean
@@ -1000,16 +1019,16 @@ option_change_migration_test: utilities/option_change_migration/option_change_mi
 stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-cassandra_format_test: utilities/merge_operators/cassandra/cassandra_format_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_format_test: utilities/cassandra/cassandra_format_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-cassandra_merge_test: utilities/merge_operators/cassandra/cassandra_merge_test.o utilities/merge_operators/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_functional_test: utilities/cassandra/cassandra_functional_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-cassandra_row_merge_test: utilities/merge_operators/cassandra/cassandra_row_merge_test.o utilities/merge_operators/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_row_merge_test: utilities/cassandra/cassandra_row_merge_test.o utilities/cassandra/test_utils.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-cassandra_serialize_test: utilities/merge_operators/cassandra/cassandra_serialize_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cassandra_serialize_test: utilities/cassandra/cassandra_serialize_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1045,6 +1064,9 @@ db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+db_blob_index_test: db/db_blob_index_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index de64bf5f7..ac85eab93 100644
--- a/TARGETS
+++ b/TARGETS
@@ -6,8 +6,6 @@ REPO_PATH = TARGETS_PATH[(TARGETS_PATH.find('fbcode/') + len('fbcode/')):] + "/"
 BUCK_BINS = "buck-out/gen/" + REPO_PATH
 TEST_RUNNER = REPO_PATH + "buckifier/rocks_test_runner.sh"
 rocksdb_compiler_flags = [
-  "-msse",
-  "-msse4.2",
   "-fno-builtin-memcmp",
   "-DROCKSDB_PLATFORM_POSIX",
   "-DROCKSDB_LIB_IO_POSIX",
@@ -16,8 +14,8 @@ rocksdb_compiler_flags = [
   "-DROCKSDB_RANGESYNC_PRESENT",
   "-DROCKSDB_SCHED_GETCPU_PRESENT",
   "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-  "-DHAVE_SSE42",
   "-DOS_LINUX",
+  "-DROCKSDB_UBSAN_RUN",
   # Flags to enable libs we include
   "-DSNAPPY",
   "-DZLIB",
@@ -49,6 +47,10 @@ rocksdb_preprocessor_flags = [
   "-I" + REPO_PATH,
 ]
 
+rocksdb_arch_preprocessor_flags = {
+  "x86_64": ["-DHAVE_SSE42"],
+}
+
 cpp_library(
     name = "rocksdb_lib",
     headers = AutoHeaders.RECURSIVE_GLOB,
@@ -207,12 +209,14 @@ cpp_library(
       "utilities/backupable/backupable_db.cc",
       "utilities/blob_db/blob_db.cc",
       "utilities/blob_db/blob_db_impl.cc",
-      "utilities/blob_db/blob_db_options_impl.cc",
       "utilities/blob_db/blob_file.cc",
       "utilities/blob_db/blob_log_reader.cc",
       "utilities/blob_db/blob_log_writer.cc",
       "utilities/blob_db/blob_log_format.cc",
       "utilities/blob_db/ttl_extractor.cc",
+      "utilities/cassandra/cassandra_compaction_filter.cc",
+      "utilities/cassandra/format.cc",
+      "utilities/cassandra/merge_operator.cc",
       "utilities/checkpoint/checkpoint_impl.cc",
       "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
       "utilities/convenience/info_log_finder.cc",
@@ -227,8 +231,6 @@ cpp_library(
       "utilities/leveldb_options/leveldb_options.cc",
       "utilities/lua/rocks_lua_compaction_filter.cc",
       "utilities/memory/memory_util.cc",
-      "utilities/merge_operators/cassandra/format.cc",
-      "utilities/merge_operators/cassandra/merge_operator.cc",
       "utilities/merge_operators/max.cc",
       "utilities/merge_operators/put.cc",
       "utilities/merge_operators/string_append/stringappend.cc",
@@ -246,13 +248,14 @@ cpp_library(
       "utilities/spatialdb/spatial_db.cc",
       "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
       "utilities/transactions/optimistic_transaction_db_impl.cc",
-      "utilities/transactions/optimistic_transaction_impl.cc",
+      "utilities/transactions/optimistic_transaction.cc",
       "utilities/transactions/transaction_base.cc",
-      "utilities/transactions/transaction_db_impl.cc",
+      "utilities/transactions/pessimistic_transaction_db.cc",
       "utilities/transactions/transaction_db_mutex_impl.cc",
-      "utilities/transactions/transaction_impl.cc",
+      "utilities/transactions/pessimistic_transaction.cc",
       "utilities/transactions/transaction_lock_mgr.cc",
       "utilities/transactions/transaction_util.cc",
+      "utilities/transactions/write_prepared_txn.cc",
       "utilities/ttl/db_ttl_impl.cc",
       "utilities/write_batch_with_index/write_batch_with_index.cc",
       "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
@@ -263,6 +266,7 @@ cpp_library(
     ],
     deps = [],
     preprocessor_flags = rocksdb_preprocessor_flags,
+    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     external_deps = rocksdb_external_deps,
 )
@@ -276,13 +280,14 @@ cpp_library(
       "util/testharness.cc",
       "util/testutil.cc",
       "db/db_test_util.cc",
-      "utilities/merge_operators/cassandra/test_utils.cc",
+      "utilities/cassandra/test_utils.cc",
       "utilities/col_buf_encoder.cc",
       "utilities/col_buf_decoder.cc",
       "utilities/column_aware_encoding_util.cc",
     ],
     deps = [":rocksdb_lib"],
     preprocessor_flags = rocksdb_preprocessor_flags,
+    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     external_deps = rocksdb_external_deps,
 )
@@ -296,6 +301,7 @@ cpp_library(
     ],
     deps = [":rocksdb_lib"],
     preprocessor_flags = rocksdb_preprocessor_flags,
+    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     external_deps = rocksdb_external_deps,
 )
@@ -306,6 +312,7 @@ cpp_library(
     srcs = ["env/env_basic_test.cc"],
     deps = [":rocksdb_test_lib"],
     preprocessor_flags = rocksdb_preprocessor_flags,
+    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     external_deps = rocksdb_external_deps,
 )
@@ -326,16 +333,16 @@ ROCKS_TESTS = [['arena_test', 'util/arena_test.cc', 'serial'],
  ['c_test', 'db/c_test.c', 'serial'],
  ['cache_test', 'cache/cache_test.cc', 'serial'],
  ['cassandra_format_test',
-  'utilities/merge_operators/cassandra/cassandra_format_test.cc',
+  'utilities/cassandra/cassandra_format_test.cc',
   'serial'],
- ['cassandra_merge_test',
-  'utilities/merge_operators/cassandra/cassandra_merge_test.cc',
+ ['cassandra_functional_test',
+  'utilities/cassandra/cassandra_functional_test.cc',
   'serial'],
  ['cassandra_row_merge_test',
-  'utilities/merge_operators/cassandra/cassandra_row_merge_test.cc',
+  'utilities/cassandra/cassandra_row_merge_test.cc',
   'serial'],
  ['cassandra_serialize_test',
-  'utilities/merge_operators/cassandra/cassandra_serialize_test.cc',
+  'utilities/cassandra/cassandra_serialize_test.cc',
   'serial'],
  ['checkpoint_test', 'utilities/checkpoint/checkpoint_test.cc', 'serial'],
  ['cleanable_test', 'table/cleanable_test.cc', 'serial'],
@@ -360,6 +367,7 @@ ROCKS_TESTS = [['arena_test', 'util/arena_test.cc', 'serial'],
  ['cuckoo_table_reader_test', 'table/cuckoo_table_reader_test.cc', 'serial'],
  ['date_tiered_test', 'utilities/date_tiered/date_tiered_test.cc', 'serial'],
  ['db_basic_test', 'db/db_basic_test.cc', 'serial'],
+ ['db_blob_index_test', 'db/db_blob_index_test.cc', 'serial'],
  ['db_block_cache_test', 'db/db_block_cache_test.cc', 'serial'],
  ['db_bloom_filter_test', 'db/db_bloom_filter_test.cc', 'serial'],
  ['db_compaction_filter_test', 'db/db_compaction_filter_test.cc', 'parallel'],
@@ -501,6 +509,7 @@ for test_cfg in ROCKS_TESTS:
       srcs = [test_cc],
       deps = [":rocksdb_test_lib"],
       preprocessor_flags = rocksdb_preprocessor_flags,
+      arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
       compiler_flags = rocksdb_compiler_flags,
       external_deps = rocksdb_external_deps,
     )
diff --git a/USERS.md b/USERS.md
index 37d33b436..7be093f95 100644
--- a/USERS.md
+++ b/USERS.md
@@ -80,3 +80,6 @@ quasardb uses a heavily tuned RocksDB as its persistence layer.
 
 ## 360 Pika
 [360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been widely used in many company
+
+## LzLabs
+LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data.
diff --git a/appveyor.yml b/appveyor.yml
index f582bb195..be9b66b45 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ image: Visual Studio 2015
 before_build:
 - md %APPVEYOR_BUILD_FOLDER%\build
 - cd %APPVEYOR_BUILD_FOLDER%\build
-- cmake -G "Visual Studio 14 2015 Win64" -DOPTDBG=1 -DXPRESS=1 ..
+- cmake -G "Visual Studio 14 2015 Win64" -DOPTDBG=1 -DXPRESS=1 -DPORTABLE=1 ..
 - cd ..
 build:
   project: build\rocksdb.sln
diff --git a/arcanist_util/__phutil_library_init__.php b/arcanist_util/__phutil_library_init__.php
deleted file mode 100644
index bc732cad6..000000000
--- a/arcanist_util/__phutil_library_init__.php
+++ /dev/null
@@ -1,3 +0,0 @@
-<?php
-
-phutil_register_library('arcanist_util', __FILE__);
diff --git a/arcanist_util/__phutil_library_map__.php b/arcanist_util/__phutil_library_map__.php
deleted file mode 100644
index 84ebc7b0a..000000000
--- a/arcanist_util/__phutil_library_map__.php
+++ /dev/null
@@ -1,71 +0,0 @@
-<?php
-
-/**
- * This file is automatically generated. Use 'arc liberate' to rebuild it.
- * @generated
- * @phutil-library-version 2
- */
-
-if (class_exists('ArcanistWorkflow')) {
-  phutil_register_library_map(array(
-    '__library_version__' => 2,
-    'class' =>
-    array(
-      'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php',
-      'BaseDirectoryScopedFormatLinter' => 'cpp_linter/BaseDirectoryScopedFormatLinter.php',
-      'FacebookArcanistConfiguration' => 'config/FacebookArcanistConfiguration.php',
-      'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php',
-      'FacebookFbcodeUnitTestEngine' => 'unit_engine/FacebookFbcodeUnitTestEngine.php',
-      'FacebookHowtoevenLintEngine' => 'lint_engine/FacebookHowtoevenLintEngine.php',
-      'FacebookHowtoevenLinter' => 'cpp_linter/FacebookHowtoevenLinter.php',
-      'FbcodeClangFormatLinter' => 'cpp_linter/FbcodeClangFormatLinter.php',
-      'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php',
-    ),
-    'function' =>
-    array(
-    ),
-    'xmap' =>
-    array(
-      'ArcanistCpplintLinter' => 'ArcanistLinter',
-      'BaseDirectoryScopedFormatLinter' => 'ArcanistLinter',
-      'FacebookArcanistConfiguration' => 'ArcanistConfiguration',
-      'FacebookFbcodeLintEngine' => 'ArcanistLintEngine',
-      'FacebookFbcodeUnitTestEngine' => 'ArcanistBaseUnitTestEngine',
-      'FacebookHowtoevenLintEngine' => 'ArcanistLintEngine',
-      'FacebookHowtoevenLinter' => 'ArcanistLinter',
-      'FbcodeClangFormatLinter' => 'BaseDirectoryScopedFormatLinter',
-      'FbcodeCppLinter' => 'ArcanistLinter',
-    ),
-  ));
-} else {
-  phutil_register_library_map(array(
-    '__library_version__' => 2,
-    'class' =>
-    array(
-      'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php',
-      'BaseDirectoryScopedFormatLinter' => 'cpp_linter/BaseDirectoryScopedFormatLinter.php',
-      'FacebookArcanistConfiguration' => 'config/FacebookOldArcanistConfiguration.php',
-      'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php',
-      'FacebookFbcodeUnitTestEngine' => 'unit_engine/FacebookOldFbcodeUnitTestEngine.php',
-      'FacebookHowtoevenLintEngine' => 'lint_engine/FacebookHowtoevenLintEngine.php',
-      'FacebookHowtoevenLinter' => 'cpp_linter/FacebookHowtoevenLinter.php',
-      'FbcodeClangFormatLinter' => 'cpp_linter/FbcodeClangFormatLinter.php',
-      'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php',
-    ),
-    'function' =>
-    array(
-    ),
-    'xmap' =>
-    array(
-      'ArcanistCpplintLinter' => 'ArcanistLinter',
-      'BaseDirectoryScopedFormatLinter' => 'ArcanistLinter',
-      'FacebookArcanistConfiguration' => 'ArcanistConfiguration',
-      'FacebookFbcodeLintEngine' => 'ArcanistLintEngine',
-      'FacebookFbcodeUnitTestEngine' => 'ArcanistBaseUnitTestEngine',
-      'FacebookHowtoevenLintEngine' => 'ArcanistLintEngine',
-      'FacebookHowtoevenLinter' => 'ArcanistLinter',
-      'FbcodeClangFormatLinter' => 'BaseDirectoryScopedFormatLinter',
-      'FbcodeCppLinter' => 'ArcanistLinter',
-    ),
-  ));
-}
diff --git a/arcanist_util/config/FacebookArcanistConfiguration.php b/arcanist_util/config/FacebookArcanistConfiguration.php
deleted file mode 100644
index 3d06fc5b5..000000000
--- a/arcanist_util/config/FacebookArcanistConfiguration.php
+++ /dev/null
@@ -1,43 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-require('RocksDBCommonHelper.php');
-
-define("DIFF_COMMAND", "diff");
-
-class FacebookArcanistConfiguration extends ArcanistConfiguration {
-  public function getCustomArgumentsForCommand($command) {
-    if ($command == "land") {
-      return array(
-          'async' => array('help' => 'Just to make tools happy'));
-    }
-    return array();
-  }
-
-  public function didRunWorkflow($command,
-                                 ArcanistWorkflow $workflow,
-                                 $error_code) {
-    // Default options don't terminate on failure, but that's what we want. In
-    // the current case we use assertions intentionally as "terminate on failure
-    // invariants".
-    assert_options(ASSERT_BAIL, true);
-
-    assert($workflow);
-    assert(strlen($command) > 0);
-
-    if ($command == DIFF_COMMAND && !$workflow->isRawDiffSource()) {
-      $diffID = $workflow->getDiffId();
-
-      // When submitting a diff this code path gets executed multiple times in
-      // a row. We only care about the case when ID for the diff is provided
-      // because that's what we need to apply the diff and trigger the tests.
-      if (strlen($diffID) > 0) {
-        assert(is_numeric($diffID));
-        startTestsInSandcastle(true /* $applyDiff */, $workflow, $diffID);
-      }
-    }
-  }
-}
diff --git a/arcanist_util/config/FacebookOldArcanistConfiguration.php b/arcanist_util/config/FacebookOldArcanistConfiguration.php
deleted file mode 100644
index 93515cc13..000000000
--- a/arcanist_util/config/FacebookOldArcanistConfiguration.php
+++ /dev/null
@@ -1,43 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-require('RocksDBCommonHelper.php');
-
-define("DIFF_COMMAND", "diff");
-
-class FacebookArcanistConfiguration extends ArcanistConfiguration {
-  public function getCustomArgumentsForCommand($command) {
-    if ($command == "land") {
-      return array(
-          'async' => array('help' => 'Just to make tools happy'));
-    }
-    return array();
-  }
-
-  public function didRunWorkflow($command,
-                                 ArcanistBaseWorkflow $workflow,
-                                 $error_code) {
-    // Default options don't terminate on failure, but that's what we want. In
-    // the current case we use assertions intentionally as "terminate on failure
-    // invariants".
-    assert_options(ASSERT_BAIL, true);
-
-    assert($workflow);
-    assert(strlen($command) > 0);
-
-    if ($command == DIFF_COMMAND && !$workflow->isRawDiffSource()) {
-      $diffID = $workflow->getDiffId();
-
-      // When submitting a diff this code path gets executed multiple times in
-      // a row. We only care about the case when ID for the diff is provided
-      // because that's what we need to apply the diff and trigger the tests.
-      if (strlen($diffID) > 0) {
-        assert(is_numeric($diffID));
-        startTestsInSandcastle(true /* $applyDiff */, $workflow, $diffID);
-      }
-    }
-  }
-}
diff --git a/arcanist_util/cpp_linter/ArcanistCpplintLinter.php b/arcanist_util/cpp_linter/ArcanistCpplintLinter.php
deleted file mode 100644
index b9c413755..000000000
--- a/arcanist_util/cpp_linter/ArcanistCpplintLinter.php
+++ /dev/null
@@ -1,88 +0,0 @@
-<?php
-
-/**
- * Uses google's cpplint.py to check code. RocksDB team forked this file from
- * phabricator's /src/lint/linter/ArcanistCpplintLinter.php, and customized it
- * for its own use.
- *
- * You can get it here:
- * http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py
- * @group linter
- */
-final class ArcanistCpplintLinter extends ArcanistLinter {
-
-  public function willLintPaths(array $paths) {
-    return;
-  }
-
-  public function getLinterName() {
-    return 'cpplint.py';
-  }
-
-  public function getLintPath() {
-    $bin = 'cpplint.py';
-    // Search under current dir
-    list($err) = exec_manual('which %s/%s', $this->linterDir(), $bin);
-    if (!$err) {
-      return $this->linterDir().'/'.$bin;
-    }
-
-    // Look for globally installed cpplint.py
-    list($err) = exec_manual('which %s', $bin);
-    if ($err) {
-      throw new ArcanistUsageException(
-        "cpplint.py does not appear to be installed on this system. Install ".
-        "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/".
-        "svn/trunk/cpplint/cpplint.py\"') ".
-        "in your .arcconfig to point to the directory where it resides. ".
-        "Also don't forget to chmod a+x cpplint.py!");
-    }
-
-    return $bin;
-  }
-
-  public function lintPath($path) {
-    $bin = $this->getLintPath();
-    $path = $this->rocksdbDir().'/'.$path;
-
-    $f = new ExecFuture("%C $path", $bin);
-
-    list($err, $stdout, $stderr) = $f->resolve();
-
-    if ($err === 2) {
-      throw new Exception("cpplint failed to run correctly:\n".$stderr);
-    }
-
-    $lines = explode("\n", $stderr);
-    $messages = array();
-    foreach ($lines as $line) {
-      $line = trim($line);
-      $matches = null;
-      $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/';
-      if (!preg_match($regex, $line, $matches)) {
-        continue;
-      }
-      foreach ($matches as $key => $match) {
-        $matches[$key] = trim($match);
-      }
-      $message = new ArcanistLintMessage();
-      $message->setPath($path);
-      $message->setLine($matches[1]);
-      $message->setCode($matches[3]);
-      $message->setName($matches[3]);
-      $message->setDescription($matches[2]);
-      $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING);
-      $this->addLintMessage($message);
-    }
-  }
-
-  // The path of this linter
-  private function linterDir() {
-    return dirname(__FILE__);
-  }
-
-  // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir.
-  private function rocksdbDir() {
-    return $this->linterDir()."/../..";
-  }
-}
diff --git a/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php b/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php
deleted file mode 100644
index 4a7b307dc..000000000
--- a/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php
+++ /dev/null
@@ -1,74 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-abstract class BaseDirectoryScopedFormatLinter extends ArcanistLinter {
-
-  const LINT_FORMATTING = 1;
-
-  private $changedLines = array();
-  private $rawLintOutput = array();
-
-  abstract protected function getPathsToLint();
-
-  protected function shouldLintPath($path) {
-    foreach ($this->getPathsToLint() as $p) {
-      // check if $path starts with $p
-      if (strncmp($path, $p, strlen($p)) === 0) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // API to tell this linter which lines were changed
-  final public function setPathChangedLines($path, $changed) {
-    $this->changedLines[$path] = $changed;
-  }
-
-  final public function willLintPaths(array $paths) {
-    $futures = array();
-    foreach ($paths as $path) {
-      if (!$this->shouldLintPath($path)) {
-        continue;
-      }
-
-      $changed = $this->changedLines[$path];
-      if (!isset($changed)) {
-        // do not run linter if there are no changes
-        continue;
-      }
-
-      $futures[$path] = $this->getFormatFuture($path, $changed);
-    }
-
-    foreach (id(new FutureIterator($futures))->limit(8) as $p => $f) {
-      $this->rawLintOutput[$p] = $f->resolvex();
-    }
-  }
-
-  abstract protected function getFormatFuture($path, array $changed);
-  abstract protected function getLintMessage($diff);
-
-  final public function lintPath($path) {
-    if (!isset($this->rawLintOutput[$path])) {
-      return;
-    }
-
-    list($new_content) = $this->rawLintOutput[$path];
-    $old_content = $this->getData($path);
-
-    if ($new_content != $old_content) {
-      $diff = ArcanistDiffUtils::renderDifferences($old_content, $new_content);
-      $this->raiseLintAtOffset(
-        0,
-        self::LINT_FORMATTING,
-        $this->getLintMessage($diff),
-        $old_content,
-        $new_content);
-    }
-  }
-
-}
diff --git a/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php b/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php
deleted file mode 100644
index 6edb114b6..000000000
--- a/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php
+++ /dev/null
@@ -1,223 +0,0 @@
-<?php
-// Copyright 2015-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-final class FacebookHowtoevenLinter extends ArcanistLinter {
-
-  const VERSION = 'fd9192f324c36d28136d14380f0b552a1385b59b';
-
-  private $parsedTargets = array();
-
-  public function getLinterName() {
-    return 'Howtoeven';
-  }
-
-  protected function getSeverity($code) {
-    $severities = array(
-      ArcanistLintSeverity::SEVERITY_DISABLED,
-      ArcanistLintSeverity::SEVERITY_ADVICE,
-      ArcanistLintSeverity::SEVERITY_WARNING,
-      ArcanistLintSeverity::SEVERITY_ERROR,
-    );
-    return idx($severities, $code, ArcanistLintSeverity::SEVERITY_WARNING);
-  }
-
-  public function willLintPaths(array $paths) {
-    // Cleanup previous runs.
-    $this->localExecx("rm -rf _build/_lint");
-
-    // Build compilation database.
-    $lintable_paths = $this->getLintablePaths($paths);
-    $interesting_paths = $this->getInterestingPaths($lintable_paths);
-
-    if (!$lintable_paths) {
-      return;
-    }
-
-    // Run lint.
-    try {
-      $this->localExecx(
-        "%C %C -p _build/dev/ %Ls",
-        $this->getBinaryPath(),
-        $this->getFilteredIssues(),
-        $lintable_paths);
-    } catch (CommandException $exception) {
-      PhutilConsole::getConsole()->writeErr($exception->getMessage());
-    }
-
-    // Load results.
-    $result = id(
-      new SQLite3(
-        $this->getProjectRoot().'/_build/_lint/lint.db',
-        SQLITE3_OPEN_READONLY))
-      ->query("SELECT * FROM raised_issues");
-
-    while ($issue = $result->fetchArray(SQLITE3_ASSOC)) {
-      // Skip issues not part of the linted file.
-      if (in_array($issue['file'], $interesting_paths)) {
-        $this->addLintMessage(id(new ArcanistLintMessage())
-          ->setPath($issue['file'])
-          ->setLine($issue['line'])
-          ->setChar($issue['column'])
-          ->setCode('Howtoeven')
-          ->setSeverity($this->getSeverity($issue['severity']))
-          ->setName('Hte-'.$issue['name'])
-          ->setDescription(
-            sprintf(
-              "%s\n\n%s",
-              ($issue['message']) ? $issue['message'] : $issue['description'],
-              $issue['explanation']))
-          ->setOriginalText(idx($issue, 'original', ''))
-          ->setReplacementText(idx($issue, 'replacement', '')));
-      }
-    }
-  }
-
-  public function lintPath($path) {
-  }
-
-  /**
-   * Get the paths that we know how to lint.
-   *
-   * The strategy is to first look whether there's an existing compilation
-   * database and use that if it's exhaustive. We generate our own only if
-   * necessary.
-   */
-  private function getLintablePaths($paths) {
-    // Replace headers with existing sources.
-    for ($i = 0; $i < count($paths); $i++) {
-      if (preg_match("/\.h$/", $paths[$i])) {
-        $header = preg_replace("/\.h$/", ".cpp", $paths[$i]);
-        if (file_exists($header)) {
-          $paths[$i] = $header;
-        }
-      }
-    }
-
-    // Check if database exists and is exhaustive.
-    $available_paths = $this->getAvailablePaths();
-    $lintable_paths = array_intersect($paths, $available_paths);
-    if ($paths === $lintable_paths) {
-      return $lintable_paths;
-    }
-
-    // Generate our own database.
-    $targets = $this->getTargetsFor($paths);
-    if (!$targets) {
-      PhutilConsole::getConsole()->writeErr(
-        "No build targets found for %s\n",
-        implode(', ', $paths));
-      return array();
-    }
-
-    $this->localExecx("./tools/build/bin/fbconfig.par -r %Ls", $targets);
-    $this->localExecx("./tools/build/bin/fbmake.par gen_cdb");
-
-    $available_paths = $this->getAvailablePaths();
-    $lintable_paths = array_intersect($paths, $available_paths);
-    if ($paths != $lintable_paths) {
-      PhutilConsole::getConsole()->writeErr(
-        "Can't lint %s\n",
-        implode(', ', array_diff($paths, $available_paths)));
-    }
-
-    // Return what we know how to lint.
-    return $lintable_paths;
-  }
-
-  /**
-   * Get the available paths in the current compilation database.
-   */
-  private function getAvailablePaths() {
-    $database_path = $this->getProjectRoot()
-      .'/_build/dev/compile_commands.json';
-    if (!file_exists($database_path)) {
-      return array();
-    }
-
-    $entries = json_decode(file_get_contents($database_path), true);
-    $paths = array();
-    foreach ($entries as $entry) {
-      $paths[] = $entry['file'];
-    }
-    return $paths;
-  }
-
-  /**
-   * Search for the targets directories for the given files.
-   */
-  private static function getTargetsFor($paths) {
-    $targets = array();
-    foreach ($paths as $path) {
-      while (($path = dirname($path)) !== '.') {
-        if (in_array('TARGETS', scandir($path))) {
-          $contents = file_get_contents($path.'/TARGETS');
-          if (strpos($contents, 'cpp_binary') !== false) {
-            $targets[] = $path;
-            break;
-          }
-        }
-      }
-    }
-    return array_unique($targets);
-  }
-
-  /**
-   * The paths that we actually want to report on.
-   */
-  private function getInterestingPaths($paths) {
-    $headers = array();
-    foreach ($paths as $path) {
-      $headers[] = preg_replace("/\.cpp$/", ".h", $path);
-    }
-    return array_merge($paths, $headers);
-  }
-
-  /**
-   * The path where the binary is located. Will return the current dewey binary
-   * unless the `HOWTOEVEN_BUILD` environment variable is set.
-   */
-  private function getBinaryPath() {
-    $path = sprintf(
-      "/mnt/dewey/fbcode/.commits/%s/builds/howtoeven/client",
-      self::VERSION);
-
-    $build = getenv('HOWTOEVEN_BUILD');
-    if ($build) {
-      $path = sprintf(
-        "./_build/%s/tools/howtoeven/client",
-        $build);
-      if (!file_exists($path)) {
-        PhutilConsole::getConsole()->writeErr(">> %s does not exist\n", $path);
-        exit(1);
-      }
-    }
-
-    return $path;
-  }
-
-  /**
-   * Execute the command in the root directory.
-   */
-  private function localExecx($command /* , ... */) {
-    $arguments = func_get_args();
-    return newv('ExecFuture', $arguments)
-      ->setCWD($this->getProjectRoot())
-      ->resolvex();
-  }
-
-  /**
-   * The root of the project.
-   */
-  private function getProjectRoot() {
-    return $this->getEngine()->getWorkingCopy()->getProjectRoot();
-  }
-
-  private function getFilteredIssues() {
-    $issues = getenv('HOWTOEVEN_ISSUES');
-    return ($issues) ? csprintf('-issues %s', $issues) : '';
-  }
-
-}
diff --git a/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php b/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php
deleted file mode 100644
index a94a0bed1..000000000
--- a/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php
+++ /dev/null
@@ -1,58 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-final class FbcodeClangFormatLinter extends BaseDirectoryScopedFormatLinter {
-
-  const LINT_FORMATTING = 1;
-  const CLANG_FORMAT_BINARY = '/mnt/vol/engshare/admin/scripts/clang-format';
-
-  protected function getPathsToLint() {
-    return array('');
-  }
-
-  public function getLinterName() {
-    return 'CLANG_FORMAT';
-  }
-
-  public function getLintSeverityMap() {
-    return array(
-      self::LINT_FORMATTING => ArcanistLintSeverity::SEVERITY_ADVICE,
-    );
-  }
-
-  public function getLintNameMap() {
-    return array(
-      self::LINT_FORMATTING => pht('Changes are not clang-formatted'),
-    );
-  }
-
-  protected function getFormatFuture($path, array $changed) {
-    $args = "";
-    foreach ($changed as $key => $value) {
-      $args .= " --lines=$key:$key";
-    }
-
-    $binary = self::CLANG_FORMAT_BINARY;
-    if (!file_exists($binary)) {
-      // trust the $PATH
-      $binary = "clang-format";
-    }
-
-    return new ExecFuture(
-      "%s %s $args",
-      $binary,
-      $this->getEngine()->getFilePathOnDisk($path));
-  }
-
-  protected function getLintMessage($diff) {
-    $link_to_clang_format =
-      "[[ http://fburl.com/clang-format | clang-format ]]";
-    return <<<LINT_MSG
-Changes in this file were not formatted using $link_to_clang_format.
-Please run build_tools/format-diff.sh or `make format`
-LINT_MSG;
-  }
-}
diff --git a/arcanist_util/cpp_linter/FbcodeCppLinter.php b/arcanist_util/cpp_linter/FbcodeCppLinter.php
deleted file mode 100644
index 3dac9bf73..000000000
--- a/arcanist_util/cpp_linter/FbcodeCppLinter.php
+++ /dev/null
@@ -1,126 +0,0 @@
-<?php
-// Copyright 2004-present Facebook.  All rights reserved.
-
-class FbcodeCppLinter extends ArcanistLinter {
-  const FLINT      = "/home/engshare/tools/flint";
-  const LINT_ERROR   = 1;
-  const LINT_WARNING = 2;
-  const LINT_ADVICE  = 3;
-  const C_FLAG = "--c_mode=true";
-
-  private $rawLintOutput = array();
-
-  public function willLintPaths(array $paths) {
-    if (!file_exists(self::FLINT)) {
-      return;
-    }
-    $futures = array();
-    foreach ($paths as $p) {
-      $lpath = $this->getEngine()->getFilePathOnDisk($p);
-      $lpath_file = file($lpath);
-      if (preg_match('/\.(c)$/', $lpath) ||
-          preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) ||
-          preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0])
-          ) {
-        $futures[$p] = new ExecFuture("%s %s %s 2>&1",
-                           self::FLINT, self::C_FLAG,
-                           $this->getEngine()->getFilePathOnDisk($p));
-      } else {
-        $futures[$p] = new ExecFuture("%s %s 2>&1",
-          self::FLINT, $this->getEngine()->getFilePathOnDisk($p));
-      }
-    }
-
-    foreach (Futures($futures)->limit(8) as $p => $f) {
-      $this->rawLintOutput[$p] = $f->resolvex();
-    }
-
-    return;
-  }
-
-  public function getLinterName() {
-    return "FBCPP";
-  }
-
-  public function lintPath($path) {
-    $this->runCppLint($path);
-  }
-
-  private function runCppLint($path) {
-    $msgs = $this->getCppLintOutput($path);
-    foreach ($msgs as $m) {
-      $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']);
-    }
-  }
-
-  private function adviseOnEachPattern(
-    $path,
-    $regex,
-    $message,
-    $lint_type = self::LINT_ADVICE,
-    $match_idx = 0) {
-      $file_data = $this->getData($path);
-      $matches = array();
-      if (!preg_match_all($regex, $file_data, $matches, PREG_OFFSET_CAPTURE)) {
-        return;
-      }
-
-      foreach ($matches[$match_idx] as $match) {
-        list($match_str, $offset) = $match;
-        $this->raiseLintAtOffset($offset, $lint_type, $message, $match_str);
-      }
-  }
-
-  public function getLintSeverityMap() {
-    return array(
-      self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING,
-      self::LINT_ADVICE  => ArcanistLintSeverity::SEVERITY_ADVICE,
-      self::LINT_ERROR   => ArcanistLintSeverity::SEVERITY_ERROR
-    );
-  }
-
-  public function getLintNameMap() {
-    return array(
-      self::LINT_ADVICE   => "CppLint Advice",
-      self::LINT_WARNING  => "CppLint Warning",
-      self::LINT_ERROR    => "CppLint Error"
-    );
-  }
-
-  private function getCppLintOutput($path) {
-     if (!array_key_exists($path, $this->rawLintOutput)) {
-       return array();
-     }
-    list($output) = $this->rawLintOutput[$path];
-
-    $msgs = array();
-    $current = null;
-    $matches = array();
-    foreach (explode("\n", $output) as $line) {
-      if (preg_match('/.*?:(\d+):(.*)/', $line, $matches)) {
-        if ($current) {
-          $msgs[] = $current;
-        }
-        $line = $matches[1];
-        $text = $matches[2];
-        if (preg_match('/.*Warning.*/', $text)) {
-          $sev = self::LINT_WARNING;
-        } else if (preg_match('/.*Advice.*/', $text)) {
-          $sev = self::LINT_ADVICE;
-        } else {
-          $sev = self::LINT_ERROR;
-        }
-        $current = array('line'     => $line,
-                         'msg'      => $text,
-                         'severity' => $sev);
-      } else if ($current) {
-        $current['msg'] .= ' ' . $line;
-      }
-    }
-    if ($current) {
-      $msgs[] = $current;
-    }
-
-    return $msgs;
-  }
-}
diff --git a/arcanist_util/cpp_linter/cpplint.py b/arcanist_util/cpp_linter/cpplint.py
deleted file mode 100755
index 3d0c45a6d..000000000
--- a/arcanist_util/cpp_linter/cpplint.py
+++ /dev/null
@@ -1,4767 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree. An additional grant
-# of patent rights can be found in the PATENTS file in the same directory.
-# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file. See the AUTHORS file for names of contributors.
-#
-# Copyright (c) 2009 Google Inc. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#    * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#    * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-#    * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Does google-lint on c++ files.
-
-The goal of this script is to identify places in the code that *may*
-be in non-compliance with google style.  It does not attempt to fix
-up these problems -- the point is to educate.  It does also not
-attempt to find all problems, or to ensure that everything it does
-find is legitimately a problem.
-
-In particular, we can get very confused by /* and // inside strings!
-We do a small hack, which is to ignore //'s with "'s after them on the
-same line, but it is far from perfect (in either direction).
-"""
-
-import codecs
-import copy
-import getopt
-import math  # for log
-import os
-import re
-import sre_compile
-import string
-import sys
-import unicodedata
-
-
-_USAGE = """
-Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
-                   [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
-        <file> [file] ...
-
-  The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
-
-  Every problem is given a confidence score from 1-5, with 5 meaning we are
-  certain of the problem, and 1 meaning it could be a legitimate construct.
-  This will miss some errors, and is not a substitute for a code review.
-
-  To suppress false-positive errors of a certain category, add a
-  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
-  suppresses errors of all categories on that line.
-
-  The files passed in will be linted; at least one file must be provided.
-  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
-  extensions with the --extensions flag.
-
-  Flags:
-
-    output=vs7
-      By default, the output is formatted to ease emacs parsing.  Visual Studio
-      compatible output (vs7) may also be used.  Other formats are unsupported.
-
-    verbose=#
-      Specify a number 0-5 to restrict errors to certain verbosity levels.
-
-    filter=-x,+y,...
-      Specify a comma-separated list of category-filters to apply: only
-      error messages whose category names pass the filters will be printed.
-      (Category names are printed with the message and look like
-      "[whitespace/indent]".)  Filters are evaluated left to right.
-      "-FOO" and "FOO" means "do not print categories that start with FOO".
-      "+FOO" means "do print categories that start with FOO".
-
-      Examples: --filter=-whitespace,+whitespace/braces
-                --filter=whitespace,runtime/printf,+runtime/printf_format
-                --filter=-,+build/include_what_you_use
-
-      To see a list of all the categories used in cpplint, pass no arg:
-         --filter=
-
-    counting=total|toplevel|detailed
-      The total number of errors found is always printed. If
-      'toplevel' is provided, then the count of errors in each of
-      the top-level categories like 'build' and 'whitespace' will
-      also be printed. If 'detailed' is provided, then a count
-      is provided for each category like 'build/class'.
-
-    root=subdir
-      The root directory used for deriving header guard CPP variable.
-      By default, the header guard CPP variable is calculated as the relative
-      path to the directory that contains .git, .hg, or .svn.  When this flag
-      is specified, the relative path is calculated from the specified
-      directory. If the specified directory does not exist, this flag is
-      ignored.
-
-      Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
-
-        No flag => CHROME_BROWSER_UI_BROWSER_H_
-        --root=chrome => BROWSER_UI_BROWSER_H_
-        --root=chrome/browser => UI_BROWSER_H_
-
-    linelength=digits
-      This is the allowed line length for the project. The default value is
-      80 characters.
-
-      Examples:
-        --linelength=120
-
-    extensions=extension,extension,...
-      The allowed file extensions that cpplint will check
-
-      Examples:
-        --extensions=hpp,cpp
-"""
-
-# We categorize each error message we print.  Here are the categories.
-# We want an explicit list so we can list them all in cpplint --filter=.
-# If you add a new error message with a new category, add it to the list
-# here!  cpplint_unittest.py should tell you if you forget to do this.
-_ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
-
-# The default state of the category filter. This is overrided by the --filter=
-# flag. By default all errors are on, so only add here categories that should be
-# off by default (i.e., categories that must be enabled by the --filter= flags).
-# All entries here should start with a '-' or '+', as in the --filter= flag.
-_DEFAULT_FILTERS = []
-
-# We used to check for high-bit characters, but after much discussion we
-# decided those were OK, as long as they were in UTF-8 and didn't represent
-# hard-coded international strings, which belong in a separate i18n file.
-
-
-# C++ headers
-_CPP_HEADERS = frozenset([
-    # Legacy
-    'algobase.h',
-    'algo.h',
-    'alloc.h',
-    'builtinbuf.h',
-    'bvector.h',
-    'complex.h',
-    'defalloc.h',
-    'deque.h',
-    'editbuf.h',
-    'fstream.h',
-    'function.h',
-    'hash_map',
-    'hash_map.h',
-    'hash_set',
-    'hash_set.h',
-    'hashtable.h',
-    'heap.h',
-    'indstream.h',
-    'iomanip.h',
-    'iostream.h',
-    'istream.h',
-    'iterator.h',
-    'list.h',
-    'map.h',
-    'multimap.h',
-    'multiset.h',
-    'ostream.h',
-    'pair.h',
-    'parsestream.h',
-    'pfstream.h',
-    'procbuf.h',
-    'pthread_alloc',
-    'pthread_alloc.h',
-    'rope',
-    'rope.h',
-    'ropeimpl.h',
-    'set.h',
-    'slist',
-    'slist.h',
-    'stack.h',
-    'stdiostream.h',
-    'stl_alloc.h',
-    'stl_relops.h',
-    'streambuf.h',
-    'stream.h',
-    'strfile.h',
-    'strstream.h',
-    'tempbuf.h',
-    'tree.h',
-    'type_traits.h',
-    'vector.h',
-    # 17.6.1.2 C++ library headers
-    'algorithm',
-    'array',
-    'atomic',
-    'bitset',
-    'chrono',
-    'codecvt',
-    'complex',
-    'condition_variable',
-    'deque',
-    'exception',
-    'forward_list',
-    'fstream',
-    'functional',
-    'future',
-    'initializer_list',
-    'iomanip',
-    'ios',
-    'iosfwd',
-    'iostream',
-    'istream',
-    'iterator',
-    'limits',
-    'list',
-    'locale',
-    'map',
-    'memory',
-    'mutex',
-    'new',
-    'numeric',
-    'ostream',
-    'queue',
-    'random',
-    'ratio',
-    'regex',
-    'set',
-    'sstream',
-    'stack',
-    'stdexcept',
-    'streambuf',
-    'string',
-    'strstream',
-    'system_error',
-    'thread',
-    'tuple',
-    'typeindex',
-    'typeinfo',
-    'type_traits',
-    'unordered_map',
-    'unordered_set',
-    'utility',
-    'valarray',
-    'vector',
-    # 17.6.1.2 C++ headers for C library facilities
-    'cassert',
-    'ccomplex',
-    'cctype',
-    'cerrno',
-    'cfenv',
-    'cfloat',
-    'cinttypes',
-    'ciso646',
-    'climits',
-    'clocale',
-    'cmath',
-    'csetjmp',
-    'csignal',
-    'cstdalign',
-    'cstdarg',
-    'cstdbool',
-    'cstddef',
-    'cstdint',
-    'cstdio',
-    'cstdlib',
-    'cstring',
-    'ctgmath',
-    'ctime',
-    'cuchar',
-    'cwchar',
-    'cwctype',
-    ])
-
-# Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
-_CHECK_MACROS = [
-    'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
-    ]
-
-# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
-_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
-
-for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
-                        ('>=', 'GE'), ('>', 'GT'),
-                        ('<=', 'LE'), ('<', 'LT')]:
-  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
-  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
-
-for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
-                            ('>=', 'LT'), ('>', 'LE'),
-                            ('<=', 'GT'), ('<', 'GE')]:
-  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
-
-# Alternative tokens and their replacements.  For full list, see section 2.5
-# Alternative tokens [lex.digraph] in the C++ standard.
-#
-# Digraphs (such as '%:') are not included here since it's a mess to
-# match those on a word boundary.
-_ALT_TOKEN_REPLACEMENT = {
-    'and': '&&',
-    'bitor': '|',
-    'or': '||',
-    'xor': '^',
-    'compl': '~',
-    'bitand': '&',
-    'and_eq': '&=',
-    'or_eq': '|=',
-    'xor_eq': '^=',
-    'not': '!',
-    'not_eq': '!='
-    }
-
-# Compile regular expression that matches all the above keywords.  The "[ =()]"
-# bit is meant to avoid matching these keywords outside of boolean expressions.
-#
-# False positives include C-style multi-line comments and multi-line strings
-# but those have always been troublesome for cpplint.
-_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
-    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
-
-
-# These constants define types of headers for use with
-# _IncludeState.CheckNextIncludeOrder().
-_C_SYS_HEADER = 1
-_CPP_SYS_HEADER = 2
-_LIKELY_MY_HEADER = 3
-_POSSIBLE_MY_HEADER = 4
-_OTHER_HEADER = 5
-
-# These constants define the current inline assembly state
-_NO_ASM = 0       # Outside of inline assembly block
-_INSIDE_ASM = 1   # Inside inline assembly block
-_END_ASM = 2      # Last line of inline assembly block
-_BLOCK_ASM = 3    # The whole block is an inline assembly block
-
-# Match start of assembly blocks
-_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
-                        r'(?:\s+(volatile|__volatile__))?'
-                        r'\s*[{(]')
-
-
-_regexp_compile_cache = {}
-
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
-
-# {str, set(int)}: a map from error categories to sets of linenumbers
-# on which those errors are expected and should be suppressed.
-_error_suppressions = {}
-
-# The root directory used for deriving header guard CPP variable.
-# This is set by --root flag.
-_root = None
-
-# The allowed line length of files.
-# This is set by --linelength flag.
-_line_length = 80
-
-# The allowed extensions for file names
-# This is set by --extensions flag.
-_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
-
-def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
-
-  Parses any NOLINT comments on the current line, updating the global
-  error_suppressions store.  Reports an error if the NOLINT comment
-  was malformed.
-
-  Args:
-    filename: str, the name of the input file.
-    raw_line: str, the line of input text, with comments.
-    linenum: int, the number of the current line.
-    error: function, an error handler.
-  """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
-  if matched:
-    category = matched.group(1)
-    if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
-    else:
-      if category.startswith('(') and category.endswith(')'):
-        category = category[1:-1]
-        if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
-          error(filename, linenum, 'readability/nolint', 5,
-                'Unknown NOLINT error category: %s' % category)
-
-
-def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
-  _error_suppressions.clear()
-
-
-def IsErrorSuppressedByNolint(category, linenum):
-  """Returns true if the specified error category is suppressed on this line.
-
-  Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
-
-  Args:
-    category: str, the category of the error.
-    linenum: int, the current line number.
-  Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
-  """
-  return (linenum in _error_suppressions.get(category, set()) or
-          linenum in _error_suppressions.get(None, set()))
-
-def Match(pattern, s):
-  """Matches the string with the pattern, caching the compiled regexp."""
-  # The regexp compilation caching is inlined in both Match and Search for
-  # performance reasons; factoring it out into a separate function turns out
-  # to be noticeably expensive.
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].match(s)
-
-
-def ReplaceAll(pattern, rep, s):
-  """Replaces instances of pattern in a string with a replacement.
-
-  The compiled regex is kept in a cache shared by Match and Search.
-
-  Args:
-    pattern: regex pattern
-    rep: replacement text
-    s: search string
-
-  Returns:
-    string with replacements made (or original string if no replacements)
-  """
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].sub(rep, s)
-
-
-def Search(pattern, s):
-  """Searches the string for the pattern, caching the compiled regexp."""
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].search(s)
-
-
-class _IncludeState(dict):
-  """Tracks line numbers for includes, and the order in which includes appear.
-
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
-
-  Call CheckNextIncludeOrder() once for each header in the file, passing
-  in the type constants defined above. Calls in an illegal order will
-  raise an _IncludeError with an appropriate error message.
-
-  """
-  # self._section will move monotonically through this set. If it ever
-  # needs to move backwards, CheckNextIncludeOrder will raise an error.
-  _INITIAL_SECTION = 0
-  _MY_H_SECTION = 1
-  _C_SECTION = 2
-  _CPP_SECTION = 3
-  _OTHER_H_SECTION = 4
-
-  _TYPE_NAMES = {
-      _C_SYS_HEADER: 'C system header',
-      _CPP_SYS_HEADER: 'C++ system header',
-      _LIKELY_MY_HEADER: 'header this file implements',
-      _POSSIBLE_MY_HEADER: 'header this file may implement',
-      _OTHER_HEADER: 'other header',
-      }
-  _SECTION_NAMES = {
-      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
-      _MY_H_SECTION: 'a header this file implements',
-      _C_SECTION: 'C system header',
-      _CPP_SECTION: 'C++ system header',
-      _OTHER_H_SECTION: 'other header',
-      }
-
-  def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
-
-  def ResetSection(self):
-    # The name of the current section.
-    self._section = self._INITIAL_SECTION
-    # The path of last found header.
-    self._last_header = ''
-
-  def SetLastHeader(self, header_path):
-    self._last_header = header_path
-
-  def CanonicalizeAlphabeticalOrder(self, header_path):
-    """Returns a path canonicalized for alphabetical comparison.
-
-    - replaces "-" with "_" so they both cmp the same.
-    - removes '-inl' since we don't require them to be after the main header.
-    - lowercase everything, just in case.
-
-    Args:
-      header_path: Path to be canonicalized.
-
-    Returns:
-      Canonicalized path.
-    """
-    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
-
-  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
-    """Check if a header is in alphabetical order with the previous header.
-
-    Args:
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      header_path: Canonicalized header to be checked.
-
-    Returns:
-      Returns true if the header is in alphabetical order.
-    """
-    # If previous section is different from current section, _last_header will
-    # be reset to empty string, so it's always less than current header.
-    #
-    # If previous line was a blank line, assume that the headers are
-    # intentionally sorted the way they are.
-    if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
-      return False
-    return True
-
-  def CheckNextIncludeOrder(self, header_type):
-    """Returns a non-empty error message if the next header is out of order.
-
-    This function also updates the internal state to be ready to check
-    the next include.
-
-    Args:
-      header_type: One of the _XXX_HEADER constants defined above.
-
-    Returns:
-      The empty string if the header is in the right order, or an
-      error message describing what's wrong.
-
-    """
-    error_message = ('Found %s after %s' %
-                     (self._TYPE_NAMES[header_type],
-                      self._SECTION_NAMES[self._section]))
-
-    last_section = self._section
-
-    if header_type == _C_SYS_HEADER:
-      if self._section <= self._C_SECTION:
-        self._section = self._C_SECTION
-      else:
-        self._last_header = ''
-        return error_message
-    elif header_type == _CPP_SYS_HEADER:
-      if self._section <= self._CPP_SECTION:
-        self._section = self._CPP_SECTION
-      else:
-        self._last_header = ''
-        return error_message
-    elif header_type == _LIKELY_MY_HEADER:
-      if self._section <= self._MY_H_SECTION:
-        self._section = self._MY_H_SECTION
-      else:
-        self._section = self._OTHER_H_SECTION
-    elif header_type == _POSSIBLE_MY_HEADER:
-      if self._section <= self._MY_H_SECTION:
-        self._section = self._MY_H_SECTION
-      else:
-        # This will always be the fallback because we're not sure
-        # enough that the header is associated with this file.
-        self._section = self._OTHER_H_SECTION
-    else:
-      assert header_type == _OTHER_HEADER
-      self._section = self._OTHER_H_SECTION
-
-    if last_section != self._section:
-      self._last_header = ''
-
-    return ''
-
-
-class _CppLintState(object):
-  """Maintains module-wide state.."""
-
-  def __init__(self):
-    self.verbose_level = 1  # global setting.
-    self.error_count = 0    # global count of reported errors
-    # filters to apply when emitting error messages
-    self.filters = _DEFAULT_FILTERS[:]
-    self.counting = 'total'  # In what way are we counting errors?
-    self.errors_by_category = {}  # string to int dict storing error counts
-
-    # output format:
-    # "emacs" - format that emacs can parse (default)
-    # "vs7" - format that Microsoft Visual Studio 7 can parse
-    self.output_format = 'emacs'
-
-  def SetOutputFormat(self, output_format):
-    """Sets the output format for errors."""
-    self.output_format = output_format
-
-  def SetVerboseLevel(self, level):
-    """Sets the module's verbosity, and returns the previous setting."""
-    last_verbose_level = self.verbose_level
-    self.verbose_level = level
-    return last_verbose_level
-
-  def SetCountingStyle(self, counting_style):
-    """Sets the module's counting options."""
-    self.counting = counting_style
-
-  def SetFilters(self, filters):
-    """Sets the error-message filters.
-
-    These filters are applied when deciding whether to emit a given
-    error message.
-
-    Args:
-      filters: A string of comma-separated filters (eg "+whitespace/indent").
-               Each filter should start with + or -; else we die.
-
-    Raises:
-      ValueError: The comma-separated filters did not all start with '+' or '-'.
-                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
-    """
-    # Default filters always have less priority than the flag ones.
-    self.filters = _DEFAULT_FILTERS[:]
-    for filt in filters.split(','):
-      clean_filt = filt.strip()
-      if clean_filt:
-        self.filters.append(clean_filt)
-    for filt in self.filters:
-      if not (filt.startswith('+') or filt.startswith('-')):
-        raise ValueError('Every filter in --filters must start with + or -'
-                         ' (%s does not)' % filt)
-
-  def ResetErrorCounts(self):
-    """Sets the module's error statistic back to zero."""
-    self.error_count = 0
-    self.errors_by_category = {}
-
-  def IncrementErrorCount(self, category):
-    """Bumps the module's error statistic."""
-    self.error_count += 1
-    if self.counting in ('toplevel', 'detailed'):
-      if self.counting != 'detailed':
-        category = category.split('/')[0]
-      if category not in self.errors_by_category:
-        self.errors_by_category[category] = 0
-      self.errors_by_category[category] += 1
-
-  def PrintErrorCounts(self):
-    """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
-      sys.stderr.write('Category \'%s\' errors found: %d\n' %
-                       (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
-
-_cpplint_state = _CppLintState()
-
-
-def _OutputFormat():
-  """Gets the module's output format."""
-  return _cpplint_state.output_format
-
-
-def _SetOutputFormat(output_format):
-  """Sets the module's output format."""
-  _cpplint_state.SetOutputFormat(output_format)
-
-
-def _VerboseLevel():
-  """Returns the module's verbosity setting."""
-  return _cpplint_state.verbose_level
-
-
-def _SetVerboseLevel(level):
-  """Sets the module's verbosity, and returns the previous setting."""
-  return _cpplint_state.SetVerboseLevel(level)
-
-
-def _SetCountingStyle(level):
-  """Sets the module's counting options."""
-  _cpplint_state.SetCountingStyle(level)
-
-
-def _Filters():
-  """Returns the module's list of output filters, as a list."""
-  return _cpplint_state.filters
-
-
-def _SetFilters(filters):
-  """Sets the module's error-message filters.
-
-  These filters are applied when deciding whether to emit a given
-  error message.
-
-  Args:
-    filters: A string of comma-separated filters (eg "whitespace/indent").
-             Each filter should start with + or -; else we die.
-  """
-  _cpplint_state.SetFilters(filters)
-
-
-class _FunctionState(object):
-  """Tracks current function name and the number of lines in its body."""
-
-  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
-  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
-
-  def __init__(self):
-    self.in_a_function = False
-    self.lines_in_function = 0
-    self.current_function = ''
-
-  def Begin(self, function_name):
-    """Start analyzing function body.
-
-    Args:
-      function_name: The name of the function being tracked.
-    """
-    self.in_a_function = True
-    self.lines_in_function = 0
-    self.current_function = function_name
-
-  def Count(self):
-    """Count line in current function body."""
-    if self.in_a_function:
-      self.lines_in_function += 1
-
-  def Check(self, error, filename, linenum):
-    """Report if too many lines in function body.
-
-    Args:
-      error: The function to call with any errors found.
-      filename: The name of the current file.
-      linenum: The number of the line to check.
-    """
-    if Match(r'T(EST|est)', self.current_function):
-      base_trigger = self._TEST_TRIGGER
-    else:
-      base_trigger = self._NORMAL_TRIGGER
-    trigger = base_trigger * 2**_VerboseLevel()
-
-    if self.lines_in_function > trigger:
-      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
-      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
-      if error_level > 5:
-        error_level = 5
-      error(filename, linenum, 'readability/fn_size', error_level,
-            'Small and focused functions are preferred:'
-            ' %s has %d non-comment lines'
-            ' (error triggered by exceeding %d lines).'  % (
-                self.current_function, self.lines_in_function, trigger))
-
-  def End(self):
-    """Stop analyzing function body."""
-    self.in_a_function = False
-
-
-class _IncludeError(Exception):
-  """Indicates a problem with the include order in a file."""
-  pass
-
-
-class FileInfo:
-  """Provides utility functions for filenames.
-
-  FileInfo provides easy access to the components of a file's path
-  relative to the project root.
-  """
-
-  def __init__(self, filename):
-    self._filename = filename
-
-  def FullName(self):
-    """Make Windows paths like Unix."""
-    return os.path.abspath(self._filename).replace('\\', '/')
-
-  def RepositoryName(self):
-    """FullName after removing the local path to the repository.
-
-    If we have a real absolute path name here we can try to do something smart:
-    detecting the root of the checkout and truncating /path/to/checkout from
-    the name so that we get header guards that don't include things like
-    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
-    people on different computers who have checked the source out to different
-    locations won't see bogus errors.
-    """
-    fullname = self.FullName()
-
-    if os.path.exists(fullname):
-      project_dir = os.path.dirname(fullname)
-
-      if os.path.exists(os.path.join(project_dir, ".svn")):
-        # If there's a .svn file in the current directory, we recursively look
-        # up the directory tree for the top of the SVN checkout
-        root_dir = project_dir
-        one_up_dir = os.path.dirname(root_dir)
-        while os.path.exists(os.path.join(one_up_dir, ".svn")):
-          root_dir = os.path.dirname(root_dir)
-          one_up_dir = os.path.dirname(one_up_dir)
-
-        prefix = os.path.commonprefix([root_dir, project_dir])
-        return fullname[len(prefix) + 1:]
-
-      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
-      # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
-
-      if (os.path.exists(os.path.join(root_dir, ".git")) or
-          os.path.exists(os.path.join(root_dir, ".hg")) or
-          os.path.exists(os.path.join(root_dir, ".svn"))):
-        prefix = os.path.commonprefix([root_dir, project_dir])
-        return fullname[len(prefix) + 1:]
-
-    # Don't know what to do; header guard warnings may be wrong...
-    return fullname
-
-  def Split(self):
-    """Splits the file into the directory, basename, and extension.
-
-    For 'chrome/browser/browser.cc', Split() would
-    return ('chrome/browser', 'browser', '.cc')
-
-    Returns:
-      A tuple of (directory, basename, extension).
-    """
-
-    googlename = self.RepositoryName()
-    project, rest = os.path.split(googlename)
-    return (project,) + os.path.splitext(rest)
-
-  def BaseName(self):
-    """File base name - text after the final slash, before the final period."""
-    return self.Split()[1]
-
-  def Extension(self):
-    """File extension - text following the final period."""
-    return self.Split()[2]
-
-  def NoExtension(self):
-    """File has no source file extension."""
-    return '/'.join(self.Split()[0:2])
-
-  def IsSource(self):
-    """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
-
-
-def _ShouldPrintError(category, confidence, linenum):
-  """If confidence >= verbose, category passes filter and is not suppressed."""
-
-  # There are three ways we might decide not to print an error message:
-  # a "NOLINT(category)" comment appears in the source,
-  # the verbosity level isn't high enough, or the filters filter it out.
-  if IsErrorSuppressedByNolint(category, linenum):
-    return False
-  if confidence < _cpplint_state.verbose_level:
-    return False
-
-  is_filtered = False
-  for one_filter in _Filters():
-    if one_filter.startswith('-'):
-      if category.startswith(one_filter[1:]):
-        is_filtered = True
-    elif one_filter.startswith('+'):
-      if category.startswith(one_filter[1:]):
-        is_filtered = False
-    else:
-      assert False  # should have been checked for in SetFilter.
-  if is_filtered:
-    return False
-
-  return True
-
-
-def Error(filename, linenum, category, confidence, message):
-  """Logs the fact we've found a lint error.
-
-  We log where the error was found, and also our confidence in the error,
-  that is, how certain we are this is a legitimate style regression, and
-  not a misidentification or a use that's sometimes justified.
-
-  False positives can be suppressed by the use of
-  "cpplint(category)"  comments on the offending line.  These are
-  parsed into _error_suppressions.
-
-  Args:
-    filename: The name of the file containing the error.
-    linenum: The number of the line containing the error.
-    category: A string used to describe the "category" this bug
-      falls under: "whitespace", say, or "runtime".  Categories
-      may have a hierarchy separated by slashes: "whitespace/indent".
-    confidence: A number from 1-5 representing a confidence score for
-      the error, with 5 meaning that we are certain of the problem,
-      and 1 meaning that it could be a legitimate construct.
-    message: The error message.
-  """
-  if _ShouldPrintError(category, confidence, linenum):
-    _cpplint_state.IncrementErrorCount(category)
-    if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-    elif _cpplint_state.output_format == 'eclipse':
-      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-    else:
-      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-
-
-# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
-_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
-    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
-# This RE is a little bit more complicated than one might expect, because we
-# have to take care of space removals tools so we can handle comments inside
-# statements better.
-# The current rule is: We only clear spaces from both sides when we're at the
-# end of the line. Otherwise, we try to remove spaces from the right side,
-# if this doesn't work we try on left side but only if there's a non-character
-# on the right.
-_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
-
-
-def IsCppString(line):
-  """Does line terminate so, that the next symbol is in string constant.
-
-  This function does not consider single-line nor multi-line comments.
-
-  Args:
-    line: is a partial line of code starting from the 0..n.
-
-  Returns:
-    True, if next character appended to 'line' is inside a
-    string constant.
-  """
-
-  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
-  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
-
-
-def CleanseRawStrings(raw_lines):
-  """Removes C++11 raw strings from lines.
-
-    Before:
-      static const char kData[] = R"(
-          multi-line string
-          )";
-
-    After:
-      static const char kData[] = ""
-          (replaced by blank line)
-          "";
-
-  Args:
-    raw_lines: list of raw lines.
-
-  Returns:
-    list of lines with C++11 raw strings replaced by empty strings.
-  """
-
-  delimiter = None
-  lines_without_raw_strings = []
-  for line in raw_lines:
-    if delimiter:
-      # Inside a raw string, look for the end
-      end = line.find(delimiter)
-      if end >= 0:
-        # Found the end of the string, match leading space for this
-        # line and resume copying the original lines, and also insert
-        # a "" on the last line.
-        leading_space = Match(r'^(\s*)\S', line)
-        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
-        delimiter = None
-      else:
-        # Haven't found the end yet, append a blank line.
-        line = ''
-
-    else:
-      # Look for beginning of a raw string.
-      # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
-        delimiter = ')' + matched.group(2) + '"'
-
-        end = matched.group(3).find(delimiter)
-        if end >= 0:
-          # Raw string ended on same line
-          line = (matched.group(1) + '""' +
-                  matched.group(3)[end + len(delimiter):])
-          delimiter = None
-        else:
-          # Start of a multi-line raw string
-          line = matched.group(1) + '""'
-
-    lines_without_raw_strings.append(line)
-
-  # TODO(unknown): if delimiter is not None here, we might want to
-  # emit a warning for unterminated string.
-  return lines_without_raw_strings
-
-
-def FindNextMultiLineCommentStart(lines, lineix):
-  """Find the beginning marker for a multiline comment."""
-  while lineix < len(lines):
-    if lines[lineix].strip().startswith('/*'):
-      # Only return this marker if the comment goes beyond this line
-      if lines[lineix].strip().find('*/', 2) < 0:
-        return lineix
-    lineix += 1
-  return len(lines)
-
-
-def FindNextMultiLineCommentEnd(lines, lineix):
-  """We are inside a comment, find the end marker."""
-  while lineix < len(lines):
-    if lines[lineix].strip().endswith('*/'):
-      return lineix
-    lineix += 1
-  return len(lines)
-
-
-def RemoveMultiLineCommentsFromRange(lines, begin, end):
-  """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
-  # unnecessary blank line warnings later in the code.
-  for i in range(begin, end):
-    lines[i] = '// dummy'
-
-
-def RemoveMultiLineComments(filename, lines, error):
-  """Removes multiline (c-style) comments from lines."""
-  lineix = 0
-  while lineix < len(lines):
-    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
-    if lineix_begin >= len(lines):
-      return
-    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
-    if lineix_end >= len(lines):
-      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
-            'Could not find end of multi-line comment')
-      return
-    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
-    lineix = lineix_end + 1
-
-
-def CleanseComments(line):
-  """Removes //-comments and single-line C-style /* */ comments.
-
-  Args:
-    line: A line of C++ source.
-
-  Returns:
-    The line with single-line comments removed.
-  """
-  commentpos = line.find('//')
-  if commentpos != -1 and not IsCppString(line[:commentpos]):
-    line = line[:commentpos].rstrip()
-  # get rid of /* ... */
-  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
-
-
-class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
-
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
-  3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
-  """
-
-  def __init__(self, lines):
-    self.elided = []
-    self.lines = []
-    self.raw_lines = lines
-    self.num_lines = len(lines)
-    self.lines_without_raw_strings = CleanseRawStrings(lines)
-    for linenum in range(len(self.lines_without_raw_strings)):
-      self.lines.append(CleanseComments(
-          self.lines_without_raw_strings[linenum]))
-      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
-      self.elided.append(CleanseComments(elided))
-
-  def NumLines(self):
-    """Returns the number of lines represented."""
-    return self.num_lines
-
-  @staticmethod
-  def _CollapseStrings(elided):
-    """Collapses strings and chars on a line to simple "" or '' blocks.
-
-    We nix strings first so we're not fooled by text like '"http://"'
-
-    Args:
-      elided: The line being processed.
-
-    Returns:
-      The line with collapsed strings.
-    """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
-
-
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
-
-  Args:
-    line: a CleansedLines line.
-    startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
-
-  Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
-  """
-  for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
-
-
-def CloseExpression(clean_lines, linenum, pos):
-  """If input points to ( or { or [ or <, finds the position that closes it.
-
-  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
-  linenum/pos that correspond to the closing of the expression.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    pos: A position on the line.
-
-  Returns:
-    A tuple (line, linenum, pos) pointer *past* the closing brace, or
-    (line, len(lines), -1) if we never find a close.  Note we ignore
-    strings and comments when matching; and the line we return is the
-    'cleansed' line at linenum.
-  """
-
-  line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
-    return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
-
-  # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
-  if end_pos > -1:
-    return (line, linenum, end_pos)
-
-  # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
-    linenum += 1
-    line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
-    if end_pos > -1:
-      return (line, linenum, end_pos)
-
-  # Did not find endchar before end of file, give up
-  return (line, clean_lines.NumLines(), -1)
-
-
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
-
-  This is almost the reverse of FindEndOfExpressionInLine, but note
-  that the input position and returned position differs by 1.
-
-  Args:
-    line: a CleansedLines line.
-    endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
-
-  Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
-  """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
-
-
-def ReverseCloseExpression(clean_lines, linenum, pos):
-  """If input points to ) or } or ] or >, finds the position that opens it.
-
-  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
-  linenum/pos that correspond to the opening of the expression.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    pos: A position on the line.
-
-  Returns:
-    A tuple (line, linenum, pos) pointer *at* the opening brace, or
-    (line, 0, -1) if we never find the matching opening brace.  Note
-    we ignore strings and comments when matching; and the line we
-    return is the 'cleansed' line at linenum.
-  """
-  line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
-    return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
-
-  # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
-  if start_pos > -1:
-    return (line, linenum, start_pos)
-
-  # Continue scanning backward
-  while linenum > 0:
-    linenum -= 1
-    line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
-    if start_pos > -1:
-      return (line, linenum, start_pos)
-
-  # Did not find startchar before beginning of file, give up
-  return (line, 0, -1)
-
-
-def CheckForCopyright(filename, lines, error):
-  """Logs an error if no Copyright message appears at the top of the file."""
-
-  # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
-  for line in xrange(1, min(len(lines), 11)):
-    if re.search(r'Copyright', lines[line], re.I): break
-  else:                       # means no copyright line was found
-    error(filename, 0, 'legal/copyright', 5,
-          'No copyright message found.  '
-          'You should have a line: "Copyright [year] <Copyright Owner>"')
-
-
-def GetHeaderGuardCPPVariable(filename):
-  """Returns the CPP variable that should be used as a header guard.
-
-  Args:
-    filename: The name of a C++ header file.
-
-  Returns:
-    The CPP variable that should be used as a header guard in the
-    named file.
-
-  """
-
-  # Restores original filename in case that cpplint is invoked from Emacs's
-  # flymake.
-  filename = re.sub(r'_flymake\.h$', '.h', filename)
-  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
-
-  fileinfo = FileInfo(filename)
-  file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
-
-
-def CheckForHeaderGuard(filename, lines, error):
-  """Checks that the file contains a header guard.
-
-  Logs an error if no #ifndef header guard is present.  For other
-  headers, checks that the full pathname is used.
-
-  Args:
-    filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-
-  cppvar = GetHeaderGuardCPPVariable(filename)
-
-  ifndef = None
-  ifndef_linenum = 0
-  define = None
-  endif = None
-  endif_linenum = 0
-  for linenum, line in enumerate(lines):
-    # Already been well guarded, no need for further checking.
-    if line.strip() == "#pragma once":
-        return
-    linesplit = line.split()
-    if len(linesplit) >= 2:
-      # find the first occurrence of #ifndef and #define, save arg
-      if not ifndef and linesplit[0] == '#ifndef':
-        # set ifndef to the header guard presented on the #ifndef line.
-        ifndef = linesplit[1]
-        ifndef_linenum = linenum
-      if not define and linesplit[0] == '#define':
-        define = linesplit[1]
-    # find the last occurrence of #endif, save entire line
-    if line.startswith('#endif'):
-      endif = line
-      endif_linenum = linenum
-
-  if not ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #ifndef header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
-  # for backward compatibility.
-  if ifndef != cppvar:
-    error_level = 0
-    if ifndef != cppvar + '_':
-      error_level = 5
-
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
-                            error)
-    error(filename, ifndef_linenum, 'build/header_guard', error_level,
-          '#ifndef header guard has wrong style, please use: %s' % cppvar)
-
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
-
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
-
-
-def CheckForBadCharacters(filename, lines, error):
-  """Logs an error for each line containing bad characters.
-
-  Two kinds of bad characters:
-
-  1. Unicode replacement characters: These indicate that either the file
-  contained invalid UTF-8 (likely) or Unicode replacement characters (which
-  it shouldn't).  Note that it's possible for this to throw off line
-  numbering if the invalid UTF-8 occurred adjacent to a newline.
-
-  2. NUL bytes.  These are problematic for some tools.
-
-  Args:
-    filename: The name of the current file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-  for linenum, line in enumerate(lines):
-    if u'\ufffd' in line:
-      error(filename, linenum, 'readability/utf8', 5,
-            'Line contains invalid UTF-8 (or Unicode replacement character).')
-    if '\0' in line:
-      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
-
-
-def CheckForNewlineAtEOF(filename, lines, error):
-  """Logs an error if there is no newline char at the end of the file.
-
-  Args:
-    filename: The name of the current file.
-    lines: An array of strings, each representing a line of the file.
-    error: The function to call with any errors found.
-  """
-
-  # The array lines() was created by adding two newlines to the
-  # original file (go figure), then splitting on \n.
-  # To verify that the file ends in \n, we just have to make sure the
-  # last-but-two element of lines() exists and is empty.
-  if len(lines) < 3 or lines[-2]:
-    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
-          'Could not find a newline character at the end of the file.')
-
-
-def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
-  """Logs an error if we see /* ... */ or "..." that extend past one line.
-
-  /* ... */ comments are legit inside macros, for one line.
-  Otherwise, we prefer // comments, so it's ok to warn about the
-  other.  Likewise, it's ok for strings to extend across multiple
-  lines, as long as a line continuation character (backslash)
-  terminates each line. Although not currently prohibited by the C++
-  style guide, it's ugly and unnecessary. We don't do well with either
-  in this lint program, so we warn about both.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-
-  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
-  # second (escaped) slash may trigger later \" detection erroneously.
-  line = line.replace('\\\\', '')
-
-  if line.count('/*') > line.count('*/'):
-    error(filename, linenum, 'readability/multiline_comment', 5,
-          'Complex multi-line /*...*/-style comment found. '
-          'Lint may give bogus warnings.  '
-          'Consider replacing these with //-style comments, '
-          'with #if 0...#endif, '
-          'or with more clearly structured multi-line comments.')
-
-  if (line.count('"') - line.count('\\"')) % 2:
-    error(filename, linenum, 'readability/multiline_string', 5,
-          'Multi-line string ("...") found.  This lint script doesn\'t '
-          'do well with such strings, and may give bogus warnings.  '
-          'Use C++11 raw strings or concatenation instead.')
-
-
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
-    )
-
-
-def CheckPosixThreading(filename, clean_lines, linenum, error):
-  """Checks for calls to thread-unsafe functions.
-
-  Much code has been originally written without consideration of
-  multi-threading. Also, engineers are relying on their old experience;
-  they have learned posix before threading extensions were added. These
-  tests guide the engineers to use thread-safe functions (when using
-  posix directly).
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
-      error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
-            '...) for improved thread safety.')
-
-
-def CheckVlogArguments(filename, clean_lines, linenum, error):
-  """Checks that VLOG() is only used for defining a logging level.
-
-  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
-  VLOG(FATAL) are not.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
-    error(filename, linenum, 'runtime/vlog', 5,
-          'VLOG() should be used with numeric verbosity level.  '
-          'Use LOG() if you want symbolic severity levels.')
-
-
-# Matches invalid increment: *count++, which moves pointer instead of
-# incrementing a value.
-_RE_PATTERN_INVALID_INCREMENT = re.compile(
-    r'^\s*\*\w+(\+\+|--);')
-
-
-def CheckInvalidIncrement(filename, clean_lines, linenum, error):
-  """Checks for invalid increment *count++.
-
-  For example following function:
-  void increment_counter(int* count) {
-    *count++;
-  }
-  is invalid, because it effectively does count++, moving pointer, and should
-  be replaced with ++*count, (*count)++ or *count += 1.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  if _RE_PATTERN_INVALID_INCREMENT.match(line):
-    error(filename, linenum, 'runtime/invalid_increment', 5,
-          'Changing pointer instead of value (or unused value of operator*).')
-
-
-class _BlockInfo(object):
-  """Stores information about a generic block of code."""
-
-  def __init__(self, seen_open_brace):
-    self.seen_open_brace = seen_open_brace
-    self.open_parentheses = 0
-    self.inline_asm = _NO_ASM
-
-  def CheckBegin(self, filename, clean_lines, linenum, error):
-    """Run checks that applies to text up to the opening brace.
-
-    This is mostly for checking the text after the class identifier
-    and the "{", usually where the base class is specified.  For other
-    blocks, there isn't much to check, so we always pass.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-    pass
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    """Run checks that applies to text after the closing brace.
-
-    This is mostly used for checking end of namespace comments.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-    pass
-
-
-class _ClassInfo(_BlockInfo):
-  """Stores information about a class."""
-
-  def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
-    self.name = name
-    self.starting_linenum = linenum
-    self.is_derived = False
-    if class_or_struct == 'struct':
-      self.access = 'public'
-      self.is_struct = True
-    else:
-      self.access = 'private'
-      self.is_struct = False
-
-    # Remember initial indentation level for this class.  Using raw_lines here
-    # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
-
-    # Try to find the end of the class.  This will be confused by things like:
-    #   class A {
-    #   } *x = { ...
-    #
-    # But it's still good enough for CheckSectionSpacing.
-    self.last_line = 0
-    depth = 0
-    for i in range(linenum, clean_lines.NumLines()):
-      line = clean_lines.elided[i]
-      depth += line.count('{') - line.count('}')
-      if not depth:
-        self.last_line = i
-        break
-
-  def CheckBegin(self, filename, clean_lines, linenum, error):
-    # Look for a bare ':'
-    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
-      self.is_derived = True
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    # Check that closing brace is aligned with beginning of the class.
-    # Only do this if the closing brace is indented by only whitespaces.
-    # This means we will not check single-line class definitions.
-    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
-    if indent and len(indent.group(1)) != self.class_indent:
-      if self.is_struct:
-        parent = 'struct ' + self.name
-      else:
-        parent = 'class ' + self.name
-      error(filename, linenum, 'whitespace/indent', 3,
-            'Closing brace should be aligned with beginning of %s' % parent)
-
-
-class _NamespaceInfo(_BlockInfo):
-  """Stores information about a namespace."""
-
-  def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
-    self.name = name or ''
-    self.starting_linenum = linenum
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    """Check end of namespace comments."""
-    line = clean_lines.raw_lines[linenum]
-
-    # Check how many lines is enclosed in this namespace.  Don't issue
-    # warning for missing namespace comments if there aren't enough
-    # lines.  However, do apply checks if there is already an end of
-    # namespace comment and it's incorrect.
-    #
-    # TODO(unknown): We always want to check end of namespace comments
-    # if a namespace is large, but sometimes we also want to apply the
-    # check if a short namespace contained nontrivial things (something
-    # other than forward declarations).  There is currently no logic on
-    # deciding what these nontrivial things are, so this check is
-    # triggered by namespace size only, which works most of the time.
-    if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
-      return
-
-    # Look for matching comment at end of namespace.
-    #
-    # Note that we accept C style "/* */" comments for terminating
-    # namespaces, so that code that terminate namespaces inside
-    # preprocessor macros can be cpplint clean.
-    #
-    # We also accept stuff like "// end of namespace <name>." with the
-    # period at the end.
-    #
-    # Besides these, we don't accept anything else, otherwise we might
-    # get false negatives when existing comment is a substring of the
-    # expected namespace.
-    if self.name:
-      # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
-                   line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace %s"' %
-              self.name)
-    else:
-      # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
-
-
-class _PreprocessorInfo(object):
-  """Stores checkpoints of nesting stacks when #if/#else is seen."""
-
-  def __init__(self, stack_before_if):
-    # The entire nesting stack before #if
-    self.stack_before_if = stack_before_if
-
-    # The entire nesting stack up to #else
-    self.stack_before_else = []
-
-    # Whether we have already seen #else or #elif
-    self.seen_else = False
-
-
-class _NestingState(object):
-  """Holds states related to parsing braces."""
-
-  def __init__(self):
-    # Stack for tracking all braces.  An object is pushed whenever we
-    # see a "{", and popped when we see a "}".  Only 3 types of
-    # objects are possible:
-    # - _ClassInfo: a class or struct.
-    # - _NamespaceInfo: a namespace.
-    # - _BlockInfo: some other type of block.
-    self.stack = []
-
-    # Stack of _PreprocessorInfo objects.
-    self.pp_stack = []
-
-  def SeenOpenBrace(self):
-    """Check if we have seen the opening brace for the innermost block.
-
-    Returns:
-      True if we have seen the opening brace, False if the innermost
-      block is still expecting an opening brace.
-    """
-    return (not self.stack) or self.stack[-1].seen_open_brace
-
-  def InNamespaceBody(self):
-    """Check if we are currently one level inside a namespace body.
-
-    Returns:
-      True if top of the stack is a namespace block, False otherwise.
-    """
-    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
-
-  def UpdatePreprocessor(self, line):
-    """Update preprocessor stack.
-
-    We need to handle preprocessors due to classes like this:
-      #ifdef SWIG
-      struct ResultDetailsPageElementExtensionPoint {
-      #else
-      struct ResultDetailsPageElementExtensionPoint : public Extension {
-      #endif
-
-    We make the following assumptions (good enough for most files):
-    - Preprocessor condition evaluates to true from #if up to first
-      #else/#elif/#endif.
-
-    - Preprocessor condition evaluates to false from #else/#elif up
-      to #endif.  We still perform lint checks on these lines, but
-      these do not affect nesting stack.
-
-    Args:
-      line: current line to check.
-    """
-    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
-      # Beginning of #if block, save the nesting stack here.  The saved
-      # stack will allow us to restore the parsing state in the #else case.
-      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
-    elif Match(r'^\s*#\s*(else|elif)\b', line):
-      # Beginning of #else block
-      if self.pp_stack:
-        if not self.pp_stack[-1].seen_else:
-          # This is the first #else or #elif block.  Remember the
-          # whole nesting stack up to this point.  This is what we
-          # keep after the #endif.
-          self.pp_stack[-1].seen_else = True
-          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
-
-        # Restore the stack to how it was before the #if
-        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
-      else:
-        # TODO(unknown): unexpected #else, issue warning?
-        pass
-    elif Match(r'^\s*#\s*endif\b', line):
-      # End of #if or #else blocks.
-      if self.pp_stack:
-        # If we saw an #else, we will need to restore the nesting
-        # stack to its former state before the #else, otherwise we
-        # will just continue from where we left off.
-        if self.pp_stack[-1].seen_else:
-          # Here we can just use a shallow copy since we are the last
-          # reference to it.
-          self.stack = self.pp_stack[-1].stack_before_else
-        # Drop the corresponding #if
-        self.pp_stack.pop()
-      else:
-        # TODO(unknown): unexpected #endif, issue warning?
-        pass
-
-  def Update(self, filename, clean_lines, linenum, error):
-    """Update nesting state with current line.
-
-    Args:
-      filename: The name of the current file.
-      clean_lines: A CleansedLines instance containing the file.
-      linenum: The number of the line to check.
-      error: The function to call with any errors found.
-    """
-    line = clean_lines.elided[linenum]
-
-    # Update pp_stack first
-    self.UpdatePreprocessor(line)
-
-    # Count parentheses.  This is to avoid adding struct arguments to
-    # the nesting stack.
-    if self.stack:
-      inner_block = self.stack[-1]
-      depth_change = line.count('(') - line.count(')')
-      inner_block.open_parentheses += depth_change
-
-      # Also check if we are starting or ending an inline assembly block.
-      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
-        if (depth_change != 0 and
-            inner_block.open_parentheses == 1 and
-            _MATCH_ASM.match(line)):
-          # Enter assembly block
-          inner_block.inline_asm = _INSIDE_ASM
-        else:
-          # Not entering assembly block.  If previous line was _END_ASM,
-          # we will now shift to _NO_ASM state.
-          inner_block.inline_asm = _NO_ASM
-      elif (inner_block.inline_asm == _INSIDE_ASM and
-            inner_block.open_parentheses == 0):
-        # Exit assembly block
-        inner_block.inline_asm = _END_ASM
-
-    # Consume namespace declaration at the beginning of the line.  Do
-    # this in a loop so that we catch same line declarations like this:
-    #   namespace proto2 { namespace bridge { class MessageSet; } }
-    while True:
-      # Match start of namespace.  The "\b\s*" below catches namespace
-      # declarations even if it weren't followed by a whitespace, this
-      # is so that we don't confuse our namespace checker.  The
-      # missing spaces will be flagged by CheckSpacing.
-      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
-      if not namespace_decl_match:
-        break
-
-      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
-      self.stack.append(new_namespace)
-
-      line = namespace_decl_match.group(2)
-      if line.find('{') != -1:
-        new_namespace.seen_open_brace = True
-        line = line[line.find('{') + 1:]
-
-    # Look for a class declaration in whatever is left of the line
-    # after parsing namespaces.  The regexp accounts for decorated classes
-    # such as in:
-    #   class LOCKABLE API Object {
-    #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
-    class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
-    if (class_decl_match and
-        (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
-
-    # If we have not yet seen the opening brace for the innermost block,
-    # run checks here.
-    if not self.SeenOpenBrace():
-      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
-
-    # Update access control if we are inside a class/struct
-    if self.stack and isinstance(self.stack[-1], _ClassInfo):
-      classinfo = self.stack[-1]
-      access_match = Match(
-          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
-          r':(?:[^:]|$)',
-          line)
-      if access_match:
-        classinfo.access = access_match.group(2)
-
-        # Check that access keywords are indented +1 space.  Skip this
-        # check if the keywords are not preceded by whitespaces.
-        indent = access_match.group(1)
-        if (len(indent) != classinfo.class_indent + 1 and
-            Match(r'^\s*$', indent)):
-          if classinfo.is_struct:
-            parent = 'struct ' + classinfo.name
-          else:
-            parent = 'class ' + classinfo.name
-          slots = ''
-          if access_match.group(3):
-            slots = access_match.group(3)
-          error(filename, linenum, 'whitespace/indent', 3,
-                '%s%s: should be indented +1 space inside %s' % (
-                    access_match.group(2), slots, parent))
-
-    # Consume braces or semicolons from what's left of the line
-    while True:
-      # Match first brace, semicolon, or closed parenthesis.
-      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
-      if not matched:
-        break
-
-      token = matched.group(1)
-      if token == '{':
-        # If namespace or class hasn't seen a opening brace yet, mark
-        # namespace/class head as complete.  Push a new block onto the
-        # stack otherwise.
-        if not self.SeenOpenBrace():
-          self.stack[-1].seen_open_brace = True
-        else:
-          self.stack.append(_BlockInfo(True))
-          if _MATCH_ASM.match(line):
-            self.stack[-1].inline_asm = _BLOCK_ASM
-      elif token == ';' or token == ')':
-        # If we haven't seen an opening brace yet, but we already saw
-        # a semicolon, this is probably a forward declaration.  Pop
-        # the stack for these.
-        #
-        # Similarly, if we haven't seen an opening brace yet, but we
-        # already saw a closing parenthesis, then these are probably
-        # function arguments with extra "class" or "struct" keywords.
-        # Also pop these stack for these.
-        if not self.SeenOpenBrace():
-          self.stack.pop()
-      else:  # token == '}'
-        # Perform end of block checks and pop the stack.
-        if self.stack:
-          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
-          self.stack.pop()
-      line = matched.group(2)
-
-  def InnermostClass(self):
-    """Get class info on the top of the stack.
-
-    Returns:
-      A _ClassInfo object if we are inside a class, or None otherwise.
-    """
-    for i in range(len(self.stack), 0, -1):
-      classinfo = self.stack[i - 1]
-      if isinstance(classinfo, _ClassInfo):
-        return classinfo
-    return None
-
-  def CheckCompletedBlocks(self, filename, error):
-    """Checks that all classes and namespaces have been completely parsed.
-
-    Call this when all lines in a file have been processed.
-    Args:
-      filename: The name of the current file.
-      error: The function to call with any errors found.
-    """
-    # Note: This test can result in false positives if #ifdef constructs
-    # get in the way of brace matching. See the testBuildClass test in
-    # cpplint_unittest.py for an example of this.
-    for obj in self.stack:
-      if isinstance(obj, _ClassInfo):
-        error(filename, obj.starting_linenum, 'build/class', 5,
-              'Failed to find complete declaration of class %s' %
-              obj.name)
-      elif isinstance(obj, _NamespaceInfo):
-        error(filename, obj.starting_linenum, 'build/namespaces', 5,
-              'Failed to find complete declaration of namespace %s' %
-              obj.name)
-
-
-def CheckForNonStandardConstructs(filename, clean_lines, linenum,
-                                  nesting_state, error):
-  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
-
-  Complain about several constructs which gcc-2 accepts, but which are
-  not standard C++.  Warning about these in lint is one way to ease the
-  transition to new compilers.
-  - put storage class first (e.g. "static const" instead of "const static").
-  - "%lld" instead of %qd" in printf-type functions.
-  - "%1$d" is non-standard in printf-type functions.
-  - "\%" is an undefined character escape sequence.
-  - text after #endif is not allowed.
-  - invalid inner-style forward declaration.
-  - >? and <? operators, and their >?= and <?= cousins.
-
-  Additionally, check for constructor/destructor style violations and reference
-  members, as it is very convenient to do so while checking for
-  gcc-2 compliance.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-  """
-
-  # Remove comments from the line, but leave in strings for now.
-  line = clean_lines.lines[linenum]
-
-  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
-    error(filename, linenum, 'runtime/printf_format', 3,
-          '%q in format strings is deprecated.  Use %ll instead.')
-
-  if Search(r'printf\s*\(.*".*%\d+\$', line):
-    error(filename, linenum, 'runtime/printf_format', 2,
-          '%N$ formats are unconventional.  Try rewriting to avoid them.')
-
-  # Remove escaped backslashes before looking for undefined escapes.
-  line = line.replace('\\\\', '')
-
-  if Search(r'("|\').*\\(%|\[|\(|{)', line):
-    error(filename, linenum, 'build/printf_format', 3,
-          '%, [, (, and { are undefined character escapes.  Unescape them.')
-
-  # For the rest, work with both comments and strings removed.
-  line = clean_lines.elided[linenum]
-
-  if Search(r'\b(const|volatile|void|char|short|int|long'
-            r'|float|double|signed|unsigned'
-            r'|schar|u?int8|u?int16|u?int32|u?int64)'
-            r'\s+(register|static|extern|typedef)\b',
-            line):
-    error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
-
-  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
-    error(filename, linenum, 'build/endif_comment', 5,
-          'Uncommented text after #endif is non-standard.  Use a comment.')
-
-  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
-    error(filename, linenum, 'build/forward_decl', 5,
-          'Inner-style forward declarations are invalid.  Remove this line.')
-
-  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
-            line):
-    error(filename, linenum, 'build/deprecated', 3,
-          '>? and <? (max and min) operators are non-standard and deprecated.')
-
-  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
-    # TODO(unknown): Could it be expanded safely to arbitrary references,
-    # without triggering too many false positives? The first
-    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
-    # the restriction.
-    # Here's the original regexp, for the reference:
-    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
-    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
-    error(filename, linenum, 'runtime/member_string_references', 2,
-          'const string& members are dangerous. It is much better to use '
-          'alternatives, such as pointers or simple constants.')
-
-  # Everything else in this function operates on class declarations.
-  # Return early if the top of the nesting stack is not a class, or if
-  # the class head is not completed yet.
-  classinfo = nesting_state.InnermostClass()
-  if not classinfo or not classinfo.seen_open_brace:
-    return
-
-  # The class may have been declared with namespace or classname qualifiers.
-  # The constructor and destructor will not have those qualifiers.
-  base_classname = classinfo.name.split('::')[-1]
-
-  # Look for single-argument constructors that aren't marked explicit.
-  # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
-
-
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
-  """Checks for the correctness of various spacing around function calls.
-
-  Args:
-    filename: The name of the current file.
-    line: The text of the line to check.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  # Since function calls often occur inside if/for/while/switch
-  # expressions - which have their own, more liberal conventions - we
-  # first see if we should be looking inside such an expression for a
-  # function call, to which we can apply more strict standards.
-  fncall = line    # if there's no control flow construct, look at whole line
-  for pattern in (r'\bif\s*\((.*)\)\s*{',
-                  r'\bfor\s*\((.*)\)\s*{',
-                  r'\bwhile\s*\((.*)\)\s*[{;]',
-                  r'\bswitch\s*\((.*)\)\s*{'):
-    match = Search(pattern, line)
-    if match:
-      fncall = match.group(1)    # look inside the parens for function calls
-      break
-
-  # Except in if/for/while/switch, there should never be space
-  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
-  # for nested parens ( (a+b) + c ).  Likewise, there should never be
-  # a space before a ( when it's a function argument.  I assume it's a
-  # function argument when the char before the whitespace is legal in
-  # a function name (alnum + _) and we're not starting a macro. Also ignore
-  # pointers and references to arrays and functions coz they're too tricky:
-  # we use a very simple way to recognize these:
-  # " (something)(maybe-something)" or
-  # " (something)(maybe-something," or
-  # " (something)[something]"
-  # Note that we assume the contents of [] to be short enough that
-  # they'll never need to wrap.
-  if (  # Ignore control structures.
-      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
-                 fncall) and
-      # Ignore pointers/references to functions.
-      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
-      # Ignore pointers/references to arrays.
-      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
-    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space after ( in function call')
-    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
-      error(filename, linenum, 'whitespace/parens', 2,
-            'Extra space after (')
-    if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
-    # If the ) is followed only by a newline or a { + newline, assume it's
-    # part of a control statement (if/while/etc), and don't complain
-    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
-      # If the closing parenthesis is preceded by only whitespaces,
-      # try to give a more descriptive error message.
-      if Search(r'^\s+\)', fncall):
-        error(filename, linenum, 'whitespace/parens', 2,
-              'Closing ) should be moved to the previous line')
-      else:
-        error(filename, linenum, 'whitespace/parens', 2,
-              'Extra space before )')
-
-
-def IsBlankLine(line):
-  """Returns true if the given line is blank.
-
-  We consider a line to be blank if the line is empty or consists of
-  only white spaces.
-
-  Args:
-    line: A line of a string.
-
-  Returns:
-    True, if the given line is blank.
-  """
-  return not line or line.isspace()
-
-
-def CheckForFunctionLengths(filename, clean_lines, linenum,
-                            function_state, error):
-  """Reports for long function bodies.
-
-  For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
-
-  Uses a simplistic algorithm assuming other style guidelines
-  (especially spacing) are followed.
-  Only checks unindented functions, so class members are unchecked.
-  Trivial bodies are unchecked, so constructors with huge initializer lists
-  may be missed.
-  Blank/comment lines are not counted so as to avoid encouraging the removal
-  of vertical space and comments just to get through a lint check.
-  NOLINT *on the last line of a function* disables this check.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    function_state: Current function name and lines in body so far.
-    error: The function to call with any errors found.
-  """
-  lines = clean_lines.lines
-  line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
-  joined_line = ''
-
-  starting_func = False
-  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
-  match_result = Match(regexp, line)
-  if match_result:
-    # If the name is all caps and underscores, figure it's a macro and
-    # ignore it, unless it's TEST or TEST_F.
-    function_name = match_result.group(1).split()[-1]
-    if function_name == 'TEST' or function_name == 'TEST_F' or (
-        not Match(r'[A-Z_]+$', function_name)):
-      starting_func = True
-
-  if starting_func:
-    body_found = False
-    for start_linenum in xrange(linenum, clean_lines.NumLines()):
-      start_line = lines[start_linenum]
-      joined_line += ' ' + start_line.lstrip()
-      if Search(r'(;|})', start_line):  # Declarations and trivial functions
-        body_found = True
-        break                              # ... ignore
-      elif Search(r'{', start_line):
-        body_found = True
-        function = Search(r'((\w|:)*)\(', line).group(1)
-        if Match(r'TEST', function):    # Handle TEST... macros
-          parameter_regexp = Search(r'(\(.*\))', joined_line)
-          if parameter_regexp:             # Ignore bad syntax
-            function += parameter_regexp.group(1)
-        else:
-          function += '()'
-        function_state.Begin(function)
-        break
-    if not body_found:
-      # No body for the function (or evidence of a non-function) was found.
-      error(filename, linenum, 'readability/fn_size', 5,
-            'Lint failed to find start of function body.')
-  elif Match(r'^\}\s*$', line):  # function end
-    function_state.Check(error, filename, linenum)
-    function_state.End()
-  elif not Match(r'^\s*$', line):
-    function_state.Count()  # Count non-blank/non-comment lines.
-
-
-_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
-
-
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
-
-  Args:
-    comment: The text of the comment from the line in question.
-    filename: The name of the current file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
-
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
-
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
-
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
-
-
-def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for the correctness of various spacing issues in the code.
-
-  Things we check for: spaces around operators, spaces after
-  if/for/while/switch, no spaces around parens in function calls, two
-  spaces between code and comment, don't start a block with a blank
-  line, don't end a function with a blank line, don't add a blank line
-  after public/protected/private, don't have too many blank lines in a row.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-
-  # Don't use "elided" lines here, otherwise we can't check commented lines.
-  # Don't want to use "raw" either, because we don't want to check inside C++11
-  # raw strings,
-  raw = clean_lines.lines_without_raw_strings
-  line = raw[linenum]
-
-  # Before nixing comments, check if the line is blank for no good
-  # reason.  This includes the first line after a block is opened, and
-  # blank lines at the end of a function (ie, right before a line like '}'
-  #
-  # Skip all the blank line checks if we are immediately inside a
-  # namespace body.  In other words, don't issue blank line warnings
-  # for this block:
-  #   namespace {
-  #
-  #   }
-  #
-  # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
-    elided = clean_lines.elided
-    prev_line = elided[linenum - 1]
-    prevbrace = prev_line.rfind('{')
-    # TODO(unknown): Don't complain if line before blank line, and line after,
-    #                both start with alnums and are indented the same amount.
-    #                This ignores whitespace at the start of a namespace block
-    #                because those are not usually indented.
-    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
-      # OK, we have a blank line at the start of a code block.  Before we
-      # complain, we check if it is an exception to the rule: The previous
-      # non-empty line has the parameters of a function header that are indented
-      # 4 spaces (because they did not fit in a 80 column line when placed on
-      # the same line as the function name).  We also check for the case where
-      # the previous line is indented 6 spaces, which may happen when the
-      # initializers of a constructor do not fit into a 80 column line.
-      exception = False
-      if Match(r' {6}\w', prev_line):  # Initializer list?
-        # We are looking for the opening column of initializer list, which
-        # should be indented 4 spaces to cause 6 space indentation afterwards.
-        search_position = linenum-2
-        while (search_position >= 0
-               and Match(r' {6}\w', elided[search_position])):
-          search_position -= 1
-        exception = (search_position >= 0
-                     and elided[search_position][:5] == '    :')
-      else:
-        # Search for the function arguments or an initializer list.  We use a
-        # simple heuristic here: If the line is indented 4 spaces; and we have a
-        # closing paren, without the opening paren, followed by an opening brace
-        # or colon (for initializer lists) we assume that it is the last line of
-        # a function header.  If we have a colon indented 4 spaces, it is an
-        # initializer list.
-        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
-                           prev_line)
-                     or Match(r' {4}:', prev_line))
-
-      if not exception:
-        error(filename, linenum, 'whitespace/blank_line', 2,
-              'Redundant blank line at the start of a code block '
-              'should be deleted.')
-    # Ignore blank lines at the end of a block in a long if-else
-    # chain, like this:
-    #   if (condition1) {
-    #     // Something followed by a blank line
-    #
-    #   } else if (condition2) {
-    #     // Something else
-    #   }
-    if linenum + 1 < clean_lines.NumLines():
-      next_line = raw[linenum + 1]
-      if (next_line
-          and Match(r'\s*}', next_line)
-          and next_line.find('} else ') == -1):
-        error(filename, linenum, 'whitespace/blank_line', 3,
-              'Redundant blank line at the end of a code block '
-              'should be deleted.')
-
-    matched = Match(r'\s*(public|protected|private):', prev_line)
-    if matched:
-      error(filename, linenum, 'whitespace/blank_line', 3,
-            'Do not leave a blank line after "%s:"' % matched.group(1))
-
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
-
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
-
-  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
-  # Otherwise not.  Note we only check for non-spaces on *both* sides;
-  # sometimes people put non-spaces on one side when aligning ='s among
-  # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
-    error(filename, linenum, 'whitespace/operators', 4,
-          'Missing spaces around =')
-
-  # It's ok not to have spaces around binary operators like + - * /, but if
-  # there's too little whitespace, we get concerned.  It's hard to tell,
-  # though, so we punt on this one for now.  TODO.
-
-  # You should always have whitespace around binary operators.
-  #
-  # Check <= and >= first to avoid false positives with < and >, then
-  # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
-  elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
-    # Look for < that is not surrounded by spaces.  This is only
-    # triggered if both sides are missing spaces, even though
-    # technically should flag if at least one side is missing a
-    # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
-
-    # Look for > that is not surrounded by spaces.  Similar to the
-    # above, we only trigger if both sides are missing spaces to avoid
-    # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
-
-  # We allow no-spaces around >> for almost anything.  This is because
-  # C++11 allows ">>" to close nested templates, which accounts for
-  # most cases when ">>" is not followed by a space.
-  #
-  # We still warn on ">>" followed by alpha character, because that is
-  # likely due to ">>" being used for right shifts, e.g.:
-  #   value >> alpha
-  #
-  # When ">>" is used to close templates, the alphanumeric letter that
-  # follows would be part of an identifier, and there should still be
-  # a space separating the template type and the identifier.
-  #   type<type<type>> alpha
-  match = Search(r'>>[a-zA-Z_]', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around >>')
-
-  # There shouldn't be space around unary operators
-  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 4,
-          'Extra space for operator %s' % match.group(1))
-
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
-  match = Search(r' (if\(|for\(|while\(|switch\()', line)
-  if match:
-    error(filename, linenum, 'whitespace/parens', 5,
-          'Missing space before ( in %s' % match.group(1))
-
-  # For if/for/while/switch, the left and right parens should be
-  # consistent about how many spaces are inside the parens, and
-  # there should either be zero or one spaces inside the parens.
-  # We don't want: "if ( foo)" or "if ( foo   )".
-  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
-  match = Search(r'\b(if|for|while|switch)\s*'
-                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
-                 line)
-  if match:
-    if len(match.group(2)) != len(match.group(4)):
-      if not (match.group(3) == ';' and
-              len(match.group(2)) == 1 + len(match.group(4)) or
-              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
-        error(filename, linenum, 'whitespace/parens', 5,
-              'Mismatching spaces inside () in %s' % match.group(1))
-    if len(match.group(2)) not in [0, 1]:
-      error(filename, linenum, 'whitespace/parens', 5,
-            'Should have zero or one spaces inside ( and ) in %s' %
-            match.group(1))
-
-  # You should always have a space after a comma (either as fn arg or operator)
-  #
-  # This does not apply when the non-space character following the
-  # comma is another comma, since the only time when that happens is
-  # for empty macro arguments.
-  #
-  # We run this check in two passes: first pass on elided lines to
-  # verify that lines contain missing whitespaces, second pass on raw
-  # lines to confirm that those missing whitespaces are not due to
-  # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
-    error(filename, linenum, 'whitespace/comma', 3,
-          'Missing space after ,')
-
-  # You should always have a space after a semicolon
-  # except for few corner cases
-  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
-  # space after ;
-  if Search(r';[^\s};\\)/]', line):
-    error(filename, linenum, 'whitespace/semicolon', 3,
-          'Missing space after ;')
-
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
-
-  # Except after an opening paren, or after another opening brace (in case of
-  # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
-  if match:
-    # Try a bit harder to check for brace initialization.  This
-    # happens in one of the following forms:
-    #   Constructor() : initializer_list_{} { ... }
-    #   Constructor{}.MemberFunction()
-    #   Type variable{};
-    #   FunctionCall(type{}, ...);
-    #   LastArgument(..., type{});
-    #   LOG(INFO) << type{} << " ...";
-    #   map_of_type[{...}] = ...;
-    #
-    # We check for the character following the closing brace, and
-    # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
-    #
-    # To account for nested initializer list, we allow any number of
-    # closing braces up to "{;,)<".  We can't simply silence the
-    # warning on first sight of closing brace, because that would
-    # cause false negatives for things that are not initializer lists.
-    #   Silence this:         But not this:
-    #     Outer{                if (...) {
-    #       Inner{...}            if (...){  // Missing space before {
-    #     };                    }
-    #
-    # There is a false negative with this approach if people inserted
-    # spurious semicolons, e.g. "if (cond){};", but we will catch the
-    # spurious semicolon with a separate check.
-    (endline, endlinenum, endpos) = CloseExpression(
-        clean_lines, linenum, len(match.group(1)))
-    trailing_text = ''
-    if endpos > -1:
-      trailing_text = endline[endpos:]
-    for offset in xrange(endlinenum + 1,
-                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
-      trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
-      error(filename, linenum, 'whitespace/braces', 5,
-            'Missing space before {')
-
-  # Make sure '} else {' has spaces.
-  if Search(r'}else', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Missing space before else')
-
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
-  # You shouldn't have a space before a semicolon at the end of the line.
-  # There's a special case for "for" since the style guide allows space before
-  # the semicolon there.
-  if Search(r':\s*;\s*$', line):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Semicolon defining empty statement. Use {} instead.')
-  elif Search(r'^\s*;\s*$', line):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Line contains only semicolon. If this should be an empty statement, '
-          'use {} instead.')
-  elif (Search(r'\s+;\s*$', line) and
-        not Search(r'\bfor\b', line)):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Extra space before last semicolon. If this should be an empty '
-          'statement, use {} instead.')
-
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
-
-
-def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
-  """Checks for additional blank line issues related to sections.
-
-  Currently the only thing checked here is blank line before protected/private.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    class_info: A _ClassInfo objects.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  # Skip checks if the class is small, where small means 25 lines or less.
-  # 25 lines seems like a good cutoff since that's the usual height of
-  # terminals, and any class that can't fit in one screen can't really
-  # be considered "small".
-  #
-  # Also skip checks if we are on the first line.  This accounts for
-  # classes that look like
-  #   class Foo { public: ... };
-  #
-  # If we didn't find the end of the class, last_line would be zero,
-  # and the check will be skipped by the first condition.
-  if (class_info.last_line - class_info.starting_linenum <= 24 or
-      linenum <= class_info.starting_linenum):
-    return
-
-  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
-  if matched:
-    # Issue warning if the line before public/protected/private was
-    # not a blank line, but don't do this if the previous line contains
-    # "class" or "struct".  This can happen two ways:
-    #  - We are at the beginning of the class.
-    #  - We are forward-declaring an inner class that is semantically
-    #    private, but needed to be public for implementation reasons.
-    # Also ignores cases where the previous line ends with a backslash as can be
-    # common when defining classes in C macros.
-    prev_line = clean_lines.lines[linenum - 1]
-    if (not IsBlankLine(prev_line) and
-        not Search(r'\b(class|struct)\b', prev_line) and
-        not Search(r'\\$', prev_line)):
-      # Try a bit harder to find the beginning of the class.  This is to
-      # account for multi-line base-specifier lists, e.g.:
-      #   class Derived
-      #       : public Base {
-      end_class_head = class_info.starting_linenum
-      for i in range(class_info.starting_linenum, linenum):
-        if Search(r'\{\s*$', clean_lines.lines[i]):
-          end_class_head = i
-          break
-      if end_class_head < linenum - 1:
-        error(filename, linenum, 'whitespace/blank_line', 3,
-              '"%s:" should be preceded by a blank line' % matched.group(1))
-
-
-def GetPreviousNonBlankLine(clean_lines, linenum):
-  """Return the most recent non-blank line and its line number.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file contents.
-    linenum: The number of the line to check.
-
-  Returns:
-    A tuple with two elements.  The first element is the contents of the last
-    non-blank line before the current line, or the empty string if this is the
-    first non-blank line.  The second is the line number of that line, or -1
-    if this is the first non-blank line.
-  """
-
-  prevlinenum = linenum - 1
-  while prevlinenum >= 0:
-    prevline = clean_lines.elided[prevlinenum]
-    if not IsBlankLine(prevline):     # if not a blank line...
-      return (prevline, prevlinenum)
-    prevlinenum -= 1
-  return ('', -1)
-
-
-def CheckBraces(filename, clean_lines, linenum, error):
-  """Looks for misplaced braces (e.g. at the end of line).
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  line = clean_lines.elided[linenum]        # get rid of comments and strings
-
-  if Match(r'\s*{\s*$', line):
-    # We allow an open brace to start a line in the case where someone is using
-    # braces in a block to explicitly create a new scope, which is commonly used
-    # to control the lifetime of stack-allocated variables.  Braces are also
-    # used for brace initializers inside function calls.  We don't detect this
-    # perfectly: we just don't complain if the last non-whitespace character on
-    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
-    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
-      error(filename, linenum, 'whitespace/braces', 4,
-            '{ should almost always be at the end of the previous line')
-
-  # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
-    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if Match(r'\s*}\s*$', prevline):
-      error(filename, linenum, 'whitespace/newline', 4,
-            'An else should appear on the same line as the preceding }')
-
-  # If braces come on one side of an else, they should be on both.
-  # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
-
-  # Likewise, an else should never have the else clause on the same line
-  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
-    error(filename, linenum, 'whitespace/newline', 4,
-          'Else clause should never be on same line as else (use 2 lines)')
-
-  # In the same way, a do/while should never be on one line
-  if Match(r'\s*do [^\s{]', line):
-    error(filename, linenum, 'whitespace/newline', 4,
-          'do/while clauses should not be on a single line')
-
-  # Block bodies should not be followed by a semicolon.  Due to C++11
-  # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
-  # 1. Some flavor of block following closing parenthesis:
-  #    for (;;) {};
-  #    while (...) {};
-  #    switch (...) {};
-  #    Function(...) {};
-  #    if (...) {};
-  #    if (...) else if (...) {};
-  #
-  # 2. else block:
-  #    if (...) else {};
-  #
-  # 3. const member function:
-  #    Function(...) const {};
-  #
-  # 4. Block following some statement:
-  #    x = 42;
-  #    {};
-  #
-  # 5. Block at the beginning of a function:
-  #    Function(...) {
-  #      {};
-  #    }
-  #
-  #    Note that naively checking for the preceding "{" will also match
-  #    braces inside multi-dimensional arrays, but this is fine since
-  #    that expression will not contain semicolons.
-  #
-  # 6. Block following another block:
-  #    while (true) {}
-  #    {};
-  #
-  # 7. End of namespaces:
-  #    namespace {};
-  #
-  #    These semicolons seems far more common than other kinds of
-  #    redundant semicolons, possibly due to people converting classes
-  #    to namespaces.  For now we do not warn for this case.
-  #
-  # Try matching case 1 first.
-  match = Match(r'^(.*\)\s*)\{', line)
-  if match:
-    # Matched closing parenthesis (case 1).  Check the token before the
-    # matching opening parenthesis, and don't warn if it looks like a
-    # macro.  This avoids these false positives:
-    #  - macro that defines a base class
-    #  - multi-line macro that defines a base class
-    #  - macro that defines the whole class-head
-    #
-    # But we still issue warnings for macros that we know are safe to
-    # warn, specifically:
-    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
-    #  - TYPED_TEST
-    #  - INTERFACE_DEF
-    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
-    #
-    # We implement a whitelist of safe macros instead of a blacklist of
-    # unsafe macros, even though the latter appears less frequently in
-    # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
-    # would result in compile errors.
-    #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
-    closing_brace_pos = match.group(1).rfind(')')
-    opening_parenthesis = ReverseCloseExpression(
-        clean_lines, linenum, closing_brace_pos)
-    if opening_parenthesis[2] > -1:
-      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
-      if ((macro and
-           macro.group(1) not in (
-               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
-               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
-               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
-          Search(r'\s+=\s*$', line_prefix)):
-        match = None
-    # Whitelist lambda function definition which also requires a ";" after
-    # closing brace
-    if match:
-        if Match(r'^.*\[.*\]\s*(.*\)\s*)\{', line):
-            match = None
-
-  else:
-    # Try matching cases 2-3.
-    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
-    if not match:
-      # Try matching cases 4-6.  These are always matched on separate lines.
-      #
-      # Note that we can't simply concatenate the previous line to the
-      # current line and do a single match, otherwise we may output
-      # duplicate warnings for the blank line case:
-      #   if (cond) {
-      #     // blank line
-      #   }
-      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-      if prevline and Search(r'[;{}]\s*$', prevline):
-        match = Match(r'^(\s*)\{', line)
-
-  # Check matching closing brace
-  if match:
-    (endline, endlinenum, endpos) = CloseExpression(
-        clean_lines, linenum, len(match.group(1)))
-    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
-      # Current {} pair is eligible for semicolon check, and we have found
-      # the redundant semicolon, output warning here.
-      #
-      # Note: because we are scanning forward for opening braces, and
-      # outputting warnings for the matching closing brace, if there are
-      # nested blocks with trailing semicolons, we will get the error
-      # messages in reversed order.
-      error(filename, endlinenum, 'readability/braces', 4,
-            "You don't need a ; after a }")
-
-
-def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
-  """Look for empty loop/conditional body with only a single semicolon.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  # Search for loop keywords at the beginning of the line.  Because only
-  # whitespaces are allowed before the keywords, this will also ignore most
-  # do-while-loops, since those lines should start with closing brace.
-  #
-  # We also check "if" blocks here, since an empty conditional block
-  # is likely an error.
-  line = clean_lines.elided[linenum]
-  matched = Match(r'\s*(for|while|if)\s*\(', line)
-  if matched:
-    # Find the end of the conditional expression
-    (end_line, end_linenum, end_pos) = CloseExpression(
-        clean_lines, linenum, line.find('('))
-
-    # Output warning if what follows the condition expression is a semicolon.
-    # No warning for all other cases, including whitespace or newline, since we
-    # have a separate check for semicolons preceded by whitespace.
-    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
-      if matched.group(1) == 'if':
-        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
-              'Empty conditional bodies should use {}')
-      else:
-        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
-              'Empty loop bodies should use {} or continue')
-
-
-def CheckCheck(filename, clean_lines, linenum, error):
-  """Checks the use of CHECK and EXPECT macros.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-
-  # Decide the set of replacement macros that should be suggested
-  lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
-    return
-
-  # Find end of the boolean expression by matching parentheses
-  (last_line, end_line, end_pos) = CloseExpression(
-      clean_lines, linenum, start_pos)
-  if end_pos < 0:
-    return
-  if linenum == end_line:
-    expression = lines[linenum][start_pos + 1:end_pos - 1]
-  else:
-    expression = lines[linenum][start_pos + 1:]
-    for i in xrange(linenum + 1, end_line):
-      expression += lines[i]
-    expression += last_line[0:end_pos - 1]
-
-  # Parse expression so that we can take parentheses into account.
-  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
-  # which is not replaceable by CHECK_LE.
-  lhs = ''
-  rhs = ''
-  operator = None
-  while expression:
-    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
-                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
-    if matched:
-      token = matched.group(1)
-      if token == '(':
-        # Parenthesized operand
-        expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
-        if end < 0:
-          return  # Unmatched parenthesis
-        lhs += '(' + expression[0:end]
-        expression = expression[end:]
-      elif token in ('&&', '||'):
-        # Logical and/or operators.  This means the expression
-        # contains more than one term, for example:
-        #   CHECK(42 < a && a < b);
-        #
-        # These are not replaceable with CHECK_LE, so bail out early.
-        return
-      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
-        # Non-relational operator
-        lhs += token
-        expression = matched.group(2)
-      else:
-        # Relational operator
-        operator = token
-        rhs = matched.group(2)
-        break
-    else:
-      # Unparenthesized operand.  Instead of appending to lhs one character
-      # at a time, we do another regular expression match to consume several
-      # characters at once if possible.  Trivial benchmark shows that this
-      # is more efficient when the operands are longer than a single
-      # character, which is generally the case.
-      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
-      if not matched:
-        matched = Match(r'^(\s*\S)(.*)$', expression)
-        if not matched:
-          break
-      lhs += matched.group(1)
-      expression = matched.group(2)
-
-  # Only apply checks if we got all parts of the boolean expression
-  if not (lhs and operator and rhs):
-    return
-
-  # Check that rhs do not contain logical operators.  We already know
-  # that lhs is fine since the loop above parses out && and ||.
-  if rhs.find('&&') > -1 or rhs.find('||') > -1:
-    return
-
-  # At least one of the operands must be a constant literal.  This is
-  # to avoid suggesting replacements for unprintable things like
-  # CHECK(variable != iterator)
-  #
-  # The following pattern matches decimal, hex integers, strings, and
-  # characters (in that order).
-  lhs = lhs.strip()
-  rhs = rhs.strip()
-  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
-  if Match(match_constant, lhs) or Match(match_constant, rhs):
-    # Note: since we know both lhs and rhs, we can provide a more
-    # descriptive error message like:
-    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
-    # Instead of:
-    #   Consider using CHECK_EQ instead of CHECK(a == b)
-    #
-    # We are still keeping the less descriptive message because if lhs
-    # or rhs gets long, the error message might become unreadable.
-    error(filename, linenum, 'readability/check', 2,
-          'Consider using %s instead of %s(a %s b)' % (
-              _CHECK_REPLACEMENT[check_macro][operator],
-              check_macro, operator))
-
-
-def CheckAltTokens(filename, clean_lines, linenum, error):
-  """Check alternative keywords being used in boolean expressions.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-
-  # Avoid preprocessor lines
-  if Match(r'^\s*#', line):
-    return
-
-  # Last ditch effort to avoid multi-line comments.  This will not help
-  # if the comment started before the current line or ended after the
-  # current line, but it catches most of the false positives.  At least,
-  # it provides a way to workaround this warning for people who use
-  # multi-line comments in preprocessor macros.
-  #
-  # TODO(unknown): remove this once cpplint has better support for
-  # multi-line comments.
-  if line.find('/*') >= 0 or line.find('*/') >= 0:
-    return
-
-  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
-    error(filename, linenum, 'readability/alt_tokens', 2,
-          'Use operator %s instead of %s' % (
-              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
-
-
-def GetLineWidth(line):
-  """Determines the width of the line in column positions.
-
-  Args:
-    line: A string, which may be a Unicode string.
-
-  Returns:
-    The width of the line in column positions, accounting for Unicode
-    combining characters and wide characters.
-  """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
-
-
-def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
-               error):
-  """Checks rules from the 'C++ style rules' section of cppguide.html.
-
-  Most of these rules are hard to test (naming, comment style), but we
-  do what we can.  In particular we check for 2-space indents, line lengths,
-  tab usage, spaces inside code, etc.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-
-  # Don't use "elided" lines here, otherwise we can't check commented lines.
-  # Don't want to use "raw" either, because we don't want to check inside C++11
-  # raw strings,
-  raw_lines = clean_lines.lines_without_raw_strings
-  line = raw_lines[linenum]
-
-  if line.find('\t') != -1:
-    error(filename, linenum, 'whitespace/tab', 1,
-          'Tab found; better to use spaces')
-
-  # One or three blank spaces at the beginning of the line is weird; it's
-  # hard to reconcile that with 2-space indents.
-  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
-  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
-  # if(RLENGTH > 20) complain = 0;
-  # if(match($0, " +(error|private|public|protected):")) complain = 0;
-  # if(match(prev, "&& *$")) complain = 0;
-  # if(match(prev, "\\|\\| *$")) complain = 0;
-  # if(match(prev, "[\",=><] *$")) complain = 0;
-  # if(match($0, " <<")) complain = 0;
-  # if(match(prev, " +for \\(")) complain = 0;
-  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
-  initial_spaces = 0
-  cleansed_line = clean_lines.elided[linenum]
-  while initial_spaces < len(line) and line[initial_spaces] == ' ':
-    initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
-    error(filename, linenum, 'whitespace/indent', 3,
-          'Weird number of spaces at line-start.  '
-          'Are you using a 2-space indent?')
-
-  # Check if the line is a header guard.
-  is_header_guard = False
-  if file_extension == 'h':
-    cppvar = GetHeaderGuardCPPVariable(filename)
-    if (line.startswith('#ifndef %s' % cppvar) or
-        line.startswith('#define %s' % cppvar) or
-        line.startswith('#endif  // %s' % cppvar)):
-      is_header_guard = True
-  # #include lines and header guards can be long, since there's no clean way to
-  # split them.
-  #
-  # URLs can be long too.  It's possible to split these, but it makes them
-  # harder to cut&paste.
-  #
-  # The "$Id:...$" comment may also get very long without it being the
-  # developers fault.
-  if (not line.startswith('#include') and not is_header_guard and
-      not Match(r'^\s*//.*http(s?)://\S*$', line) and
-      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
-    line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
-      error(filename, linenum, 'whitespace/line_length', 2,
-            'Lines should be <= %i characters long' % _line_length)
-
-  if (cleansed_line.count(';') > 1 and
-      # for loops are allowed two ;'s (and may run over two lines).
-      cleansed_line.find('for') == -1 and
-      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
-       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
-      # It's ok to have many commands in a switch case that fits in 1 line
-      not ((cleansed_line.find('case ') != -1 or
-            cleansed_line.find('default:') != -1) and
-           cleansed_line.find('break;') != -1)):
-    error(filename, linenum, 'whitespace/newline', 0,
-          'More than one command on the same line')
-
-  # Some more style checks
-  CheckBraces(filename, clean_lines, linenum, error)
-  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
-  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
-  CheckCheck(filename, clean_lines, linenum, error)
-  CheckAltTokens(filename, clean_lines, linenum, error)
-  classinfo = nesting_state.InnermostClass()
-  if classinfo:
-    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
-
-
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
-_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
-# Matches the first component of a filename delimited by -s and _s. That is:
-#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
-#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
-_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
-
-
-def _DropCommonSuffixes(filename):
-  """Drops common suffixes like _test.cc or -inl.h from filename.
-
-  For example:
-    >>> _DropCommonSuffixes('foo/foo-inl.h')
-    'foo/foo'
-    >>> _DropCommonSuffixes('foo/bar/foo.cc')
-    'foo/bar/foo'
-    >>> _DropCommonSuffixes('foo/foo_internal.h')
-    'foo/foo'
-    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
-    'foo/foo_unusualinternal'
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    The filename with the common suffix removed.
-  """
-  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
-                 'inl.h', 'impl.h', 'internal.h'):
-    if (filename.endswith(suffix) and len(filename) > len(suffix) and
-        filename[-len(suffix) - 1] in ('-', '_')):
-      return filename[:-len(suffix) - 1]
-  return os.path.splitext(filename)[0]
-
-
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
-def _ClassifyInclude(fileinfo, include, is_system):
-  """Figures out what kind of header 'include' is.
-
-  Args:
-    fileinfo: The current file cpplint is running over. A FileInfo instance.
-    include: The path to a #included file.
-    is_system: True if the #include used <> rather than "".
-
-  Returns:
-    One of the _XXX_HEADER constants.
-
-  For example:
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
-    _C_SYS_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
-    _CPP_SYS_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
-    _LIKELY_MY_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
-    ...                  'bar/foo_other_ext.h', False)
-    _POSSIBLE_MY_HEADER
-    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
-    _OTHER_HEADER
-  """
-  # This is a list of all standard c++ header files, except
-  # those already checked for above.
-  is_cpp_h = include in _CPP_HEADERS
-
-  if is_system:
-    if is_cpp_h:
-      return _CPP_SYS_HEADER
-    else:
-      return _C_SYS_HEADER
-
-  # If the target file and the include we're checking share a
-  # basename when we drop common extensions, and the include
-  # lives in . , then it's likely to be owned by the target file.
-  target_dir, target_base = (
-      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
-  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
-  if target_base == include_base and (
-      include_dir == target_dir or
-      include_dir == os.path.normpath(target_dir + '/../public')):
-    return _LIKELY_MY_HEADER
-
-  # If the target and include share some initial basename
-  # component, it's possible the target is implementing the
-  # include, so it's allowed to be first, but we'll never
-  # complain if it's not there.
-  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
-  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
-  if (target_first_component and include_first_component and
-      target_first_component.group(0) ==
-      include_first_component.group(0)):
-    return _POSSIBLE_MY_HEADER
-
-  return _OTHER_HEADER
-
-
-
-def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
-  """Check rules that are applicable to #include lines.
-
-  Strings on #include lines are NOT removed from elided line, to make
-  certain tasks easier. However, to prevent false positives, checks
-  applicable to #include lines in CheckLanguage must be put here.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    error: The function to call with any errors found.
-  """
-  fileinfo = FileInfo(filename)
-
-  line = clean_lines.lines[linenum]
-
-  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
-    error(filename, linenum, 'build/include', 4,
-          'Include the directory when naming .h files')
-
-  # we shouldn't include a file more than once. actually, there are a
-  # handful of instances where doing so is okay, but in general it's
-  # not.
-  match = _RE_PATTERN_INCLUDE.search(line)
-  if match:
-    include = match.group(2)
-    is_system = (match.group(1) == '<')
-    if include in include_state:
-      error(filename, linenum, 'build/include', 4,
-            '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
-
-      # We want to ensure that headers appear in the right order:
-      # 1) for foo.cc, foo.h  (preferred location)
-      # 2) c system files
-      # 3) cpp system files
-      # 4) for foo.cc, foo.h  (deprecated location)
-      # 5) other google headers
-      #
-      # We classify each include statement as one of those 5 types
-      # using a number of techniques. The include_state object keeps
-      # track of the highest type seen, and complains if we see a
-      # lower type after that.
-      error_message = include_state.CheckNextIncludeOrder(
-          _ClassifyInclude(fileinfo, include, is_system))
-      if error_message:
-        error(filename, linenum, 'build/include_order', 4,
-              '%s. Should be: %s.h, c system, c++ system, other.' %
-              (error_message, fileinfo.BaseName()))
-      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
-      if not include_state.IsInAlphabeticalOrder(
-          clean_lines, linenum, canonical_include):
-        error(filename, linenum, 'build/include_alpha', 4,
-              'Include "%s" not in alphabetical order' % include)
-      include_state.SetLastHeader(canonical_include)
-
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
-
-
-def _GetTextInside(text, start_pattern):
-  r"""Retrieves all the text between matching open and close parentheses.
-
-  Given a string of lines and a regular expression string, retrieve all the text
-  following the expression and between opening punctuation symbols like
-  (, [, or {, and the matching close-punctuation symbol. This properly nested
-  occurrences of the punctuations, so for the text like
-    printf(a(), b(c()));
-  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
-  start_pattern must match string having an open punctuation symbol at the end.
-
-  Args:
-    text: The lines to extract text. Its comments and strings must be elided.
-           It can be single line and can span multiple lines.
-    start_pattern: The regexp string indicating where to start extracting
-                   the text.
-  Returns:
-    The extracted text.
-    None if either the opening string or ending punctuation could not be found.
-  """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
-  # rewritten to use _GetTextInside (and use inferior regexp matching today).
-
-  # Give opening punctuations to get the matching close-punctuations.
-  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
-
-  # Find the position to start extracting text.
-  match = re.search(start_pattern, text, re.M)
-  if not match:  # start_pattern not found in text.
-    return None
-  start_position = match.end(0)
-
-  assert start_position > 0, (
-      'start_pattern must ends with an opening punctuation.')
-  assert text[start_position - 1] in matching_punctuation, (
-      'start_pattern must ends with an opening punctuation.')
-  # Stack of closing punctuations we expect to have in text after position.
-  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
-  position = start_position
-  while punctuation_stack and position < len(text):
-    if text[position] == punctuation_stack[-1]:
-      punctuation_stack.pop()
-    elif text[position] in closing_punctuation:
-      # A closing punctuation without matching opening punctuations.
-      return None
-    elif text[position] in matching_punctuation:
-      punctuation_stack.append(matching_punctuation[text[position]])
-    position += 1
-  if punctuation_stack:
-    # Opening punctuations left without matching close-punctuations.
-    return None
-  # punctuations match.
-  return text[start_position:position - 1]
-
-
-# Patterns for matching call-by-reference parameters.
-#
-# Supports nested templates up to 2 levels deep using this messy pattern:
-#   < (?: < (?: < [^<>]*
-#               >
-#           |   [^<>] )*
-#         >
-#     |   [^<>] )*
-#   >
-_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
-_RE_PATTERN_TYPE = (
-    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
-    r'(?:\w|'
-    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
-    r'::)+')
-# A call-by-reference parameter ends with '& identifier'.
-_RE_PATTERN_REF_PARAM = re.compile(
-    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
-    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
-# A call-by-const-reference parameter either ends with 'const& identifier'
-# or looks like 'const type& identifier' when 'type' is atomic.
-_RE_PATTERN_CONST_REF_PARAM = (
-    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
-    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
-
-
-def CheckLanguage(filename, clean_lines, linenum, file_extension,
-                  include_state, nesting_state, error):
-  """Checks rules from the 'C++ language rules' section of cppguide.html.
-
-  Some of these rules are hard to test (function overloading, using
-  uint32 inappropriately), but we do the best we can.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    file_extension: The extension (without the dot) of the filename.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  # If the line is empty or consists of entirely a comment, no need to
-  # check it.
-  line = clean_lines.elided[linenum]
-  if not line:
-    return
-
-  match = _RE_PATTERN_INCLUDE.search(line)
-  if match:
-    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
-    return
-
-  # Reset include state across preprocessor directives.  This is meant
-  # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
-
-  # Make Windows paths like Unix.
-  fullname = os.path.abspath(filename).replace('\\', '/')
-
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
-
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
-    # TODO(unknown): check that 1-arg constructors are explicit.
-    #                How to tell it's a constructor?
-    #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
-    #                (level 1 error)
-    pass
-
-  # Check if people are using the verboten C basic types.  The only exception
-  # we regularly allow is "unsigned short port" for port.
-  if Search(r'\bshort port\b', line):
-    if not Search(r'\bunsigned short port\b', line):
-      error(filename, linenum, 'runtime/int', 4,
-            'Use "unsigned short" for ports, not "short"')
-  else:
-    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
-    if match:
-      error(filename, linenum, 'runtime/int', 4,
-            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
-
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
-  # Check if some verboten operator overloading is going on
-  # TODO(unknown): catch out-of-line unary operator&:
-  #   class X {};
-  #   int operator&(const X& x) { return 42; }  // unary operator&
-  # The trick is it's hard to tell apart from binary operator&:
-  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
-  if Search(r'\boperator\s*&\s*\(\s*\)', line):
-    error(filename, linenum, 'runtime/operator', 4,
-          'Unary operator& is dangerous.  Do not use it.')
-
-  # Check for suspicious usage of "if" like
-  # } if (a == b) {
-  if Search(r'\}\s*if\s*\(', line):
-    error(filename, linenum, 'readability/braces', 4,
-          'Did you mean "else if"? If not, start a new line for "if".')
-
-  # Check for potential format string bugs like printf(foo).
-  # We constrain the pattern not to pick things like DocidForPrintf(foo).
-  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
-  # convention of the whole function to process multiple line to handle it.
-  #   printf(
-  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
-  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
-  if printf_args:
-    match = Match(r'([\w.\->()]+)$', printf_args)
-    if match and match.group(1) != '__VA_ARGS__':
-      function_name = re.search(r'\b((?:string)?printf)\s*\(',
-                                line, re.I).group(1)
-      error(filename, linenum, 'runtime/printf', 4,
-            'Potential format string bug. Do %s("%%s", %s) instead.'
-            % (function_name, match.group(1)))
-
-  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
-  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
-  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
-    error(filename, linenum, 'runtime/memset', 4,
-          'Did you mean "memset(%s, 0, %s)"?'
-          % (match.group(1), match.group(2)))
-
-  if Search(r'\busing namespace\b', line):
-    error(filename, linenum, 'build/namespaces', 5,
-          'Do not use namespace using-directives.  '
-          'Use using-declarations instead.')
-
-  # Detect variable-length arrays.
-  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
-  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
-      match.group(3).find(']') == -1):
-    # Split the size using space and arithmetic operators as delimiters.
-    # If any of the resulting tokens are not compile time constants then
-    # report the error.
-    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
-    is_const = True
-    skip_next = False
-    for tok in tokens:
-      if skip_next:
-        skip_next = False
-        continue
-
-      if Search(r'sizeof\(.+\)', tok): continue
-      if Search(r'arraysize\(\w+\)', tok): continue
-
-      tok = tok.lstrip('(')
-      tok = tok.rstrip(')')
-      if not tok: continue
-      if Match(r'\d+', tok): continue
-      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
-      if Match(r'k[A-Z0-9]\w*', tok): continue
-      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
-      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
-      # A catch all for tricky sizeof cases, including 'sizeof expression',
-      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
-      # requires skipping the next token because we split on ' ' and '*'.
-      if tok.startswith('sizeof'):
-        skip_next = True
-        continue
-      is_const = False
-      break
-    if not is_const:
-      error(filename, linenum, 'runtime/arrays', 1,
-            'Do not use variable-length arrays.  Use an appropriately named '
-            "('k' followed by CamelCase) compile-time constant for the size.")
-
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
-  # Check for use of unnamed namespaces in header files.  Registration
-  # macros are typically OK, so we allow use of "namespace {" on lines
-  # that end with backslashes.
-  if (file_extension == 'h'
-      and Search(r'\bnamespace\s*{', line)
-      and line[-1] != '\\'):
-    error(filename, linenum, 'build/namespaces', 4,
-          'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
-          ' for more information.')
-
-def CheckForNonConstReference(filename, clean_lines, linenum,
-                              nesting_state, error):
-  """Check for non-const references.
-
-  Separate from CheckLanguage since it scans backwards from current
-  line, instead of scanning forward.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  # Do nothing if there is no '&' on current line.
-  line = clean_lines.elided[linenum]
-  if '&' not in line:
-    return
-
-  # Long type names may be broken across multiple lines, usually in one
-  # of these forms:
-  #   LongType
-  #       ::LongTypeContinued &identifier
-  #   LongType::
-  #       LongTypeContinued &identifier
-  #   LongType<
-  #       ...>::LongTypeContinued &identifier
-  #
-  # If we detected a type split across two lines, join the previous
-  # line to current line so that we can match const references
-  # accordingly.
-  #
-  # Note that this only scans back one line, since scanning back
-  # arbitrary number of lines would be expensive.  If you have a type
-  # that spans more than 2 lines, please use a typedef.
-  if linenum > 1:
-    previous = None
-    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
-      # previous_line\n + ::current_line
-      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
-                        clean_lines.elided[linenum - 1])
-    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
-      # previous_line::\n + current_line
-      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
-                        clean_lines.elided[linenum - 1])
-    if previous:
-      line = previous.group(1) + line.lstrip()
-    else:
-      # Check for templated parameter that is split across multiple lines
-      endpos = line.rfind('>')
-      if endpos > -1:
-        (_, startline, startpos) = ReverseCloseExpression(
-            clean_lines, linenum, endpos)
-        if startpos > -1 and startline < linenum:
-          # Found the matching < on an earlier line, collect all
-          # pieces up to current line.
-          line = ''
-          for i in xrange(startline, linenum + 1):
-            line += clean_lines.elided[i].strip()
-
-  # Check for non-const references in function parameters.  A single '&' may
-  # found in the following places:
-  #   inside expression: binary & for bitwise AND
-  #   inside expression: unary & for taking the address of something
-  #   inside declarators: reference parameter
-  # We will exclude the first two cases by checking that we are not inside a
-  # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
-  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
-  # We allow non-const references in a few standard places, like functions
-  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
-  # those function parameters.
-  #
-  # We also accept & in static_assert, which looks like a function but
-  # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
-                           r'operator\s*[<>][<>]|'
-                           r'static_assert|COMPILE_ASSERT'
-                           r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
-  elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
-    # didn't see any function name on this line, so this is likely a
-    # multi-line parameter list.  Try a bit harder to catch this case.
-    for i in xrange(2):
-      if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
-
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
-
-
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
-  """Checks for a C-style cast by looking for the pattern.
-
-  Args:
-    filename: The name of the current file.
-    linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
-    cast_type: The string for the C++ cast to recommend.  This is either
-      reinterpret_cast, static_cast, or const_cast, depending.
-    pattern: The regular expression used to find C-style casts.
-    error: The function to call with any errors found.
-
-  Returns:
-    True if an error was emitted.
-    False otherwise.
-  """
-  match = Search(pattern, line)
-  if not match:
-    return False
-
-  # Exclude lines with sizeof, since sizeof looks like a cast.
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    return False
-
-  # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
-    return False
-
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
-  remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
-
-  # At this point, all that should be left is actual casts.
-  error(filename, linenum, 'readability/casting', 4,
-        'Using C-style cast.  Use %s<%s>(...) instead' %
-        (cast_type, match.group(1)))
-
-  return True
-
-
-_HEADERS_CONTAINING_TEMPLATES = (
-    ('<deque>', ('deque',)),
-    ('<functional>', ('unary_function', 'binary_function',
-                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
-                      'negate',
-                      'equal_to', 'not_equal_to', 'greater', 'less',
-                      'greater_equal', 'less_equal',
-                      'logical_and', 'logical_or', 'logical_not',
-                      'unary_negate', 'not1', 'binary_negate', 'not2',
-                      'bind1st', 'bind2nd',
-                      'pointer_to_unary_function',
-                      'pointer_to_binary_function',
-                      'ptr_fun',
-                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
-                      'mem_fun_ref_t',
-                      'const_mem_fun_t', 'const_mem_fun1_t',
-                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
-                      'mem_fun_ref',
-                     )),
-    ('<limits>', ('numeric_limits',)),
-    ('<list>', ('list',)),
-    ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
-    ('<queue>', ('queue', 'priority_queue',)),
-    ('<set>', ('set', 'multiset',)),
-    ('<stack>', ('stack',)),
-    ('<string>', ('char_traits', 'basic_string',)),
-    ('<utility>', ('pair',)),
-    ('<vector>', ('vector',)),
-
-    # gcc extensions.
-    # Note: std::hash is their hash, ::hash is our hash
-    ('<hash_map>', ('hash_map', 'hash_multimap',)),
-    ('<hash_set>', ('hash_set', 'hash_multiset',)),
-    ('<slist>', ('slist',)),
-    )
-
-_RE_PATTERN_STRING = re.compile(r'\bstring\b')
-
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
-
-_re_pattern_templates = []
-for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
-  for _template in _templates:
-    _re_pattern_templates.append(
-        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
-         _template + '<>',
-         _header))
-
-
-def FilesBelongToSameModule(filename_cc, filename_h):
-  """Check if these two filenames belong to the same module.
-
-  The concept of a 'module' here is a as follows:
-  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
-  same 'module' if they are in the same directory.
-  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
-  to belong to the same module here.
-
-  If the filename_cc contains a longer path than the filename_h, for example,
-  '/absolute/path/to/base/sysinfo.cc', and this file would include
-  'base/sysinfo.h', this function also produces the prefix needed to open the
-  header. This is used by the caller of this function to more robustly open the
-  header file. We don't have access to the real include paths in this context,
-  so we need this guesswork here.
-
-  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
-  according to this implementation. Because of this, this function gives
-  some false positives. This should be sufficiently rare in practice.
-
-  Args:
-    filename_cc: is the path for the .cc file
-    filename_h: is the path for the header path
-
-  Returns:
-    Tuple with a bool and a string:
-    bool: True if filename_cc and filename_h belong to the same module.
-    string: the additional prefix needed to open the header file.
-  """
-
-  if not filename_cc.endswith('.cc'):
-    return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
-  filename_cc = filename_cc.replace('/public/', '/')
-  filename_cc = filename_cc.replace('/internal/', '/')
-
-  if not filename_h.endswith('.h'):
-    return (False, '')
-  filename_h = filename_h[:-len('.h')]
-  if filename_h.endswith('-inl'):
-    filename_h = filename_h[:-len('-inl')]
-  filename_h = filename_h.replace('/public/', '/')
-  filename_h = filename_h.replace('/internal/', '/')
-
-  files_belong_to_same_module = filename_cc.endswith(filename_h)
-  common_path = ''
-  if files_belong_to_same_module:
-    common_path = filename_cc[:-len(filename_h)]
-  return files_belong_to_same_module, common_path
-
-
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
-
-  Args:
-    filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
-    io: The io factory to use to read the file. Provided for testability.
-
-  Returns:
-    True if a header was successfully added. False otherwise.
-  """
-  headerfile = None
-  try:
-    headerfile = io.open(filename, 'r', 'utf8', 'replace')
-  except IOError:
-    return False
-  linenum = 0
-  for line in headerfile:
-    linenum += 1
-    clean_line = CleanseComments(line)
-    match = _RE_PATTERN_INCLUDE.search(clean_line)
-    if match:
-      include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
-  return True
-
-
-def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
-                              io=codecs):
-  """Reports for missing stl includes.
-
-  This function will output warnings to make sure you are including the headers
-  necessary for the stl containers and functions that you use. We only give one
-  reason to include a header. For example, if you use both equal_to<> and
-  less<> in a .h file, only one (the latter in the file) of these will be
-  reported as a reason to include the <functional>.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    include_state: An _IncludeState instance.
-    error: The function to call with any errors found.
-    io: The IO factory to use to read the header file. Provided for unittest
-        injection.
-  """
-  required = {}  # A map of header name to linenumber and the template entity.
-                 # Example of required: { '<functional>': (1219, 'less<>') }
-
-  for linenum in xrange(clean_lines.NumLines()):
-    line = clean_lines.elided[linenum]
-    if not line or line[0] == '#':
-      continue
-
-    # String is special -- it is a non-templatized type in STL.
-    matched = _RE_PATTERN_STRING.search(line)
-    if matched:
-      # Don't warn about strings in non-STL namespaces:
-      # (We check only the first match per line; good enough.)
-      prefix = line[:matched.start()]
-      if prefix.endswith('std::') or not prefix.endswith('::'):
-        required['<string>'] = (linenum, 'string')
-
-    for pattern, template, header in _re_pattern_algorithm_header:
-      if pattern.search(line):
-        required[header] = (linenum, template)
-
-    # The following function is just a speed up, no semantics are changed.
-    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
-      continue
-
-    for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
-
-  # The policy is that if you #include something in foo.h you don't need to
-  # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
-
-  # Did we find the header for this file (if any) and successfully load it?
-  header_found = False
-
-  # Use the absolute path so that matching works properly.
-  abs_filename = FileInfo(filename).FullName()
-
-  # For Emacs's flymake.
-  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
-  # by flymake and that file name might end with '_flymake.cc'. In that case,
-  # restore original file name here so that the corresponding header file can be
-  # found.
-  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
-  # instead of 'foo_flymake.h'
-  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
-
-  # include_state is modified during iteration, so we iterate over a copy of
-  # the keys.
-  header_keys = include_state.keys()
-  for header in header_keys:
-    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
-    fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
-      header_found = True
-
-  # If we can't find the header file for a .cc, assume it's because we don't
-  # know where to look. In that case we'll give up as we're not sure they
-  # didn't include it in the .h file.
-  # TODO(unknown): Do a better job of finding .h files so we are confident that
-  # not having the .h file means there isn't one.
-  if filename.endswith('.cc') and not header_found:
-    return
-
-  # All the lines have been processed, report the errors found.
-  for required_header_unstripped in required:
-    template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
-      error(filename, required[required_header_unstripped][0],
-            'build/include_what_you_use', 4,
-            'Add #include ' + required_header_unstripped + ' for ' + template)
-
-
-_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
-
-
-def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
-  """Check that make_pair's template arguments are deduced.
-
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
-  specified explicitly, and such use isn't intended in any case.
-
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]
-  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
-  if match:
-    error(filename, linenum, 'build/explicit_make_pair',
-          4,  # 4 = high confidence
-          'For C++11-compatibility, omit template arguments from make_pair'
-          ' OR use pair directly OR if appropriate, construct a pair directly')
-
-
-def ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, nesting_state, error,
-                extra_check_functions=[]):
-  """Processes a single line in the file.
-
-  Args:
-    filename: Filename of the file that is being processed.
-    file_extension: The extension (dot not included) of the file.
-    clean_lines: An array of strings, each representing a line of the file,
-                 with comments stripped.
-    line: Number of line being processed.
-    include_state: An _IncludeState instance in which the headers are inserted.
-    function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-  raw_lines = clean_lines.raw_lines
-  ParseNolintSuppressions(filename, raw_lines[line], line, error)
-  nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
-  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
-  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
-  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
-  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
-                nesting_state, error)
-  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
-  CheckForNonStandardConstructs(filename, clean_lines, line,
-                                nesting_state, error)
-  CheckVlogArguments(filename, clean_lines, line, error)
-  CheckPosixThreading(filename, clean_lines, line, error)
-  CheckInvalidIncrement(filename, clean_lines, line, error)
-  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
-  for check_fn in extra_check_functions:
-    check_fn(filename, clean_lines, line, error)
-
-def ProcessFileData(filename, file_extension, lines, error,
-                    extra_check_functions=[]):
-  """Performs lint checks and reports any errors to the given error function.
-
-  Args:
-    filename: Filename of the file that is being processed.
-    file_extension: The extension (dot not included) of the file.
-    lines: An array of strings, each representing a line of the file, with the
-           last element being empty if the file is terminated with a newline.
-    error: A callable to which errors are reported, which takes 4 arguments:
-           filename, line number, error level, and message
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-  lines = (['// marker so line numbers and indices both start at 1'] + lines +
-           ['// marker so line numbers end in a known way'])
-
-  include_state = _IncludeState()
-  function_state = _FunctionState()
-  nesting_state = _NestingState()
-
-  ResetNolintSuppressions()
-
-  CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
-  RemoveMultiLineComments(filename, lines, error)
-  clean_lines = CleansedLines(lines)
-  for line in xrange(clean_lines.NumLines()):
-    ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, nesting_state, error,
-                extra_check_functions)
-  nesting_state.CheckCompletedBlocks(filename, error)
-
-  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
-
-  # We check here rather than inside ProcessLine so that we see raw
-  # lines rather than "cleaned" lines.
-  CheckForBadCharacters(filename, lines, error)
-
-  CheckForNewlineAtEOF(filename, lines, error)
-
-def ProcessFile(filename, vlevel, extra_check_functions=[]):
-  """Does google-lint on a single file.
-
-  Args:
-    filename: The name of the file to parse.
-
-    vlevel: The level of errors to report.  Every error of confidence
-    >= verbose_level will be reported.  0 is a good default.
-
-    extra_check_functions: An array of additional check functions that will be
-                           run on each source line. Each function takes 4
-                           arguments: filename, clean_lines, line, error
-  """
-
-  _SetVerboseLevel(vlevel)
-
-  try:
-    # Support the UNIX convention of using "-" for stdin.  Note that
-    # we are not opening the file with universal newline support
-    # (which codecs doesn't support anyway), so the resulting lines do
-    # contain trailing '\r' characters if we are reading a file that
-    # has CRLF endings.
-    # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
-    if filename == '-':
-      lines = codecs.StreamReaderWriter(sys.stdin,
-                                        codecs.getreader('utf8'),
-                                        codecs.getwriter('utf8'),
-                                        'replace').read().split('\n')
-    else:
-      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
-
-    carriage_return_found = False
-    # Remove trailing '\r'.
-    for linenum in range(len(lines)):
-      if lines[linenum].endswith('\r'):
-        lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
-
-  except IOError:
-    sys.stderr.write(
-        "Skipping input '%s': Can't open for reading\n" % filename)
-    return
-
-  # Note, if no dot is found, this will give the entire filename as the ext.
-  file_extension = filename[filename.rfind('.') + 1:]
-
-  # When reading from stdin, the extension is unknown, so no cpplint tests
-  # should rely on the extension.
-  if filename != '-' and file_extension not in _valid_extensions:
-    sys.stderr.write('Ignoring %s; not a valid file name '
-                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
-  else:
-    ProcessFileData(filename, file_extension, lines, Error,
-                    extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
-
-  sys.stderr.write('Done processing %s\n' % filename)
-
-
-def PrintUsage(message):
-  """Prints a brief usage string and exits, optionally with an error message.
-
-  Args:
-    message: The optional error message.
-  """
-  sys.stderr.write(_USAGE)
-  if message:
-    sys.exit('\nFATAL ERROR: ' + message)
-  else:
-    sys.exit(1)
-
-
-def PrintCategories():
-  """Prints a list of all the error-categories used by error messages.
-
-  These are the categories used to filter messages via --filter.
-  """
-  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
-  sys.exit(0)
-
-
-def ParseArguments(args):
-  """Parses the command line arguments.
-
-  This may set the output format and verbosity level as side-effects.
-
-  Args:
-    args: The command line arguments:
-
-  Returns:
-    The list of filenames to lint.
-  """
-  try:
-    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
-                                                 'counting=',
-                                                 'filter=',
-                                                 'root=',
-                                                 'linelength=',
-                                                 'extensions='])
-  except getopt.GetoptError:
-    PrintUsage('Invalid arguments.')
-
-  verbosity = _VerboseLevel()
-  output_format = _OutputFormat()
-  filters = ''
-  counting_style = ''
-
-  for (opt, val) in opts:
-    if opt == '--help':
-      PrintUsage(None)
-    elif opt == '--output':
-      if val not in ('emacs', 'vs7', 'eclipse'):
-        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
-      output_format = val
-    elif opt == '--verbose':
-      verbosity = int(val)
-    elif opt == '--filter':
-      filters = val
-      if not filters:
-        PrintCategories()
-    elif opt == '--counting':
-      if val not in ('total', 'toplevel', 'detailed'):
-        PrintUsage('Valid counting options are total, toplevel, and detailed')
-      counting_style = val
-    elif opt == '--root':
-      global _root
-      _root = val
-    elif opt == '--linelength':
-      global _line_length
-      try:
-          _line_length = int(val)
-      except ValueError:
-          PrintUsage('Line length must be digits.')
-    elif opt == '--extensions':
-      global _valid_extensions
-      try:
-          _valid_extensions = set(val.split(','))
-      except ValueError:
-          PrintUsage('Extensions must be comma separated list.')
-
-  if not filenames:
-    PrintUsage('No files were specified.')
-
-  _SetOutputFormat(output_format)
-  _SetVerboseLevel(verbosity)
-  _SetFilters(filters)
-  _SetCountingStyle(counting_style)
-
-  return filenames
-
-
-def main():
-  filenames = ParseArguments(sys.argv[1:])
-
-  # Change stderr to write with replacement characters so we don't die
-  # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
-
-  _cpplint_state.ResetErrorCounts()
-  for filename in filenames:
-    ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
-
-  sys.exit(_cpplint_state.error_count > 0)
-
-
-if __name__ == '__main__':
-  main()
diff --git a/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php b/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php
deleted file mode 100644
index 88b0748f7..000000000
--- a/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php
+++ /dev/null
@@ -1,138 +0,0 @@
-<?php
-// Copyright 2004-present Facebook.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-class FacebookFbcodeLintEngine extends ArcanistLintEngine {
-
-  public function buildLinters() {
-    $linters = array();
-    $paths = $this->getPaths();
-
-    // Remove all deleted files, which are not checked by the
-    // following linters.
-    foreach ($paths as $key => $path) {
-      if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) {
-        unset($paths[$key]);
-      }
-    }
-
-    $generated_linter = new ArcanistGeneratedLinter();
-    $linters[] = $generated_linter;
-
-    $nolint_linter = new ArcanistNoLintLinter();
-    $linters[] = $nolint_linter;
-
-    $text_linter = new ArcanistTextLinter();
-    $text_linter->setCustomSeverityMap(array(
-      ArcanistTextLinter::LINT_LINE_WRAP
-        => ArcanistLintSeverity::SEVERITY_ADVICE,
-    ));
-    $linters[] = $text_linter;
-
-    $java_text_linter = new ArcanistTextLinter();
-    $java_text_linter->setMaxLineLength(100);
-    $java_text_linter->setCustomSeverityMap(array(
-      ArcanistTextLinter::LINT_LINE_WRAP
-        => ArcanistLintSeverity::SEVERITY_ADVICE,
-    ));
-    $linters[] = $java_text_linter;
-
-    $python_linter = new ArcanistPEP8Linter();
-    $linters[] = $python_linter;
-
-    $cpp_linters = array();
-    $cpp_linters[] = $linters[] = new ArcanistCpplintLinter();
-    $cpp_linters[] = $linters[] = new FbcodeCppLinter();
-
-    $clang_format_linter = new FbcodeClangFormatLinter();
-    $linters[] = $clang_format_linter;
-
-    $spelling_linter = new ArcanistSpellingLinter();
-    $linters[] = $spelling_linter;
-
-    foreach ($paths as $path) {
-      $is_text = false;
-
-      $text_extensions = (
-        '/\.('.
-        'cpp|cxx|c|cc|h|hpp|hxx|tcc|'.
-        'py|rb|hs|pl|pm|tw|'.
-        'php|phpt|css|js|'.
-        'java|'.
-        'thrift|'.
-        'lua|'.
-        'siv|'.
-        'txt'.
-        ')$/'
-      );
-      if (preg_match($text_extensions, $path)) {
-        $is_text = true;
-      }
-      if ($is_text) {
-        $nolint_linter->addPath($path);
-
-        $generated_linter->addPath($path);
-        $generated_linter->addData($path, $this->loadData($path));
-
-        if (preg_match('/\.java$/', $path)) {
-          $java_text_linter->addPath($path);
-          $java_text_linter->addData($path, $this->loadData($path));
-        } else {
-          $text_linter->addPath($path);
-          $text_linter->addData($path, $this->loadData($path));
-        }
-
-        $spelling_linter->addPath($path);
-        $spelling_linter->addData($path, $this->loadData($path));
-      }
-      if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)
-          && !preg_match('/third-party/', $path)) {
-        foreach ($cpp_linters as &$linter) {
-          $linter->addPath($path);
-          $linter->addData($path, $this->loadData($path));
-        }
-
-        $clang_format_linter->addPath($path);
-        $clang_format_linter->addData($path, $this->loadData($path));
-        $clang_format_linter->setPathChangedLines(
-          $path, $this->getPathChangedLines($path));
-      }
-
-      // Match *.py and contbuild config files
-      if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/',
-                    $path)) {
-        $space_count = 4;
-        $real_path = $this->getFilePathOnDisk($path);
-        $dir = dirname($real_path);
-        do {
-          if (file_exists($dir.'/.python2space')) {
-            $space_count = 2;
-            break;
-          }
-          $dir = dirname($dir);
-        } while ($dir != '/' && $dir != '.');
-
-        $cur_path_linter = $python_linter;
-        $cur_path_linter->addPath($path);
-        $cur_path_linter->addData($path, $this->loadData($path));
-
-        if (preg_match('/\.tw$/', $path)) {
-          $cur_path_linter->setCustomSeverityMap(array(
-            'E251' => ArcanistLintSeverity::SEVERITY_DISABLED,
-          ));
-        }
-      }
-    }
-
-    $name_linter = new ArcanistFilenameLinter();
-    $linters[] = $name_linter;
-    foreach ($paths as $path) {
-      $name_linter->addPath($path);
-    }
-
-    return $linters;
-  }
-
-}
diff --git a/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php b/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php
deleted file mode 100644
index 2e0148141..000000000
--- a/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php
+++ /dev/null
@@ -1,27 +0,0 @@
-<?php
-// Copyright 2015-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-final class FacebookHowtoevenLintEngine extends ArcanistLintEngine {
-
-  public function buildLinters() {
-    $paths = array();
-
-    foreach ($this->getPaths() as $path) {
-      // Don't try to lint deleted files or changed directories.
-      if (!Filesystem::pathExists($path) || is_dir($path)) {
-        continue;
-      }
-
-      if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) {
-        $paths[] = $path;
-      }
-    }
-
-    $howtoeven = new FacebookHowtoevenLinter();
-    $howtoeven->setPaths($paths);
-    return array($howtoeven);
-  }
-}
diff --git a/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php b/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php
deleted file mode 100644
index 62c275f6a..000000000
--- a/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php
+++ /dev/null
@@ -1,17 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-class FacebookFbcodeUnitTestEngine extends ArcanistUnitTestEngine {
-
-  public function run() {
-      // For a call to `arc call-conduit differential.updateunitresults` to
-      // succeed we need at least one entry here.
-      $result = new ArcanistUnitTestResult();
-      $result->setName("dummy_placeholder_entry");
-      $result->setResult(ArcanistUnitTestResult::RESULT_PASS);
-      return array($result);
-  }
-}
diff --git a/arcanist_util/unit_engine/FacebookOldFbcodeUnitTestEngine.php b/arcanist_util/unit_engine/FacebookOldFbcodeUnitTestEngine.php
deleted file mode 100644
index 985bd68fc..000000000
--- a/arcanist_util/unit_engine/FacebookOldFbcodeUnitTestEngine.php
+++ /dev/null
@@ -1,17 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
-
-class FacebookFbcodeUnitTestEngine extends ArcanistBaseUnitTestEngine {
-
-  public function run() {
-      // For a call to `arc call-conduit differential.updateunitresults` to
-      // succeed we need at least one entry here.
-      $result = new ArcanistUnitTestResult();
-      $result->setName("dummy_placeholder_entry");
-      $result->setResult(ArcanistUnitTestResult::RESULT_PASS);
-      return array($result);
-  }
-}
diff --git a/buckifier/rocks_test_runner.sh b/buckifier/rocks_test_runner.sh
index 2ee216934..e1f48a760 100755
--- a/buckifier/rocks_test_runner.sh
+++ b/buckifier/rocks_test_runner.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # Create a tmp directory for the test to use
 TEST_DIR=$(mktemp -d /dev/shm/fbcode_rocksdb_XXXXXXX)
 TEST_TMPDIR="$TEST_DIR" $@ && rm -rf "$TEST_DIR"
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 079b892a7..33023a589 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -6,12 +6,10 @@
 import os
 
 TARGETS_PATH = os.path.dirname(__file__)
-REPO_PATH = TARGETS_PATH[(TARGETS_PATH.find('fbcode/') + len('fbcode/')):] + "/"
+REPO_PATH = "rocksdb/src/"
 BUCK_BINS = "buck-out/gen/" + REPO_PATH
 TEST_RUNNER = REPO_PATH + "buckifier/rocks_test_runner.sh"
 rocksdb_compiler_flags = [
-  "-msse",
-  "-msse4.2",
   "-fno-builtin-memcmp",
   "-DROCKSDB_PLATFORM_POSIX",
   "-DROCKSDB_LIB_IO_POSIX",
@@ -20,7 +18,6 @@
   "-DROCKSDB_RANGESYNC_PRESENT",
   "-DROCKSDB_SCHED_GETCPU_PRESENT",
   "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-  "-DHAVE_SSE42",
   "-DOS_LINUX",
   # Flags to enable libs we include
   "-DSNAPPY",
@@ -52,6 +49,10 @@
   "-I" + REPO_PATH + "include/",
   "-I" + REPO_PATH,
 ]
+
+rocksdb_arch_preprocessor_flags = {
+  "x86_64": ["-DHAVE_SSE42"],
+}
 """
 
 
@@ -62,6 +63,7 @@
     srcs = [%s],
     deps = [%s],
     preprocessor_flags = rocksdb_preprocessor_flags,
+    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
     compiler_flags = rocksdb_compiler_flags,
     external_deps = rocksdb_external_deps,
 )
@@ -73,6 +75,7 @@
   srcs = [%s],
   deps = [%s],
   preprocessor_flags = rocksdb_preprocessor_flags,
+  arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
   compiler_flags = rocksdb_compiler_flags,
   external_deps = rocksdb_external_deps,
 )
@@ -95,6 +98,7 @@
       srcs = [test_cc],
       deps = [":rocksdb_test_lib"],
       preprocessor_flags = rocksdb_preprocessor_flags,
+      arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
       compiler_flags = rocksdb_compiler_flags,
       external_deps = rocksdb_external_deps,
     )
diff --git a/arcanist_util/config/RocksDBCommonHelper.php b/build_tools/RocksDBCommonHelper.php
similarity index 85%
rename from arcanist_util/config/RocksDBCommonHelper.php
rename to build_tools/RocksDBCommonHelper.php
index de40cc78c..9fe770fe9 100644
--- a/arcanist_util/config/RocksDBCommonHelper.php
+++ b/build_tools/RocksDBCommonHelper.php
@@ -1,8 +1,8 @@
 <?php
 // Copyright 2004-present Facebook. All Rights Reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 // Name of the environment variables which need to be set by the entity which
 // triggers continuous runs so that code at the end of the file gets executed
@@ -21,11 +21,17 @@ function postURL($diffID, $url) {
   assert(is_numeric($diffID));
   assert(strlen($url) > 0);
 
-  $cmd = 'echo \'{"diff_id": ' . $diffID . ', '
-         . '"name":"click here for sandcastle tests for D' . $diffID . '", '
-         . '"link":"' . $url . '"}\' | '
-         . 'arc call-conduit '
-         . 'differential.updateunitresults';
+  $cmd_args = array(
+    'diff_id' => (int)$diffID,
+    'name' => sprintf(
+      'click here for sandcastle tests for D%d',
+      (int)$diffID
+    ),
+    'link' => $url
+  );
+  $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args))
+         . ' | arc call-conduit differential.updateunitresults';
+
   shell_exec($cmd);
 }
 
@@ -35,11 +41,15 @@ function buildUpdateTestStatusCmd($diffID, $test, $status) {
   assert(strlen($test) > 0);
   assert(strlen($status) > 0);
 
-  $cmd = 'echo \'{"diff_id": ' . $diffID . ', '
-         . '"name":"' . $test . '", '
-         . '"result":"' . $status . '"}\' | '
-         . 'arc call-conduit '
-         . 'differential.updateunitresults';
+  $cmd_args = array(
+    'diff_id' => (int)$diffID,
+    'name' => $test,
+    'result' => $status
+  );
+
+  $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args))
+         . ' | arc call-conduit differential.updateunitresults';
+
   return $cmd;
 }
 
@@ -68,7 +78,7 @@ function getSteps($applyDiff, $diffID, $username, $test) {
     // and authenticate using that in Sandcastle.
     $setup = array(
       "name" => "Setup arcrc",
-      "shell" => "echo " . $arcrc_content . " | base64 --decode"
+      "shell" => "echo " . escapeshellarg($arcrc_content) . " | base64 --decode"
                  . " | gzip -d > ~/.arcrc",
       "user" => "root"
     );
@@ -94,7 +104,17 @@ function getSteps($applyDiff, $diffID, $username, $test) {
     "user" => "root"
   );
 
+  // This fixes "FATAL: ThreadSanitizer can not mmap the shadow memory"
+  // Source:
+  // https://github.com/google/sanitizers/wiki/ThreadSanitizerCppManual#FAQ
+  $fix_kernel_issue = array(
+    "name" => "Fix kernel issue with tsan",
+    "shell" => "echo 2 >/proc/sys/kernel/randomize_va_space",
+    "user" => "root"
+  );
+
   $steps[] = $fix_git_ignore;
+  $steps[] = $fix_kernel_issue;
 
   // This will be the command used to execute particular type of tests.
   $cmd = "";
@@ -104,7 +124,7 @@ function getSteps($applyDiff, $diffID, $username, $test) {
     $patch = array(
       "name" => "Patch " . $diffID,
       "shell" => "arc --arcrc-file ~/.arcrc "
-                  . "patch --nocommit --diff " . $diffID,
+                  . "patch --nocommit --diff " . escapeshellarg($diffID),
       "user" => "root"
     );
 
@@ -115,8 +135,8 @@ function getSteps($applyDiff, $diffID, $username, $test) {
   }
 
   // Run the actual command.
-  $cmd = $cmd . "J=$(nproc) ./build_tools/precommit_checker.py " . $test
-           . "; exit_code=$?; ";
+  $cmd = $cmd . "J=$(nproc) ./build_tools/precommit_checker.py " .
+           escapeshellarg($test) . "; exit_code=$?; ";
 
   if ($applyDiff) {
     $cmd = $cmd . "([[ \$exit_code -eq 0 ]] &&"
@@ -149,7 +169,7 @@ function getSteps($applyDiff, $diffID, $username, $test) {
     "name" => "Run " . $test,
     "shell" => $cmd,
     "user" => "root",
-    "parser" => "python build_tools/error_filter.py " . $test,
+    "parser" => "python build_tools/error_filter.py " . escapeshellarg($test),
   );
 
   $steps[] = $run_test;
@@ -197,7 +217,7 @@ function getSandcastleConfig() {
     if (file_exists(PRIMARY_TOKEN_FILE)) {
       $cmd = 'cat ' . PRIMARY_TOKEN_FILE;
     } else {
-      $cmd = 'cat ' . $cwd_token_file;
+      $cmd = 'cat ' . escapeshellarg($cwd_token_file);
     }
 
     assert(strlen($cmd) > 0);
@@ -241,7 +261,7 @@ function startTestsInSandcastle($applyDiff, $workflow, $diffID) {
   // List of tests we want to run in Sandcastle.
   $tests = array("unit", "unit_non_shm", "unit_481", "clang_unit", "tsan",
                  "asan", "lite_test", "valgrind", "release", "release_481",
-                 "clang_release", "punit", "clang_analyze", "code_cov",
+                 "clang_release", "clang_analyze", "code_cov",
                  "java_build", "no_compression", "unity", "ubsan");
 
   $send_email_template = array(
@@ -321,9 +341,11 @@ function startTestsInSandcastle($applyDiff, $workflow, $diffID) {
   $app = $sandcastle_config[0];
   $token = $sandcastle_config[1];
 
-  $cmd = 'curl -s -k -F app=' . $app . ' '
-          . '-F token=' . $token . ' -F job=\'' . json_encode($job)
-          .'\' "' . $url . '"';
+  $cmd = 'curl -s -k '
+          . ' -F app=' . escapeshellarg($app)
+          . ' -F token=' . escapeshellarg($token)
+          . ' -F job=' . escapeshellarg(json_encode($job))
+          .' ' . escapeshellarg($url);
 
   $output = shell_exec($cmd);
   assert(strlen($output) > 0);
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index ff95c75f0..c7ddb7cce 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -317,9 +317,11 @@ EOF
     # Test whether jemalloc is available
     if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
       2>/dev/null; then
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ljemalloc"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -ljemalloc"
+        # This will enable some preprocessor identifiers in the Makefile
         JEMALLOC=1
+        # JEMALLOC can be enabled either using the flag (like here) or by
+        # providing direct link to the jemalloc library
+        WITH_JEMALLOC_FLAG=1
     else
         # jemalloc is not available. Let's try tcmalloc
         if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \
@@ -517,7 +519,14 @@ echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
 echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
 echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
 echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT"
+# This will enable some related identifiers for the preprocessor
 if test -n "$JEMALLOC"; then
   echo "JEMALLOC=1" >> "$OUTPUT"
 fi
+# Indicates that jemalloc should be enabled using -ljemalloc flag
+# The alternative is to porvide a direct link to the library via JEMALLOC_LIB
+# and JEMALLOC_INCLUDE
+if test -n "$WITH_JEMALLOC_FLAG"; then
+  echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
+fi
 echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
diff --git a/build_tools/cont_integration.sh b/build_tools/cont_integration.sh
index 9d0f7766a..06f25c596 100755
--- a/build_tools/cont_integration.sh
+++ b/build_tools/cont_integration.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # Copyright (c) 2016, Facebook. All rights reserved.
 #
@@ -67,7 +67,7 @@ function update_repo_status {
 #
 
 # Path to the determinator from the root of the RocksDB repo.
-CONTRUN_DETERMINATOR=./arcanist_util/config/RocksDBCommonHelper.php
+CONTRUN_DETERMINATOR=./build_tools/RocksDBCommonHelper.php
 
 # Value of the previous commit.
 PREV_COMMIT=
diff --git a/build_tools/dockerbuild.sh b/build_tools/dockerbuild.sh
index 2685380bf..02f609442 100755
--- a/build_tools/dockerbuild.sh
+++ b/build_tools/dockerbuild.sh
@@ -1,2 +1,2 @@
-#!/bin/bash
+#!/usr/bin/env bash
 docker run -v $PWD:/rocks -w /rocks buildpack-deps make
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index 868452a92..81221ed9a 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # If clang_format_diff.py command is not specfied, we assume we are able to
 # access directly without any path.
 if [ -z $CLANG_FORMAT_DIFF ]
diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh
deleted file mode 100755
index edcb36c1f..000000000
--- a/build_tools/make_new_version.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-#  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-#  This source code is licensed under the BSD-style license found in the
-#  LICENSE file in the root directory of this source tree. An additional grant
-#  of patent rights can be found in the PATENTS file in the same directory.
-
-set -e
-if [ -z "$GIT" ]
-then
-  GIT="git"
-fi
-
-# Print out the colored progress info so that it can be brainlessly
-# distinguished by users.
-function title() {
-  echo -e "\033[1;32m$*\033[0m"
-}
-
-usage="Create new RocksDB version and prepare it for the release process\n"
-usage+="USAGE: ./make_new_version.sh <version> [<remote>]\n"
-usage+="  version: specify a version without '.fb' suffix (e.g. 5.4).\n"
-usage+="  remote: name of the remote to push the branch to (default: origin)."
-
-# -- Pre-check
-if [[ $# < 1 ]]; then
-  echo -e $usage
-  exit 1
-fi
-
-ROCKSDB_VERSION=$1
-
-REMOTE="origin"
-if [[ $# > 1 ]]; then
-  REMOTE=$2
-fi
-
-GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
-echo $GIT_BRANCH
-
-if [ $GIT_BRANCH != "master" ]; then
-  echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
-  exit 1
-fi
-
-title "Adding new tag for this release ..."
-BRANCH="$ROCKSDB_VERSION.fb"
-$GIT checkout -b $BRANCH
-
-# Setting up the proxy for remote repo access
-title "Pushing new branch to remote repo ..."
-git push $REMOTE --set-upstream $BRANCH
-
-title "Branch $BRANCH is pushed to github;"
diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh
index 765898821..698063328 100755
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 set -e
 
diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index 044c3df35..6e8ae9cd7 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # This script is executed by Sandcastle
 # to determine next steps to run
 
@@ -343,27 +343,7 @@ LITE_BUILD_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB debug version',
-                'shell':'$LITE make J=1 static_lib || $CONTRUN_NAME=lite_static_lib $TASK_CREATION_TOOL',
-                'user':'root',
-                $PARSER
-            },
-        ],
-        $REPORT
-    }
-]"
-
-#
-# RocksDB lite tests
-#
-LITE_UNIT_TEST_COMMANDS="[
-    {
-        'name':'Rocksdb Lite Unit Test',
-        'oncall':'$ONCALL',
-        'steps': [
-            $CLEANUP_ENV,
-            {
-                'name':'Build RocksDB debug version',
-                'shell':'$SHM $LITE make J=1 check || $CONTRUN_NAME=lite_check $TASK_CREATION_TOOL',
+                'shell':'$LITE make J=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -564,7 +544,7 @@ TSAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'set -o pipefail && $SHM $DEBUG $TSAN CRASH_TEST_KILL_ODD=1887 CRASH_TEST_EXT_ARGS=--log2_keys_per_lock=22  make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -583,39 +563,7 @@ run_format_compatible()
   rm -rf /dev/shm/rocksdb
   mkdir /dev/shm/rocksdb
 
-  echo '
-  if [ -e "build_tools/build_detect_platform" ]
-  then
-    sed "s/tcmalloc/nothingnothingnothing/g" build_tools/build_detect_platform > $TEST_TMPDIR/temp_build_file
-    rm -rf build_tools/build_detect_platform
-    cp $TEST_TMPDIR/temp_build_file build_tools/build_detect_platform
-    chmod +x build_tools/build_detect_platform
-  fi
-
-  if [ -e "build_detect_platform" ]
-  then
-    sed "s/tcmalloc/nothingnothingnothing/g" build_detect_platform > $TEST_TMPDIR/temp_build_file
-    rm -rf build_detect_platform
-    cp $TEST_TMPDIR/temp_build_file build_detect_platform
-    chmod +x build_detect_platform
-  fi
-
-  make ldb -j32
-
-  if [ -e "build_detect_platform" ]
-  then
-    git checkout -- build_detect_platform
-  fi
-
-  if [ -e "build_tools/build_detect_platform" ]
-  then
-    git checkout -- build_tools/build_detect_platform
-  fi
-  ' > temp_build_ldb.sh
-
-  sed "s/make ldb -j32/source temp_build_ldb.sh/g" tools/check_format_compatible.sh > tools/temp_check_format_compatible.sh
-  chmod +x tools/temp_check_format_compatible.sh
-  tools/temp_check_format_compatible.sh
+  tools/check_format_compatible.sh
 }
 
 FORMAT_COMPATIBLE_COMMANDS="[
@@ -780,9 +728,6 @@ case $1 in
   lite)
     echo $LITE_BUILD_COMMANDS
     ;;
-  lite_test)
-    echo $LITE_UNIT_TEST_COMMANDS
-    ;;
   stress_crash)
     echo $STRESS_CRASH_TEST_COMMANDS
     ;;
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index db9d1438e..7e42714ef 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -27,6 +27,11 @@ std::shared_ptr<Cache> NewClockCache(size_t capacity, int num_shard_bits,
 #include <atomic>
 #include <deque>
 
+// "tbb/concurrent_hash_map.h" requires RTTI if exception is enabled.
+// Disable it so users can chooose to disable RTTI.
+#ifndef ROCKSDB_USE_RTTI
+#define TBB_USE_EXCEPTIONS 0
+#endif
 #include "tbb/concurrent_hash_map.h"
 
 #include "cache/sharded_cache.h"
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index b201d81a4..d29e70934 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -22,7 +22,7 @@
 
 namespace rocksdb {
 
-LRUHandleTable::LRUHandleTable() : length_(0), elems_(0), list_(nullptr) {
+LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
   Resize();
 }
 
@@ -100,7 +100,7 @@ void LRUHandleTable::Resize() {
 }
 
 LRUCacheShard::LRUCacheShard()
-    : usage_(0), lru_usage_(0), high_pri_pool_usage_(0) {
+    : high_pri_pool_usage_(0), usage_(0), lru_usage_(0) {
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -157,6 +157,16 @@ void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
   *lru_low_pri = lru_low_pri_;
 }
 
+size_t LRUCacheShard::TEST_GetLRUSize() {
+  LRUHandle* lru_handle = lru_.next;
+  size_t lru_size = 0;
+  while (lru_handle != &lru_) {
+    lru_size++;
+    lru_handle = lru_handle->next;
+  }
+  return lru_size;
+}
+
 void LRUCacheShard::LRU_Remove(LRUHandle* e) {
   assert(e->next != nullptr);
   assert(e->prev != nullptr);
@@ -223,6 +233,22 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
   }
 }
 
+void* LRUCacheShard::operator new(size_t size) {
+  return port::cacheline_aligned_alloc(size);
+}
+
+void* LRUCacheShard::operator new[](size_t size) {
+  return port::cacheline_aligned_alloc(size);
+}
+
+void LRUCacheShard::operator delete(void *memblock) {
+  port::cacheline_aligned_free(memblock);
+}
+
+void LRUCacheShard::operator delete[](void* memblock) {
+  port::cacheline_aligned_free(memblock);
+}
+
 void LRUCacheShard::SetCapacity(size_t capacity) {
   autovector<LRUHandle*> last_reference_list;
   {
@@ -438,11 +464,11 @@ std::string LRUCacheShard::GetPrintableOptions() const {
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                    bool strict_capacity_limit, double high_pri_pool_ratio)
     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
-  int num_shards = 1 << num_shard_bits;
-  shards_ = new LRUCacheShard[num_shards];
+  num_shards_ = 1 << num_shard_bits;
+  shards_ = new LRUCacheShard[num_shards_];
   SetCapacity(capacity);
   SetStrictCapacityLimit(strict_capacity_limit);
-  for (int i = 0; i < num_shards; i++) {
+  for (int i = 0; i < num_shards_; i++) {
     shards_[i].SetHighPriorityPoolRatio(high_pri_pool_ratio);
   }
 }
@@ -469,7 +495,20 @@ uint32_t LRUCache::GetHash(Handle* handle) const {
   return reinterpret_cast<const LRUHandle*>(handle)->hash;
 }
 
-void LRUCache::DisownData() { shards_ = nullptr; }
+void LRUCache::DisownData() {
+// Do not drop data if compile with ASAN to suppress leak warning.
+#ifndef __SANITIZE_ADDRESS__
+  shards_ = nullptr;
+#endif  // !__SANITIZE_ADDRESS__
+}
+
+size_t LRUCache::TEST_GetLRUSize() {
+  size_t lru_size_of_all_shards = 0;
+  for (int i = 0; i < num_shards_; i++) {
+    lru_size_of_all_shards += shards_[i].TEST_GetLRUSize();
+  }
+  return lru_size_of_all_shards;
+}
 
 std::shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
                                    bool strict_capacity_limit,
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 4b6a9f2fe..abe78fd0c 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -148,13 +148,13 @@ class LRUHandleTable {
 
   // The table consists of an array of buckets where each bucket is
   // a linked list of cache entries that hash into the bucket.
+  LRUHandle** list_;
   uint32_t length_;
   uint32_t elems_;
-  LRUHandle** list_;
 };
 
 // A single shard of sharded cache.
-class LRUCacheShard : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard {
  public:
   LRUCacheShard();
   virtual ~LRUCacheShard();
@@ -198,6 +198,19 @@ class LRUCacheShard : public CacheShard {
 
   void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri);
 
+  //  Retrieves number of elements in LRU, for unit test purpose only
+  //  not threadsafe
+  size_t TEST_GetLRUSize();
+
+  // Overloading to aligned it to cache line size
+  void* operator new(size_t);
+
+  void* operator new[](size_t);
+
+  void operator delete(void *);
+
+  void operator delete[](void*);
+
  private:
   void LRU_Remove(LRUHandle* e);
   void LRU_Insert(LRUHandle* e);
@@ -219,12 +232,6 @@ class LRUCacheShard : public CacheShard {
   // Initialized before use.
   size_t capacity_;
 
-  // Memory size for entries residing in the cache
-  size_t usage_;
-
-  // Memory size for entries residing only in the LRU list
-  size_t lru_usage_;
-
   // Memory size for entries in high-pri pool.
   size_t high_pri_pool_usage_;
 
@@ -238,11 +245,6 @@ class LRUCacheShard : public CacheShard {
   // Remember the value to avoid recomputing each time.
   double high_pri_pool_capacity_;
 
-  // mutex_ protects the following state.
-  // We don't count mutex_ as the cache's internal state so semantically we
-  // don't mind mutex_ invoking the non-const actions.
-  mutable port::Mutex mutex_;
-
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
   // LRU contains items which can be evicted, ie reference only by cache
@@ -251,7 +253,29 @@ class LRUCacheShard : public CacheShard {
   // Pointer to head of low-pri pool in LRU list.
   LRUHandle* lru_low_pri_;
 
+  // ------------^^^^^^^^^^^^^-----------
+  // Not frequently modified data members
+  // ------------------------------------
+  //
+  // We separate data members that are updated frequently from the ones that
+  // are not frequently updated so that they don't share the same cache line
+  // which will lead into false cache sharing
+  //
+  // ------------------------------------
+  // Frequently modified data members
+  // ------------vvvvvvvvvvvvv-----------
   LRUHandleTable table_;
+
+  // Memory size for entries residing in the cache
+  size_t usage_;
+
+  // Memory size for entries residing only in the LRU list
+  size_t lru_usage_;
+
+  // mutex_ protects the following state.
+  // We don't count mutex_ as the cache's internal state so semantically we
+  // don't mind mutex_ invoking the non-const actions.
+  mutable port::Mutex mutex_;
 };
 
 class LRUCache : public ShardedCache {
@@ -267,8 +291,12 @@ class LRUCache : public ShardedCache {
   virtual uint32_t GetHash(Handle* handle) const override;
   virtual void DisownData() override;
 
+  //  Retrieves number of elements in LRU, for unit test purpose only
+  size_t TEST_GetLRUSize();
+
  private:
   LRUCacheShard* shards_;
+  int num_shards_ = 0;
 };
 
 }  // namespace rocksdb
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 87794fd16..1b83033c3 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -17,7 +17,16 @@ class LRUCacheTest : public testing::Test {
   ~LRUCacheTest() {}
 
   void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0) {
-    cache_.reset(new LRUCacheShard());
+    cache_.reset(
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4316) // We've validated the alignment with the new operators
+#endif
+      new LRUCacheShard()
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+    );
     cache_->SetCapacity(capacity);
     cache_->SetStrictCapacityLimit(false);
     cache_->SetHighPriorityPoolRatio(high_pri_pool_ratio);
diff --git a/cmake/RocksDBConfig.cmake.in b/cmake/RocksDBConfig.cmake.in
new file mode 100644
index 000000000..b3cb2b27a
--- /dev/null
+++ b/cmake/RocksDBConfig.cmake.in
@@ -0,0 +1,3 @@
+@PACKAGE_INIT@
+include("${CMAKE_CURRENT_LIST_DIR}/RocksDBTargets.cmake")
+check_required_components(RocksDB)
diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh
index 4d8052c9e..6d87ae908 100755
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Exit on error.
 set -e
diff --git a/db/builder.cc b/db/builder.cc
index 6f973fdbd..7cfa7800c 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -47,15 +47,15 @@ TableBuilder* NewTableBuilder(
     WritableFileWriter* file, const CompressionType compression_type,
     const CompressionOptions& compression_opts, int level,
     const std::string* compression_dict, const bool skip_filters,
-    const uint64_t creation_time) {
+    const uint64_t creation_time, const uint64_t oldest_key_time) {
   assert((column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          column_family_name.empty());
   return ioptions.table_factory->NewTableBuilder(
-      TableBuilderOptions(ioptions, internal_comparator,
-                          int_tbl_prop_collector_factories, compression_type,
-                          compression_opts, compression_dict, skip_filters,
-                          column_family_name, level, creation_time),
+      TableBuilderOptions(
+          ioptions, internal_comparator, int_tbl_prop_collector_factories,
+          compression_type, compression_opts, compression_dict, skip_filters,
+          column_family_name, level, creation_time, oldest_key_time),
       column_family_id, file);
 }
 
@@ -74,8 +74,8 @@ Status BuildTable(
     const CompressionOptions& compression_opts, bool paranoid_file_checks,
     InternalStats* internal_stats, TableFileCreationReason reason,
     EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
-    TableProperties* table_properties, int level,
-    const uint64_t creation_time) {
+    TableProperties* table_properties, int level, const uint64_t creation_time,
+    const uint64_t oldest_key_time) {
   assert((column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          column_family_name.empty());
@@ -120,12 +120,11 @@ Status BuildTable(
 
       file_writer.reset(new WritableFileWriter(std::move(file), env_options,
                                                ioptions.statistics));
-
       builder = NewTableBuilder(
           ioptions, internal_comparator, int_tbl_prop_collector_factories,
           column_family_id, column_family_name, file_writer.get(), compression,
           compression_opts, level, nullptr /* compression_dict */,
-          false /* skip_filters */, creation_time);
+          false /* skip_filters */, creation_time, oldest_key_time);
     }
 
     MergeHelper merge(env, internal_comparator.user_comparator(),
diff --git a/db/builder.h b/db/builder.h
index a432a7531..5a5081c64 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -50,7 +50,8 @@ TableBuilder* NewTableBuilder(
     WritableFileWriter* file, const CompressionType compression_type,
     const CompressionOptions& compression_opts, int level,
     const std::string* compression_dict = nullptr,
-    const bool skip_filters = false, const uint64_t creation_time = 0);
+    const bool skip_filters = false, const uint64_t creation_time = 0,
+    const uint64_t oldest_key_time = 0);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@@ -77,6 +78,6 @@ extern Status BuildTable(
     EventLogger* event_logger = nullptr, int job_id = 0,
     const Env::IOPriority io_priority = Env::IO_HIGH,
     TableProperties* table_properties = nullptr, int level = -1,
-    const uint64_t creation_time = 0);
+    const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0);
 
 }  // namespace rocksdb
diff --git a/db/c.cc b/db/c.cc
index 441ffade3..cbfb8557d 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -36,6 +36,7 @@
 #include "utilities/merge_operators.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/checkpoint.h"
 
 using rocksdb::BytewiseComparator;
@@ -52,6 +53,7 @@ using rocksdb::CompressionType;
 using rocksdb::WALRecoveryMode;
 using rocksdb::DB;
 using rocksdb::DBOptions;
+using rocksdb::DbPath;
 using rocksdb::Env;
 using rocksdb::EnvOptions;
 using rocksdb::InfoLogLevel;
@@ -94,6 +96,8 @@ using rocksdb::PinnableSlice;
 using rocksdb::TransactionDBOptions;
 using rocksdb::TransactionDB;
 using rocksdb::TransactionOptions;
+using rocksdb::OptimisticTransactionDB;
+using rocksdb::OptimisticTransactionOptions;
 using rocksdb::Transaction;
 using rocksdb::Checkpoint;
 
@@ -152,6 +156,12 @@ struct rocksdb_transaction_t {
 struct rocksdb_checkpoint_t {
   Checkpoint* rep;
 };
+struct rocksdb_optimistictransactiondb_t {
+  OptimisticTransactionDB* rep;
+};
+struct rocksdb_optimistictransaction_options_t {
+  OptimisticTransactionOptions rep;
+};
 
 struct rocksdb_compactionfiltercontext_t {
   CompactionFilter::Context rep;
@@ -382,6 +392,10 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
   }
 };
 
+struct rocksdb_dbpath_t {
+  DbPath rep;
+};
+
 struct rocksdb_env_t {
   Env* rep;
   bool is_default;
@@ -2009,6 +2023,16 @@ void rocksdb_options_set_paranoid_checks(
   opt->rep.paranoid_checks = v;
 }
 
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt, 
+                                  const rocksdb_dbpath_t** dbpath_values, 
+                                  size_t num_paths) {
+  std::vector<DbPath> db_paths(num_paths);
+  for (size_t i = 0; i < num_paths; ++i) {
+    db_paths[i] = dbpath_values[i]->rep;
+  }
+  opt->rep.db_paths = db_paths;
+}
+
 void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
   opt->rep.env = (env ? env->rep : nullptr);
 }
@@ -2087,6 +2111,10 @@ void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
   opt->rep.statistics = rocksdb::CreateDBStatistics();
 }
 
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val) {
+  opt->rep.skip_stats_update_on_db_open = val;
+}
+
 void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
   opt->rep.num_levels = n;
 }
@@ -2789,6 +2817,17 @@ size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
   return cache->rep->GetPinnedUsage();
 }
 
+rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) {
+  rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
+  result->rep.path = std::string(path);
+  result->rep.target_size = target_size;
+  return result;
+}
+
+void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) {
+  delete dbpath;
+}
+
 rocksdb_env_t* rocksdb_create_default_env() {
   rocksdb_env_t* result = new rocksdb_env_t;
   result->rep = Env::Default();
@@ -3223,6 +3262,32 @@ void rocksdb_transaction_options_set_max_write_batch_size(
   opt->rep.max_write_batch_size = size;
 }
 
+rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create() {
+  return new rocksdb_optimistictransaction_options_t;
+}
+
+void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, txn_db->rep->CreateColumnFamily(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
 rocksdb_transactiondb_t* rocksdb_transactiondb_open(
     const rocksdb_options_t* options,
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
@@ -3279,7 +3344,14 @@ void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
   delete txn;
 }
 
-//Read a key inside a transaction
+const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
+    rocksdb_transaction_t* txn) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = txn->rep->GetSnapshot();
+  return result;
+}
+
+// Read a key inside a transaction
 char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
                               const rocksdb_readoptions_t* options,
                               const char* key, size_t klen, size_t* vlen,
@@ -3299,6 +3371,49 @@ char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
   return result;
 }
 
+char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
+                                 const rocksdb_readoptions_t* options,
+                                 rocksdb_column_family_handle_t* column_family,
+                                 const char* key, size_t klen, size_t* vlen,
+                                 char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
+                                         const rocksdb_readoptions_t* options,
+                                         const char* key, size_t klen,
+                                         size_t* vlen, unsigned char exclusive,
+                                         char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
 // Read a key outside a transaction
 char* rocksdb_transactiondb_get(
     rocksdb_transactiondb_t* txn_db,
@@ -3321,6 +3436,26 @@ char* rocksdb_transactiondb_get(
   return result;
 }
 
+char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, column_family->rep,
+                              Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
 // Put a key inside a transaction
 void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
                              size_t klen, const char* val, size_t vlen,
@@ -3328,13 +3463,56 @@ void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
   SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
 }
 
-//Put a key outside a transaction
+void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
+                                rocksdb_column_family_handle_t* column_family,
+                                const char* key, size_t klen, const char* val,
+                                size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
+                                  Slice(val, vlen)));
+}
+
+// Put a key outside a transaction
 void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
                                const rocksdb_writeoptions_t* options,
                                const char* key, size_t klen, const char* val,
                                size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Put(options->rep, Slice(key, klen), 
+                                     Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t keylen,
+                                  const char* val, size_t vallen,
+                                  char** errptr) {
+  SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
+                                     Slice(key, keylen), Slice(val, vallen)));
+}
+
+// Write batch into transaction db
+void rocksdb_transactiondb_write(
+        rocksdb_transactiondb_t* db,
+        const rocksdb_writeoptions_t* options,
+        rocksdb_writebatch_t* batch,
+        char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+// Merge a key inside a transaction
+void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
+                               size_t klen, const char* val, size_t vlen,
+                               char** errptr) {
+  SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
+}
+
+// Merge a key outside a transaction
+void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_writeoptions_t* options,
+                                 const char* key, size_t klen, const char* val,
+                                 size_t vlen, char** errptr) {
   SaveError(errptr,
-            txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
+    txn_db->rep->Merge(options->rep, Slice(key, klen), Slice(val, vlen)));
 }
 
 // Delete a key inside a transaction
@@ -3343,6 +3521,12 @@ void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
   SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
 }
 
+void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
+}
+
 // Delete a key outside a transaction
 void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
                                   const rocksdb_writeoptions_t* options,
@@ -3350,6 +3534,14 @@ void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
   SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
 }
 
+void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
+                                        Slice(key, keylen)));
+}
+
 // Create an iterator inside a transaction
 rocksdb_iterator_t* rocksdb_transaction_create_iterator(
     rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
@@ -3358,6 +3550,14 @@ rocksdb_iterator_t* rocksdb_transaction_create_iterator(
   return result;
 }
 
+// Create an iterator outside a transaction
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep);
+  return result;
+}
+
 void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
   delete txn_db->rep;
   delete txn_db;
@@ -3374,6 +3574,42 @@ rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
   return result;
 }
 
+rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
+    const rocksdb_options_t* options, const char* name,
+    char** errptr) {
+  OptimisticTransactionDB* otxn_db;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            options->rep, std::string(name), &otxn_db))) {
+    return nullptr;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
+                                                 otxn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = otxn_db->rep->BeginTransaction(
+      write_options->rep, otxn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  delete otxn_db->rep;
+  delete otxn_db;
+}
+
 void rocksdb_free(void* ptr) { free(ptr); }
 
 rocksdb_pinnableslice_t* rocksdb_get_pinned(
diff --git a/db/c_test.c b/db/c_test.c
index 57f19aa96..7b76badf1 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -41,6 +41,7 @@ static char dbname[200];
 static char sstfilename[200];
 static char dbbackupname[200];
 static char dbcheckpointname[200];
+static char dbpathname[200];
 
 static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
@@ -347,10 +348,25 @@ static void CheckTxnDBGet(
         Free(&val);
 }
 
+static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
+                            const rocksdb_readoptions_t* options,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key,
+                                     strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
 int main(int argc, char** argv) {
   rocksdb_t* db;
   rocksdb_comparator_t* cmp;
   rocksdb_cache_t* cache;
+  rocksdb_dbpath_t *dbpath;
   rocksdb_env_t* env;
   rocksdb_options_t* options;
   rocksdb_compactoptions_t* coptions;
@@ -385,8 +401,14 @@ int main(int argc, char** argv) {
            GetTempDir(),
            ((int)geteuid()));
 
+  snprintf(dbpathname, sizeof(dbpathname),
+           "%s/rocksdb_c_test-%d-dbpath",
+           GetTempDir(),
+           ((int) geteuid()));
+
   StartPhase("create_objects");
   cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024);
   env = rocksdb_create_default_env();
   cache = rocksdb_cache_create_lru(100000);
 
@@ -1357,6 +1379,18 @@ int main(int argc, char** argv) {
     CheckNoError(err);
     CheckTxnDBGet(txn_db, roptions, "foo", NULL);
 
+    // write batch into TransactionDB
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_transactiondb_write(txn_db, woptions, wb, &err);
+    rocksdb_writebatch_destroy(wb);
+    CheckTxnDBGet(txn_db, roptions, "box", "c");
+    CheckNoError(err);
+
     // begin a transaction
     txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
     // put
@@ -1413,6 +1447,23 @@ int main(int argc, char** argv) {
     CheckNoError(err);
     CheckTxnDBGet(txn_db, roptions, "bar", NULL);
 
+    // Column families.
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+                                                     "txn_db_cf", &err);
+    CheckNoError(err);
+
+    rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+                                 8, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+
+    rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+
+    rocksdb_column_family_handle_destroy(cfh);
+
     // close and destroy
     rocksdb_transaction_destroy(txn);
     rocksdb_transactiondb_close(txn_db);
@@ -1440,6 +1491,18 @@ int main(int argc, char** argv) {
     CheckNoError(err);
   }
 
+  // Simple sanity check that options setting db_paths work.
+  StartPhase("open_db_paths");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    const rocksdb_dbpath_t* paths[1] = {dbpath};
+    rocksdb_options_set_db_paths(options, paths, 1);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+  
   StartPhase("cleanup");
   rocksdb_close(db);
   rocksdb_options_destroy(options);
@@ -1449,6 +1512,7 @@ int main(int argc, char** argv) {
   rocksdb_compactoptions_destroy(coptions);
   rocksdb_cache_destroy(cache);
   rocksdb_comparator_destroy(cmp);
+  rocksdb_dbpath_destroy(dbpath);
   rocksdb_env_destroy(env);
 
   fprintf(stderr, "PASS\n");
diff --git a/db/column_family.cc b/db/column_family.cc
index b00eda074..6fd078784 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -930,6 +930,13 @@ SuperVersion* ColumnFamilyData::InstallSuperVersion(
   super_version_ = new_superversion;
   ++super_version_number_;
   super_version_->version_number = super_version_number_;
+  if (old_superversion != nullptr) {
+    if (old_superversion->mutable_cf_options.write_buffer_size !=
+        mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+    }
+  }
+
   // Reset SuperVersions cached in thread local storage
   ResetThreadLocalSuperVersions();
 
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 0d5f2dcf2..88786d469 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -521,6 +521,7 @@ TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) {
   }
 }
 
+#ifndef ROCKSDB_LITE
 TEST_F(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
   Open();
 
@@ -542,6 +543,7 @@ TEST_F(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
+#endif  // !ROCKSDB_LITE
 
 class FlushEmptyCFTestWithParam : public ColumnFamilyTest,
                                   public testing::WithParamInterface<bool> {
diff --git a/db/compaction.cc b/db/compaction.cc
index f4a82ed33..706eb3be0 100644
--- a/db/compaction.cc
+++ b/db/compaction.cc
@@ -169,6 +169,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
       is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
       is_manual_compaction_(_manual_compaction),
+      is_trivial_move_(false),
       compaction_reason_(_compaction_reason) {
   MarkFilesBeingCompacted(true);
   if (is_manual_compaction_) {
diff --git a/db/compaction_iteration_stats.h b/db/compaction_iteration_stats.h
index 52a666e4e..ddb534622 100644
--- a/db/compaction_iteration_stats.h
+++ b/db/compaction_iteration_stats.h
@@ -16,6 +16,8 @@ struct CompactionIterationStats {
   int64_t num_record_drop_obsolete = 0;
   int64_t num_record_drop_range_del = 0;
   int64_t num_range_del_drop_obsolete = 0;
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  int64_t num_optimized_del_drop_obsolete = 0;
   uint64_t total_filter_time = 0;
 
   // Input statistics
diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc
index 08ae19734..ae63f04d8 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@@ -25,6 +25,8 @@ CompactionEventListener::CompactionListenerValueType fromInternalValueType(
           kSingleDelete;
     case kTypeRangeDeletion:
       return CompactionEventListener::CompactionListenerValueType::kRangeDelete;
+    case kTypeBlobIndex:
+      return CompactionEventListener::CompactionListenerValueType::kBlobIndex;
     default:
       assert(false);
       return CompactionEventListener::CompactionListenerValueType::kInvalid;
@@ -111,6 +113,7 @@ void CompactionIterator::ResetRecordCounts() {
   iter_stats_.num_record_drop_obsolete = 0;
   iter_stats_.num_record_drop_range_del = 0;
   iter_stats_.num_range_del_drop_obsolete = 0;
+  iter_stats_.num_optimized_del_drop_obsolete = 0;
 }
 
 void CompactionIterator::SeekToFirst() {
@@ -227,7 +230,8 @@ void CompactionIterator::NextFromInput() {
 #endif  // ROCKSDB_LITE
 
       // apply the compaction filter to the first occurrence of the user key
-      if (compaction_filter_ != nullptr && ikey_.type == kTypeValue &&
+      if (compaction_filter_ != nullptr && 
+          (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex) &&
           (visible_at_tip_ || ikey_.sequence > latest_snapshot_ ||
            ignore_snapshots_)) {
         // If the user has specified a compaction filter and the sequence
@@ -237,11 +241,13 @@ void CompactionIterator::NextFromInput() {
         CompactionFilter::Decision filter;
         compaction_filter_value_.clear();
         compaction_filter_skip_until_.Clear();
+        CompactionFilter::ValueType value_type =
+            ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                                     : CompactionFilter::ValueType::kBlobIndex;
         {
           StopWatchNano timer(env_, true);
           filter = compaction_filter_->FilterV2(
-              compaction_->level(), ikey_.user_key,
-              CompactionFilter::ValueType::kValue, value_,
+              compaction_->level(), ikey_.user_key, value_type, value_,
               &compaction_filter_value_, compaction_filter_skip_until_.rep());
           iter_stats_.total_filter_time +=
               env_ != nullptr ? timer.ElapsedNanos() : 0;
@@ -426,6 +432,9 @@ void CompactionIterator::NextFromInput() {
           // Can compact out this SingleDelete.
           ++iter_stats_.num_record_drop_obsolete;
           ++iter_stats_.num_single_del_fallthru;
+          if (!bottommost_level_) {
+            ++iter_stats_.num_optimized_del_drop_obsolete;
+          }
         } else {
           // Output SingleDelete
           valid_ = true;
@@ -467,6 +476,9 @@ void CompactionIterator::NextFromInput() {
       // Note:  Dropping this Delete will not affect TransactionDB
       // write-conflict checking since it is earlier than any snapshot.
       ++iter_stats_.num_record_drop_obsolete;
+      if (!bottommost_level_) {
+        ++iter_stats_.num_optimized_del_drop_obsolete;
+      }
       input_->Next();
     } else if (ikey_.type == kTypeMerge) {
       if (!merge_helper_->HasOperator()) {
diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc
index b625c99ff..dfc413936 100644
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction_iterator_test.cc
@@ -455,6 +455,111 @@ TEST_F(CompactionIteratorTest, ShuttingDownInMerge) {
   EXPECT_EQ(2, filter.last_seen.load());
 }
 
+TEST_F(CompactionIteratorTest, SingleMergeOperand) {
+  class Filter : public CompactionFilter {
+    virtual Decision FilterV2(int level, const Slice& key, ValueType t,
+                              const Slice& existing_value,
+                              std::string* new_value,
+                              std::string* skip_until) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("av1", v);
+        return Decision::kKeep;
+      } else if (k == "b") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        return Decision::kKeep;
+      } else if (k == "c") {
+        return Decision::kKeep;
+      }
+
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.SingleMergeOperand::Filter";
+    }
+  };
+
+  class SingleMergeOp : public MergeOperator {
+   public:
+    bool FullMergeV2(const MergeOperationInput& merge_in,
+                     MergeOperationOutput* merge_out) const override {
+      // See InitIterators() call below for why "c" is the only key for which
+      // FullMergeV2 should be called.
+      EXPECT_EQ("c", merge_in.key.ToString());
+
+      std::string temp_value;
+      if (merge_in.existing_value != nullptr) {
+        temp_value = merge_in.existing_value->ToString();
+      }
+
+      for (auto& operand : merge_in.operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      merge_out->new_value = temp_value;
+
+      return true;
+    }
+
+    bool PartialMergeMulti(const Slice& key,
+                           const std::deque<Slice>& operand_list,
+                           std::string* new_value,
+                           Logger* logger) const override {
+      std::string string_key = key.ToString();
+      EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+      if (string_key == "a") {
+        EXPECT_EQ(1, operand_list.size());
+      } else if (string_key == "b") {
+        EXPECT_EQ(2, operand_list.size());
+      }
+
+      std::string temp_value;
+      for (auto& operand : operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      swap(temp_value, *new_value);
+
+      return true;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest SingleMergeOp";
+    }
+
+    bool AllowSingleOperand() const override { return true; }
+  };
+
+  SingleMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      // a should invoke PartialMergeMulti with a single merge operand.
+      {test::KeyStr("a", 50, kTypeMerge),
+       // b should invoke PartialMergeMulti with two operands.
+       test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+       // c should invoke FullMerge due to kTypeValue at the beginning.
+       test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+      {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+      &merge_op, &filter);
+
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("av1", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/compaction_job.cc b/db/compaction_job.cc
index 636cdbea1..7419e0a54 100644
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@@ -594,6 +594,9 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
 
   double read_write_amp = 0.0;
   double write_amp = 0.0;
+  double bytes_read_per_sec = 0;
+  double bytes_written_per_sec = 0;
+
   if (stats.bytes_read_non_output_levels > 0) {
     read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
                       stats.bytes_read_non_output_levels) /
@@ -601,17 +604,22 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
     write_amp = stats.bytes_written /
                 static_cast<double>(stats.bytes_read_non_output_levels);
   }
+  if (stats.micros > 0) {
+    bytes_read_per_sec =
+        (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
+        static_cast<double>(stats.micros);
+    bytes_written_per_sec =
+        stats.bytes_written / static_cast<double>(stats.micros);
+  }
+
   ROCKS_LOG_BUFFER(
       log_buffer_,
       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
       "files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
       "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
-      cfd->GetName().c_str(), vstorage->LevelSummary(&tmp),
-      (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
-          static_cast<double>(stats.micros),
-      stats.bytes_written / static_cast<double>(stats.micros),
-      compact_->compaction->output_level(),
+      cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
+      bytes_written_per_sec, compact_->compaction->output_level(),
       stats.num_input_files_in_non_output_levels,
       stats.num_input_files_in_output_level, stats.num_output_files,
       stats.bytes_read_non_output_levels / 1048576.0,
@@ -1006,6 +1014,10 @@ void CompactionJob::RecordDroppedKeys(
     RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
                c_iter_stats.num_range_del_drop_obsolete);
   }
+  if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_optimized_del_drop_obsolete);
+  }
 }
 
 Status CompactionJob::FinishCompactionOutputFile(
@@ -1028,6 +1040,7 @@ Status CompactionJob::FinishCompactionOutputFile(
   auto meta = &sub_compact->current_output()->meta;
   if (s.ok()) {
     Slice lower_bound_guard, upper_bound_guard;
+    std::string smallest_user_key;
     const Slice *lower_bound, *upper_bound;
     if (sub_compact->outputs.size() == 1) {
       // For the first output table, include range tombstones before the min key
@@ -1037,7 +1050,8 @@ Status CompactionJob::FinishCompactionOutputFile(
       // For subsequent output tables, only include range tombstones from min
       // key onwards since the previous file was extended to contain range
       // tombstones falling before min key.
-      lower_bound_guard = meta->smallest.user_key();
+      smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/);
+      lower_bound_guard = Slice(smallest_user_key);
       lower_bound = &lower_bound_guard;
     } else {
       lower_bound = nullptr;
diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 6795227b5..79af3ed9f 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -234,11 +234,6 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& cf_name,
   // If, after the expansion, there are files that are already under
   // compaction, then we must drop/cancel this compaction.
   if (AreFilesInCompaction(inputs->files)) {
-    ROCKS_LOG_WARN(
-        ioptions_.info_log,
-        "[%s] ExpandWhileOverlapping() failure because some of the necessary"
-        " compaction input files are currently being compacted.",
-        cf_name.c_str());
     return false;
   }
   return true;
@@ -1106,11 +1101,7 @@ void LevelCompactionBuilder::SetupInitialFiles() {
       }
       output_level_ =
           (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
-      if (PickFileToCompact() &&
-          compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                     &start_level_inputs_) &&
-          !compaction_picker_->FilesRangeOverlapWithCompaction(
-              {start_level_inputs_}, output_level_)) {
+      if (PickFileToCompact()) {
         // found the compaction!
         if (start_level_ == 0) {
           // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
@@ -1332,12 +1323,10 @@ bool LevelCompactionBuilder::PickFileToCompact() {
   const std::vector<FileMetaData*>& level_files =
       vstorage_->LevelFiles(start_level_);
 
-  // record the first file that is not yet compacted
-  int nextIndex = -1;
-
-  for (unsigned int i = vstorage_->NextCompactionIndex(start_level_);
-       i < file_size.size(); i++) {
-    int index = file_size[i];
+  unsigned int cmp_idx;
+  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+       cmp_idx < file_size.size(); cmp_idx++) {
+    int index = file_size[cmp_idx];
     auto* f = level_files[index];
 
     // do not pick a file to compact if it is being compacted
@@ -1346,27 +1335,42 @@ bool LevelCompactionBuilder::PickFileToCompact() {
       continue;
     }
 
-    // remember the startIndex for the next call to PickCompaction
-    if (nextIndex == -1) {
-      nextIndex = i;
+    start_level_inputs_.files.push_back(f);
+    start_level_inputs_.level = start_level_;
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &start_level_inputs_) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {start_level_inputs_}, output_level_)) {
+      // A locked (pending compaction) input-level file was pulled in due to
+      // user-key overlap.
+      start_level_inputs_.clear();
+      continue;
     }
 
-    // Do not pick this file if its parents at level+1 are being compacted.
-    // Maybe we can avoid redoing this work in SetupOtherInputs
-    parent_index_ = -1;
-    if (compaction_picker_->IsRangeInCompaction(vstorage_, &f->smallest,
-                                                &f->largest, output_level_,
-                                                &parent_index_)) {
+    // Now that input level is fully expanded, we check whether any output files
+    // are locked due to pending compaction.
+    //
+    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+    // level files are locked, not just the extra ones pulled in for user-key
+    // overlap.
+    InternalKey smallest, largest;
+    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (!output_level_inputs.empty() &&
+        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &output_level_inputs)) {
+      start_level_inputs_.clear();
       continue;
     }
-    start_level_inputs_.files.push_back(f);
-    start_level_inputs_.level = start_level_;
     base_index_ = index;
     break;
   }
 
   // store where to start the iteration in the next call to PickCompaction
-  vstorage_->SetNextCompactionIndex(start_level_, nextIndex);
+  vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
 
   return start_level_inputs_.size() > 0;
 }
@@ -1438,19 +1442,22 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   inputs.emplace_back();
   inputs[0].level = 0;
 
-  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-    auto f = *ritr;
-    if (f->fd.table_reader != nullptr &&
-        f->fd.table_reader->GetTableProperties() != nullptr) {
-      auto creation_time =
-          f->fd.table_reader->GetTableProperties()->creation_time;
-      if (creation_time == 0 ||
-          creation_time >=
-              (current_time - ioptions_.compaction_options_fifo.ttl)) {
-        break;
+  // avoid underflow
+  if (current_time > ioptions_.compaction_options_fifo.ttl) {
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      auto f = *ritr;
+      if (f->fd.table_reader != nullptr &&
+          f->fd.table_reader->GetTableProperties() != nullptr) {
+        auto creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time == 0 ||
+            creation_time >=
+                (current_time - ioptions_.compaction_options_fifo.ttl)) {
+          break;
+        }
+        total_size -= f->compensated_file_size;
+        inputs[0].files.push_back(f);
       }
-      total_size -= f->compensated_file_size;
-      inputs[0].files.push_back(f);
     }
   }
 
diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc
index 2e34e9ab2..bba2d073d 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction_picker_test.cc
@@ -512,7 +512,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
         kFileSize, 0, i * 100, i * 100 + 99);
     current_size += kFileSize;
     UpdateVersionStorageInfo();
-    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+    ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
               vstorage_->CompactionScore(0) >= 1);
   }
 }
@@ -852,6 +852,80 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
   ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
 }
 
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+  // Locked file encountered when pulling in extra input-level files with same
+  // user keys. Verify we pick the next-best file from the same input level.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // file_number 2U is largest and thus first choice. But it overlaps with
+  // file_number 1U which is being compacted. So instead we pick the next-
+  // biggest file, 3U, which is eligible for compaction.
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 900000000U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+  // Locked file encountered when pulling in extra output-level files with same
+  // user keys. Expected to skip that compaction and pick the next-best choice.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // score(L1) = 3.7
+  // score(L2) = 1.85
+  // There is no eligible file in L1 to compact since both candidates pull in
+  // file_number 5U, which overlaps with a file pending compaction (6U). The
+  // first eligible compaction is from L2->L3.
+  Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 5000000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "201" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  file_map_[6U].first->being_compacted = true;
+  Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
 TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
   NewVersionStorage(6, kCompactionStyleLevel);
   mutable_cf_options_.level0_file_num_compaction_trigger = 2;
@@ -1316,6 +1390,49 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
   ASSERT_FALSE(compaction->IsTrivialMove());
 }
 
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1000000000U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 900000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+      "249" /* largest */, 800000000U /* file_size */);
+  Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+      "299" /* largest */, 700000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 1U /* file_size */);
+  file_map_[5U].first->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(0U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(0U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+  ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/compaction_picker_universal.cc b/db/compaction_picker_universal.cc
index ce480267c..14533fbcd 100644
--- a/db/compaction_picker_universal.cc
+++ b/db/compaction_picker_universal.cc
@@ -373,6 +373,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(
               c->inputs(0)->size());
 
   RegisterCompaction(c);
+  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
 
   TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
                            c);
diff --git a/db/convenience.cc b/db/convenience.cc
index 6568b1fff..8ee31caca 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -9,19 +9,50 @@
 #include "rocksdb/convenience.h"
 
 #include "db/db_impl.h"
+#include "util/cast_util.h"
 
 namespace rocksdb {
 
 void CancelAllBackgroundWork(DB* db, bool wait) {
-  (dynamic_cast<DBImpl*>(db->GetRootDB()))->CancelAllBackgroundWork(wait);
+  (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
+      ->CancelAllBackgroundWork(wait);
 }
 
 Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
                           const Slice* begin, const Slice* end) {
-  return (dynamic_cast<DBImpl*>(db->GetRootDB()))
+  return (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
       ->DeleteFilesInRange(column_family, begin, end);
 }
 
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path) {
+  unique_ptr<RandomAccessFile> file;
+  uint64_t file_size;
+  InternalKeyComparator internal_comparator(options.comparator);
+  ImmutableCFOptions ioptions(options);
+
+  Status s = ioptions.env->NewRandomAccessFile(file_path, &file, env_options);
+  if (s.ok()) {
+    s = ioptions.env->GetFileSize(file_path, &file_size);
+  } else {
+    return s;
+  }
+  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(file), file_path));
+  s = ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, env_options, internal_comparator,
+                         false /* skip_filters */, -1 /* level */),
+      std::move(file_reader), file_size, &table_reader,
+      false /* prefetch_index_and_filter_in_cache */);
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_reader->VerifyChecksum();
+  return s;
+}
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index f9ab8302c..56e157832 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -20,6 +20,7 @@
 #include "db/log_format.h"
 #include "db/version_set.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
@@ -152,7 +153,7 @@ class CorruptionTest : public testing::Test {
     struct stat sbuf;
     if (stat(fname.c_str(), &sbuf) != 0) {
       const char* msg = strerror(errno);
-      ASSERT_TRUE(false) << fname << ": " << msg;
+      FAIL() << fname << ": " << msg;
     }
 
     if (offset < 0) {
@@ -179,6 +180,9 @@ class CorruptionTest : public testing::Test {
     }
     s = WriteStringToFile(Env::Default(), contents, fname);
     ASSERT_TRUE(s.ok()) << s.ToString();
+    Options options;
+    EnvOptions env_options;
+    ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname));
   }
 
   void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
@@ -213,7 +217,7 @@ class CorruptionTest : public testing::Test {
         return;
       }
     }
-    ASSERT_TRUE(false) << "no file found at level";
+    FAIL() << "no file found at level";
   }
 
 
@@ -312,6 +316,7 @@ TEST_F(CorruptionTest, TableFile) {
 
   Corrupt(kTableFile, 100, 1);
   Check(99, 99);
+  ASSERT_NOK(dbi->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, TableFileIndexData) {
@@ -327,9 +332,11 @@ TEST_F(CorruptionTest, TableFileIndexData) {
   // corrupt an index block of an entire file
   Corrupt(kTableFile, -2000, 500);
   Reopen();
+  dbi = reinterpret_cast<DBImpl*>(db_);
   // one full file should be readable, since only one was corrupted
   // the other file should be fully non-readable, since index was corrupted
   Check(5000, 5000);
+  ASSERT_NOK(dbi->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, MissingDescriptor) {
@@ -389,10 +396,12 @@ TEST_F(CorruptionTest, CompactionInputError) {
 
   Corrupt(kTableFile, 100, 1);
   Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
 
   // Force compactions by writing lots of values
   Build(10000);
   Check(10000, 10000);
+  ASSERT_NOK(dbi->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
@@ -424,6 +433,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
   CorruptTableFileAtLevel(0, 100, 1);
   Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
 
   // Write must eventually fail because of corrupted table
   Status s;
@@ -445,6 +455,7 @@ TEST_F(CorruptionTest, UnrelatedKeys) {
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
   Corrupt(kTableFile, 100, 1);
+  ASSERT_NOK(dbi->VerifyChecksum());
 
   std::string tmp1, tmp2;
   ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 3d732f573..654a457ef 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -175,6 +175,8 @@ TEST_F(DBBasicTest, LevelLimitReopen) {
   int i = 0;
   while (NumTableFilesAtLevel(2, 1) == 0) {
     ASSERT_OK(Put(1, Key(i++), value));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
   }
 
   options.num_levels = 1;
@@ -358,7 +360,6 @@ TEST_F(DBBasicTest, FLUSH) {
     WriteOptions writeOpt = WriteOptions();
     writeOpt.disableWAL = true;
     SetPerfLevel(kEnableTime);
-    ;
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     // this will now also flush the last 2 writes
     ASSERT_OK(Flush(1));
@@ -367,6 +368,7 @@ TEST_F(DBBasicTest, FLUSH) {
     get_perf_context()->Reset();
     Get(1, "foo");
     ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+    ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
 
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("v1", Get(1, "foo"));
@@ -723,6 +725,7 @@ TEST_F(DBBasicTest, FlushOneColumnFamily) {
 TEST_F(DBBasicTest, MultiGetSimple) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
     ASSERT_OK(Put(1, "k1", "v1"));
     ASSERT_OK(Put(1, "k2", "v2"));
     ASSERT_OK(Put(1, "k3", "v3"));
@@ -736,12 +739,15 @@ TEST_F(DBBasicTest, MultiGetSimple) {
     std::vector<std::string> values(20, "Temporary data to be overwritten");
     std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
 
+    get_perf_context()->Reset();
     std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
     ASSERT_EQ(values.size(), keys.size());
     ASSERT_EQ(values[0], "v1");
     ASSERT_EQ(values[1], "v2");
     ASSERT_EQ(values[2], "v3");
     ASSERT_EQ(values[4], "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
 
     ASSERT_OK(s[0]);
     ASSERT_OK(s[1]);
@@ -749,6 +755,7 @@ TEST_F(DBBasicTest, MultiGetSimple) {
     ASSERT_TRUE(s[3].IsNotFound());
     ASSERT_OK(s[4]);
     ASSERT_TRUE(s[5].IsNotFound());
+    SetPerfLevel(kDisable);
   } while (ChangeCompactOptions());
 }
 
@@ -785,36 +792,30 @@ TEST_F(DBBasicTest, MultiGetEmpty) {
 TEST_F(DBBasicTest, ChecksumTest) {
   BlockBasedTableOptions table_options;
   Options options = CurrentOptions();
+  // change when new checksum type added
+  int max_checksum = static_cast<int>(kxxHash);
+  const int kNumPerFile = 2;
+
+  // generate one table with each type of checksum
+  for (int i = 0; i <= max_checksum; ++i) {
+    table_options.checksum = static_cast<ChecksumType>(i);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
 
-  table_options.checksum = kCRC32c;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-  ASSERT_OK(Put("a", "b"));
-  ASSERT_OK(Put("c", "d"));
-  ASSERT_OK(Flush());  // table with crc checksum
-
-  table_options.checksum = kxxHash;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-  ASSERT_OK(Put("e", "f"));
-  ASSERT_OK(Put("g", "h"));
-  ASSERT_OK(Flush());  // table with xxhash checksum
-
-  table_options.checksum = kCRC32c;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-  ASSERT_EQ("b", Get("a"));
-  ASSERT_EQ("d", Get("c"));
-  ASSERT_EQ("f", Get("e"));
-  ASSERT_EQ("h", Get("g"));
-
-  table_options.checksum = kCRC32c;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-  ASSERT_EQ("b", Get("a"));
-  ASSERT_EQ("d", Get("c"));
-  ASSERT_EQ("f", Get("e"));
-  ASSERT_EQ("h", Get("g"));
+  // verify data with each type of checksum
+  for (int i = 0; i <= kxxHash; ++i) {
+    table_options.checksum = static_cast<ChecksumType>(i);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
+      ASSERT_EQ(Key(j), Get(Key(j)));
+    }
+  }
 }
 
 // On Windows you can have either memory mapped file or a file
diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc
new file mode 100644
index 000000000..e71b511df
--- /dev/null
+++ b/db/db_blob_index_test.cc
@@ -0,0 +1,409 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+  enum Tier {
+    kMemtable = 0,
+    kImmutableMemtables = 1,
+    kL0SstFile = 2,
+    kLnSstFile = 3,
+  };
+  const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+                                       Tier::kImmutableMemtables,
+                                       Tier::kL0SstFile, Tier::kLnSstFile};
+
+  DBBlobIndexTest() : DBTestBase("/db_blob_index_test") {}
+
+  ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+  ColumnFamilyData* cfd() {
+    return reinterpret_cast<ColumnFamilyHandleImpl*>(cfh())->cfd();
+  }
+
+  Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+                      const Slice& blob_index) {
+    return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+                                            blob_index);
+  }
+
+  Status Write(WriteBatch* batch) {
+    return dbfull()->Write(WriteOptions(), batch);
+  }
+
+  std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+                      const Snapshot* snapshot = nullptr) {
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    PinnableSlice value;
+    auto s = dbfull()->GetImpl(read_options, cfh(), key, &value,
+                               nullptr /*value_found*/, is_blob_index);
+    if (s.IsNotFound()) {
+      return "NOT_FOUND";
+    }
+    if (s.IsNotSupported()) {
+      return "NOT_SUPPORTED";
+    }
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+
+  std::string GetBlobIndex(const Slice& key,
+                           const Snapshot* snapshot = nullptr) {
+    bool is_blob_index = false;
+    std::string value = GetImpl(key, &is_blob_index, snapshot);
+    if (!is_blob_index) {
+      return "NOT_BLOB";
+    }
+    return value;
+  }
+
+  ArenaWrappedDBIter* GetBlobIterator() {
+    return dbfull()->NewIteratorImpl(ReadOptions(), cfd(),
+                                     dbfull()->GetLatestSequenceNumber(),
+                                     true /*allow_blob*/);
+  }
+
+  Options GetTestOptions() {
+    Options options;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.disable_auto_compactions = true;
+    // Disable auto flushes.
+    options.max_write_buffer_number = 10;
+    options.min_write_buffer_number_to_merge = 10;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    return options;
+  }
+
+  void MoveDataTo(Tier tier) {
+    switch (tier) {
+      case Tier::kMemtable:
+        break;
+      case Tier::kImmutableMemtables:
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+        break;
+      case Tier::kL0SstFile:
+        ASSERT_OK(Flush());
+        break;
+      case Tier::kLnSstFile:
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("a", "dummy"));
+        ASSERT_OK(Put("z", "dummy"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+        ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+        break;
+    }
+  }
+};
+
+// Should be able to write kTypeBlobIndex to memtables and SST files.
+TEST_F(DBBlobIndexTest, Write) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    for (int i = 1; i <= 5; i++) {
+      std::string index = ToString(i);
+      WriteBatch batch;
+      ASSERT_OK(PutBlobIndex(&batch, "key" + index, "blob" + index));
+      ASSERT_OK(Write(&batch));
+    }
+    MoveDataTo(tier);
+    for (int i = 1; i <= 5; i++) {
+      std::string index = ToString(i);
+      ASSERT_EQ("blob" + index, GetBlobIndex("key" + index));
+    }
+  }
+}
+
+// Get should be able to return blob index if is_blob_index is provided,
+// otherwise return Status::NotSupported status.
+TEST_F(DBBlobIndexTest, Get) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "value"));
+    ASSERT_OK(PutBlobIndex(&batch, "blob_key", "blob_index"));
+    ASSERT_OK(Write(&batch));
+    MoveDataTo(tier);
+    // Verify normal value
+    bool is_blob_index = false;
+    PinnableSlice value;
+    ASSERT_EQ("value", Get("key"));
+    ASSERT_EQ("value", GetImpl("key"));
+    ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+    ASSERT_FALSE(is_blob_index);
+    // Verify blob index
+    ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+    ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
+    ASSERT_TRUE(is_blob_index);
+  }
+}
+
+// Get should NOT return Status::NotSupported if blob index is updated with
+// a normal value.
+TEST_F(DBBlobIndexTest, Updated) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(PutBlobIndex(&batch, "key" + ToString(i), "blob_index"));
+    }
+    ASSERT_OK(Write(&batch));
+    // Avoid blob values from being purged.
+    const Snapshot* snapshot = dbfull()->GetSnapshot();
+    ASSERT_OK(Put("key1", "new_value"));
+    ASSERT_OK(Merge("key2", "a"));
+    ASSERT_OK(Merge("key2", "b"));
+    ASSERT_OK(Merge("key2", "c"));
+    ASSERT_OK(Delete("key3"));
+    ASSERT_OK(SingleDelete("key4"));
+    ASSERT_OK(Delete("key5"));
+    ASSERT_OK(Merge("key5", "a"));
+    ASSERT_OK(Merge("key5", "b"));
+    ASSERT_OK(Merge("key5", "c"));
+    ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+    MoveDataTo(tier);
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
+    }
+    ASSERT_EQ("new_value", Get("key1"));
+    ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    ASSERT_EQ("NOT_FOUND", Get("key3"));
+    ASSERT_EQ("NOT_FOUND", Get("key4"));
+    ASSERT_EQ("a,b,c", GetImpl("key5"));
+    for (int i = 6; i < 9; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("key" + ToString(i)));
+    }
+    ASSERT_EQ("blob_index", GetBlobIndex("key9"));
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+// Iterator should get blob value if allow_blob flag is set,
+// otherwise return Status::NotSupported status.
+TEST_F(DBBlobIndexTest, Iterate) {
+  const std::vector<std::vector<ValueType>> data = {
+      /*00*/ {kTypeValue},
+      /*01*/ {kTypeBlobIndex},
+      /*02*/ {kTypeValue},
+      /*03*/ {kTypeBlobIndex, kTypeValue},
+      /*04*/ {kTypeValue},
+      /*05*/ {kTypeValue, kTypeBlobIndex},
+      /*06*/ {kTypeValue},
+      /*07*/ {kTypeDeletion, kTypeBlobIndex},
+      /*08*/ {kTypeValue},
+      /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+      /*10*/ {kTypeValue},
+      /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+      /*12*/ {kTypeValue},
+      /*13*/
+      {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+      /*14*/ {kTypeValue},
+      /*15*/ {kTypeBlobIndex},
+      /*16*/ {kTypeValue},
+  };
+
+  auto get_key = [](int index) {
+    char buf[20];
+    snprintf(buf, sizeof(buf), "%02d", index);
+    return "key" + std::string(buf);
+  };
+
+  auto get_value = [&](int index, int version) {
+    return get_key(index) + "_value" + ToString(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status().code());
+    if (expected_status == Status::kOk) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto create_normal_iterator = [&]() -> Iterator* {
+    return dbfull()->NewIterator(ReadOptions());
+  };
+
+  auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+  auto check_is_blob = [&](bool is_blob) {
+    return [is_blob](Iterator* iterator) {
+      ASSERT_EQ(is_blob,
+                reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+    };
+  };
+
+  auto verify = [&](int index, Status::Code expected_status,
+                    const Slice& forward_value, const Slice& backward_value,
+                    std::function<Iterator*()> create_iterator,
+                    std::function<void(Iterator*)> extra_check = nullptr) {
+    // Seek
+    auto* iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index));
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Next
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index - 1));
+    ASSERT_TRUE(iterator->Valid());
+    iterator->Next();
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // SeekForPrev
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->SeekForPrev(get_key(index));
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Prev
+    iterator = create_iterator();
+    iterator->Seek(get_key(index + 1));
+    ASSERT_TRUE(iterator->Valid());
+    iterator->Prev();
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+  };
+
+  for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+    // Avoid values from being purged.
+    std::vector<const Snapshot*> snapshots;
+    DestroyAndReopen(GetTestOptions());
+
+    // fill data
+    for (int i = 0; i < static_cast<int>(data.size()); i++) {
+      for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+        std::string key = get_key(i);
+        std::string value = get_value(i, j);
+        WriteBatch batch;
+        switch (data[i][j]) {
+          case kTypeValue:
+            ASSERT_OK(Put(key, value));
+            break;
+          case kTypeDeletion:
+            ASSERT_OK(Delete(key));
+            break;
+          case kTypeSingleDeletion:
+            ASSERT_OK(SingleDelete(key));
+            break;
+          case kTypeMerge:
+            ASSERT_OK(Merge(key, value));
+            break;
+          case kTypeBlobIndex:
+            ASSERT_OK(PutBlobIndex(&batch, key, value));
+            ASSERT_OK(Write(&batch));
+            break;
+          default:
+            assert(false);
+        };
+      }
+      snapshots.push_back(dbfull()->GetSnapshot());
+    }
+    ASSERT_OK(
+        dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+    snapshots.push_back(dbfull()->GetSnapshot());
+    MoveDataTo(tier);
+
+    // Normal iterator
+    verify(1, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(3, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_normal_iterator);
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_normal_iterator);
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_normal_iterator);
+    verify(11, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_normal_iterator);
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_normal_iterator);
+
+    // Iterator with blob support
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+
+    for (auto* snapshot : snapshots) {
+      dbfull()->ReleaseSnapshot(snapshot);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 317597cb6..169cadc85 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -497,7 +497,7 @@ TEST_F(DBBlockCacheTest, CompressedCache) {
         options.compression = kNoCompression;
         break;
       default:
-        ASSERT_TRUE(false);
+        FAIL();
     }
     CreateAndReopenWithCF({"pikachu"}, options);
     // default column family doesn't have block cache
@@ -560,7 +560,7 @@ TEST_F(DBBlockCacheTest, CompressedCache) {
         ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
         break;
       default:
-        ASSERT_TRUE(false);
+        FAIL();
     }
 
     options.create_if_missing = true;
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index bca188a98..93fbc0d37 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -1439,6 +1439,60 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
   ASSERT_GT(old_num_files, new_num_files);
 }
 
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+  // regression test for #2833: groups of files whose user-keys overlap at the
+  // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+  // reappear, either because a new version of the key was removed, or a range
+  // deletion was partially dropped. It could also cause non-overlapping
+  // invariant to be violated if the files dropped by DeleteFilesInRange were
+  // a subset of files that a range deletion spans.
+  const int kNumL0Files = 2;
+  const int kValSize = 8 << 10;  // 8KB
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  // The snapshot prevents key 1 from having its old version dropped. The low
+  // `target_file_size_base` ensures two keys will be in each output file.
+  const Snapshot* snapshot = nullptr;
+  Random rnd(301);
+  // The value indicates which flush the key belonged to, which is enough
+  // for us to determine the keys' relative ages. After L0 flushes finish,
+  // files look like:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[0]
+  // File 1:               1 -> vals[1], 2 -> vals[1]
+  //
+  // Then L0->L1 compaction happens, which outputs keys as follows:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[1]
+  // File 1:               1 -> vals[0], 2 -> vals[1]
+  //
+  // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+  // would cause `1 -> vals[0]` (an older key) to reappear.
+  std::string vals[kNumL0Files];
+  for (int i = 0; i < kNumL0Files; ++i) {
+    vals[i] = RandomString(&rnd, kValSize);
+    Put(Key(i), vals[i]);
+    Put(Key(i + 1), vals[i]);
+    Flush();
+    if (i == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+  // "1 -> vals[0]" to reappear.
+  Slice begin = Key(0);
+  Slice end = Key(1);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_EQ(vals[1], Get(Key(1)));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
   int32_t trivial_move = 0;
   int32_t non_trivial_move = 0;
@@ -2684,6 +2738,46 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
   ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
 }
 
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+  // Deletions can be dropped when compacted to non-last level if they fall
+  // outside the lower-level files' key-ranges.
+  const int kNumL0Files = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // put key 1 and 3 in separate L1, L2 files.
+  // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+  for (int level = 2; level >= 1; --level) {
+    for (int i = 0; i < 2; ++i) {
+      Put(Key(2 * i + 1), "val");
+      Flush();
+    }
+    MoveFilesToLevel(level);
+    ASSERT_EQ(2, NumTableFilesAtLevel(level));
+  }
+
+  // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+  // - Tombstones for keys 2 and 4 can be dropped early.
+  // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    Put(Key(0), "val");  // sentinel to prevent trivial move
+    Delete(Key(i + 1));
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+  }
+  ASSERT_EQ(2, options.statistics->getTickerCount(
+                   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+  ASSERT_EQ(2,
+            options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
 INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
                         ::testing::Values(std::make_tuple(1, true),
                                           std::make_tuple(1, false),
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 49c432f39..38eee5645 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -1,9 +1,7 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
diff --git a/db/db_impl.cc b/db/db_impl.cc
index f770b51ae..c9f4702c3 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -67,6 +67,7 @@
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
@@ -80,6 +81,7 @@
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
+#include "tools/sst_dump_tool_imp.h"
 #include "util/auto_roll_logger.h"
 #include "util/autovector.h"
 #include "util/build_version.h"
@@ -168,6 +170,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       last_batch_group_size_(0),
       unscheduled_flushes_(0),
       unscheduled_compactions_(0),
+      bg_bottom_compaction_scheduled_(0),
       bg_compaction_scheduled_(0),
       num_running_compactions_(0),
       bg_flush_scheduled_(0),
@@ -242,7 +245,8 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
     return;
   }
   // Wait for background work to finish
-  while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_) {
     bg_cv_.Wait();
   }
 }
@@ -252,15 +256,18 @@ DBImpl::~DBImpl() {
   // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
   CancelAllBackgroundWork(false);
+  int bottom_compactions_unscheduled =
+      env_->UnSchedule(this, Env::Priority::BOTTOM);
   int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
   int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
   mutex_.Lock();
+  bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
   bg_compaction_scheduled_ -= compactions_unscheduled;
   bg_flush_scheduled_ -= flushes_unscheduled;
 
   // Wait for background work to finish
-  while (bg_compaction_scheduled_ || bg_flush_scheduled_ ||
-         bg_purge_scheduled_) {
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ || bg_purge_scheduled_) {
     TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
     bg_cv_.Wait();
   }
@@ -902,7 +909,8 @@ Status DBImpl::Get(const ReadOptions& read_options,
 
 Status DBImpl::GetImpl(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
-                       PinnableSlice* pinnable_val, bool* value_found) {
+                       PinnableSlice* pinnable_val, bool* value_found,
+                       bool* is_blob_index) {
   assert(pinnable_val != nullptr);
   StopWatch sw(env_, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
@@ -952,13 +960,13 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   bool done = false;
   if (!skip_memtable) {
     if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                     &range_del_agg, read_options)) {
+                     &range_del_agg, read_options, is_blob_index)) {
       done = true;
       pinnable_val->PinSelf();
       RecordTick(stats_, MEMTABLE_HIT);
     } else if ((s.ok() || s.IsMergeInProgress()) &&
                sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                            &range_del_agg, read_options)) {
+                            &range_del_agg, read_options, is_blob_index)) {
       done = true;
       pinnable_val->PinSelf();
       RecordTick(stats_, MEMTABLE_HIT);
@@ -970,7 +978,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
     sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context,
-                     &range_del_agg, value_found);
+                     &range_del_agg, value_found, nullptr, nullptr,
+                     is_blob_index);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
@@ -983,6 +992,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     size_t size = pinnable_val->size();
     RecordTick(stats_, BYTES_READ, size);
     MeasureTime(stats_, BYTES_PER_READ, size);
+    PERF_COUNTER_ADD(get_read_bytes, size);
   }
   return s;
 }
@@ -1110,6 +1120,7 @@ std::vector<Status> DBImpl::MultiGet(
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
   RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
   MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
   PERF_TIMER_STOP(get_post_process_time);
 
   return stat_list;
@@ -1315,6 +1326,11 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
                                  &edit, &mutex_);
       write_thread_.ExitUnbatched(&w);
     }
+    if (s.ok()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
 
     if (!cf_support_snapshot) {
       // Dropped Column Family doesn't support snapshot. Need to recalculate
@@ -1336,9 +1352,6 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
     // later inside db_mutex.
     EraseThreadStatusCfInfo(cfd);
     assert(cfd->IsDropped());
-    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
-    max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
-                                  mutable_cf_options->max_write_buffer_number;
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "Dropped column family with id %u\n", cfd->GetID());
   } else {
@@ -1402,77 +1415,83 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
     return NewDBIterator(
         env_, read_options, *cfd->ioptions(), cfd->user_comparator(), iter,
         kMaxSequenceNumber,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        sv->version_number);
+        sv->mutable_cf_options.max_sequential_skip_in_iterations);
 #endif
   } else {
     SequenceNumber latest_snapshot = versions_->LastSequence();
-    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
-
     auto snapshot =
         read_options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(
-                read_options.snapshot)->number_
+            ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                  ->number_
             : latest_snapshot;
-
-    // Try to generate a DB iterator tree in continuous memory area to be
-    // cache friendly. Here is an example of result:
-    // +-------------------------------+
-    // |                               |
-    // | ArenaWrappedDBIter            |
-    // |  +                            |
-    // |  +---> Inner Iterator   ------------+
-    // |  |                            |     |
-    // |  |    +-- -- -- -- -- -- -- --+     |
-    // |  +--- | Arena                 |     |
-    // |       |                       |     |
-    // |          Allocated Memory:    |     |
-    // |       |   +-------------------+     |
-    // |       |   | DBIter            | <---+
-    // |           |  +                |
-    // |       |   |  +-> iter_  ------------+
-    // |       |   |                   |     |
-    // |       |   +-------------------+     |
-    // |       |   | MergingIterator   | <---+
-    // |           |  +                |
-    // |       |   |  +->child iter1  ------------+
-    // |       |   |  |                |          |
-    // |           |  +->child iter2  ----------+ |
-    // |       |   |  |                |        | |
-    // |       |   |  +->child iter3  --------+ | |
-    // |           |                   |      | | |
-    // |       |   +-------------------+      | | |
-    // |       |   | Iterator1         | <--------+
-    // |       |   +-------------------+      | |
-    // |       |   | Iterator2         | <------+
-    // |       |   +-------------------+      |
-    // |       |   | Iterator3         | <----+
-    // |       |   +-------------------+
-    // |       |                       |
-    // +-------+-----------------------+
-    //
-    // ArenaWrappedDBIter inlines an arena area where all the iterators in
-    // the iterator tree are allocated in the order of being accessed when
-    // querying.
-    // Laying out the iterators in the order of being accessed makes it more
-    // likely that any iterator pointer is close to the iterator it points to so
-    // that they are likely to be in the same cache line and/or page.
-    ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, *cfd->ioptions(), cfd->user_comparator(), snapshot,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        sv->version_number);
-
-    InternalIterator* internal_iter =
-        NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                            db_iter->GetRangeDelAggregator());
-    db_iter->SetIterUnderDBIter(internal_iter);
-
-    return db_iter;
+    return NewIteratorImpl(read_options, cfd, snapshot);
   }
   // To stop compiler from complaining
   return nullptr;
 }
 
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+                                            ColumnFamilyData* cfd,
+                                            SequenceNumber snapshot,
+                                            bool allow_blob) {
+  SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+
+  // Try to generate a DB iterator tree in continuous memory area to be
+  // cache friendly. Here is an example of result:
+  // +-------------------------------+
+  // |                               |
+  // | ArenaWrappedDBIter            |
+  // |  +                            |
+  // |  +---> Inner Iterator   ------------+
+  // |  |                            |     |
+  // |  |    +-- -- -- -- -- -- -- --+     |
+  // |  +--- | Arena                 |     |
+  // |       |                       |     |
+  // |          Allocated Memory:    |     |
+  // |       |   +-------------------+     |
+  // |       |   | DBIter            | <---+
+  // |           |  +                |
+  // |       |   |  +-> iter_  ------------+
+  // |       |   |                   |     |
+  // |       |   +-------------------+     |
+  // |       |   | MergingIterator   | <---+
+  // |           |  +                |
+  // |       |   |  +->child iter1  ------------+
+  // |       |   |  |                |          |
+  // |           |  +->child iter2  ----------+ |
+  // |       |   |  |                |        | |
+  // |       |   |  +->child iter3  --------+ | |
+  // |           |                   |      | | |
+  // |       |   +-------------------+      | | |
+  // |       |   | Iterator1         | <--------+
+  // |       |   +-------------------+      | |
+  // |       |   | Iterator2         | <------+
+  // |       |   +-------------------+      |
+  // |       |   | Iterator3         | <----+
+  // |       |   +-------------------+
+  // |       |                       |
+  // +-------+-----------------------+
+  //
+  // ArenaWrappedDBIter inlines an arena area where all the iterators in
+  // the iterator tree are allocated in the order of being accessed when
+  // querying.
+  // Laying out the iterators in the order of being accessed makes it more
+  // likely that any iterator pointer is close to the iterator it points to so
+  // that they are likely to be in the same cache line and/or page.
+  ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), snapshot,
+      sv->mutable_cf_options.max_sequential_skip_in_iterations,
+      sv->version_number, ((read_options.snapshot != nullptr) ? nullptr : this),
+      cfd, allow_blob);
+
+  InternalIterator* internal_iter =
+      NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
+                          db_iter->GetRangeDelAggregator());
+  db_iter->SetIterUnderDBIter(internal_iter);
+
+  return db_iter;
+}
+
 Status DBImpl::NewIterators(
     const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_families,
@@ -1511,33 +1530,21 @@ Status DBImpl::NewIterators(
       iterators->push_back(NewDBIterator(
           env_, read_options, *cfd->ioptions(), cfd->user_comparator(), iter,
           kMaxSequenceNumber,
-          sv->mutable_cf_options.max_sequential_skip_in_iterations,
-          sv->version_number));
+          sv->mutable_cf_options.max_sequential_skip_in_iterations));
     }
 #endif
   } else {
     SequenceNumber latest_snapshot = versions_->LastSequence();
+    auto snapshot =
+        read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                  ->number_
+            : latest_snapshot;
 
     for (size_t i = 0; i < column_families.size(); ++i) {
       auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
           column_families[i])->cfd();
-      SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
-
-      auto snapshot =
-          read_options.snapshot != nullptr
-              ? reinterpret_cast<const SnapshotImpl*>(
-                  read_options.snapshot)->number_
-              : latest_snapshot;
-
-      ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-          env_, read_options, *cfd->ioptions(), cfd->user_comparator(),
-          snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-          sv->version_number);
-      InternalIterator* internal_iter =
-          NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                              db_iter->GetRangeDelAggregator());
-      db_iter->SetIterUnderDBIter(internal_iter);
-      iterators->push_back(db_iter);
+      iterators->push_back(NewIteratorImpl(read_options, cfd, snapshot));
     }
   }
 
@@ -1576,12 +1583,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
   delete casted_s;
 }
 
-bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) {
+bool DBImpl::HasActiveSnapshotInRange(SequenceNumber lower_bound,
+                                      SequenceNumber upper_bound) {
   InstrumentedMutexLock l(&mutex_);
-  if (snapshots_.empty()) {
-    return false;
-  }
-  return (snapshots_.newest()->GetSequenceNumber() > sn);
+  return snapshots_.HasSnapshotInRange(lower_bound, upper_bound);
 }
 
 #ifndef ROCKSDB_LITE
@@ -2057,25 +2062,19 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
         end_key = &end_storage;
       }
 
-      vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1,
-                                     nullptr, false);
+      vstorage->GetCleanInputsWithinInterval(i, begin_key, end_key,
+                                             &level_files, -1 /* hint_index */,
+                                             nullptr /* file_index */);
       FileMetaData* level_file;
       for (uint32_t j = 0; j < level_files.size(); j++) {
         level_file = level_files[j];
-        if (((begin == nullptr) ||
-             (cfd->internal_comparator().user_comparator()->Compare(
-                  level_file->smallest.user_key(), *begin) >= 0)) &&
-            ((end == nullptr) ||
-             (cfd->internal_comparator().user_comparator()->Compare(
-                  level_file->largest.user_key(), *end) <= 0))) {
-          if (level_file->being_compacted) {
-            continue;
-          }
-          edit.SetColumnFamily(cfd->GetID());
-          edit.DeleteFile(i, level_file->fd.GetNumber());
-          deleted_files.push_back(level_file);
-          level_file->being_compacted = true;
+        if (level_file->being_compacted) {
+          continue;
         }
+        edit.SetColumnFamily(cfd->GetID());
+        edit.DeleteFile(i, level_file->fd.GetNumber());
+        deleted_files.push_back(level_file);
+        level_file->being_compacted = true;
       }
     }
     if (edit.GetDeletedFiles().empty()) {
@@ -2293,7 +2292,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
     // Delete log files in the WAL dir
     for (const auto& file : walDirFiles) {
       if (ParseFileName(file, &number, &type) && type == kLogFile) {
-        Status del = env->DeleteFile(soptions.wal_dir + "/" + file);
+        Status del = env->DeleteFile(LogFileName(soptions.wal_dir, number));
         if (result.ok() && !del.ok()) {
           result = del;
         }
@@ -2521,7 +2520,8 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
                                        bool cache_only, SequenceNumber* seq,
-                                       bool* found_record_for_key) {
+                                       bool* found_record_for_key,
+                                       bool* is_blob_index) {
   Status s;
   MergeContext merge_context;
   RangeDelAggregator range_del_agg(sv->mem->GetInternalKeyComparator(),
@@ -2536,7 +2536,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
 
   // Check if there is a record for this key in the latest memtable
   sv->mem->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
-               read_options);
+               read_options, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2555,7 +2555,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
 
   // Check if there is a record for this key in the immutable memtables
   sv->imm->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
-               read_options);
+               read_options, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2574,7 +2574,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
 
   // Check if there is a record for this key in the immutable memtables
   sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &range_del_agg,
-                          seq, read_options);
+                          seq, read_options, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2598,7 +2598,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     // Check tables
     sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
                      &range_del_agg, nullptr /* value_found */,
-                     found_record_for_key, seq);
+                     found_record_for_key, seq, is_blob_index);
 
     if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
       // unexpected error reading SST files
@@ -2735,6 +2735,54 @@ Status DBImpl::IngestExternalFile(
   return status;
 }
 
+Status DBImpl::VerifyChecksum() {
+  Status s;
+  Options options;
+  EnvOptions env_options;
+  std::vector<ColumnFamilyData*> cfd_list;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped() && cfd->initialized()) {
+        cfd->Ref();
+        cfd_list.push_back(cfd);
+      }
+    }
+  }
+  std::vector<SuperVersion*> sv_list;
+  for (auto cfd : cfd_list) {
+    sv_list.push_back(cfd->GetReferencedSuperVersion(&mutex_));
+  }
+  for (auto& sv : sv_list) {
+    VersionStorageInfo* vstorage = sv->current->storage_info();
+    for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+      for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+           j++) {
+        const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
+        std::string fname = TableFileName(immutable_db_options_.db_paths,
+                                          fd.GetNumber(), fd.GetPathId());
+        s = rocksdb::VerifySstFileChecksum(options, env_options, fname);
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto sv : sv_list) {
+      if (sv && sv->Unref()) {
+        sv->Cleanup();
+        delete sv;
+      }
+    }
+    for (auto cfd : cfd_list) {
+        cfd->Unref();
+    }
+  }
+  return s;
+}
+
 void DBImpl::NotifyOnExternalFileIngested(
     ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
 #ifndef ROCKSDB_LITE
diff --git a/db/db_impl.h b/db/db_impl.h
index bc2072d7e..f1730f9ad 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -52,6 +52,7 @@
 
 namespace rocksdb {
 
+class ArenaWrappedDBIter;
 class MemTable;
 class TableCache;
 class Version;
@@ -93,6 +94,13 @@ class DBImpl : public DB {
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, PinnableSlice* value,
+                 bool* value_found = nullptr, bool* is_blob_index = nullptr);
+
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
@@ -123,6 +131,7 @@ class DBImpl : public DB {
                            ColumnFamilyHandle* column_family, const Slice& key,
                            std::string* value,
                            bool* value_found = nullptr) override;
+
   using DB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family) override;
@@ -130,6 +139,11 @@ class DBImpl : public DB {
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families,
       std::vector<Iterator*>* iterators) override;
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      bool allow_blob = false);
+
   virtual const Snapshot* GetSnapshot() override;
   virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
   using DB::GetProperty;
@@ -202,7 +216,9 @@ class DBImpl : public DB {
 
   virtual SequenceNumber GetLatestSequenceNumber() const override;
 
-  bool HasActiveSnapshotLaterThanSN(SequenceNumber sn);
+  // Whether there is an active snapshot in range [lower_bound, upper_bound).
+  bool HasActiveSnapshotInRange(SequenceNumber lower_bound,
+                                SequenceNumber upper_bound);
 
 #ifndef ROCKSDB_LITE
   using DB::ResetStats;
@@ -235,11 +251,11 @@ class DBImpl : public DB {
       ColumnFamilyHandle* column_family,
       ColumnFamilyMetaData* metadata) override;
 
-  // experimental API
   Status SuggestCompactRange(ColumnFamilyHandle* column_family,
-                             const Slice* begin, const Slice* end);
+                             const Slice* begin, const Slice* end) override;
 
-  Status PromoteL0(ColumnFamilyHandle* column_family, int target_level);
+  Status PromoteL0(ColumnFamilyHandle* column_family,
+                   int target_level) override;
 
   // Similar to Write() but will call the callback once on the single write
   // thread to determine whether it is safe to perform the write.
@@ -285,7 +301,8 @@ class DBImpl : public DB {
   // TODO(andrewkr): this API need to be aware of range deletion operations
   Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
                                  bool cache_only, SequenceNumber* seq,
-                                 bool* found_record_for_key);
+                                 bool* found_record_for_key,
+                                 bool* is_blob_index = nullptr);
 
   using DB::IngestExternalFile;
   virtual Status IngestExternalFile(
@@ -293,6 +310,8 @@ class DBImpl : public DB {
       const std::vector<std::string>& external_files,
       const IngestExternalFileOptions& ingestion_options) override;
 
+  virtual Status VerifyChecksum() override;
+
 #endif  // ROCKSDB_LITE
 
   // Similar to GetSnapshot(), but also lets the db know that this snapshot
@@ -339,6 +358,8 @@ class DBImpl : public DB {
     return alive_log_files_.begin()->getting_flushed;
   }
 
+  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
   // Force current memtable contents to be flushed.
   Status TEST_FlushMemTable(bool wait = true,
                             ColumnFamilyHandle* cfh = nullptr);
@@ -494,6 +515,12 @@ class DBImpl : public DB {
 
   const WriteController& write_controller() { return write_controller_; }
 
+  InternalIterator* NewInternalIterator(const ReadOptions&,
+                                        ColumnFamilyData* cfd,
+                                        SuperVersion* super_version,
+                                        Arena* arena,
+                                        RangeDelAggregator* range_del_agg);
+
   // hollow transactions shell used for recovery.
   // these will then be passed to TransactionDB so that
   // locks can be reacquired before writing can resume.
@@ -552,6 +579,7 @@ class DBImpl : public DB {
   void AddToLogsToFreeQueue(log::Writer* log_writer) {
     logs_to_free_queue_.push_back(log_writer);
   }
+  InstrumentedMutex* mutex() { return &mutex_; }
 
   Status NewDB();
 
@@ -566,12 +594,6 @@ class DBImpl : public DB {
   std::unordered_map<std::string, RecoveredTransaction*>
       recovered_transactions_;
 
-  InternalIterator* NewInternalIterator(const ReadOptions&,
-                                        ColumnFamilyData* cfd,
-                                        SuperVersion* super_version,
-                                        Arena* arena,
-                                        RangeDelAggregator* range_del_agg);
-
   // Except in DB::Open(), WriteOptionsFile can only be called when:
   // Persist options to options file.
   // If need_mutex_lock = false, the method will lock DB mutex.
@@ -613,16 +635,18 @@ class DBImpl : public DB {
   Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
                    WriteCallback* callback = nullptr,
                    uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                   bool disable_memtable = false);
+                   bool disable_memtable = false, uint64_t* seq_used = nullptr);
 
   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
                             WriteCallback* callback = nullptr,
                             uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                            bool disable_memtable = false);
+                            bool disable_memtable = false,
+                            uint64_t* seq_used = nullptr);
 
   Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
                           WriteCallback* callback = nullptr,
-                          uint64_t* log_used = nullptr, uint64_t log_ref = 0);
+                          uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                          uint64_t* seq_used = nullptr);
 
   uint64_t FindMinLogContainingOutstandingPrep();
   uint64_t FindMinPrepLogReferencedByMemTable();
@@ -630,14 +654,18 @@ class DBImpl : public DB {
  private:
   friend class DB;
   friend class InternalStats;
-  friend class TransactionImpl;
+  friend class PessimisticTransaction;
+  friend class WriteCommittedTxn;
+  friend class WritePreparedTxn;
 #ifndef ROCKSDB_LITE
   friend class ForwardIterator;
 #endif
   friend struct SuperVersion;
   friend class CompactedDBImpl;
 #ifndef NDEBUG
+  friend class DBTest2_ReadCallbackTest_Test;
   friend class XFTransactionWriteHandler;
+  friend class DBBlobIndexTest;
 #endif
   struct CompactionState;
 
@@ -655,6 +683,7 @@ class DBImpl : public DB {
     }
   };
 
+  struct PrepickedCompaction;
   struct PurgeFileInfo;
 
   // Recover the descriptor from persistent storage.  May do a significant
@@ -796,14 +825,19 @@ class DBImpl : public DB {
   void SchedulePendingPurge(std::string fname, FileType type, uint64_t number,
                             uint32_t path_id, int job_id);
   static void BGWorkCompaction(void* arg);
+  // Runs a pre-chosen universal compaction involving bottom level in a
+  // separate, bottom-pri thread pool.
+  static void BGWorkBottomCompaction(void* arg);
   static void BGWorkFlush(void* db);
   static void BGWorkPurge(void* arg);
   static void UnscheduleCallback(void* arg);
-  void BackgroundCallCompaction(void* arg);
+  void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                Env::Priority bg_thread_pri);
   void BackgroundCallFlush();
   void BackgroundCallPurge();
   Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
-                              LogBuffer* log_buffer, void* m = 0);
+                              LogBuffer* log_buffer,
+                              PrepickedCompaction* prepicked_compaction);
   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
                          LogBuffer* log_buffer);
 
@@ -1056,6 +1090,10 @@ class DBImpl : public DB {
   int unscheduled_flushes_;
   int unscheduled_compactions_;
 
+  // count how many background compactions are running or have been scheduled in
+  // the BOTTOM pool
+  int bg_bottom_compaction_scheduled_;
+
   // count how many background compactions are running or have been scheduled
   int bg_compaction_scheduled_;
 
@@ -1072,7 +1110,7 @@ class DBImpl : public DB {
   int bg_purge_scheduled_;
 
   // Information for a manual compaction
-  struct ManualCompaction {
+  struct ManualCompactionState {
     ColumnFamilyData* cfd;
     int input_level;
     int output_level;
@@ -1088,13 +1126,21 @@ class DBImpl : public DB {
     InternalKey* manual_end;      // how far we are compacting
     InternalKey tmp_storage;      // Used to keep track of compaction progress
     InternalKey tmp_storage1;     // Used to keep track of compaction progress
+  };
+  struct PrepickedCompaction {
+    // background compaction takes ownership of `compaction`.
     Compaction* compaction;
+    // caller retains ownership of `manual_compaction_state` as it is reused
+    // across background compactions.
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
   };
-  std::deque<ManualCompaction*> manual_compaction_dequeue_;
+  std::deque<ManualCompactionState*> manual_compaction_dequeue_;
 
   struct CompactionArg {
+    // caller retains ownership of `db`.
     DBImpl* db;
-    ManualCompaction* m;
+    // background compaction takes ownership of `prepicked_compaction`.
+    PrepickedCompaction* prepicked_compaction;
   };
 
   // Have we encountered a background error in paranoid mode?
@@ -1216,23 +1262,17 @@ class DBImpl : public DB {
 
 #endif  // ROCKSDB_LITE
 
-  // Function that Get and KeyMayExist call with no_io true or false
-  // Note: 'value_found' from KeyMayExist propagates here
-  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
-                 const Slice& key, PinnableSlice* value,
-                 bool* value_found = nullptr);
-
   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
                               const DBPropertyInfo& property_info,
                               bool is_locked, uint64_t* value);
 
   bool HasPendingManualCompaction();
   bool HasExclusiveManualCompaction();
-  void AddManualCompaction(ManualCompaction* m);
-  void RemoveManualCompaction(ManualCompaction* m);
-  bool ShouldntRunManualCompaction(ManualCompaction* m);
+  void AddManualCompaction(ManualCompactionState* m);
+  void RemoveManualCompaction(ManualCompactionState* m);
+  bool ShouldntRunManualCompaction(ManualCompactionState* m);
   bool HaveManualCompaction(ColumnFamilyData* cfd);
-  bool MCOverlap(ManualCompaction* m, ManualCompaction* m1);
+  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
 
   size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
 
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc
index 68d283123..3e686fe70 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@@ -612,7 +612,8 @@ Status DBImpl::CompactFilesImpl(
 Status DBImpl::PauseBackgroundWork() {
   InstrumentedMutexLock guard_lock(&mutex_);
   bg_compaction_paused_++;
-  while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) {
+  while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+         bg_flush_scheduled_ > 0) {
     bg_cv_.Wait();
   }
   bg_work_paused_++;
@@ -808,7 +809,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 
   bool scheduled = false;
   bool manual_conflict = false;
-  ManualCompaction manual;
+  ManualCompactionState manual;
   manual.cfd = cfd;
   manual.input_level = input_level;
   manual.output_level = output_level;
@@ -858,7 +859,8 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   AddManualCompaction(&manual);
   TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
   if (exclusive) {
-    while (bg_compaction_scheduled_ > 0) {
+    while (bg_bottom_compaction_scheduled_ > 0 ||
+           bg_compaction_scheduled_ > 0) {
       TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
       ROCKS_LOG_INFO(
           immutable_db_options_.info_log,
@@ -878,14 +880,14 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   while (!manual.done) {
     assert(HasPendingManualCompaction());
     manual_conflict = false;
+    Compaction* compaction;
     if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
         scheduled ||
-        ((manual.manual_end = &manual.tmp_storage1)&&(
-             (manual.compaction = manual.cfd->CompactRange(
-                  *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
-                  manual.output_level, manual.output_path_id, manual.begin,
-                  manual.end, &manual.manual_end, &manual_conflict)) ==
-             nullptr) &&
+        ((manual.manual_end = &manual.tmp_storage1) &&
+         ((compaction = manual.cfd->CompactRange(
+               *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
+               manual.output_level, manual.output_path_id, manual.begin,
+               manual.end, &manual.manual_end, &manual_conflict)) == nullptr) &&
          manual_conflict)) {
       // exclusive manual compactions should not see a conflict during
       // CompactRange
@@ -898,14 +900,16 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
         manual.incomplete = false;
       }
     } else if (!scheduled) {
-      if (manual.compaction == nullptr) {
+      if (compaction == nullptr) {
         manual.done = true;
         bg_cv_.SignalAll();
         continue;
       }
       ca = new CompactionArg;
       ca->db = this;
-      ca->m = &manual;
+      ca->prepicked_compaction = new PrepickedCompaction;
+      ca->prepicked_compaction->manual_compaction_state = &manual;
+      ca->prepicked_compaction->compaction = compaction;
       manual.incomplete = false;
       bg_compaction_scheduled_++;
       env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
@@ -1047,7 +1051,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
          unscheduled_compactions_ > 0) {
     CompactionArg* ca = new CompactionArg;
     ca->db = this;
-    ca->m = nullptr;
+    ca->prepicked_compaction = nullptr;
     bg_compaction_scheduled_++;
     unscheduled_compactions_--;
     env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
@@ -1152,7 +1156,23 @@ void DBImpl::BGWorkCompaction(void* arg) {
   delete reinterpret_cast<CompactionArg*>(arg);
   IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
   TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
-  reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(ca.m);
+  auto prepicked_compaction =
+      static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+  reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(
+      prepicked_compaction, Env::Priority::LOW);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+  CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+  delete static_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+  TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+  auto* prepicked_compaction = ca.prepicked_compaction;
+  assert(prepicked_compaction && prepicked_compaction->compaction &&
+         !prepicked_compaction->manual_compaction_state);
+  ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+  delete prepicked_compaction;
 }
 
 void DBImpl::BGWorkPurge(void* db) {
@@ -1165,8 +1185,11 @@ void DBImpl::BGWorkPurge(void* db) {
 void DBImpl::UnscheduleCallback(void* arg) {
   CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
   delete reinterpret_cast<CompactionArg*>(arg);
-  if ((ca.m != nullptr) && (ca.m->compaction != nullptr)) {
-    delete ca.m->compaction;
+  if (ca.prepicked_compaction != nullptr) {
+    if (ca.prepicked_compaction->compaction != nullptr) {
+      delete ca.prepicked_compaction->compaction;
+    }
+    delete ca.prepicked_compaction;
   }
   TEST_SYNC_POINT("DBImpl::UnscheduleCallback");
 }
@@ -1293,9 +1316,9 @@ void DBImpl::BackgroundCallFlush() {
   }
 }
 
-void DBImpl::BackgroundCallCompaction(void* arg) {
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                      Env::Priority bg_thread_pri) {
   bool made_progress = false;
-  ManualCompaction* m = reinterpret_cast<ManualCompaction*>(arg);
   JobContext job_context(next_job_id_.fetch_add(1), true);
   TEST_SYNC_POINT("BackgroundCallCompaction:0");
   MaybeDumpStats();
@@ -1313,9 +1336,11 @@ void DBImpl::BackgroundCallCompaction(void* arg) {
     auto pending_outputs_inserted_elem =
         CaptureCurrentFileNumberInPendingOutputs();
 
-    assert(bg_compaction_scheduled_);
-    Status s =
-        BackgroundCompaction(&made_progress, &job_context, &log_buffer, m);
+    assert((bg_thread_pri == Env::Priority::BOTTOM &&
+            bg_bottom_compaction_scheduled_) ||
+           (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+    Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+                                    prepicked_compaction);
     TEST_SYNC_POINT("BackgroundCallCompaction:1");
     if (!s.ok() && !s.IsShutdownInProgress()) {
       // Wait a little bit before retrying background compaction in
@@ -1361,17 +1386,24 @@ void DBImpl::BackgroundCallCompaction(void* arg) {
 
     assert(num_running_compactions_ > 0);
     num_running_compactions_--;
-    bg_compaction_scheduled_--;
+    if (bg_thread_pri == Env::Priority::LOW) {
+      bg_compaction_scheduled_--;
+    } else {
+      assert(bg_thread_pri == Env::Priority::BOTTOM);
+      bg_bottom_compaction_scheduled_--;
+    }
 
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
 
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
-    if (made_progress || bg_compaction_scheduled_ == 0 ||
+    if (made_progress ||
+        (bg_compaction_scheduled_ == 0 &&
+         bg_bottom_compaction_scheduled_ == 0) ||
         HasPendingManualCompaction()) {
       // signal if
       // * made_progress -- need to wakeup DelayWrite
-      // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+      // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
       // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
       // If none of this is true, there is no need to signal since nobody is
       // waiting for it
@@ -1386,14 +1418,23 @@ void DBImpl::BackgroundCallCompaction(void* arg) {
 
 Status DBImpl::BackgroundCompaction(bool* made_progress,
                                     JobContext* job_context,
-                                    LogBuffer* log_buffer, void* arg) {
-  ManualCompaction* manual_compaction =
-      reinterpret_cast<ManualCompaction*>(arg);
+                                    LogBuffer* log_buffer,
+                                    PrepickedCompaction* prepicked_compaction) {
+  ManualCompactionState* manual_compaction =
+      prepicked_compaction == nullptr
+          ? nullptr
+          : prepicked_compaction->manual_compaction_state;
   *made_progress = false;
   mutex_.AssertHeld();
   TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
 
   bool is_manual = (manual_compaction != nullptr);
+  unique_ptr<Compaction> c;
+  if (prepicked_compaction != nullptr &&
+      prepicked_compaction->compaction != nullptr) {
+    c.reset(prepicked_compaction->compaction);
+  }
+  bool is_prepicked = is_manual || c;
 
   // (manual_compaction->in_progress == false);
   bool trivial_move_disallowed =
@@ -1410,7 +1451,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       manual_compaction->status = status;
       manual_compaction->done = true;
       manual_compaction->in_progress = false;
-      delete manual_compaction->compaction;
       manual_compaction = nullptr;
     }
     return status;
@@ -1421,13 +1461,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     manual_compaction->in_progress = true;
   }
 
-  unique_ptr<Compaction> c;
   // InternalKey manual_end_storage;
   // InternalKey* manual_end = &manual_end_storage;
   if (is_manual) {
-    ManualCompaction* m = manual_compaction;
+    ManualCompactionState* m = manual_compaction;
     assert(m->in_progress);
-    c.reset(std::move(m->compaction));
     if (!c) {
       m->done = true;
       m->manual_end = nullptr;
@@ -1449,7 +1487,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                ? "(end)"
                : m->manual_end->DebugString().c_str()));
     }
-  } else if (!compaction_queue_.empty()) {
+  } else if (!is_prepicked && !compaction_queue_.empty()) {
     if (HaveManualCompaction(compaction_queue_.front())) {
       // Can't compact right now, but try again later
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
@@ -1601,6 +1639,28 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     // Clear Instrument
     ThreadStatusUtil::ResetThreadStatus();
+  } else if (c->column_family_data()->ioptions()->compaction_style ==
+                 kCompactionStyleUniversal &&
+             !is_prepicked && c->output_level() > 0 &&
+             c->output_level() ==
+                 c->column_family_data()
+                     ->current()
+                     ->storage_info()
+                     ->MaxOutputLevel(
+                         immutable_db_options_.allow_ingest_behind) &&
+             env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+    // Forward universal compactions involving last level to the bottom pool
+    // if it exists, such that long-running compactions can't block short-
+    // lived ones, like L0->L0s.
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->prepicked_compaction = new PrepickedCompaction;
+    ca->prepicked_compaction->compaction = c.release();
+    ca->prepicked_compaction->manual_compaction_state = nullptr;
+    ++bg_bottom_compaction_scheduled_;
+    env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+                   this, &DBImpl::UnscheduleCallback);
   } else {
     int output_level  __attribute__((unused)) = c->output_level();
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
@@ -1664,7 +1724,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   }
 
   if (is_manual) {
-    ManualCompaction* m = manual_compaction;
+    ManualCompactionState* m = manual_compaction;
     if (!status.ok()) {
       m->status = status;
       m->done = true;
@@ -1707,13 +1767,13 @@ bool DBImpl::HasPendingManualCompaction() {
   return (!manual_compaction_dequeue_.empty());
 }
 
-void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) {
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
   manual_compaction_dequeue_.push_back(m);
 }
 
-void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) {
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
   // Remove from queue
-  std::deque<ManualCompaction*>::iterator it =
+  std::deque<ManualCompactionState*>::iterator it =
       manual_compaction_dequeue_.begin();
   while (it != manual_compaction_dequeue_.end()) {
     if (m == (*it)) {
@@ -1726,16 +1786,17 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) {
   return;
 }
 
-bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) {
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
   if (num_running_ingest_file_ > 0) {
     // We need to wait for other IngestExternalFile() calls to finish
     // before running a manual compaction.
     return true;
   }
   if (m->exclusive) {
-    return (bg_compaction_scheduled_ > 0);
+    return (bg_bottom_compaction_scheduled_ > 0 ||
+            bg_compaction_scheduled_ > 0);
   }
-  std::deque<ManualCompaction*>::iterator it =
+  std::deque<ManualCompactionState*>::iterator it =
       manual_compaction_dequeue_.begin();
   bool seen = false;
   while (it != manual_compaction_dequeue_.end()) {
@@ -1756,7 +1817,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) {
 
 bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
   // Remove from priority queue
-  std::deque<ManualCompaction*>::iterator it =
+  std::deque<ManualCompactionState*>::iterator it =
       manual_compaction_dequeue_.begin();
   while (it != manual_compaction_dequeue_.end()) {
     if ((*it)->exclusive) {
@@ -1774,7 +1835,7 @@ bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
 
 bool DBImpl::HasExclusiveManualCompaction() {
   // Remove from priority queue
-  std::deque<ManualCompaction*>::iterator it =
+  std::deque<ManualCompactionState*>::iterator it =
       manual_compaction_dequeue_.begin();
   while (it != manual_compaction_dequeue_.end()) {
     if ((*it)->exclusive) {
@@ -1785,7 +1846,7 @@ bool DBImpl::HasExclusiveManualCompaction() {
   return false;
 }
 
-bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) {
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
   if ((m->exclusive) || (m1->exclusive)) {
     return true;
   }
diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc
index 9f4fccabc..a4b378020 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@@ -80,6 +80,15 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
                              disallow_trivial_move);
 }
 
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  if (cfd == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  }
+  return SwitchMemtable(cfd, &write_context);
+}
+
 Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) {
   FlushOptions fo;
   fo.wait = wait;
@@ -112,7 +121,9 @@ Status DBImpl::TEST_WaitForCompact() {
   // OR flush to finish.
 
   InstrumentedMutexLock l(&mutex_);
-  while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) {
+  while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+          bg_flush_scheduled_) &&
+         bg_error_.ok()) {
     bg_cv_.Wait();
   }
   return bg_error_;
diff --git a/db/db_impl_files.cc b/db/db_impl_files.cc
index 3bbf94c29..e44e42318 100644
--- a/db/db_impl_files.cc
+++ b/db/db_impl_files.cc
@@ -368,6 +368,9 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
     candidate_files.emplace_back(
         MakeTableFileName(kDumbDbName, file->fd.GetNumber()),
         file->fd.GetPathId());
+    if (file->table_reader_handle) {
+      table_cache_->Release(file->table_reader_handle);
+    }
     delete file;
   }
 
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index d4fe7e702..d69eecb98 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -58,7 +58,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   SequenceNumber latest_snapshot = versions_->LastSequence();
   auto db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, *cfd->ioptions(), cfd->user_comparator(),
+      env_, read_options, *cfd->ioptions(),
       (read_options.snapshot != nullptr
            ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
                  ->number_
@@ -87,7 +87,7 @@ Status DBImplReadOnly::NewIterators(
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, *cfd->ioptions(), cfd->user_comparator(),
+        env_, read_options, *cfd->ioptions(),
         (read_options.snapshot != nullptr
              ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
                    ->number_
diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc
index d60784ad2..0731d6b1d 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@@ -60,7 +60,7 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
 Status DBImpl::WriteImpl(const WriteOptions& write_options,
                          WriteBatch* my_batch, WriteCallback* callback,
                          uint64_t* log_used, uint64_t log_ref,
-                         bool disable_memtable) {
+                         bool disable_memtable, uint64_t* seq_used) {
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
@@ -79,12 +79,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
   if (concurrent_prepare_ && disable_memtable) {
     return WriteImplWALOnly(write_options, my_batch, callback, log_used,
-                            log_ref);
+                            log_ref, seq_used);
   }
 
   if (immutable_db_options_.enable_pipelined_write) {
     return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
-                              log_ref, disable_memtable);
+                              log_ref, disable_memtable, seq_used);
   }
 
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
@@ -127,6 +127,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     if (log_used != nullptr) {
       *log_used = w.log_used;
     }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
     // write is complete and leader has updated sequence
     return w.FinalStatus();
   }
@@ -278,6 +281,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
               write_options.ignore_missing_column_families, 0 /*log_number*/,
               this, true /*concurrent_memtable_writes*/);
         }
+        if (seq_used != nullptr) {
+          *seq_used = w.sequence;
+        }
       }
     }
   }
@@ -313,7 +319,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       versions_->SetLastSequence(last_sequence);
     }
     MemTableInsertStatusCheck(w.status);
-    write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
+    write_thread_.ExitAsBatchGroupLeader(write_group, status);
   }
 
   if (status.ok()) {
@@ -325,7 +331,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                                   WriteBatch* my_batch, WriteCallback* callback,
                                   uint64_t* log_used, uint64_t log_ref,
-                                  bool disable_memtable) {
+                                  bool disable_memtable, uint64_t* seq_used) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
 
@@ -441,6 +447,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
       write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
     }
   }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
 
   assert(w.state == WriteThread::STATE_COMPLETED);
   return w.FinalStatus();
@@ -448,7 +457,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
 
 Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
                                 WriteBatch* my_batch, WriteCallback* callback,
-                                uint64_t* log_used, uint64_t log_ref) {
+                                uint64_t* log_used, uint64_t log_ref,
+                                uint64_t* seq_used) {
   Status status;
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
@@ -465,6 +475,9 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
     if (log_used != nullptr) {
       *log_used = w.log_used;
     }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
     return w.FinalStatus();
   }
   // else we are the leader of the write batch group
@@ -510,6 +523,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   // wal_write_mutex_ to ensure ordered events in WAL
   status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
                                 0 /*total_count*/);
+  auto curr_seq = last_sequence + 1;
+  for (auto* writer : write_group) {
+    if (writer->CheckCallback(this)) {
+      writer->sequence = curr_seq;
+      curr_seq += WriteBatchInternal::Count(writer->batch);
+    }
+  }
   if (status.ok() && write_options.sync) {
     // Requesting sync with concurrent_prepare_ is expected to be very rare. We
     // hance provide a simple implementation that is not necessarily efficient.
@@ -524,10 +544,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   if (!w.CallbackFailed()) {
     WriteCallbackStatusCheck(status);
   }
-  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
+  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
   if (status.ok()) {
     status = w.FinalStatus();
   }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
   return status;
 }
 
@@ -1059,7 +1082,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   SuperVersion* new_superversion = nullptr;
   const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
 
-  // Set current_memtble_info for memtable sealed callback
+  // Set memtable_info for memtable sealed callback
 #ifndef ROCKSDB_LITE
   MemTableInfo memtable_info;
   memtable_info.cf_name = cfd->GetName();
diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc
index e93961c13..9f4dcc5d0 100644
--- a/db/db_io_failure_test.cc
+++ b/db/db_io_failure_test.cc
@@ -204,7 +204,6 @@ TEST_F(DBIOFailureTest, ManifestWriteError) {
     ASSERT_EQ("bar2", Get("foo2"));
   }
 }
-#endif  // ROCKSDB_LITE
 
 TEST_F(DBIOFailureTest, PutFailsParanoid) {
   // Test the following:
@@ -559,6 +558,7 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) {
   ASSERT_EQ("bar3", Get(1, "foo"));
 }
 #endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_iter.cc b/db/db_iter.cc
index c821476c4..8cd804430 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -8,8 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_iter.h"
-#include <stdexcept>
-#include <deque>
 #include <string>
 #include <limits>
 
@@ -18,7 +16,6 @@
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "monitoring/perf_context_imp.h"
-#include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
@@ -86,6 +83,7 @@ class DBIter: public Iterator {
       RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
       RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
       RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
       ResetCounters();
     }
 
@@ -101,12 +99,12 @@ class DBIter: public Iterator {
     uint64_t bytes_read_;
   };
 
-  DBIter(Env* env, const ReadOptions& read_options,
+  DBIter(Env* _env, const ReadOptions& read_options,
          const ImmutableCFOptions& cf_options, const Comparator* cmp,
          InternalIterator* iter, SequenceNumber s, bool arena_mode,
-         uint64_t max_sequential_skip_in_iterations, uint64_t version_number)
+         uint64_t max_sequential_skip_in_iterations, bool allow_blob)
       : arena_mode_(arena_mode),
-        env_(env),
+        env_(_env),
         logger_(cf_options.info_log),
         user_comparator_(cmp),
         merge_operator_(cf_options.merge_operator),
@@ -116,14 +114,14 @@ class DBIter: public Iterator {
         valid_(false),
         current_entry_is_merged_(false),
         statistics_(cf_options.statistics),
-        version_number_(version_number),
         iterate_lower_bound_(read_options.iterate_lower_bound),
         iterate_upper_bound_(read_options.iterate_upper_bound),
         prefix_same_as_start_(read_options.prefix_same_as_start),
         pin_thru_lifetime_(read_options.pin_data),
         total_order_seek_(read_options.total_order_seek),
         range_del_agg_(cf_options.internal_comparator, s,
-                       true /* collapse_deletions */) {
+                       true /* collapse_deletions */),
+        allow_blob_(allow_blob) {
     RecordTick(statistics_, NO_ITERATORS);
     prefix_extractor_ = cf_options.prefix_extractor;
     max_skip_ = max_sequential_skip_in_iterations;
@@ -181,6 +179,10 @@ class DBIter: public Iterator {
       return status_;
     }
   }
+  bool IsBlob() const {
+    assert(valid_ && (allow_blob_ || !is_blob_));
+    return is_blob_;
+  }
 
   virtual Status GetProperty(std::string prop_name,
                              std::string* prop) override {
@@ -189,10 +191,7 @@ class DBIter: public Iterator {
     }
     if (prop_name == "rocksdb.iterator.super-version-number") {
       // First try to pass the value returned from inner iterator.
-      if (!iter_->GetProperty(prop_name, prop).ok()) {
-        *prop = ToString(version_number_);
-      }
-      return Status::OK();
+      return iter_->GetProperty(prop_name, prop);
     } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
       if (valid_) {
         *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
@@ -210,6 +209,9 @@ class DBIter: public Iterator {
   virtual void SeekForPrev(const Slice& target) override;
   virtual void SeekToFirst() override;
   virtual void SeekToLast() override;
+  Env* env() { return env_; }
+  void set_sequence(uint64_t s) { sequence_ = s; }
+  void set_valid(bool v) { valid_ = v; }
 
  private:
   void ReverseToForward();
@@ -261,7 +263,7 @@ class DBIter: public Iterator {
   const Comparator* const user_comparator_;
   const MergeOperator* const merge_operator_;
   InternalIterator* iter_;
-  SequenceNumber const sequence_;
+  SequenceNumber sequence_;
 
   Status status_;
   IterKey saved_key_;
@@ -275,7 +277,6 @@ class DBIter: public Iterator {
   uint64_t max_skip_;
   uint64_t max_skippable_internal_keys_;
   uint64_t num_internal_keys_skipped_;
-  uint64_t version_number_;
   const Slice* iterate_lower_bound_;
   const Slice* iterate_upper_bound_;
   IterKey prefix_start_buf_;
@@ -290,6 +291,8 @@ class DBIter: public Iterator {
   RangeDelAggregator range_del_agg_;
   LocalStatistics local_stats_;
   PinnedIteratorsManager pinned_iters_mgr_;
+  bool allow_blob_;
+  bool is_blob_;
 
   // No copying allowed
   DBIter(const DBIter&);
@@ -379,6 +382,8 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
   //  - none of the above  : saved_key_ can contain anything, it doesn't matter.
   uint64_t num_skipped = 0;
 
+  is_blob_ = false;
+
   do {
     ParsedInternalKey ikey;
 
@@ -423,6 +428,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
             PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
             break;
           case kTypeValue:
+          case kTypeBlobIndex:
             saved_key_.SetUserKey(
                 ikey.user_key,
                 !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
@@ -434,6 +440,18 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
               skipping = true;
               num_skipped = 0;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            } else if (ikey.type == kTypeBlobIndex) {
+              if (!allow_blob_) {
+                ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+                status_ = Status::NotSupported(
+                    "Encounter unexpected blob index. Please open DB with "
+                    "rocksdb::blob_db::BlobDB instead.");
+                valid_ = false;
+              } else {
+                is_blob_ = true;
+                valid_ = true;
+              }
+              return;
             } else {
               valid_ = true;
               return;
@@ -575,6 +593,18 @@ void DBIter::MergeValuesNewToOld() {
       merge_context_.PushOperand(iter_->value(),
                                  iter_->IsValuePinned() /* operand_pinned */);
       PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (kTypeBlobIndex == ikey.type) {
+      if (!allow_blob_) {
+        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+        status_ = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "rocksdb::blob_db::BlobDB instead.");
+      } else {
+        status_ =
+            Status::NotSupported("Blob DB does not support merge operator.");
+      }
+      valid_ = false;
+      return;
     } else {
       assert(false);
     }
@@ -697,7 +727,6 @@ void DBIter::PrevInternal() {
     }
 
     if (FindValueForCurrentKey()) {
-      valid_ = true;
       if (!iter_->Valid()) {
         return;
       }
@@ -759,6 +788,7 @@ bool DBIter::FindValueForCurrentKey() {
     last_key_entry_type = ikey.type;
     switch (last_key_entry_type) {
       case kTypeValue:
+      case kTypeBlobIndex:
         if (range_del_agg_.ShouldDelete(
                 ikey,
                 RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
@@ -804,6 +834,7 @@ bool DBIter::FindValueForCurrentKey() {
   }
 
   Status s;
+  is_blob_ = false;
   switch (last_key_entry_type) {
     case kTypeDeletion:
     case kTypeSingleDeletion:
@@ -819,6 +850,18 @@ bool DBIter::FindValueForCurrentKey() {
             merge_operator_, saved_key_.GetUserKey(), nullptr,
             merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
             env_, &pinned_value_, true);
+      } else if (last_not_merge_type == kTypeBlobIndex) {
+        if (!allow_blob_) {
+          ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+          status_ = Status::NotSupported(
+              "Encounter unexpected blob index. Please open DB with "
+              "rocksdb::blob_db::BlobDB instead.");
+        } else {
+          status_ =
+              Status::NotSupported("Blob DB does not support merge operator.");
+        }
+        valid_ = false;
+        return true;
       } else {
         assert(last_not_merge_type == kTypeValue);
         s = MergeHelper::TimedFullMerge(
@@ -830,6 +873,17 @@ bool DBIter::FindValueForCurrentKey() {
     case kTypeValue:
       // do nothing - we've already has value in saved_value_
       break;
+    case kTypeBlobIndex:
+      if (!allow_blob_) {
+        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+        status_ = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "rocksdb::blob_db::BlobDB instead.");
+        valid_ = false;
+        return true;
+      }
+      is_blob_ = true;
+      break;
     default:
       assert(false);
       break;
@@ -863,7 +917,15 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
     valid_ = false;
     return false;
   }
-  if (ikey.type == kTypeValue) {
+  if (ikey.type == kTypeBlobIndex && !allow_blob_) {
+    ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+    status_ = Status::NotSupported(
+        "Encounter unexpected blob index. Please open DB with "
+        "rocksdb::blob_db::BlobDB instead.");
+    valid_ = false;
+    return true;
+  }
+  if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
     assert(iter_->IsValuePinned());
     pinned_value_ = iter_->value();
     valid_ = true;
@@ -1029,6 +1091,7 @@ void DBIter::Seek(const Slice& target) {
       if (valid_) {
         RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
         RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
       }
     }
   } else {
@@ -1071,6 +1134,7 @@ void DBIter::SeekForPrev(const Slice& target) {
       if (valid_) {
         RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
         RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
       }
     }
   } else {
@@ -1113,6 +1177,7 @@ void DBIter::SeekToFirst() {
       if (valid_) {
         RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
         RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
       }
     }
   } else {
@@ -1160,6 +1225,7 @@ void DBIter::SeekToLast() {
     if (valid_) {
       RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
       RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
     }
   }
   if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
@@ -1175,17 +1241,15 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
                         InternalIterator* internal_iter,
                         const SequenceNumber& sequence,
                         uint64_t max_sequential_skip_in_iterations,
-                        uint64_t version_number) {
+                        bool allow_blob) {
   DBIter* db_iter = new DBIter(
       env, read_options, cf_options, user_key_comparator, internal_iter,
-      sequence, false, max_sequential_skip_in_iterations, version_number);
+      sequence, false, max_sequential_skip_in_iterations, allow_blob);
   return db_iter;
 }
 
 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
 
-void ArenaWrappedDBIter::SetDBIter(DBIter* iter) { db_iter_ = iter; }
-
 RangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
   return db_iter_->GetRangeDelAggregator();
 }
@@ -1208,28 +1272,70 @@ inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
 inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
 inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
 inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
+bool ArenaWrappedDBIter::IsBlob() const { return db_iter_->IsBlob(); }
 inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
                                               std::string* prop) {
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+      *prop = ToString(sv_number_);
+    }
+    return Status::OK();
+  }
   return db_iter_->GetProperty(prop_name, prop);
 }
-void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
-                                         void* arg2) {
-  db_iter_->RegisterCleanup(function, arg1, arg2);
+
+void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
+                              const ImmutableCFOptions& cf_options,
+                              const SequenceNumber& sequence,
+                              uint64_t max_sequential_skip_in_iteration,
+                              uint64_t version_number, bool allow_blob) {
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem)
+      DBIter(env, read_options, cf_options, cf_options.user_comparator, nullptr,
+             sequence, true, max_sequential_skip_in_iteration, allow_blob);
+  sv_number_ = version_number;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+  if (cfd_ == nullptr || db_impl_ == nullptr) {
+    return Status::NotSupported("Creating renew iterator is not allowed.");
+  }
+  assert(db_iter_ != nullptr);
+  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+  if (sv_number_ != cur_sv_number) {
+    Env* env = db_iter_->env();
+    db_iter_->~DBIter();
+    arena_.~Arena();
+    new (&arena_) Arena();
+
+    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
+    Init(env, read_options_, *(cfd_->ioptions()), latest_seq,
+         sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, allow_blob_);
+
+    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator());
+    SetIterUnderDBIter(internal_iter);
+  } else {
+    db_iter_->set_sequence(latest_seq);
+    db_iter_->set_valid(false);
+  }
+  return Status::OK();
 }
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
     Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    uint64_t version_number) {
+    const ImmutableCFOptions& cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    DBImpl* db_impl, ColumnFamilyData* cfd, bool allow_blob) {
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  Arena* arena = iter->GetArena();
-  auto mem = arena->AllocateAligned(sizeof(DBIter));
-  DBIter* db_iter = new (mem)
-      DBIter(env, read_options, cf_options, user_key_comparator, nullptr,
-             sequence, true, max_sequential_skip_in_iterations, version_number);
-
-  iter->SetDBIter(db_iter);
+  iter->Init(env, read_options, cf_options, sequence,
+             max_sequential_skip_in_iterations, version_number, allow_blob);
+  if (db_impl != nullptr && cfd != nullptr) {
+    iter->StoreRefreshInfo(read_options, db_impl, cfd, allow_blob);
+  }
 
   return iter;
 }
diff --git a/db/db_iter.h b/db/db_iter.h
index 833526440..26fcd44cb 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -10,6 +10,7 @@
 #pragma once
 #include <stdint.h>
 #include <string>
+#include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
 #include "options/cf_options.h"
@@ -33,7 +34,7 @@ extern Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
                                InternalIterator* internal_iter,
                                const SequenceNumber& sequence,
                                uint64_t max_sequential_skip_in_iterations,
-                               uint64_t version_number);
+                               bool allow_blob = false);
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed be allocated. This class is used as an entry point of
@@ -49,10 +50,6 @@ class ArenaWrappedDBIter : public Iterator {
   virtual Arena* GetArena() { return &arena_; }
   virtual RangeDelAggregator* GetRangeDelAggregator();
 
-  // Set the DB Iterator to be wrapped
-
-  virtual void SetDBIter(DBIter* iter);
-
   // Set the internal iterator wrapped inside the DB Iterator. Usually it is
   // a merging iterator.
   virtual void SetIterUnderDBIter(InternalIterator* iter);
@@ -66,20 +63,43 @@ class ArenaWrappedDBIter : public Iterator {
   virtual Slice key() const override;
   virtual Slice value() const override;
   virtual Status status() const override;
+  virtual Status Refresh() override;
+  bool IsBlob() const;
 
-  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
   virtual Status GetProperty(std::string prop_name, std::string* prop) override;
 
+  void Init(Env* env, const ReadOptions& read_options,
+            const ImmutableCFOptions& cf_options,
+            const SequenceNumber& sequence,
+            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            bool allow_blob);
+
+  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, bool allow_blob) {
+    read_options_ = read_options;
+    db_impl_ = db_impl;
+    cfd_ = cfd;
+    allow_blob_ = allow_blob;
+  }
+
  private:
   DBIter* db_iter_;
   Arena arena_;
+  uint64_t sv_number_;
+  ColumnFamilyData* cfd_ = nullptr;
+  DBImpl* db_impl_ = nullptr;
+  ReadOptions read_options_;
+  bool allow_blob_ = false;
 };
 
 // Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
     Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    uint64_t version_number);
+    const ImmutableCFOptions& cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool allow_blob = false);
 
 }  // namespace rocksdb
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index d6abfc2a4..6b2cb9af9 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -193,7 +193,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -225,7 +225,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -251,7 +251,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -283,7 +283,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -318,7 +318,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -347,7 +347,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      7, options.max_sequential_skip_in_iterations, 0));
+                      7, options.max_sequential_skip_in_iterations));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -384,7 +384,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations, 0));
+                      4, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -409,7 +409,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -431,7 +431,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -466,7 +466,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      7, options.max_sequential_skip_in_iterations, 0));
+                      7, options.max_sequential_skip_in_iterations));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -495,7 +495,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -537,7 +537,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations, 0));
+                      2, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -568,7 +568,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -597,7 +597,7 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations, 0));
+                      0, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -608,7 +608,7 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations, 0));
+                      0, options.max_sequential_skip_in_iterations));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -630,7 +630,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 2, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 2, options.max_sequential_skip_in_iterations));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -673,7 +673,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       options.statistics = rocksdb::CreateDBStatistics();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -708,7 +708,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -736,7 +736,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, 202,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -768,7 +768,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, i,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
 
@@ -784,7 +784,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      200, options.max_sequential_skip_in_iterations, 0));
+                      200, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -818,7 +818,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -852,7 +852,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -900,7 +900,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 0;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -946,7 +946,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -990,7 +990,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1028,7 +1028,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1063,7 +1063,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1093,7 +1093,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1130,7 +1130,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1167,7 +1167,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, 2 * i + 1,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1220,7 +1220,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
           env_, ro, cf_options, BytewiseComparator(), internal_iter, 2 * i + 1,
-          options.max_sequential_skip_in_iterations, 0));
+          options.max_sequential_skip_in_iterations));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1258,7 +1258,7 @@ TEST_F(DBIteratorTest, DBIterator1) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 1, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 1, options.max_sequential_skip_in_iterations));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1285,7 +1285,7 @@ TEST_F(DBIteratorTest, DBIterator2) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 0, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 0, options.max_sequential_skip_in_iterations));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1309,7 +1309,7 @@ TEST_F(DBIteratorTest, DBIterator3) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 2, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 2, options.max_sequential_skip_in_iterations));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1333,7 +1333,7 @@ TEST_F(DBIteratorTest, DBIterator4) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 4, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 4, options.max_sequential_skip_in_iterations));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1365,7 +1365,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations, 0));
+                      0, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1387,7 +1387,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      1, options.max_sequential_skip_in_iterations, 0));
+                      1, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1409,7 +1409,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations, 0));
+                      2, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1431,7 +1431,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      3, options.max_sequential_skip_in_iterations, 0));
+                      3, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1453,7 +1453,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations, 0));
+                      4, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1475,7 +1475,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      5, options.max_sequential_skip_in_iterations, 0));
+                      5, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1497,7 +1497,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      6, options.max_sequential_skip_in_iterations, 0));
+                      6, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1517,7 +1517,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations, 0));
+                      10, options.max_sequential_skip_in_iterations));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -1546,7 +1546,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations, 0));
+                      0, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1568,7 +1568,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      1, options.max_sequential_skip_in_iterations, 0));
+                      1, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1590,7 +1590,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations, 0));
+                      2, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1612,7 +1612,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      3, options.max_sequential_skip_in_iterations, 0));
+                      3, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -1630,7 +1630,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations, 0));
+                      4, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1652,7 +1652,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      5, options.max_sequential_skip_in_iterations, 0));
+                      5, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1674,7 +1674,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      6, options.max_sequential_skip_in_iterations, 0));
+                      6, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1715,7 +1715,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations, 0));
+                      0, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1749,7 +1749,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations, 0));
+                      2, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1789,7 +1789,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations, 0));
+                      4, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1829,7 +1829,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      5, options.max_sequential_skip_in_iterations, 0));
+                      5, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1874,7 +1874,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      6, options.max_sequential_skip_in_iterations, 0));
+                      6, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1920,7 +1920,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      7, options.max_sequential_skip_in_iterations, 0));
+                      7, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1960,7 +1960,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      9, options.max_sequential_skip_in_iterations, 0));
+                      9, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2006,7 +2006,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      13, options.max_sequential_skip_in_iterations, 0));
+                      13, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2053,7 +2053,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
 
     std::unique_ptr<Iterator> db_iter(
         NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      14, options.max_sequential_skip_in_iterations, 0));
+                      14, options.max_sequential_skip_in_iterations));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2083,7 +2083,7 @@ TEST_F(DBIteratorTest, DBIterator8) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 10, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 10, options.max_sequential_skip_in_iterations));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2113,7 +2113,7 @@ TEST_F(DBIteratorTest, DBIterator9) {
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
         env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations, 0));
+        internal_iter, 10, options.max_sequential_skip_in_iterations));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2179,7 +2179,7 @@ TEST_F(DBIteratorTest, DBIterator10) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 10, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 10, options.max_sequential_skip_in_iterations));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2218,7 +2218,7 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
 
   std::unique_ptr<Iterator> db_iter(
       NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 10, 0 /* force seek */, 0));
+                    internal_iter, 10, 0 /* force seek */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2246,7 +2246,7 @@ TEST_F(DBIteratorTest, DBIterator11) {
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
       env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 1, options.max_sequential_skip_in_iterations, 0));
+      internal_iter, 1, options.max_sequential_skip_in_iterations));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2272,7 +2272,7 @@ TEST_F(DBIteratorTest, DBIterator12) {
 
   std::unique_ptr<Iterator> db_iter(
       NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 10, 0, 0));
+                    internal_iter, 10, 0));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -2309,7 +2309,7 @@ TEST_F(DBIteratorTest, DBIterator13) {
 
   std::unique_ptr<Iterator> db_iter(
       NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 2, 3, 0));
+                    internal_iter, 2, 3));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2337,7 +2337,7 @@ TEST_F(DBIteratorTest, DBIterator14) {
 
   std::unique_ptr<Iterator> db_iter(
       NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 4, 1, 0));
+                    internal_iter, 4, 1));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2376,7 +2376,7 @@ class DBIterWithMergeIterTest : public testing::Test {
     db_iter_.reset(NewDBIterator(env_, ro_, ImmutableCFOptions(options_),
                                  BytewiseComparator(), merge_iter,
                                  8 /* read data earlier than seqId 8 */,
-                                 3 /* max iterators before reseek */, 0));
+                                 3 /* max iterators before reseek */));
   }
 
   Env* env_;
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 90f43ea37..5cd48fb8e 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -1719,12 +1719,15 @@ TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
 
   std::vector<port::Thread> threads;
   std::function<void()> reader_func_next = [&]() {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
     Iterator* iter = db_->NewIterator(ReadOptions());
 
     iter->SeekToFirst();
     // Seek will bump ITER_BYTES_READ
-    total_bytes += iter->key().size();
-    total_bytes += iter->value().size();
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
     while (true) {
       iter->Next();
       total_next++;
@@ -1733,20 +1736,25 @@ TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
         break;
       }
       total_next_found++;
-      total_bytes += iter->key().size();
-      total_bytes += iter->value().size();
+      bytes += iter->key().size();
+      bytes += iter->value().size();
     }
 
     delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
   };
 
   std::function<void()> reader_func_prev = [&]() {
+    SetPerfLevel(kEnableCount);
     Iterator* iter = db_->NewIterator(ReadOptions());
 
     iter->SeekToLast();
     // Seek will bump ITER_BYTES_READ
-    total_bytes += iter->key().size();
-    total_bytes += iter->value().size();
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
     while (true) {
       iter->Prev();
       total_prev++;
@@ -1755,11 +1763,14 @@ TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
         break;
       }
       total_prev_found++;
-      total_bytes += iter->key().size();
-      total_bytes += iter->value().size();
+      bytes += iter->key().size();
+      bytes += iter->value().size();
     }
 
     delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
   };
 
   for (int i = 0; i < 10; i++) {
@@ -1909,6 +1920,130 @@ TEST_F(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
                  NUMBER_OF_RESEEKS_IN_ITERATION));
 }
 
+TEST_F(DBIteratorTest, Refresh) {
+  ASSERT_OK(Put("x", "y"));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Refresh();
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  dbfull()->Flush(FlushOptions());
+
+  ASSERT_OK(Put("m", "n"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Refresh();
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("m")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter.reset();
+}
+
+TEST_F(DBIteratorTest, TableFilter) {
+  ASSERT_OK(Put("a", "1"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Put("c", "3"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("d", "4"));
+  ASSERT_OK(Put("e", "5"));
+  ASSERT_OK(Put("f", "6"));
+  dbfull()->Flush(FlushOptions());
+
+  // Ensure the table_filter callback is called once for each table.
+  {
+    std::set<uint64_t> unseen {1, 2, 3};
+    ReadOptions opts;
+    opts.table_filter = [&](const TableProperties& props) {
+      auto it = unseen.find(props.num_entries);
+      if (it == unseen.end()) {
+        ADD_FAILURE() << "saw table properties with an unexpected " << props.num_entries << " entries";
+      } else {
+        unseen.erase(it);
+      }
+      return true;
+    };
+    auto iter = db_->NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->2");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->3");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(unseen.empty());
+    delete iter;
+  }
+
+  // Ensure returning false in the table_filter hides the keys from that table
+  // during iteration.
+  {
+    ReadOptions opts;
+    opts.table_filter = [](const TableProperties& props) {
+      return props.num_entries != 2;
+    };
+    auto iter = db_->NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    delete iter;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index b09fe1ffa..0da64b136 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -1309,6 +1309,80 @@ TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
   ASSERT_EQ(0, num_keys);
 }
 
+TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
+  std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
+  uint64_t oldest_key_time = 0;
+  Options options;
+  options.env = mock_env.get();
+
+  // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
+  mock_env->set_current_time(100);
+  for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                          kCompactionStyleNone}) {
+    options.compaction_style = compaction;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_FALSE(dbfull()->GetIntProperty(
+        DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
+  }
+
+  options.compaction_style = kCompactionStyleFIFO;
+  options.compaction_options_fifo.ttl = 300;
+  options.compaction_options_fifo.allow_compaction = false;
+  DestroyAndReopen(options);
+
+  mock_env->set_current_time(100);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+  ASSERT_OK(Flush());
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+
+  mock_env->set_current_time(200);
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+
+  mock_env->set_current_time(300);
+  ASSERT_OK(Put("k3", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time);
+
+  mock_env->set_current_time(450);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(200, oldest_key_time);
+
+  mock_env->set_current_time(550);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(300, oldest_key_time);
+
+  mock_env->set_current_time(650);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel());
+  ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                        &oldest_key_time));
+
+  // Close before mock_env destructs.
+  Close();
+}
+
 #endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index d0a60e330..982cbb85a 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -894,6 +894,74 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) {
   }
 }
 
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+  // make sure compaction treats files containing a split range deletion in the
+  // input level as an atomic unit. I.e., compacting any input-level file(s)
+  // containing a portion of the range deletion causes all other input-level
+  // files containing portions of that same range deletion to be included in the
+  // compaction.
+  const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  // i == 0: CompactFiles
+  // i == 1: CompactRange
+  // i == 2: automatic compaction
+  for (int i = 0; i < 3; ++i) {
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+    ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                     Key(2 * kNumFilesPerLevel));
+
+    Random rnd(301);
+    std::string value = RandomString(&rnd, kValueBytes);
+    for (int j = 0; j < kNumFilesPerLevel; ++j) {
+      // give files overlapping key-ranges to prevent trivial move
+      ASSERT_OK(Put(Key(j), value));
+      ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+      if (j > 0) {
+        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_EQ(j, NumTableFilesAtLevel(0));
+      }
+    }
+    // put extra key to trigger final flush
+    ASSERT_OK(Put("", ""));
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    if (i == 0) {
+      ASSERT_OK(db_->CompactFiles(
+          CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+    } else if (i == 1) {
+      auto begin_str = Key(0), end_str = Key(1);
+      Slice begin = begin_str, end = end_str;
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+    } else if (i == 2) {
+      ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                {{"max_bytes_for_level_base", "10000"}}));
+      dbfull()->TEST_WaitForCompact();
+    }
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
 TEST_F(DBRangeDelTest, UnorderedTombstones) {
   // Regression test for #2752. Range delete tombstones between
   // different snapshot stripes are not stored in order, so the first
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 73c6fe801..e01754c44 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -650,9 +650,18 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
 }
 
 TEST_F(DBSSTTest, GetTotalSstFilesSize) {
+  // We don't propagate oldest-key-time table property on compaction and
+  // just write 0 as default value. This affect the exact table size, since
+  // we encode table properties as varint64. Force time to be 0 to work around
+  // it. Should remove the workaround after we propagate the property on
+  // compaction.
+  std::unique_ptr<MockTimeEnv> mock_env(new MockTimeEnv(Env::Default()));
+  mock_env->set_current_time(0);
+
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.compression = kNoCompression;
+  options.env = mock_env.get();
   DestroyAndReopen(options);
   // Generate 5 files in L0
   for (int i = 0; i < 5; i++) {
@@ -739,6 +748,9 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
   // Live SST files = 0
   // Total SST files = 0
   ASSERT_EQ(total_sst_files_size, 0);
+
+  // Close db before mock_env destruct.
+  Close();
 }
 
 TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 6e93b39b5..193101d46 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -23,6 +23,7 @@
 #include <alloca.h>
 #endif
 
+#include "cache/lru_cache.h"
 #include "db/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -2234,6 +2235,10 @@ class ModelDB : public DB {
     return Status::NotSupported("Not implemented.");
   }
 
+  virtual Status VerifyChecksum() override {
+    return Status::NotSupported("Not implemented.");
+  }
+
   using DB::GetPropertiesOfAllTables;
   virtual Status GetPropertiesOfAllTables(
       ColumnFamilyHandle* column_family,
@@ -2886,20 +2891,23 @@ TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
   ASSERT_TRUE(TryReopen(options).IsNotSupported());
 }
 
-TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
+TEST_F(DBTest, FIFOCompactionWithTTLTest) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
   options.write_buffer_size = 10 << 10;  // 10KB
   options.arena_block_size = 4096;
   options.compression = kNoCompression;
   options.create_if_missing = true;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
 
   // Test to make sure that all files with expired ttl are deleted on next
   // manual compaction.
   {
+    env_->addon_time_.store(0);
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
-    options.compaction_options_fifo.ttl = 600;  // seconds
+    options.compaction_options_fifo.ttl = 1 * 60 * 60 ;  // 1 hour
     options = CurrentOptions(options);
     DestroyAndReopen(options);
 
@@ -2910,19 +2918,21 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
-    // sleep for 5 seconds
-    env_->SleepForMicroseconds(5 * 1000 * 1000);
+    // Sleep for 2 hours -- which is much greater than TTL.
+    // Note: Couldn't use SleepForMicroseconds because it takes an int instead
+    // of uint64_t. Hence used addon_time_ directly.
+    // env_->SleepForMicroseconds(2 * 60 * 60 * 1000 * 1000);
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+
+    // Since no flushes and compactions have run, the db should still be in
+    // the same state even after considerable time has passed.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
-    // change ttl to 1 sec. So all files should be deleted on next compaction.
-    options.compaction_options_fifo.ttl = 1;
-    Reopen(options);
-
     dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   }
@@ -2932,7 +2942,7 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
   {
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
-    options.compaction_options_fifo.ttl = 5;  // seconds
+    options.compaction_options_fifo.ttl = 1 * 60 * 60;  // 1 hour
     options = CurrentOptions(options);
     DestroyAndReopen(options);
 
@@ -2943,11 +2953,13 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
-    env_->SleepForMicroseconds(6 * 1000 * 1000);
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
@@ -2969,10 +2981,10 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
   // Test that shows the fall back to size-based FIFO compaction if TTL-based
   // deletion doesn't move the total size to be less than max_table_files_size.
   {
-    options.write_buffer_size = 110 << 10;                             // 10KB
+    options.write_buffer_size = 10 << 10;                              // 10KB
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
-    options.compaction_options_fifo.ttl = 5;  // seconds
+    options.compaction_options_fifo.ttl =  1 * 60 * 60;  // 1 hour
     options = CurrentOptions(options);
     DestroyAndReopen(options);
 
@@ -2983,11 +2995,13 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 3);
 
-    env_->SleepForMicroseconds(6 * 1000 * 1000);
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 3);
 
@@ -2996,8 +3010,8 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     // Size limit is still guaranteed.
     ASSERT_LE(SizeAtLevel(0),
               options.compaction_options_fifo.max_table_files_size);
@@ -3007,7 +3021,7 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
   {
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = true;
-    options.compaction_options_fifo.ttl = 5;  // seconds
+    options.compaction_options_fifo.ttl = 1 * 60 * 60;  // 1 hour
     options.level0_file_num_compaction_trigger = 6;
     options = CurrentOptions(options);
     DestroyAndReopen(options);
@@ -3019,15 +3033,16 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
     // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
     // (due to level0_file_num_compaction_trigger = 6).
     // So total files = 1 + remaining 4 = 5.
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
 
-    // Sleep for a little over ttl time.
-    env_->SleepForMicroseconds(6 * 1000 * 1000);
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->addon_time_.fetch_add(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
 
@@ -3037,8 +3052,8 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
     ASSERT_LE(SizeAtLevel(0),
               options.compaction_options_fifo.max_table_files_size);
@@ -3050,7 +3065,7 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
     options.write_buffer_size = 20 << 10;                               // 20K
     options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1.5MB
     options.compaction_options_fifo.allow_compaction = true;
-    options.compaction_options_fifo.ttl = 60 * 60;  // 1 hour
+    options.compaction_options_fifo.ttl = 1 * 60 * 60;  // 1 hour
     options.level0_file_num_compaction_trigger = 6;
     options = CurrentOptions(options);
     DestroyAndReopen(options);
@@ -3062,8 +3077,8 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     // It should be compacted to 10 files.
     ASSERT_EQ(NumTableFilesAtLevel(0), 10);
 
@@ -3073,8 +3088,8 @@ TEST_F(DBTest, DISABLED_FIFOCompactionWithTTLTest) {
         ASSERT_OK(Put(ToString(i * 20 + j + 2000), RandomString(&rnd, 980)));
       }
       Flush();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     // It should be compacted to no more than 20 files.
     ASSERT_GT(NumTableFilesAtLevel(0), 10);
@@ -3332,11 +3347,23 @@ TEST_F(DBTest, DynamicMemtableOptions) {
       {"write_buffer_size", "131072"},
   }));
 
-  // The existing memtable is still 64KB in size, after it becomes immutable,
-  // the next memtable will be 128KB in size. Write 256KB total, we should
-  // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
-  gen_l0_kb(256);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 2);  // (A)
+  // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+  // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+  gen_l0_kb(192);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);  // (A)
+  ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+  // Decrease buffer size below current usage
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "65536"},
+  }));
+  // The existing memtable became eligible for flush when we reduced its
+  // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+  // memtable to be marked full, second schedules the flush. Then we should have
+  // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+  gen_l0_kb(2);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
   ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
   ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
 
@@ -5360,6 +5387,36 @@ TEST_F(DBTest, RowCache) {
   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
   ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
 }
+
+TEST_F(DBTest, PinnableSliceAndRowCache) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    PinnableSlice pin_slice;
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(pin_slice.ToString(), "bar");
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+}
+
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBTest, DeletingOldWalAfterDrop) {
diff --git a/db/db_test2.cc b/db/db_test2.cc
index aa10789c8..30afd5a69 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -2236,6 +2236,7 @@ TEST_F(DBTest2, LowPriWrite) {
   ASSERT_EQ(1, rate_limit_count.load());
 }
 
+#ifndef ROCKSDB_LITE
 TEST_F(DBTest2, RateLimitedCompactionReads) {
   // compaction input has 512KB data
   const int kNumKeysPerFile = 128;
@@ -2281,12 +2282,15 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
     // chose 1MB as the upper bound on the total bytes read.
     size_t rate_limited_bytes =
         options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
-    ASSERT_GE(
-        rate_limited_bytes,
-        static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+    // Include the explict prefetch of the footer in direct I/O case.
+    size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
+    ASSERT_GE(rate_limited_bytes,
+              static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+                                  direct_io_extra));
     ASSERT_LT(
         rate_limited_bytes,
-        static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+        static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+                            direct_io_extra));
 
     Iterator* iter = db_->NewIterator(ReadOptions());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -2299,7 +2303,34 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
                   options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
   }
 }
+#endif  // ROCKSDB_LITE
 
+// Make sure DB can be reopen with reduced number of levels, given no file
+// is on levels higher than the new num_levels.
+TEST_F(DBTest2, ReduceLevel) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  Reopen(options);
+  Put("foo", "bar");
+  Flush();
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  options.num_levels = 3;
+  Reopen(options);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 7de6cff3e..c4d465ba1 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -42,13 +42,24 @@ SpecialEnv::SpecialEnv(Env* base)
   non_writable_count_ = 0;
   table_write_callback_ = nullptr;
 }
-
+#ifndef ROCKSDB_LITE
 ROT13BlockCipher rot13Cipher_(16);
+#endif  // ROCKSDB_LITE
 
 DBTestBase::DBTestBase(const std::string path)
     : mem_env_(!getenv("MEM_ENV") ? nullptr : new MockEnv(Env::Default())),
-      encrypted_env_(!getenv("ENCRYPTED_ENV") ? nullptr : NewEncryptedEnv(mem_env_ ? mem_env_ : Env::Default(), new CTREncryptionProvider(rot13Cipher_))),
-      env_(new SpecialEnv(encrypted_env_ ? encrypted_env_ : (mem_env_ ? mem_env_ : Env::Default()))),
+#ifndef ROCKSDB_LITE
+      encrypted_env_(
+          !getenv("ENCRYPTED_ENV")
+              ? nullptr
+              : NewEncryptedEnv(mem_env_ ? mem_env_ : Env::Default(),
+                                new CTREncryptionProvider(rot13Cipher_))),
+#else
+      encrypted_env_(nullptr),
+#endif  // ROCKSDB_LITE
+      env_(new SpecialEnv(encrypted_env_
+                              ? encrypted_env_
+                              : (mem_env_ ? mem_env_ : Env::Default()))),
       option_config_(kDefault) {
   env_->SetBackgroundThreads(1, Env::LOW);
   env_->SetBackgroundThreads(1, Env::HIGH);
@@ -686,6 +697,13 @@ std::string DBTestBase::Get(int cf, const std::string& k,
   return result;
 }
 
+Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
+  return s;
+}
+
 uint64_t DBTestBase::GetNumSnapshots() {
   uint64_t int_num;
   EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 5fb3f0c81..f2caa46ca 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -572,6 +572,37 @@ class SpecialEnv : public EnvWrapper {
   std::atomic<bool> is_wal_sync_thread_safe_{true};
 };
 
+class MockTimeEnv : public EnvWrapper {
+ public:
+  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
+
+  virtual Status GetCurrentTime(int64_t* time) override {
+    assert(time != nullptr);
+    assert(current_time_ <=
+           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    *time = static_cast<int64_t>(current_time_);
+    return Status::OK();
+  }
+
+  virtual uint64_t NowMicros() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
+    return current_time_ * 1000000;
+  }
+
+  virtual uint64_t NowNanos() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
+    return current_time_ * 1000000000;
+  }
+
+  void set_current_time(uint64_t time) {
+    assert(time >= current_time_);
+    current_time_ = time;
+  }
+
+ private:
+  uint64_t current_time_ = 0;
+};
+
 #ifndef ROCKSDB_LITE
 class OnFileDeletionListener : public EventListener {
  public:
@@ -803,6 +834,8 @@ class DBTestBase : public testing::Test {
   std::string Get(int cf, const std::string& k,
                   const Snapshot* snapshot = nullptr);
 
+  Status Get(const std::string& k, PinnableSlice* v);
+
   uint64_t GetNumSnapshots();
 
   uint64_t GetTimeOldestSnapshots();
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index c6334f8e0..58fda80d5 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -696,17 +696,12 @@ TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
     num_keys -= 100;
   }
 
-  // Wait for the 2nd background compaction process to start
-  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
-  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
-
-  // Hold the 1st and 2nd compaction from finishing
+  // Hold the 1st compaction from finishing
   TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
   dbfull()->TEST_WaitForCompact();
 
-  // Although 2 compaction threads started, the second one did not compact
-  // anything because the number of files not being compacted is less than
-  // level0_file_num_compaction_trigger
+  // There should only be one picked compaction as the score drops below one
+  // after the first one is picked.
   EXPECT_EQ(total_picked_compactions, 1);
   EXPECT_EQ(TotalTableFiles(), 4);
 
@@ -1370,6 +1365,140 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
   Destroy(options);
 }
 
+TEST_P(DBTestUniversalCompaction, FullCompactionInBottomPriThreadPool) {
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  for (bool allow_ingest_behind : {false, true}) {
+    Options options = CurrentOptions();
+    options.allow_ingest_behind = allow_ingest_behind;
+    options.compaction_style = kCompactionStyleUniversal;
+    options.num_levels = num_levels_;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    int num_bottom_pri_compactions = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BGWorkBottomCompaction",
+        [&](void* arg) { ++num_bottom_pri_compactions; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      ASSERT_EQ(NumSortedRuns(), num);
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx);
+    }
+    dbfull()->TEST_WaitForCompact();
+
+    if (allow_ingest_behind || num_levels_ > 1) {
+      // allow_ingest_behind increases number of levels while sanitizing.
+      ASSERT_EQ(1, num_bottom_pri_compactions);
+    } else {
+      // for single-level universal, everything's bottom level so nothing should
+      // be executed in bottom-pri thread pool.
+      ASSERT_EQ(0, num_bottom_pri_compactions);
+    }
+    // Verify that size amplification did occur
+    ASSERT_EQ(NumSortedRuns(), 1);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
+  if (num_levels_ == 1) {
+    // for single-level universal, everything's bottom level so nothing should
+    // be executed in bottom-pri thread pool.
+    return;
+  }
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {// wait for the full compaction to be picked before adding files intended
+       // for the second one.
+       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+        "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+       // the full (bottom-pri) compaction waits until a partial (low-pri)
+       // compaction has started to verify they can run in parallel.
+       {"DBImpl::BackgroundCompaction:NonTrivial",
+        "DBImpl::BGWorkBottomCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      dbfull()->TEST_WaitForFlushMemTable();
+    }
+    if (i == 0) {
+      TEST_SYNC_POINT(
+          "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // First compaction should output to bottom level. Second should output to L0
+  // since older L0 files pending compaction prevent it from being placed lower.
+  ASSERT_EQ(NumSortedRuns(), 2);
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
+  // Regression test for extra compactions scheduled. Once enough compactions
+  // have been scheduled to bring the score below one, we should stop
+  // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily.
+  const int kNumFilesTrigger = 8;
+  Options options = CurrentOptions();
+  options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
+  options.compaction_options_universal.max_size_amplification_percent =
+      static_cast<unsigned int>(-1);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  Reopen(options);
+
+  std::atomic<int> num_compactions_attempted(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start", [&](void* arg) {
+        ++num_compactions_attempted;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int num = 0; num < kNumFilesTrigger; num++) {
+    ASSERT_EQ(NumSortedRuns(), num);
+    int key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Compacting the first four files was enough to bring the score below one so
+  // there's no need to schedule any more compactions.
+  ASSERT_EQ(1, num_compactions_attempted);
+  ASSERT_EQ(NumSortedRuns(), 5);
+}
+
 INSTANTIATE_TEST_CASE_P(UniversalCompactionNumLevels, DBTestUniversalCompaction,
                         ::testing::Combine(::testing::Values(1, 3, 5),
                                            ::testing::Bool()));
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index 726f444fa..e3e8ad829 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -3,12 +3,17 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <atomic>
 #include <memory>
 #include <thread>
 #include <vector>
 #include "db/db_test_util.h"
 #include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
+#include "util/fault_injection_test_env.h"
+#include "util/string_util.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
@@ -18,7 +23,9 @@ class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
  public:
   DBWriteTest() : DBTestBase("/db_write_test") {}
 
-  void Open() { DBTestBase::Reopen(GetOptions(GetParam())); }
+  Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+  void Open() { DBTestBase::Reopen(GetOptions()); }
 };
 
 // Sequence number should be return through input write batch.
@@ -67,6 +74,47 @@ TEST_P(DBWriteTest, ReturnSeuqneceNumberMultiThreaded) {
   }
 }
 
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+  constexpr int kNumThreads = 5;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  std::atomic<int> ready_count{0};
+  std::atomic<int> leader_count{0};
+  std::vector<port::Thread> threads;
+  mock_env->SetFilesystemActive(false);
+  // Wait until all threads linked to write threads, to make sure
+  // all threads join the same batch group.
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        ready_count++;
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          leader_count++;
+          while (ready_count < kNumThreads) {
+            // busy waiting
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.push_back(port::Thread(
+        [&](int index) {
+          // All threads should fail.
+          ASSERT_FALSE(Put("key" + ToString(index), "value").ok());
+        },
+        i));
+  }
+  for (int i = 0; i < kNumThreads; i++) {
+    threads[i].join();
+  }
+  ASSERT_EQ(1, leader_count);
+  // Close before mock_env destruct.
+  Close();
+}
+
 INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
                         testing::Values(DBTestBase::kDefault,
                                         DBTestBase::kConcurrentWALWrites,
diff --git a/db/dbformat.cc b/db/dbformat.cc
index 20c54495a..f287ae9f4 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -27,7 +27,7 @@ namespace rocksdb {
 // and the value type is embedded as the low 8 bits in the sequence
 // number in internal keys, we need to use the highest-numbered
 // ValueType, not the lowest).
-const ValueType kValueTypeForSeek = kTypeSingleDeletion;
+const ValueType kValueTypeForSeek = kTypeBlobIndex;
 const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
 
 uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
diff --git a/db/dbformat.h b/db/dbformat.h
index 0ffffc88f..c58b8363a 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -47,6 +47,8 @@ enum ValueType : unsigned char {
   kTypeNoop = 0xD,                        // WAL only.
   kTypeColumnFamilyRangeDeletion = 0xE,   // WAL only.
   kTypeRangeDeletion = 0xF,               // meta block
+  kTypeColumnFamilyBlobIndex = 0x10,      // Blob DB only
+  kTypeBlobIndex = 0x11,                  // Blob DB only
   kMaxValue = 0x7F                        // Not used for storing records.
 };
 
@@ -57,7 +59,7 @@ extern const ValueType kValueTypeForSeekForPrev;
 // Checks whether a type is an inline value type
 // (i.e. a type used in memtable skiplist and sst file datablock).
 inline bool IsValueType(ValueType t) {
-  return t <= kTypeMerge || t == kTypeSingleDeletion;
+  return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex;
 }
 
 // Checks whether a type is from user operation
@@ -84,6 +86,12 @@ struct ParsedInternalKey {
   ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
       : user_key(u), sequence(seq), type(t) { }
   std::string DebugString(bool hex = false) const;
+
+  void clear() {
+    user_key.clear();
+    sequence = 0;
+    type = kTypeDeletion;
+  }
 };
 
 // Return the length of the encoding of "key".
@@ -151,6 +159,9 @@ class InternalKeyComparator : public Comparator {
 
   int Compare(const InternalKey& a, const InternalKey& b) const;
   int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+  virtual const Comparator* GetRootComparator() const override {
+    return user_comparator_->GetRootComparator();
+  }
 };
 
 // Modules in this directory should keep internal keys wrapped inside
diff --git a/db/experimental.cc b/db/experimental.cc
index 45d4d70aa..effe9d7c3 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -14,20 +14,18 @@ namespace experimental {
 
 Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
                            const Slice* begin, const Slice* end) {
-  auto dbimpl = dynamic_cast<DBImpl*>(db);
-  if (dbimpl == nullptr) {
-    return Status::InvalidArgument("Didn't recognize DB object");
+  if (db == nullptr) {
+    return Status::InvalidArgument("DB is empty");
   }
 
-  return dbimpl->SuggestCompactRange(column_family, begin, end);
+  return db->SuggestCompactRange(column_family, begin, end);
 }
 
 Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
-  auto dbimpl = dynamic_cast<DBImpl*>(db);
-  if (dbimpl == nullptr) {
+  if (db == nullptr) {
     return Status::InvalidArgument("Didn't recognize DB object");
   }
-  return dbimpl->PromoteL0(column_family, target_level);
+  return db->PromoteL0(column_family, target_level);
 }
 
 #else  // ROCKSDB_LITE
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 534e8a0bf..65f2f7510 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -558,8 +558,10 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
 }
 
 TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+  int kNumLevels = 7;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
   Reopen(options);
 
   std::map<std::string, std::string> true_data;
@@ -567,43 +569,65 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   // prevent range deletions from being dropped due to becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
 
-  // range del [0, 50) in L0 file, [50, 100) in memtable
-  for (int i = 0; i < 2; i++) {
-    if (i == 1) {
+  // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+  for (int i = 0; i < 3; i++) {
+    if (i != 0) {
       db_->Flush(FlushOptions());
+      if (i == 1) {
+        MoveFilesToLevel(kNumLevels - 1);
+      }
     }
     ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
                                Key(50 * i), Key(50 * (i + 1))));
   }
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
 
-  // overlaps with L0 file but not memtable, so flush is skipped
+  // overlaps with L0 file but not memtable, so flush is skipped and file is
+  // ingested into L0
   SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L5
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
       file_id++, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
 
-  // overlaps with memtable, so flush is triggered (thus file count increases by
-  // two at this step).
+  // ingested file overlaps with memtable, so flush is triggered before the file
+  // is ingested such that the ingested data is considered newest. So L0 file
+  // count increases by two.
   ASSERT_OK(GenerateAndAddExternalFile(
-      options, {50, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
       file_id++, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
 
-  // snapshot unneeded now that both range deletions are persisted
+  // snapshot unneeded now that all range deletions are persisted
   db_->ReleaseSnapshot(snapshot);
 
   // overlaps with nothing, so places at bottom level and skips incrementing
   // seqnum.
   ASSERT_OK(GenerateAndAddExternalFile(
-      options, {101, 125}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
       file_id++, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
   ASSERT_EQ(4, NumTableFilesAtLevel(0));
-  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 58fa35446..5ea8ecb61 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -376,6 +376,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
 
 Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables(
     SuperVersion* sv, bool* overlap) {
+  *overlap = false;
   // Create an InternalIterator over all memtables
   Arena arena;
   ReadOptions ro;
@@ -391,26 +392,33 @@ Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables(
     memtable_range_del_iters.push_back(active_range_del_iter);
   }
   sv->imm->AddRangeTombstoneIterators(ro, &memtable_range_del_iters);
-  std::unique_ptr<InternalIterator> memtable_range_del_iter(NewMergingIterator(
-      &cfd_->internal_comparator(),
-      memtable_range_del_iters.empty() ? nullptr : &memtable_range_del_iters[0],
-      static_cast<int>(memtable_range_del_iters.size())));
-
+  RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
+                                   {} /* snapshots */,
+                                   false /* collapse_deletions */);
   Status status;
-  *overlap = false;
-  for (IngestedFileInfo& f : files_to_ingest_) {
-    status =
-        IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(), overlap);
-    if (!status.ok() || *overlap == true) {
-      break;
-    }
-    status = IngestedFileOverlapWithRangeDeletions(
-        &f, memtable_range_del_iter.get(), overlap);
-    if (!status.ok() || *overlap == true) {
-      break;
+  {
+    std::unique_ptr<InternalIterator> memtable_range_del_iter(
+        NewMergingIterator(&cfd_->internal_comparator(),
+                           memtable_range_del_iters.empty()
+                               ? nullptr
+                               : &memtable_range_del_iters[0],
+                           static_cast<int>(memtable_range_del_iters.size())));
+    status = range_del_agg.AddTombstones(std::move(memtable_range_del_iter));
+  }
+  if (status.ok()) {
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      status = IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(),
+                                                    overlap);
+      if (!status.ok() || *overlap == true) {
+        break;
+      }
+      if (range_del_agg.IsRangeOverlapped(f.smallest_user_key,
+                                          f.largest_user_key)) {
+        *overlap = true;
+        break;
+      }
     }
   }
-
   return status;
 }
 
@@ -575,34 +583,6 @@ Status ExternalSstFileIngestionJob::IngestedFileOverlapWithIteratorRange(
   return iter->status();
 }
 
-Status ExternalSstFileIngestionJob::IngestedFileOverlapWithRangeDeletions(
-    const IngestedFileInfo* file_to_ingest, InternalIterator* range_del_iter,
-    bool* overlap) {
-  auto* vstorage = cfd_->current()->storage_info();
-  auto* ucmp = vstorage->InternalComparator()->user_comparator();
-
-  *overlap = false;
-  if (range_del_iter != nullptr) {
-    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
-         range_del_iter->Next()) {
-      ParsedInternalKey parsed_key;
-      if (!ParseInternalKey(range_del_iter->key(), &parsed_key)) {
-        return Status::Corruption("corrupted range deletion key: " +
-                                  range_del_iter->key().ToString());
-      }
-      RangeTombstone range_del(parsed_key, range_del_iter->value());
-      if (ucmp->Compare(range_del.start_key_,
-                        file_to_ingest->largest_user_key) <= 0 &&
-          ucmp->Compare(file_to_ingest->smallest_user_key,
-                        range_del.end_key_) <= 0) {
-        *overlap = true;
-        break;
-      }
-    }
-  }
-  return Status::OK();
-}
-
 bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
     const IngestedFileInfo* file_to_ingest, int level) {
   if (level == 0) {
@@ -639,23 +619,26 @@ Status ExternalSstFileIngestionJob::IngestedFileOverlapWithLevel(
   ro.total_order_seek = true;
   MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
                                           &arena);
+  // Files are opened lazily when the iterator needs them, thus range deletions
+  // are also added lazily to the aggregator. We need to check for range
+  // deletion overlap only in the case where there's no point-key overlap. Then,
+  // we've already opened the file with range containing the ingested file's
+  // begin key, and iterated through all files until the one containing the
+  // ingested file's end key. So any files maybe containing range deletions
+  // overlapping the ingested file must have been opened and had their range
+  // deletions added to the aggregator.
+  RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
+                                   {} /* snapshots */,
+                                   false /* collapse_deletions */);
   sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder, lvl,
-                                    nullptr /* range_del_agg */);
+                                    &range_del_agg);
   ScopedArenaIterator level_iter(merge_iter_builder.Finish());
-
-  std::vector<InternalIterator*> level_range_del_iters;
-  sv->current->AddRangeDelIteratorsForLevel(ro, env_options_, lvl,
-                                            &level_range_del_iters);
-  std::unique_ptr<InternalIterator> level_range_del_iter(NewMergingIterator(
-      &cfd_->internal_comparator(),
-      level_range_del_iters.empty() ? nullptr : &level_range_del_iters[0],
-      static_cast<int>(level_range_del_iters.size())));
-
   Status status = IngestedFileOverlapWithIteratorRange(
       file_to_ingest, level_iter.get(), overlap_with_level);
-  if (status.ok() && *overlap_with_level == false) {
-    status = IngestedFileOverlapWithRangeDeletions(
-        file_to_ingest, level_range_del_iter.get(), overlap_with_level);
+  if (status.ok() && *overlap_with_level == false &&
+      range_del_agg.IsRangeOverlapped(file_to_ingest->smallest_user_key,
+                                      file_to_ingest->largest_user_key)) {
+    *overlap_with_level = true;
   }
   return status;
 }
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 2d0fadeed..e42c50603 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -139,13 +139,6 @@ class ExternalSstFileIngestionJob {
       const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
       bool* overlap);
 
-  // Check if `file_to_ingest` key range overlaps with any range deletions
-  // specified by `iter`.
-  // REQUIRES: Mutex held
-  Status IngestedFileOverlapWithRangeDeletions(
-      const IngestedFileInfo* file_to_ingest, InternalIterator* range_del_iter,
-      bool* overlap);
-
   // Check if `file_to_ingest` key range overlap with level
   // REQUIRES: Mutex held
   Status IngestedFileOverlapWithLevel(SuperVersion* sv,
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 846edb407..778c9eca1 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -301,6 +301,8 @@ Status FlushJob::WriteLevel0Table() {
       db_options_.env->GetCurrentTime(&_current_time);  // ignore error
       const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
+      uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime();
+
       s = BuildTable(
           dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
           optimized_env_options, cfd_->table_cache(), iter.get(),
@@ -311,7 +313,8 @@ Status FlushJob::WriteLevel0Table() {
           cfd_->ioptions()->compression_opts,
           mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
           TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
-          Env::IO_HIGH, &table_properties_, 0 /* level */, current_time);
+          Env::IO_HIGH, &table_properties_, 0 /* level */, current_time,
+          oldest_key_time);
       LogFlush(db_options_.info_log);
     }
     ROCKS_LOG_INFO(db_options_.info_log,
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 54723ea91..e98bd98cf 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -13,8 +13,9 @@
 #endif
 
 #include <inttypes.h>
-#include <string>
 #include <algorithm>
+#include <limits>
+#include <string>
 #include <utility>
 #include <vector>
 #include "db/column_family.h"
@@ -243,6 +244,7 @@ static const std::string num_running_flushes = "num-running-flushes";
 static const std::string actual_delayed_write_rate =
     "actual-delayed-write-rate";
 static const std::string is_write_stopped = "is-write-stopped";
+static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
 
 const std::string DB::Properties::kNumFilesAtLevelPrefix =
                       rocksdb_prefix + num_files_at_level_prefix;
@@ -316,6 +318,8 @@ const std::string DB::Properties::kActualDelayedWriteRate =
     rocksdb_prefix + actual_delayed_write_rate;
 const std::string DB::Properties::kIsWriteStopped =
     rocksdb_prefix + is_write_stopped;
+const std::string DB::Properties::kEstimateOldestKeyTime =
+    rocksdb_prefix + estimate_oldest_key_time;
 
 const std::unordered_map<std::string, DBPropertyInfo>
     InternalStats::ppt_name_to_info = {
@@ -414,6 +418,9 @@ const std::unordered_map<std::string, DBPropertyInfo>
           nullptr}},
         {DB::Properties::kIsWriteStopped,
          {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr}},
+        {DB::Properties::kEstimateOldestKeyTime,
+         {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime,
+          nullptr}},
 };
 
 const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
@@ -775,6 +782,35 @@ bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
   return true;
 }
 
+bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // TODO(yiwu): The property is currently available for fifo compaction
+  // with allow_compaction = false. This is because we don't propagate
+  // oldest_key_time on compaction.
+  if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
+      cfd_->ioptions()->compaction_options_fifo.allow_compaction) {
+    return false;
+  }
+
+  TablePropertiesCollection collection;
+  auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = std::numeric_limits<uint64_t>::max();
+  for (auto& p : collection) {
+    *value = std::min(*value, p.second->oldest_key_time);
+    if (*value == 0) {
+      break;
+    }
+  }
+  if (*value > 0) {
+    *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
+                       cfd_->imm()->ApproximateOldestKeyTime(), *value});
+  }
+  return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
+}
+
 void InternalStats::DumpDBStats(std::string* value) {
   char buf[1000];
   // DB-level stats, only available from default column family
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 1dd393f73..a0b8a9027 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -475,6 +475,8 @@ class InternalStats {
   bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
                                     Version* version);
   bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
+                                   Version* version);
 
   // Total number of background errors encountered. Every time a flush task
   // or compaction task fails, this counter is incremented. The failure can
diff --git a/db/memtable.cc b/db/memtable.cc
index a24989123..d51b26187 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -9,8 +9,9 @@
 
 #include "db/memtable.h"
 
-#include <memory>
 #include <algorithm>
+#include <limits>
+#include <memory>
 
 #include "db/dbformat.h"
 #include "db/merge_context.h"
@@ -37,10 +38,10 @@
 
 namespace rocksdb {
 
-MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions,
-                                 const MutableCFOptions& mutable_cf_options)
-    : write_buffer_size(mutable_cf_options.write_buffer_size),
-      arena_block_size(mutable_cf_options.arena_block_size),
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
+    : arena_block_size(mutable_cf_options.arena_block_size),
       memtable_prefix_bloom_bits(
           static_cast<uint32_t>(
               static_cast<double>(mutable_cf_options.write_buffer_size) *
@@ -81,6 +82,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       data_size_(0),
       num_entries_(0),
       num_deletes_(0),
+      write_buffer_size_(mutable_cf_options.write_buffer_size),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
@@ -96,7 +98,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       flush_state_(FLUSH_NOT_REQUESTED),
       env_(ioptions.env),
       insert_with_hint_prefix_extractor_(
-          ioptions.memtable_insert_with_hint_prefix_extractor) {
+          ioptions.memtable_insert_with_hint_prefix_extractor),
+      oldest_key_time_(std::numeric_limits<uint64_t>::max()) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
@@ -133,6 +136,7 @@ size_t MemTable::ApproximateMemoryUsage() {
 }
 
 bool MemTable::ShouldFlushNow() const {
+  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
   // In a lot of times, we cannot allocate arena blocks that exactly matches the
   // buffer size. Thus we have to decide if we should over-allocate or
   // under-allocate.
@@ -150,16 +154,14 @@ bool MemTable::ShouldFlushNow() const {
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
   if (allocated_memory + kArenaBlockSize <
-      moptions_.write_buffer_size +
-      kArenaBlockSize * kAllowOverAllocationRatio) {
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
     return false;
   }
 
-  // if user keeps adding entries that exceeds moptions.write_buffer_size,
-  // we need to flush earlier even though we still have much available
-  // memory left.
-  if (allocated_memory > moptions_.write_buffer_size +
-      kArenaBlockSize * kAllowOverAllocationRatio) {
+  // if user keeps adding entries that exceeds write_buffer_size, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
     return true;
   }
 
@@ -202,6 +204,21 @@ void MemTable::UpdateFlushState() {
   }
 }
 
+void MemTable::UpdateOldestKeyTime() {
+  uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
+  if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
+    int64_t current_time = 0;
+    auto s = env_->GetCurrentTime(&current_time);
+    if (s.ok()) {
+      assert(current_time >= 0);
+      // If fail, the timestamp is already set.
+      oldest_key_time_.compare_exchange_strong(
+          oldest_key_time, static_cast<uint64_t>(current_time),
+          std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+  }
+}
+
 int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
                                         const char* prefix_len_key2) const {
   // Internal keys are encoded as length-prefixed strings.
@@ -247,7 +264,8 @@ class MemTableIterator : public InternalIterator {
         comparator_(mem.comparator_),
         valid_(false),
         arena_mode_(arena != nullptr),
-        value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) {
+        value_pinned_(
+            !mem.GetImmutableMemTableOptions()->inplace_update_support) {
     if (use_range_del_table) {
       iter_ = mem.range_del_table_->GetIterator(arena);
     } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
@@ -516,6 +534,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   if (is_range_del_table_empty_ && type == kTypeRangeDeletion) {
     is_range_del_table_empty_ = false;
   }
+  UpdateOldestKeyTime();
 }
 
 // Callback from MemTable::Get()
@@ -537,6 +556,7 @@ struct Saver {
   Statistics* statistics;
   bool inplace_update_support;
   Env* env_;
+  bool* is_blob_index;
 };
 }  // namespace
 
@@ -566,11 +586,26 @@ static bool SaveValue(void* arg, const char* entry) {
     ValueType type;
     UnPackSequenceAndType(tag, &s->seq, &type);
 
-    if ((type == kTypeValue || type == kTypeMerge) &&
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
         range_del_agg->ShouldDelete(Slice(key_ptr, key_length))) {
       type = kTypeRangeDeletion;
     }
     switch (type) {
+      case kTypeBlobIndex:
+        if (s->is_blob_index == nullptr) {
+          ROCKS_LOG_ERROR(s->logger, "Encounter unexpected blob index.");
+          *(s->status) = Status::NotSupported(
+              "Encounter unsupported blob value. Please open DB with "
+              "rocksdb::blob_db::BlobDB instead.");
+        } else if (*(s->merge_in_progress)) {
+          *(s->status) =
+              Status::NotSupported("Blob DB does not support merge operator.");
+        }
+        if (!s->status->ok()) {
+          *(s->found_final_value) = true;
+          return false;
+        }
+      // intentional fallthrough
       case kTypeValue: {
         if (s->inplace_update_support) {
           s->mem->GetLock(s->key->user_key())->ReadLock();
@@ -589,6 +624,9 @@ static bool SaveValue(void* arg, const char* entry) {
           s->mem->GetLock(s->key->user_key())->ReadUnlock();
         }
         *(s->found_final_value) = true;
+        if (s->is_blob_index != nullptr) {
+          *(s->is_blob_index) = (type == kTypeBlobIndex);
+        }
         return false;
       }
       case kTypeDeletion:
@@ -635,7 +673,7 @@ static bool SaveValue(void* arg, const char* entry) {
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext* merge_context,
                    RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                   const ReadOptions& read_opts) {
+                   const ReadOptions& read_opts, bool* is_blob_index) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -681,6 +719,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.inplace_update_support = moptions_.inplace_update_support;
     saver.statistics = moptions_.statistics;
     saver.env_ = env_;
+    saver.is_blob_index = is_blob_index;
     table_->Get(key, &saver, SaveValue);
 
     *seq = saver.seq;
diff --git a/db/memtable.h b/db/memtable.h
index fe9feaf57..4f63818ee 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -35,11 +35,9 @@ class MemTableIterator;
 class MergeContext;
 class InternalIterator;
 
-struct MemTableOptions {
-  explicit MemTableOptions(
-      const ImmutableCFOptions& ioptions,
-      const MutableCFOptions& mutable_cf_options);
-  size_t write_buffer_size;
+struct ImmutableMemTableOptions {
+  explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
+                                    const MutableCFOptions& mutable_cf_options);
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
   size_t memtable_huge_page_size;
@@ -187,13 +185,15 @@ class MemTable {
   // status returned indicates a corruption or other unexpected error.
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           SequenceNumber* seq, const ReadOptions& read_opts);
+           SequenceNumber* seq, const ReadOptions& read_opts,
+           bool* is_blob_index = nullptr);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           const ReadOptions& read_opts) {
+           const ReadOptions& read_opts, bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts);
+    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
+               is_blob_index);
   }
 
   // Attempts to update the new_value inplace, else does normal Add
@@ -258,6 +258,18 @@ class MemTable {
     return num_deletes_.load(std::memory_order_relaxed);
   }
 
+  // Dynamically change the memtable's capacity. If set below the current usage,
+  // the next key added will trigger a flush. Can only increase size when
+  // memtable prefix bloom is disabled, since we can't easily allocate more
+  // space.
+  void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+    if (prefix_bloom_ == nullptr ||
+        new_write_buffer_size < write_buffer_size_) {
+      write_buffer_size_.store(new_write_buffer_size,
+                               std::memory_order_relaxed);
+    }
+  }
+
   // Returns the edits area that is needed for flushing the memtable
   VersionEdit* GetEdits() { return &edit_; }
 
@@ -346,7 +358,13 @@ class MemTable {
     return comparator_.comparator;
   }
 
-  const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
+  const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+    return &moptions_;
+  }
+
+  uint64_t ApproximateOldestKeyTime() const {
+    return oldest_key_time_.load(std::memory_order_relaxed);
+  }
 
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
@@ -356,7 +374,7 @@ class MemTable {
   friend class MemTableList;
 
   KeyComparator comparator_;
-  const MemTableOptions moptions_;
+  const ImmutableMemTableOptions moptions_;
   int refs_;
   const size_t kArenaBlockSize;
   AllocTracker mem_tracker_;
@@ -370,6 +388,9 @@ class MemTable {
   std::atomic<uint64_t> num_entries_;
   std::atomic<uint64_t> num_deletes_;
 
+  // Dynamically changeable memtable option
+  std::atomic<size_t> write_buffer_size_;
+
   // These are used to manage memtable flushes to storage
   bool flush_in_progress_; // started the flush
   bool flush_completed_;   // finished the flush
@@ -411,12 +432,17 @@ class MemTable {
   // Insert hints for each prefix.
   std::unordered_map<Slice, void*, SliceHasher> insert_hints_;
 
+  // Timestamp of oldest key
+  std::atomic<uint64_t> oldest_key_time_;
+
   // Returns a heuristic flush decision
   bool ShouldFlushNow() const;
 
   // Updates flush_state_ using ShouldFlushNow()
   void UpdateFlushState();
 
+  void UpdateOldestKeyTime();
+
   // No copying allowed
   MemTable(const MemTable&);
   MemTable& operator=(const MemTable&);
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 8f710c2e9..5921a50b3 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -10,6 +10,7 @@
 #endif
 
 #include <inttypes.h>
+#include <limits>
 #include <string>
 #include "db/memtable.h"
 #include "db/version_set.h"
@@ -103,35 +104,31 @@ int MemTableList::NumFlushed() const {
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
                               Status* s, MergeContext* merge_context,
                               RangeDelAggregator* range_del_agg,
-                              SequenceNumber* seq,
-                              const ReadOptions& read_opts) {
+                              SequenceNumber* seq, const ReadOptions& read_opts,
+                              bool* is_blob_index) {
   return GetFromList(&memlist_, key, value, s, merge_context, range_del_agg,
-                     seq, read_opts);
+                     seq, read_opts, is_blob_index);
 }
 
-bool MemTableListVersion::GetFromHistory(const LookupKey& key,
-                                         std::string* value, Status* s,
-                                         MergeContext* merge_context,
-                                         RangeDelAggregator* range_del_agg,
-                                         SequenceNumber* seq,
-                                         const ReadOptions& read_opts) {
+bool MemTableListVersion::GetFromHistory(
+    const LookupKey& key, std::string* value, Status* s,
+    MergeContext* merge_context, RangeDelAggregator* range_del_agg,
+    SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
   return GetFromList(&memlist_history_, key, value, s, merge_context,
-                     range_del_agg, seq, read_opts);
+                     range_del_agg, seq, read_opts, is_blob_index);
 }
 
-bool MemTableListVersion::GetFromList(std::list<MemTable*>* list,
-                                      const LookupKey& key, std::string* value,
-                                      Status* s, MergeContext* merge_context,
-                                      RangeDelAggregator* range_del_agg,
-                                      SequenceNumber* seq,
-                                      const ReadOptions& read_opts) {
+bool MemTableListVersion::GetFromList(
+    std::list<MemTable*>* list, const LookupKey& key, std::string* value,
+    Status* s, MergeContext* merge_context, RangeDelAggregator* range_del_agg,
+    SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
   *seq = kMaxSequenceNumber;
 
   for (auto& memtable : *list) {
     SequenceNumber current_seq = kMaxSequenceNumber;
 
     bool done = memtable->Get(key, value, s, merge_context, range_del_agg,
-                              &current_seq, read_opts);
+                              &current_seq, read_opts, is_blob_index);
     if (*seq == kMaxSequenceNumber) {
       // Store the most recent sequence number of any operation on this key.
       // Since we only care about the most recent change, we only need to
@@ -447,6 +444,13 @@ size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
 
 size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
 
+uint64_t MemTableList::ApproximateOldestKeyTime() const {
+  if (!current_->memlist_.empty()) {
+    return current_->memlist_.back()->ApproximateOldestKeyTime();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
 void MemTableList::InstallNewVersion() {
   if (current_->refs_ == 1) {
     // we're the only one using the version, just keep using it
diff --git a/db/memtable_list.h b/db/memtable_list.h
index ed475b83a..69038af50 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -54,13 +54,15 @@ class MemTableListVersion {
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           SequenceNumber* seq, const ReadOptions& read_opts);
+           SequenceNumber* seq, const ReadOptions& read_opts,
+           bool* is_blob_index = nullptr);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           const ReadOptions& read_opts) {
+           const ReadOptions& read_opts, bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts);
+    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
+               is_blob_index);
   }
 
   // Similar to Get(), but searches the Memtable history of memtables that
@@ -70,14 +72,16 @@ class MemTableListVersion {
   bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
                       MergeContext* merge_context,
                       RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                      const ReadOptions& read_opts);
+                      const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr);
   bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
                       MergeContext* merge_context,
                       RangeDelAggregator* range_del_agg,
-                      const ReadOptions& read_opts) {
+                      const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr) {
     SequenceNumber seq;
     return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
-                          read_opts);
+                          read_opts, is_blob_index);
   }
 
   Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
@@ -117,7 +121,7 @@ class MemTableListVersion {
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
                    std::string* value, Status* s, MergeContext* merge_context,
                    RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                   const ReadOptions& read_opts);
+                   const ReadOptions& read_opts, bool* is_blob_index = nullptr);
 
   void AddMemTable(MemTable* m);
 
@@ -217,6 +221,9 @@ class MemTableList {
   // the unflushed mem-tables.
   size_t ApproximateUnflushedMemTablesMemoryUsage();
 
+  // Returns an estimate of the timestamp of the earliest key.
+  uint64_t ApproximateOldestKeyTime() const;
+
   // Request a flush of all existing memtables to storage.  This will
   // cause future calls to IsFlushPending() to return true if this list is
   // non-empty (regardless of the min_write_buffer_number_to_merge
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 142486e5e..55f8254cf 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -18,6 +18,33 @@
 
 namespace rocksdb {
 
+MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
+                         const MergeOperator* user_merge_operator,
+                         const CompactionFilter* compaction_filter,
+                         Logger* logger, bool assert_valid_internal_key,
+                         SequenceNumber latest_snapshot, int level,
+                         Statistics* stats,
+                         const std::atomic<bool>* shutting_down)
+    : env_(env),
+      user_comparator_(user_comparator),
+      user_merge_operator_(user_merge_operator),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      logger_(logger),
+      assert_valid_internal_key_(assert_valid_internal_key),
+      allow_single_operand_(false),
+      latest_snapshot_(latest_snapshot),
+      level_(level),
+      keys_(),
+      filter_timer_(env_),
+      total_filter_time_(0U),
+      stats_(stats) {
+  assert(user_comparator_ != nullptr);
+  if (user_merge_operator_) {
+    allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
+  }
+}
+
 Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
                                    const Slice& key, const Slice* value,
                                    const std::vector<Slice>& operands,
@@ -201,12 +228,11 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
           ikey.sequence <= latest_snapshot_
               ? CompactionFilter::Decision::kKeep
               : FilterMerge(orig_ikey.user_key, value_slice);
-      if (range_del_agg != nullptr &&
-
+      if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
+          range_del_agg != nullptr &&
           range_del_agg->ShouldDelete(
               iter->key(),
-              RangeDelAggregator::RangePositioningMode::kForwardTraversal) &&
-          filter != CompactionFilter::Decision::kRemoveAndSkipUntil) {
+              RangeDelAggregator::RangePositioningMode::kForwardTraversal)) {
         filter = CompactionFilter::Decision::kRemove;
       }
       if (filter == CompactionFilter::Decision::kKeep ||
@@ -289,7 +315,8 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
     // Attempt to use the user's associative merge function to
     // merge the stacked merge operands into a single operand.
     s = Status::MergeInProgress();
-    if (merge_context_.GetNumOperands() >= 2) {
+    if (merge_context_.GetNumOperands() >= 2 ||
+        (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
       bool merge_success = false;
       std::string merge_result;
       {
diff --git a/db/merge_helper.h b/db/merge_helper.h
index 59da47a6b..b9ef12a4c 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -34,22 +34,7 @@ class MergeHelper {
               const CompactionFilter* compaction_filter, Logger* logger,
               bool assert_valid_internal_key, SequenceNumber latest_snapshot,
               int level = 0, Statistics* stats = nullptr,
-              const std::atomic<bool>* shutting_down = nullptr)
-      : env_(env),
-        user_comparator_(user_comparator),
-        user_merge_operator_(user_merge_operator),
-        compaction_filter_(compaction_filter),
-        shutting_down_(shutting_down),
-        logger_(logger),
-        assert_valid_internal_key_(assert_valid_internal_key),
-        latest_snapshot_(latest_snapshot),
-        level_(level),
-        keys_(),
-        filter_timer_(env_),
-        total_filter_time_(0U),
-        stats_(stats) {
-    assert(user_comparator_ != nullptr);
-  }
+              const std::atomic<bool>* shutting_down = nullptr);
 
   // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
   // Result of merge will be written to result if status returned is OK.
@@ -158,6 +143,7 @@ class MergeHelper {
   const std::atomic<bool>* shutting_down_;
   Logger* logger_;
   bool assert_valid_internal_key_; // enforce no internal key corruption?
+  bool allow_single_operand_;
   SequenceNumber latest_snapshot_;
   int level_;
 
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index a4fb64294..6caf89b31 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -144,6 +144,29 @@ bool RangeDelAggregator::ShouldDelete(
   return parsed.sequence < tombstone_map_iter->second.seq_;
 }
 
+bool RangeDelAggregator::IsRangeOverlapped(const Slice& start,
+                                           const Slice& end) {
+  // so far only implemented for non-collapsed mode since file ingestion (only
+  //  client) doesn't use collapsing
+  assert(!collapse_deletions_);
+  if (rep_ == nullptr) {
+    return false;
+  }
+  for (const auto& seqnum_and_tombstone_map : rep_->stripe_map_) {
+    for (const auto& start_key_and_tombstone :
+         seqnum_and_tombstone_map.second.raw_map) {
+      const auto& tombstone = start_key_and_tombstone.second;
+      if (icmp_.user_comparator()->Compare(start, tombstone.end_key_) < 0 &&
+          icmp_.user_comparator()->Compare(tombstone.start_key_, end) <= 0 &&
+          icmp_.user_comparator()->Compare(tombstone.start_key_,
+                                           tombstone.end_key_) < 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool RangeDelAggregator::ShouldAddTombstones(
     bool bottommost_level /* = false */) {
   // TODO(andrewkr): can we just open a file and throw it away if it ends up
@@ -357,7 +380,8 @@ Status RangeDelAggregator::AddTombstone(RangeTombstone tombstone) {
       ++new_range_dels_iter;
     }
   } else {
-    tombstone_map.emplace(tombstone.start_key_, std::move(tombstone));
+    auto start_key = tombstone.start_key_;
+    tombstone_map.emplace(start_key, std::move(tombstone));
   }
   return Status::OK();
 }
diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h
index 9d4b8ca16..6a5a266ef 100644
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@@ -77,6 +77,16 @@ class RangeDelAggregator {
                     RangePositioningMode mode = kFullScan);
   bool ShouldDelete(const Slice& internal_key,
                     RangePositioningMode mode = kFullScan);
+
+  // Checks whether range deletions cover any keys between `start` and `end`,
+  // inclusive.
+  //
+  // @param start User key representing beginning of range to check for overlap.
+  // @param end User key representing end of range to check for overlap. This
+  //     argument is inclusive, so the existence of a range deletion covering
+  //     `end` causes this to return true.
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
   bool ShouldAddTombstones(bool bottommost_level = false);
 
   // Adds tombstones to the tombstone aggregation structure maintained by this
diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc
index 39029bd2a..5896a5638 100644
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
@@ -28,9 +28,9 @@ enum Direction {
 
 void VerifyRangeDels(const std::vector<RangeTombstone>& range_dels,
                      const std::vector<ExpectedPoint>& expected_points) {
+  auto icmp = InternalKeyComparator(BytewiseComparator());
   // Test same result regardless of which order the range deletions are added.
   for (Direction dir : {kForward, kReverse}) {
-    auto icmp = InternalKeyComparator(BytewiseComparator());
     RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, true);
     std::vector<std::string> keys, values;
     for (const auto& range_del : range_dels) {
@@ -62,6 +62,27 @@ void VerifyRangeDels(const std::vector<RangeTombstone>& range_dels,
       }
     }
   }
+
+  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
+                                   false /* collapse_deletions */);
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  std::unique_ptr<test::VectorIterator> range_del_iter(
+      new test::VectorIterator(keys, values));
+  range_del_agg.AddTombstones(std::move(range_del_iter));
+  for (size_t i = 1; i < expected_points.size(); ++i) {
+    bool overlapped = range_del_agg.IsRangeOverlapped(
+        expected_points[i - 1].begin, expected_points[i].begin);
+    if (expected_points[i - 1].seq > 0 || expected_points[i].seq > 0) {
+      ASSERT_TRUE(overlapped);
+    } else {
+      ASSERT_FALSE(overlapped);
+    }
+  }
 }
 
 }  // anonymous namespace
@@ -112,9 +133,14 @@ TEST_F(RangeDelAggregatorTest, SameEndKey) {
 }
 
 TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
-  VerifyRangeDels(
-      {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}},
-      {{" ", 0}, {"a", 5}, {"b", 0}, {"c", 10}, {"d", 0}, {"e", 15}, {"f", 0}});
+  VerifyRangeDels({{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}, {{" ", 0},
+                                                                    {"a", 5},
+                                                                    {"b", 0},
+                                                                    {"c", 10},
+                                                                    {"d", 0},
+                                                                    {"da", 0},
+                                                                    {"e", 15},
+                                                                    {"f", 0}});
 }
 
 // Note the Cover* tests also test cases where tombstones are inserted under a
diff --git a/db/repair.cc b/db/repair.cc
index c248e6f43..9ed326032 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -176,6 +176,7 @@ class Repairer {
       status = db_impl->NewDB();
       delete db_impl;
     }
+
     if (status.ok()) {
       // Recover using the fresh manifest created by NewDB()
       status =
@@ -246,9 +247,21 @@ class Repairer {
   Status FindFiles() {
     std::vector<std::string> filenames;
     bool found_file = false;
+    std::vector<std::string> to_search_paths;
+
     for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
+        to_search_paths.push_back(db_options_.db_paths[path_id].path);
+    }
+
+    // search wal_dir if user uses a customize wal_dir
+    if (!db_options_.wal_dir.empty() && 
+        db_options_.wal_dir != dbname_) {
+        to_search_paths.push_back(db_options_.wal_dir);
+    }
+
+    for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
       Status status =
-          env_->GetChildren(db_options_.db_paths[path_id].path, &filenames);
+          env_->GetChildren(to_search_paths[path_id], &filenames);
       if (!status.ok()) {
         return status;
       }
@@ -261,14 +274,12 @@ class Repairer {
       for (size_t i = 0; i < filenames.size(); i++) {
         if (ParseFileName(filenames[i], &number, &type)) {
           if (type == kDescriptorFile) {
-            assert(path_id == 0);
             manifests_.push_back(filenames[i]);
           } else {
             if (number + 1 > next_file_number_) {
               next_file_number_ = number + 1;
             }
             if (type == kLogFile) {
-              assert(path_id == 0);
               logs_.push_back(number);
             } else if (type == kTableFile) {
               table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
@@ -288,7 +299,8 @@ class Repairer {
 
   void ConvertLogFilesToTables() {
     for (size_t i = 0; i < logs_.size(); i++) {
-      std::string logname = LogFileName(dbname_, logs_[i]);
+      // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option.
+      std::string logname = LogFileName(db_options_.wal_dir, logs_[i]);
       Status status = ConvertLogToTable(logs_[i]);
       if (!status.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
@@ -312,7 +324,7 @@ class Repairer {
     };
 
     // Open the log file
-    std::string logname = LogFileName(dbname_, log);
+    std::string logname = LogFileName(db_options_.wal_dir, log);
     unique_ptr<SequentialFile> lfile;
     Status status = env_->NewSequentialFile(
         logname, &lfile, env_->OptimizeForLogRead(env_options_));
diff --git a/db/repair_test.cc b/db/repair_test.cc
index 226e4e6d0..b267c6d16 100644
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@@ -174,6 +174,40 @@ TEST_F(RepairTest, UnflushedSst) {
   ASSERT_EQ(Get("key"), "val");
 }
 
+TEST_F(RepairTest, SeparateWalDir) {
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    Put("key", "val");
+    Put("foo", "bar");
+    VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 1);
+    uint64_t total_ssts_size;
+    GetAllSSTFiles(&total_ssts_size);
+    ASSERT_EQ(total_ssts_size, 0);
+    std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+    Close();
+    ASSERT_OK(env_->FileExists(manifest_path));
+    ASSERT_OK(env_->DeleteFile(manifest_path));
+    ASSERT_OK(RepairDB(dbname_, options));
+
+    // make sure that all WALs are converted to SSTables.
+    options.wal_dir = "";
+
+    Reopen(options);
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 0);
+    GetAllSSTFiles(&total_ssts_size);
+    ASSERT_GT(total_ssts_size, 0);
+    ASSERT_EQ(Get("key"), "val");
+    ASSERT_EQ(Get("foo"), "bar");
+
+ } while(ChangeWalOptions());
+}
+
 TEST_F(RepairTest, RepairMultipleColumnFamilies) {
   // Verify repair logic associates SST files with their original column
   // families.
diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h
index b94602f2a..7dc405931 100644
--- a/db/snapshot_impl.h
+++ b/db/snapshot_impl.h
@@ -74,9 +74,11 @@ class SnapshotList {
     count_--;
   }
 
-  // retrieve all snapshot numbers. They are sorted in ascending order.
+  // retrieve all snapshot numbers up until max_seq. They are sorted in
+  // ascending order.
   std::vector<SequenceNumber> GetAll(
-      SequenceNumber* oldest_write_conflict_snapshot = nullptr) {
+      SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+      const SequenceNumber& max_seq = kMaxSequenceNumber) const {
     std::vector<SequenceNumber> ret;
 
     if (oldest_write_conflict_snapshot != nullptr) {
@@ -86,8 +88,11 @@ class SnapshotList {
     if (empty()) {
       return ret;
     }
-    SnapshotImpl* s = &list_;
+    const SnapshotImpl* s = &list_;
     while (s->next_ != &list_) {
+      if (s->next_->number_ > max_seq) {
+        break;
+      }
       ret.push_back(s->next_->number_);
 
       if (oldest_write_conflict_snapshot != nullptr &&
@@ -103,6 +108,22 @@ class SnapshotList {
     return ret;
   }
 
+  // Whether there is an active snapshot in range [lower_bound, upper_bound).
+  bool HasSnapshotInRange(SequenceNumber lower_bound,
+                          SequenceNumber upper_bound) {
+    if (empty()) {
+      return false;
+    }
+    const SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      if (s->next_->number_ >= lower_bound) {
+        return s->next_->number_ < upper_bound;
+      }
+      s = s->next_;
+    }
+    return false;
+  }
+
   // get the sequence number of the most recent snapshot
   SequenceNumber GetNewest() {
     if (empty()) {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 398556a08..25c3befa4 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -106,10 +106,10 @@ Status TableCache::GetTableReader(
     }
     StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(std::move(file), fname, ioptions_.env,
-                                   ioptions_.statistics, record_read_stats,
-                                   file_read_hist, ioptions_.rate_limiter,
-                                   for_compaction));
+        new RandomAccessFileReader(
+            std::move(file), fname, ioptions_.env,
+            record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
+            file_read_hist, ioptions_.rate_limiter, for_compaction));
     s = ioptions_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, env_options, internal_comparator,
                            skip_filters, level),
@@ -225,8 +225,12 @@ InternalIterator* TableCache::NewIterator(
   }
   InternalIterator* result = nullptr;
   if (s.ok()) {
-    result =
-      table_reader->NewIterator(options, arena, &icomparator, skip_filters);
+    if (options.table_filter &&
+        !options.table_filter(*table_reader->GetTableProperties())) {
+      result = NewEmptyInternalIterator(arena);
+    } else {
+      result = table_reader->NewIterator(options, arena, skip_filters);
+    }
     if (create_new_table_reader) {
       assert(handle == nullptr);
       result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);
@@ -311,6 +315,7 @@ Status TableCache::Get(const ReadOptions& options,
 #ifndef ROCKSDB_LITE
   IterKey row_cache_key;
   std::string row_cache_entry_buffer;
+
   // Check row cache if enabled. Since row cache does not currently store
   // sequence numbers, we cannot use it if we need to fetch the sequence.
   if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
@@ -334,10 +339,26 @@ Status TableCache::Get(const ReadOptions& options,
 
     if (auto row_handle =
             ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+      // Cleanable routine to release the cache entry
+      Cleanable value_pinner;
+      auto release_cache_entry_func = [](void* cache_to_clean,
+                                         void* cache_handle) {
+        ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+      };
       auto found_row_cache_entry = static_cast<const std::string*>(
           ioptions_.row_cache->Value(row_handle));
-      replayGetContextLog(*found_row_cache_entry, user_key, get_context);
-      ioptions_.row_cache->Release(row_handle);
+      // If it comes here value is located on the cache.
+      // found_row_cache_entry points to the value on cache,
+      // and value_pinner has cleanup procedure for the cached entry.
+      // After replayGetContextLog() returns, get_context.pinnable_slice_
+      // will point to cache entry buffer (or a copy based on that) and
+      // cleanup routine under value_pinner will be delegated to
+      // get_context.pinnable_slice_. Cache entry is released when
+      // get_context.pinnable_slice_ is reset.
+      value_pinner.RegisterCleanup(release_cache_entry_func,
+                                   ioptions_.row_cache.get(), row_handle);
+      replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+                          &value_pinner);
       RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
       done = true;
     } else {
diff --git a/db/version_builder.cc b/db/version_builder.cc
index bab8d11f5..e8db67527 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <atomic>
 #include <functional>
+#include <map>
 #include <set>
 #include <thread>
 #include <unordered_map>
@@ -87,7 +88,16 @@ class VersionBuilder::Rep {
   Logger* info_log_;
   TableCache* table_cache_;
   VersionStorageInfo* base_vstorage_;
+  int num_levels_;
   LevelState* levels_;
+  // Store states of levels larger than num_levels_. We do this instead of
+  // storing them in levels_ to avoid regression in case there are no files
+  // on invalid levels. The version is not consistent if in the end the files
+  // on invalid levels don't cancel out.
+  std::map<int, std::unordered_set<uint64_t>> invalid_levels_;
+  // Whether there are invalid new files or invalid deletion on levels larger
+  // than num_levels_.
+  bool has_invalid_levels_;
   FileComparator level_zero_cmp_;
   FileComparator level_nonzero_cmp_;
 
@@ -97,8 +107,10 @@ class VersionBuilder::Rep {
       : env_options_(env_options),
         info_log_(info_log),
         table_cache_(table_cache),
-        base_vstorage_(base_vstorage) {
-    levels_ = new LevelState[base_vstorage_->num_levels()];
+        base_vstorage_(base_vstorage),
+        num_levels_(base_vstorage->num_levels()),
+        has_invalid_levels_(false) {
+    levels_ = new LevelState[num_levels_];
     level_zero_cmp_.sort_method = FileComparator::kLevel0;
     level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0;
     level_nonzero_cmp_.internal_comparator =
@@ -106,7 +118,7 @@ class VersionBuilder::Rep {
   }
 
   ~Rep() {
-    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+    for (int level = 0; level < num_levels_; level++) {
       const auto& added = levels_[level].added_files;
       for (auto& pair : added) {
         UnrefFile(pair.second);
@@ -137,7 +149,7 @@ class VersionBuilder::Rep {
     }
 #endif
     // make sure the files are sorted correctly
-    for (int level = 0; level < vstorage->num_levels(); level++) {
+    for (int level = 0; level < num_levels_; level++) {
       auto& level_files = vstorage->LevelFiles(level);
       for (size_t i = 1; i < level_files.size(); i++) {
         auto f1 = level_files[i - 1];
@@ -196,7 +208,7 @@ class VersionBuilder::Rep {
 #endif
     // a file to be deleted better exist in the previous version
     bool found = false;
-    for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) {
+    for (int l = 0; !found && l < num_levels_; l++) {
       const std::vector<FileMetaData*>& base_files =
           base_vstorage_->LevelFiles(l);
       for (size_t i = 0; i < base_files.size(); i++) {
@@ -210,7 +222,7 @@ class VersionBuilder::Rep {
     // if the file did not exist in the previous version, then it
     // is possibly moved from lower level to higher level in current
     // version
-    for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) {
+    for (int l = level + 1; !found && l < num_levels_; l++) {
       auto& level_added = levels_[l].added_files;
       auto got = level_added.find(number);
       if (got != level_added.end()) {
@@ -233,6 +245,19 @@ class VersionBuilder::Rep {
     }
   }
 
+  bool CheckConsistencyForNumLevels() {
+    // Make sure there are no files on or beyond num_levels().
+    if (has_invalid_levels_) {
+      return false;
+    }
+    for (auto& level : invalid_levels_) {
+      if (level.second.size() > 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Apply all of the edits in *edit to the current state.
   void Apply(VersionEdit* edit) {
     CheckConsistency(base_vstorage_);
@@ -242,26 +267,45 @@ class VersionBuilder::Rep {
     for (const auto& del_file : del) {
       const auto level = del_file.first;
       const auto number = del_file.second;
-      levels_[level].deleted_files.insert(number);
-      CheckConsistencyForDeletes(edit, number, level);
-
-      auto exising = levels_[level].added_files.find(number);
-      if (exising != levels_[level].added_files.end()) {
-        UnrefFile(exising->second);
-        levels_[level].added_files.erase(number);
+      if (level < num_levels_) {
+        levels_[level].deleted_files.insert(number);
+        CheckConsistencyForDeletes(edit, number, level);
+
+        auto exising = levels_[level].added_files.find(number);
+        if (exising != levels_[level].added_files.end()) {
+          UnrefFile(exising->second);
+          levels_[level].added_files.erase(number);
+        }
+      } else {
+        if (invalid_levels_[level].count(number) > 0) {
+          invalid_levels_[level].erase(number);
+        } else {
+          // Deleting an non-existing file on invalid level.
+          has_invalid_levels_ = true;
+        }
       }
     }
 
     // Add new files
     for (const auto& new_file : edit->GetNewFiles()) {
       const int level = new_file.first;
-      FileMetaData* f = new FileMetaData(new_file.second);
-      f->refs = 1;
-
-      assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
-             levels_[level].added_files.end());
-      levels_[level].deleted_files.erase(f->fd.GetNumber());
-      levels_[level].added_files[f->fd.GetNumber()] = f;
+      if (level < num_levels_) {
+        FileMetaData* f = new FileMetaData(new_file.second);
+        f->refs = 1;
+
+        assert(levels_[level].added_files.find(f->fd.GetNumber()) ==
+               levels_[level].added_files.end());
+        levels_[level].deleted_files.erase(f->fd.GetNumber());
+        levels_[level].added_files[f->fd.GetNumber()] = f;
+      } else {
+        uint64_t number = new_file.second.fd.GetNumber();
+        if (invalid_levels_[level].count(number) == 0) {
+          invalid_levels_[level].insert(number);
+        } else {
+          // Creating an already existing file on invalid level.
+          has_invalid_levels_ = true;
+        }
+      }
     }
   }
 
@@ -270,7 +314,7 @@ class VersionBuilder::Rep {
     CheckConsistency(base_vstorage_);
     CheckConsistency(vstorage);
 
-    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+    for (int level = 0; level < num_levels_; level++) {
       const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
@@ -325,7 +369,7 @@ class VersionBuilder::Rep {
     assert(table_cache_ != nullptr);
     // <file metadata, level>
     std::vector<std::pair<FileMetaData*, int>> files_meta;
-    for (int level = 0; level < base_vstorage_->num_levels(); level++) {
+    for (int level = 0; level < num_levels_; level++) {
       for (auto& file_meta_pair : levels_[level].added_files) {
         auto* file_meta = file_meta_pair.second;
         assert(!file_meta->table_reader_handle);
@@ -386,24 +430,35 @@ VersionBuilder::VersionBuilder(const EnvOptions& env_options,
                                VersionStorageInfo* base_vstorage,
                                Logger* info_log)
     : rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {}
+
 VersionBuilder::~VersionBuilder() { delete rep_; }
+
 void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
   rep_->CheckConsistency(vstorage);
 }
+
 void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
                                                 uint64_t number, int level) {
   rep_->CheckConsistencyForDeletes(edit, number, level);
 }
+
+bool VersionBuilder::CheckConsistencyForNumLevels() {
+  return rep_->CheckConsistencyForNumLevels();
+}
+
 void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+
 void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
   rep_->SaveTo(vstorage);
 }
+
 void VersionBuilder::LoadTableHandlers(
     InternalStats* internal_stats, int max_threads,
     bool prefetch_index_and_filter_in_cache) {
   rep_->LoadTableHandlers(internal_stats, max_threads,
                           prefetch_index_and_filter_in_cache);
 }
+
 void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
                                   FileMetaData* f) {
   rep_->MaybeAddFile(vstorage, level, f);
diff --git a/db/version_builder.h b/db/version_builder.h
index 235f79d7f..440d4eaf6 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -29,6 +29,7 @@ class VersionBuilder {
   void CheckConsistency(VersionStorageInfo* vstorage);
   void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
                                   int level);
+  bool CheckConsistencyForNumLevels();
   void Apply(VersionEdit* edit);
   void SaveTo(VersionStorageInfo* vstorage);
   void LoadTableHandlers(InternalStats* internal_stats, int max_threads,
diff --git a/db/version_set.cc b/db/version_set.cc
index 0069d86c1..da55ad57a 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -328,10 +328,6 @@ Version::~Version() {
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
-        if (f->table_reader_handle) {
-          cfd_->table_cache()->EraseHandle(f->fd, f->table_reader_handle);
-          f->table_reader_handle = nullptr;
-        }
         vset_->obsolete_files_.push_back(f);
       }
     }
@@ -879,22 +875,6 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
   }
 }
 
-void Version::AddRangeDelIteratorsForLevel(
-    const ReadOptions& read_options, const EnvOptions& soptions, int level,
-    std::vector<InternalIterator*>* range_del_iters) {
-  range_del_iters->clear();
-  for (size_t i = 0; i < storage_info_.LevelFilesBrief(level).num_files; i++) {
-    const auto& file = storage_info_.LevelFilesBrief(level).files[i];
-    auto* range_del_iter = cfd_->table_cache()->NewRangeTombstoneIterator(
-        read_options, soptions, cfd_->internal_comparator(), file.fd,
-        cfd_->internal_stats()->GetFileReadHist(level),
-        false /* skip_filters */, level);
-    if (range_del_iter != nullptr) {
-      range_del_iters->push_back(range_del_iter);
-    }
-  }
-}
-
 VersionStorageInfo::VersionStorageInfo(
     const InternalKeyComparator* internal_comparator,
     const Comparator* user_comparator, int levels,
@@ -969,7 +949,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                   PinnableSlice* value, Status* status,
                   MergeContext* merge_context,
                   RangeDelAggregator* range_del_agg, bool* value_found,
-                  bool* key_exists, SequenceNumber* seq) {
+                  bool* key_exists, SequenceNumber* seq, bool* is_blob) {
   Slice ikey = k.internal_key();
   Slice user_key = k.user_key();
 
@@ -985,7 +965,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
       value, value_found, merge_context, range_del_agg, this->env_, seq,
-      merge_operator_ ? &pinned_iters_mgr : nullptr);
+      merge_operator_ ? &pinned_iters_mgr : nullptr, is_blob);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
@@ -1034,6 +1014,12 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         return;
       case GetContext::kMerge:
         break;
+      case GetContext::kBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "rocksdb::blob_db::BlobDB instead.");
+        return;
     }
     f = fp.GetNextFile();
   }
@@ -1231,6 +1217,14 @@ int VersionStorageInfo::MaxInputLevel() const {
   return 0;
 }
 
+int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
+  if (allow_ingest_behind) {
+    assert(num_levels() > 1);
+    return num_levels() - 2;
+  }
+  return num_levels() - 1;
+}
+
 void VersionStorageInfo::EstimateCompactionBytesNeeded(
     const MutableCFOptions& mutable_cf_options) {
   // Only implemented for level-based compaction
@@ -1783,27 +1777,33 @@ void VersionStorageInfo::GetOverlappingInputs(
 void VersionStorageInfo::GetCleanInputsWithinInterval(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
-  if (level >= num_non_empty_levels_) {
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  if (level >= num_non_empty_levels_ || level == 0 ||
+      level_files_brief_[level].num_files == 0) {
     // this level is empty, no inputs within range
+    // also don't support clean input interval within L0
     return;
   }
 
-  inputs->clear();
   Slice user_begin, user_end;
-  if (begin != nullptr) {
+  const auto& level_files = level_files_brief_[level];
+  if (begin == nullptr) {
+    user_begin = ExtractUserKey(level_files.files[0].smallest_key);
+  } else {
     user_begin = begin->user_key();
   }
-  if (end != nullptr) {
+  if (end == nullptr) {
+    user_end = ExtractUserKey(
+        level_files.files[level_files.num_files - 1].largest_key);
+  } else {
     user_end = end->user_key();
   }
-  if (file_index) {
-    *file_index = -1;
-  }
-  if (begin != nullptr && end != nullptr && level > 0) {
-    GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
-                                          hint_index, file_index,
-                                          true /* within_interval */);
-  }
+  GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
+                                        hint_index, file_index,
+                                        true /* within_interval */);
 }
 
 // Store in "*inputs" all files in "level" that overlap [begin,end]
@@ -1866,8 +1866,8 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
   } else {
     ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid,
                                        &start_index, &end_index);
+    assert(end_index >= start_index);
   }
-  assert(end_index >= start_index);
   // insert overlapping files into vector
   for (int i = start_index; i <= end_index; i++) {
     inputs->push_back(files_[level][i]);
@@ -2331,10 +2331,14 @@ void CloseTables(void* ptr, size_t) {
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
   // VersionSet
-  column_family_set_->get_table_cache()->ApplyToAllCacheEntries(&CloseTables,
-                                                                false);
+  Cache* table_cache = column_family_set_->get_table_cache();
+  table_cache->ApplyToAllCacheEntries(&CloseTables, false /* thread_safe */);
   column_family_set_.reset();
   for (auto file : obsolete_files_) {
+    if (file->table_reader_handle) {
+      table_cache->Release(file->table_reader_handle);
+      TableCache::Evict(table_cache, file->fd.GetNumber());
+    }
     delete file;
   }
   obsolete_files_.clear();
@@ -2834,11 +2838,6 @@ Status VersionSet::Recover(
         cfd = column_family_set_->GetColumnFamily(edit.column_family_);
         // this should never happen since cf_in_builders is true
         assert(cfd != nullptr);
-        if (edit.max_level_ >= cfd->current()->storage_info()->num_levels()) {
-          s = Status::InvalidArgument(
-              "db has more levels than options.num_levels");
-          break;
-        }
 
         // if it is not column family add or column family drop,
         // then it's a file add/delete, which should be forwarded
@@ -2922,6 +2921,18 @@ Status VersionSet::Recover(
         list_of_not_found);
   }
 
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      assert(builders.count(cfd->GetID()) > 0);
+      auto* builder = builders[cfd->GetID()]->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+
   if (s.ok()) {
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
diff --git a/db/version_set.h b/db/version_set.h
index 5a1f8d07d..8b1fe2d64 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -147,6 +147,7 @@ class VersionStorageInfo {
   }
 
   int MaxInputLevel() const;
+  int MaxOutputLevel(bool allow_ingest_behind) const;
 
   // Return level number that has idx'th highest score
   int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
@@ -461,10 +462,6 @@ class Version {
                             MergeIteratorBuilder* merger_iter_builder,
                             int level, RangeDelAggregator* range_del_agg);
 
-  void AddRangeDelIteratorsForLevel(
-      const ReadOptions& read_options, const EnvOptions& soptions, int level,
-      std::vector<InternalIterator*>* range_del_iters);
-
   // Lookup the value for key.  If found, store it in *val and
   // return OK.  Else return a non-OK status.
   // Uses *operands to store merge_operator operations to apply later.
@@ -484,7 +481,8 @@ class Version {
   void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
            Status* status, MergeContext* merge_context,
            RangeDelAggregator* range_del_agg, bool* value_found = nullptr,
-           bool* key_exists = nullptr, SequenceNumber* seq = nullptr);
+           bool* key_exists = nullptr, SequenceNumber* seq = nullptr,
+           bool* is_blob = nullptr);
 
   // Loads some stats information from files. Call without mutex held. It needs
   // to be called before applying the version to the version set.
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 7ee2dd017..4a9ecbfdd 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -26,6 +26,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/write_batch.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
@@ -273,8 +274,8 @@ namespace {
 struct CompareLogByPointer {
   bool operator()(const std::unique_ptr<LogFile>& a,
                   const std::unique_ptr<LogFile>& b) {
-    LogFileImpl* a_impl = dynamic_cast<LogFileImpl*>(a.get());
-    LogFileImpl* b_impl = dynamic_cast<LogFileImpl*>(b.get());
+    LogFileImpl* a_impl = static_cast_with_check<LogFileImpl, LogFile>(a.get());
+    LogFileImpl* b_impl = static_cast_with_check<LogFileImpl, LogFile>(b.get());
     return *a_impl < *b_impl;
   }
 };
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 91be9a0df..76fc94844 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -67,6 +67,7 @@ enum ContentFlags : uint32_t {
   HAS_COMMIT = 1 << 7,
   HAS_ROLLBACK = 1 << 8,
   HAS_DELETE_RANGE = 1 << 9,
+  HAS_BLOB_INDEX = 1 << 10,
 };
 
 struct BatchContentClassifier : public WriteBatch::Handler {
@@ -97,6 +98,11 @@ struct BatchContentClassifier : public WriteBatch::Handler {
     return Status::OK();
   }
 
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_BLOB_INDEX;
+    return Status::OK();
+  }
+
   Status MarkBeginPrepare() override {
     content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
     return Status::OK();
@@ -328,6 +334,17 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
         return Status::Corruption("bad WriteBatch Merge");
       }
       break;
+    case kTypeColumnFamilyBlobIndex:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+    // intentional fallthrough
+    case kTypeBlobIndex:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      break;
     case kTypeLogData:
       assert(blob != nullptr);
       if (!GetLengthPrefixedSlice(input, blob)) {
@@ -414,6 +431,13 @@ Status WriteBatch::Iterate(Handler* handler) const {
         s = handler->MergeCF(column_family, key, value);
         found++;
         break;
+      case kTypeColumnFamilyBlobIndex:
+      case kTypeBlobIndex:
+        assert(content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
+        s = handler->PutBlobIndexCF(column_family, key, value);
+        found++;
+        break;
       case kTypeLogData:
         handler->LogData(blob);
         break;
@@ -759,6 +783,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
                                    value);
 }
 
+Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key, const Slice& value) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_BLOB_INDEX,
+                          std::memory_order_relaxed);
+  return save.commit();
+}
+
 Status WriteBatch::PutLogData(const Slice& blob) {
   LocalSavePoint save(this);
   rep_.push_back(static_cast<char>(kTypeLogData));
@@ -935,8 +978,8 @@ class MemTableInserter : public WriteBatch::Handler {
     return true;
   }
 
-  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) override {
+  Status PutCFImpl(uint32_t column_family_id, const Slice& key,
+                   const Slice& value, ValueType value_type) {
     if (rebuilding_trx_ != nullptr) {
       WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
       return Status::OK();
@@ -949,9 +992,9 @@ class MemTableInserter : public WriteBatch::Handler {
     }
 
     MemTable* mem = cf_mems_->GetMemTable();
-    auto* moptions = mem->GetMemTableOptions();
+    auto* moptions = mem->GetImmutableMemTableOptions();
     if (!moptions->inplace_update_support) {
-      mem->Add(sequence_, kTypeValue, key, value, concurrent_memtable_writes_,
+      mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
                get_post_process_info(mem));
     } else if (moptions->inplace_callback == nullptr) {
       assert(!concurrent_memtable_writes_);
@@ -986,11 +1029,11 @@ class MemTableInserter : public WriteBatch::Handler {
                                                  value, &merged_value);
         if (status == UpdateStatus::UPDATED_INPLACE) {
           // prev_value is updated in-place with final value.
-          mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
+          mem->Add(sequence_, value_type, key, Slice(prev_buffer, prev_size));
           RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         } else if (status == UpdateStatus::UPDATED) {
           // merged_value contains the final value.
-          mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
+          mem->Add(sequence_, value_type, key, Slice(merged_value));
           RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         }
       }
@@ -1003,6 +1046,11 @@ class MemTableInserter : public WriteBatch::Handler {
     return Status::OK();
   }
 
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) override {
+    return PutCFImpl(column_family_id, key, value, kTypeValue);
+  }
+
   Status DeleteImpl(uint32_t column_family_id, const Slice& key,
                     const Slice& value, ValueType delete_type) {
     MemTable* mem = cf_mems_->GetMemTable();
@@ -1091,7 +1139,7 @@ class MemTableInserter : public WriteBatch::Handler {
     }
 
     MemTable* mem = cf_mems_->GetMemTable();
-    auto* moptions = mem->GetMemTableOptions();
+    auto* moptions = mem->GetImmutableMemTableOptions();
     bool perform_merge = false;
 
     // If we pass DB through and options.max_successive_merges is hit
@@ -1159,6 +1207,12 @@ class MemTableInserter : public WriteBatch::Handler {
     return Status::OK();
   }
 
+  virtual Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                                const Slice& value) override {
+    // Same as PutCF except for value type.
+    return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
+  }
+
   void CheckMemtableFull() {
     if (flush_scheduler_ != nullptr) {
       auto* cfd = cf_mems_->current();
@@ -1303,6 +1357,7 @@ Status WriteBatchInternal::InsertInto(WriteThread::WriteGroup& write_group,
       continue;
     }
     SetSequence(w->batch, inserter.sequence());
+    w->sequence = inserter.sequence();
     inserter.set_log_number_ref(w->log_ref);
     w->status = w->batch->Iterate(&inserter);
     if (!w->status.ok()) {
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 48a417ce8..2408686f1 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -99,6 +99,9 @@ class WriteBatchInternal {
   static Status Merge(WriteBatch* batch, uint32_t column_family_id,
                       const SliceParts& key, const SliceParts& value);
 
+  static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key, const Slice& value);
+
   static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid);
 
   static Status MarkRollback(WriteBatch* batch, const Slice& xid);
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 388155b63..4584793ab 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -60,7 +60,7 @@ static std::string PrintContents(WriteBatch* b) {
     }
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ParsedInternalKey ikey;
-      memset((void*)&ikey, 0, sizeof(ikey));
+      ikey.clear();
       EXPECT_TRUE(ParseInternalKey(iter->key(), &ikey));
       switch (ikey.type) {
         case kTypeValue:
@@ -451,20 +451,20 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
     }
     virtual Status DeleteCF(uint32_t column_family_id,
                             const Slice& key) override {
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return Status::OK();
     }
     virtual Status SingleDeleteCF(uint32_t column_family_id,
                                   const Slice& key) override {
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return Status::OK();
     }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                            const Slice& value) override {
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override { EXPECT_TRUE(false); }
+    virtual void LogData(const Slice& blob) override { ADD_FAILURE(); }
     virtual bool Continue() override { return num_seen < kNumUpdates; }
   } handler;
 
@@ -502,20 +502,20 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
     }
     virtual Status DeleteCF(uint32_t column_family_id,
                             const Slice& key) override {
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return Status::OK();
     }
     virtual Status SingleDeleteCF(uint32_t column_family_id,
                                   const Slice& key) override {
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return Status::OK();
     }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                            const Slice& value) override {
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override { EXPECT_TRUE(false); }
+    virtual void LogData(const Slice& blob) override { ADD_FAILURE(); }
     virtual bool Continue() override { return num_seen < 2; }
   } handler;
 
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index 9edf1c158..d2bf30a09 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -16,7 +16,6 @@
 #include "rocksdb/db.h"
 #include "rocksdb/write_batch.h"
 #include "port/port.h"
-#include "util/logging.h"
 #include "util/random.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
@@ -107,6 +106,10 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
     std::vector<std::pair<string, string>> kvs_;
   };
 
+  // In each scenario we'll launch multiple threads to write.
+  // The size of each array equals to number of threads, and
+  // each boolean in it denote whether callback of corresponding
+  // thread should succeed or fail.
   std::vector<std::vector<WriteOP>> write_scenarios = {
       {true},
       {false},
@@ -145,23 +148,37 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
               db_impl = dynamic_cast<DBImpl*>(db);
               ASSERT_TRUE(db_impl);
 
-              std::atomic<uint64_t> threads_waiting(0);
+              // Writers that have called JoinBatchGroup.
+              std::atomic<uint64_t> threads_joining(0);
+              // Writers that have linked to the queue
+              std::atomic<uint64_t> threads_linked(0);
+              // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+              std::atomic<uint64_t> threads_verified(0);
+
               std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
               ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
 
+              rocksdb::SyncPoint::GetInstance()->SetCallBack(
+                  "WriteThread::JoinBatchGroup:Start", [&](void*) {
+                    uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+                    // Wait for the last joined writer to link to the queue.
+                    // In this way the writers link to the queue one by one.
+                    // This allows us to confidently detect the first writer
+                    // who increases threads_linked as the leader.
+                    while (threads_linked.load() < cur_threads_joining) {
+                    }
+                  });
+
+              // Verification once writers call JoinBatchGroup.
               rocksdb::SyncPoint::GetInstance()->SetCallBack(
                   "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
-                    uint64_t cur_threads_waiting = 0;
+                    uint64_t cur_threads_linked = threads_linked.fetch_add(1);
                     bool is_leader = false;
                     bool is_last = false;
 
                     // who am i
-                    do {
-                      cur_threads_waiting = threads_waiting.load();
-                      is_leader = (cur_threads_waiting == 0);
-                      is_last = (cur_threads_waiting == write_group.size() - 1);
-                    } while (!threads_waiting.compare_exchange_strong(
-                        cur_threads_waiting, cur_threads_waiting + 1));
+                    is_leader = (cur_threads_linked == 0);
+                    is_last = (cur_threads_linked == write_group.size() - 1);
 
                     // check my state
                     auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
@@ -185,8 +202,10 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
                                   !write_group.back().callback_.should_fail_);
                     }
 
-                    // wait for friends
-                    while (threads_waiting.load() < write_group.size()) {
+                    threads_verified.fetch_add(1);
+                    // Wait here until all verification in this sync-point
+                    // callback finish for all writers.
+                    while (threads_verified.load() < write_group.size()) {
                     }
                   });
 
@@ -211,17 +230,20 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
 
               std::atomic<uint32_t> thread_num(0);
               std::atomic<char> dummy_key(0);
+
+              // Each write thread create a random write batch and write to DB
+              // with a write callback.
               std::function<void()> write_with_callback_func = [&]() {
                 uint32_t i = thread_num.fetch_add(1);
                 Random rnd(i);
 
                 // leaders gotta lead
-                while (i > 0 && threads_waiting.load() < 1) {
+                while (i > 0 && threads_verified.load() < 1) {
                 }
 
                 // loser has to lose
                 while (i == write_group.size() - 1 &&
-                       threads_waiting.load() < write_group.size() - 1) {
+                       threads_verified.load() < write_group.size() - 1) {
                 }
 
                 auto& write_op = write_group.at(i);
@@ -231,11 +253,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
                 // insert some keys
                 for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
                   // grab unique key
-                  char my_key = 0;
-                  do {
-                    my_key = dummy_key.load();
-                  } while (
-                      !dummy_key.compare_exchange_strong(my_key, my_key + 1));
+                  char my_key = dummy_key.fetch_add(1);
 
                   string skey(5, my_key);
                   string sval(10, my_key);
diff --git a/db/write_thread.cc b/db/write_thread.cc
index 022f4e646..afe2f2797 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -57,6 +57,10 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
                                 AdaptationContext* ctx) {
   uint8_t state;
 
+  // 1. Busy loop using "pause" for 1 micro sec
+  // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+  // 3. Else blocking wait
+
   // On a modern Xeon each loop takes about 7 nanoseconds (most of which
   // is the effect of the pause instruction), so 200 iterations is a bit
   // more than a microsecond.  This is long enough that waits longer than
@@ -114,13 +118,21 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
 
   const size_t kMaxSlowYieldsWhileSpinning = 3;
 
+  // Whether the yield approach has any credit in this context. The credit is
+  // added by yield being succesfull before timing out, and decreased otherwise.
+  auto& yield_credit = ctx->value;
+  // Update the yield_credit based on sample runs or right after a hard failure
   bool update_ctx = false;
+  // Should we reinforce the yield credit
   bool would_spin_again = false;
+  // The samling base for updating the yeild credit. The sampling rate would be
+  // 1/sampling_base.
+  const int sampling_base = 256;
 
   if (max_yield_usec_ > 0) {
-    update_ctx = Random::GetTLSInstance()->OneIn(256);
+    update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
 
-    if (update_ctx || ctx->value.load(std::memory_order_relaxed) >= 0) {
+    if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
       // we're updating the adaptation statistics, or spinning has >
       // 50% chance of being shorter than max_yield_usec_ and causing no
       // involuntary context switches
@@ -149,7 +161,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
           // accurate enough to measure the yield duration
           ++slow_yield_count;
           if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
-            // Not just one ivcsw, but several.  Immediately update ctx
+            // Not just one ivcsw, but several.  Immediately update yield_credit
             // and fall back to blocking
             update_ctx = true;
             break;
@@ -165,11 +177,19 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
   }
 
   if (update_ctx) {
-    auto v = ctx->value.load(std::memory_order_relaxed);
+    // Since our update is sample based, it is ok if a thread overwrites the
+    // updates by other threads. Thus the update does not have to be atomic.
+    auto v = yield_credit.load(std::memory_order_relaxed);
     // fixed point exponential decay with decay constant 1/1024, with +1
     // and -1 scaled to avoid overflow for int32_t
-    v = v + (v / 1024) + (would_spin_again ? 1 : -1) * 16384;
-    ctx->value.store(v, std::memory_order_relaxed);
+    //
+    // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+    // 0.1%). If the sampled yield was successful, the credit is also increased
+    // by X. Setting X=2^17 ensures that the credit never exceeds
+    // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+    // logic applies to negative credits.
+    v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+    yield_credit.store(v, std::memory_order_relaxed);
   }
 
   assert((state & goal_mask) != 0);
@@ -267,10 +287,11 @@ void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
   SetState(w, STATE_COMPLETED);
 }
 
+static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
 void WriteThread::JoinBatchGroup(Writer* w) {
-  static AdaptationContext ctx("JoinBatchGroup");
-
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
   assert(w->batch != nullptr);
+
   bool linked_as_leader = LinkOne(w, &newest_writer_);
   if (linked_as_leader) {
     SetState(w, STATE_GROUP_LEADER);
@@ -294,7 +315,7 @@ void WriteThread::JoinBatchGroup(Writer* w) {
      */
     AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
                       STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
-               &ctx);
+               &jbg_ctx);
     TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
   }
 }
@@ -473,9 +494,9 @@ void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
   }
 }
 
+static WriteThread::AdaptationContext cpmtw_ctx("CompleteParallelMemTableWriter");
 // This method is called by both the leader and parallel followers
 bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
-  static AdaptationContext ctx("CompleteParallelMemTableWriter");
 
   auto* write_group = w->write_group;
   if (!w->status.ok()) {
@@ -485,7 +506,7 @@ bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
 
   if (write_group->running-- > 1) {
     // we're not the last one
-    AwaitState(w, STATE_COMPLETED, &ctx);
+    AwaitState(w, STATE_COMPLETED, &cpmtw_ctx);
     return false;
   }
   // else we're the last parallel worker and should perform exit duties.
@@ -504,13 +525,18 @@ void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
   SetState(write_group->leader, STATE_COMPLETED);
 }
 
+static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
 void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
                                          Status status) {
-  static AdaptationContext ctx("ExitAsBatchGroupLeader");
   Writer* leader = write_group.leader;
   Writer* last_writer = write_group.last_writer;
   assert(leader->link_older == nullptr);
 
+  // Propagate memtable write error to the whole group.
+  if (status.ok() && !write_group.status.ok()) {
+    status = write_group.status;
+  }
+
   if (enable_pipelined_write_) {
     // Notify writers don't write to memtable to exit.
     for (Writer* w = last_writer; w != leader;) {
@@ -544,7 +570,7 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
     }
     AwaitState(leader, STATE_MEMTABLE_WRITER_LEADER |
                            STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
-               &ctx);
+               &eabgl_ctx);
   } else {
     Writer* head = newest_writer_.load(std::memory_order_acquire);
     if (head != last_writer ||
@@ -591,15 +617,15 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
   }
 }
 
+static WriteThread::AdaptationContext eu_ctx("EnterUnbatched");
 void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
-  static AdaptationContext ctx("EnterUnbatched");
   assert(w != nullptr && w->batch == nullptr);
   mu->Unlock();
   bool linked_as_leader = LinkOne(w, &newest_writer_);
   if (!linked_as_leader) {
     TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
     // Last leader will not pick us as a follower since our batch is nullptr
-    AwaitState(w, STATE_GROUP_LEADER, &ctx);
+    AwaitState(w, STATE_GROUP_LEADER, &eu_ctx);
   }
   if (enable_pipelined_write_) {
     WaitForMemTableWriters();
@@ -619,15 +645,15 @@ void WriteThread::ExitUnbatched(Writer* w) {
   }
 }
 
+static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters");
 void WriteThread::WaitForMemTableWriters() {
-  static AdaptationContext ctx("WaitForMemTableWriters");
   assert(enable_pipelined_write_);
   if (newest_memtable_writer_.load() == nullptr) {
     return;
   }
   Writer w;
   if (!LinkOne(&w, &newest_memtable_writer_)) {
-    AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &ctx);
+    AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx);
   }
   newest_memtable_writer_.store(nullptr);
 }
diff --git a/db/write_thread.h b/db/write_thread.h
index 51bb97f2a..57ce71e08 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -55,7 +55,7 @@ class WriteThread {
 
     // The state used to inform a waiting writer that it has become a
     // parallel memtable writer. It can be the group leader who launch the
-    // praallel writer group, or one of the followers. The writer should then
+    // parallel writer group, or one of the followers. The writer should then
     // apply its batch to the memtable concurrently and call
     // CompleteParallelMemTableWriter.
     STATE_PARALLEL_MEMTABLE_WRITER = 8,
diff --git a/docs/_docs/faq.md b/docs/_docs/faq.md
index 6253afeea..0887a0987 100644
--- a/docs/_docs/faq.md
+++ b/docs/_docs/faq.md
@@ -13,7 +13,7 @@ RocksDB is an embeddable persistent key-value store for fast storage. RocksDB ca
 
 RocksDB builds on [LevelDB](https://code.google.com/p/leveldb/) to be scalable to run on servers with many CPU cores, to efficiently use fast storage, to support IO-bound, in-memory and write-once workloads, and to be flexible to allow for innovation.
 
-For the latest details, watch [Mark Callaghan’s and Igor Canadi’s talk at CMU on 10/2015](https://scs.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=f4e0eb37-ae18-468f-9248-cb73edad3e56). [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages/intro.pdf?raw=true) from the Data @ Scale 2013 conference provides some perspective about how RocksDB has evolved.
+For the latest details, watch [Mark Callaghan’s and Igor Canadi’s talk at CMU on 10/2015](https://scs.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=f4e0eb37-ae18-468f-9248-cb73edad3e56). [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages-old/intro.pdf?raw=true) from the Data @ Scale 2013 conference provides some perspective about how RocksDB has evolved.
 
 ## How does performance compare?
 
diff --git a/docs/_docs/getting-started.md b/docs/_docs/getting-started.md
index 0d5360932..8b01dfefd 100644
--- a/docs/_docs/getting-started.md
+++ b/docs/_docs/getting-started.md
@@ -11,7 +11,7 @@ The RocksDB library provides a persistent key value store. Keys and values are a
 
 The library is maintained by the Facebook Database Engineering Team, and is based on [LevelDB](https://github.com/google/leveldb), by Sanjay Ghemawat and Jeff Dean at Google.
 
-This overview gives some simple examples of how RocksDB is used. For the story of why RocksDB was created in the first place, see [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages/intro.pdf?raw=true) from the Data @ Scale 2013 conference.
+This overview gives some simple examples of how RocksDB is used. For the story of why RocksDB was created in the first place, see [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages-old/intro.pdf?raw=true) from the Data @ Scale 2013 conference.
 
 ## Opening A Database
 
diff --git a/docs/_posts/2017-05-12-partitioned-index-filter.markdown b/docs/_posts/2017-05-12-partitioned-index-filter.markdown
index fb4f62cd8..a537feb0c 100644
--- a/docs/_posts/2017-05-12-partitioned-index-filter.markdown
+++ b/docs/_posts/2017-05-12-partitioned-index-filter.markdown
@@ -31,4 +31,4 @@ In this example we have a DB of size 86G on HDD and emulate the small memory tha
 
 In this example we have a DB of size 300G on SSD and emulate the small memory that would be available in presence of other DBs on the same node by by using direct IO (skipping OS file cache) and block cache of size 6G and 2G. Without partitioning the linkbench throughput drops from 38k tps to 23k when reducing block cache size from 6G to 2G. With partitioning the throughput drops from 38k to only 30k.
 
-Learn more (here)[https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters].
+Learn more [here](https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters).
diff --git a/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown b/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
new file mode 100644
index 000000000..3b54ffd5a
--- /dev/null
+++ b/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.6.1 Released!
+layout: post
+author: yiwu
+category: blog
+---
+
+### Public API Change
+* Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads. See https://github.com/facebook/rocksdb/wiki/Thread-Pool for more details.
+* Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
+* options.delayed_write_rate by default take the value of options.rate_limiter rate.
+* Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`.
+
+### New Features
+* Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads. See http://rocksdb.org/blog/2017/05/14/core-local-stats.html for more details.
+* Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit.
+* Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit. See https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager for more details.
+* Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind. See https://github.com/facebook/rocksdb/wiki/Low-Priority-Write for more details.
+* `DB::IngestExternalFile()` now supports ingesting files into a database containing range deletions.
+
+### Bug Fixes
+* Shouldn't ignore return value of fsync() in flush.
diff --git a/docs/_posts/2017-08-24-pinnableslice.markdown b/docs/_posts/2017-08-24-pinnableslice.markdown
new file mode 100644
index 000000000..7ac2fec34
--- /dev/null
+++ b/docs/_posts/2017-08-24-pinnableslice.markdown
@@ -0,0 +1,37 @@
+---
+title: PinnableSlice; less memcpy with point lookups
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+The classic API for [DB::Get](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L310) receives a std::string as argument to which it will copy the value. The memcpy overhead could be non-trivial when the value is large. The [new API](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L322) receives a PinnableSlice instead, which avoids memcpy in most of the cases.
+
+### What is PinnableSlice?
+
+Similarly to Slice, PinnableSlice refers to some in-memory data so it does not incur the memcpy cost. To ensure that the data will not be erased while it is being processed by the user, PinnableSlice, as its name suggests, has the data pinned in memory. The pinned data are released when PinnableSlice object is destructed or when ::Reset is invoked explicitly on it.
+
+### How good is it?
+
+Here are the improvements in throughput for an [in-memory benchmark](https://github.com/facebook/rocksdb/pull/1756#issuecomment-286201693):
+* value 1k byte: 14%
+* value 10k byte: 34%
+
+### Any limitations?
+
+PinnableSlice tries to avoid memcpy as much as possible. The primary gain is when reading large values from the block cache. There are however cases that it would still have to copy the data into its internal buffer. The reason is mainly the complexity of implementation and if there is enough motivation on the application side. the scope of PinnableSlice could be extended to such cases too. These include:
+* Merged values
+* Reads from memtables
+
+### How to use it?
+
+```cpp
+PinnableSlice pinnable_val;
+while (!stopped) { 
+   auto s = db->Get(opt, cf, key, &pinnable_val);
+   // ... use it
+   pinnable_val.Reset(); // then release it immediately
+}
+```
+
+You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/master/examples/simple_example.cc) demonstrates that with more examples.
diff --git a/docs/_posts/2017-08-25-flushwal.markdown b/docs/_posts/2017-08-25-flushwal.markdown
new file mode 100644
index 000000000..2dc5626ad
--- /dev/null
+++ b/docs/_posts/2017-08-25-flushwal.markdown
@@ -0,0 +1,26 @@
+---
+title: FlushWAL; less fwrite, faster writes
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+When `DB::Put` is called, the data is written to both memtable (to be flushed to SST files later) and the WAL (write-ahead log) if it is enabled. In the case of a crash, RocksDB can recover as much as the memtable state that is reflected into the WAL. By default RocksDB automatically flushes the WAL from the application memory to the OS buffer after each `::Put`. It however can be configured to perform the flush manually after an explicit call to `::FlushWAL`. Not doing fwrite syscall after each `::Put` offers a tradeoff between reliability and write latency for the general case. As we explain below, some applications such as MyRocks benefit from this API to gain higher write throughput with however no compromise in reliability.
+
+### How much is the gain?
+
+Using `::FlushWAL` API along with setting `DBOptions.concurrent_prepare`, MyRocks achieves 40% higher throughput in Sysbench's [update-nonindex](https://github.com/akopytov/sysbench/blob/master/src/lua/oltp_update_non_index.lua) benchmark.
+
+### Write, Flush, and Sync
+
+The write to the WAL is first written to the application memory buffer. The buffer in the next step is "flushed" to OS buffer by calling fwrite syscall. The OS buffer is later "synced" to the persistent storage. The data in the OS buffer, although not persisted yet, will survive the application crash. By default, the flush occurs automatically upon each call to `DB::Put` or `DB::Write`. The user can additionally request sync after each write by setting `WriteOptions::sync`.
+
+### FlushWAL API
+
+The user can turn off the automatic flush of the WAL by setting `DBOptions::manual_wal_flush`. In that case, the WAL buffer is flushed when it is either full or `DB::FlushWAL` is called by the user. The API also accepts a boolean argument should we want to sync right after the flush: `::FlushWAL(true)`.
+
+### Success story: MyRocks
+
+Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for acheiving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer.
+
+Since binlog helps in recovering from some failure scenarios, MySQL can provide reliability without however needing a storage WAL flush after each individual commit. MyRocks benefits from this property, disables automatic WAL flush in RocksDB, and manually calls `::FlushWAL` when requested by MySQL.
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 9c9fcd3fc..6b688a660 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -1,9 +1,7 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
 
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 7f2bc3b85..5a671d72f 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -761,23 +761,23 @@ class PosixEnv : public Env {
 
   // Allow increasing the number of worker threads.
   virtual void SetBackgroundThreads(int num, Priority pri) override {
-    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
     thread_pools_[pri].SetBackgroundThreads(num);
   }
 
   virtual int GetBackgroundThreads(Priority pri) override {
-    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
     return thread_pools_[pri].GetBackgroundThreads();
   }
 
   // Allow increasing the number of worker threads.
   virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
-    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
     thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
   }
 
   virtual void LowerThreadPoolIOPriority(Priority pool = LOW) override {
-    assert(pool >= Priority::LOW && pool <= Priority::HIGH);
+    assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
 #ifdef OS_LINUX
     thread_pools_[pool].LowerIOPriority();
 #endif
@@ -883,7 +883,7 @@ PosixEnv::PosixEnv()
 
 void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri,
                         void* tag, void (*unschedFunction)(void* arg)) {
-  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
   thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
 }
 
@@ -892,7 +892,7 @@ int PosixEnv::UnSchedule(void* arg, Priority pri) {
 }
 
 unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
-  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
   return thread_pools_[pri].GetQueueLen();
 }
 
diff --git a/env/env_test.cc b/env/env_test.cc
index 7fd71a3c4..9ec2f142e 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -125,12 +125,14 @@ static void SetBool(void* ptr) {
   reinterpret_cast<std::atomic<bool>*>(ptr)->store(true);
 }
 
-TEST_P(EnvPosixTestWithParam, RunImmediately) {
-  std::atomic<bool> called(false);
-  env_->Schedule(&SetBool, &called);
-  Env::Default()->SleepForMicroseconds(kDelayMicros);
-  ASSERT_TRUE(called.load());
-  WaitThreadPoolsEmpty();
+TEST_F(EnvPosixTest, RunImmediately) {
+  for (int pri = Env::BOTTOM; pri < Env::TOTAL; ++pri) {
+    std::atomic<bool> called(false);
+    env_->SetBackgroundThreads(1, static_cast<Env::Priority>(pri));
+    env_->Schedule(&SetBool, &called, static_cast<Env::Priority>(pri));
+    Env::Default()->SleepForMicroseconds(kDelayMicros);
+    ASSERT_TRUE(called.load());
+  }
 }
 
 TEST_P(EnvPosixTestWithParam, UnSchedule) {
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 79a4f8c44..669011c4e 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <chrono>
 #include "port/sys_time.h"
+#include "util/cast_util.h"
 #include "util/murmurhash.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
@@ -711,7 +712,8 @@ Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
 }
 
 Status MockEnv::UnlockFile(FileLock* flock) {
-  std::string fn = dynamic_cast<MockEnvFileLock*>(flock)->FileName();
+  std::string fn =
+      static_cast_with_check<MockEnvFileLock, FileLock>(flock)->FileName();
   {
     MutexLock lock(&mutex_);
     if (file_map_.find(fn) != file_map_.end()) {
diff --git a/examples/Makefile b/examples/Makefile
index a3a786e83..57cd1a75a 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -8,6 +8,10 @@ ifndef DISABLE_JEMALLOC
 	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE)
 endif
 
+ifneq ($(USE_RTTI), 1)
+	CXXFLAGS += -fno-rtti
+endif
+
 .PHONY: clean librocksdb
 
 all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example
diff --git a/examples/compaction_filter_example.cc b/examples/compaction_filter_example.cc
index 7a78244a0..226dfe790 100644
--- a/examples/compaction_filter_example.cc
+++ b/examples/compaction_filter_example.cc
@@ -59,7 +59,11 @@ int main() {
 
   MyFilter filter;
 
-  system("rm -rf /tmp/rocksmergetest");
+  int ret = system("rm -rf /tmp/rocksmergetest");
+  if (ret != 0) {
+    fprintf(stderr, "Error deleting /tmp/rocksmergetest, code: %d\n", ret);
+    return ret;
+  }
   rocksdb::Options options;
   options.create_if_missing = true;
   options.merge_operator.reset(new MyMerge);
diff --git a/examples/simple_example.cc b/examples/simple_example.cc
index 52fffff5b..a8f80f091 100644
--- a/examples/simple_example.cc
+++ b/examples/simple_example.cc
@@ -50,6 +50,33 @@ int main() {
   db->Get(ReadOptions(), "key2", &value);
   assert(value == "value");
 
+  {
+    PinnableSlice pinnable_val;
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val);
+    assert(pinnable_val == "value");
+  }
+
+  {
+    std::string string_val;
+    // If it cannot pin the value, it copies the value to its internal buffer.
+    // The intenral buffer could be set during construction.
+    PinnableSlice pinnable_val(&string_val);
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val);
+    assert(pinnable_val == "value");
+    // If the value is not pinned, the internal buffer must have the value.
+    assert(pinnable_val.IsPinned() || string_val == "value");
+  }
+
+  PinnableSlice pinnable_val;
+  db->Get(ReadOptions(), db->DefaultColumnFamily(), "key1", &pinnable_val);
+  assert(s.IsNotFound());
+  // Reset PinnableSlice after each use and before each reuse
+  pinnable_val.Reset();
+  db->Get(ReadOptions(), db->DefaultColumnFamily(), "key2", &pinnable_val);
+  assert(pinnable_val == "value");
+  pinnable_val.Reset();
+  // The Slice pointed by pinnable_val is not valid after this point
+
   delete db;
 
   return 0;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 34364deac..2269f7261 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -82,6 +82,7 @@ typedef struct rocksdb_compactionfiltercontext_t
 typedef struct rocksdb_compactionfilterfactory_t
     rocksdb_compactionfilterfactory_t;
 typedef struct rocksdb_comparator_t      rocksdb_comparator_t;
+typedef struct rocksdb_dbpath_t          rocksdb_dbpath_t;
 typedef struct rocksdb_env_t             rocksdb_env_t;
 typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t;
 typedef struct rocksdb_filelock_t        rocksdb_filelock_t;
@@ -116,6 +117,8 @@ typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t;
 typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t;
 typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t;
 typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t;
+typedef struct rocksdb_optimistictransactiondb_t rocksdb_optimistictransactiondb_t;
+typedef struct rocksdb_optimistictransaction_options_t rocksdb_optimistictransaction_options_t;
 typedef struct rocksdb_transaction_t rocksdb_transaction_t;
 typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
 
@@ -713,6 +716,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
     rocksdb_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*,
+                                                             const rocksdb_dbpath_t** path_values, 
+                                                             size_t num_paths);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
                                                         rocksdb_env_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*,
@@ -759,6 +765,8 @@ rocksdb_options_set_max_bytes_for_level_multiplier_additional(
     rocksdb_options_t*, int* level_values, size_t num_levels);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
     rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_stats_update_on_db_open(
+    rocksdb_options_t* opt, unsigned char val);
 
 /* returns a pointer to a malloc()-ed, null terminated string */
 extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
@@ -1089,6 +1097,11 @@ rocksdb_cache_get_usage(rocksdb_cache_t* cache);
 extern ROCKSDB_LIBRARY_API size_t
 rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache);
 
+/* DBPath */
+
+extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size);
+extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*);
+
 /* Env */
 
 extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env();
@@ -1247,6 +1260,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
 
 /* Transactions */
 
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
     const rocksdb_options_t* options,
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
@@ -1273,33 +1292,86 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback(
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy(
     rocksdb_transaction_t* txn);
 
+// This snapshot should be freed using rocksdb_free
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
+rocksdb_transaction_get_snapshot(rocksdb_transaction_t* txn);
+
 extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get(
     rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
     const char* key, size_t klen, size_t* vlen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get(
     rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
     const char* key, size_t klen, size_t* vlen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put(
     rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
     size_t vlen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_put_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put(
     rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
     const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t *batch, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge(
+    rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
+    size_t vlen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete(
     rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete(
     rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
     const char* key, size_t klen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn,
                                     const rocksdb_readoptions_t* options);
 
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db,
+                                      const rocksdb_readoptions_t* options);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close(
     rocksdb_transactiondb_t* txn_db);
 
@@ -1307,6 +1379,20 @@ extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
 rocksdb_transactiondb_checkpoint_object_create(rocksdb_transactiondb_t* txn_db,
                                                char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options,
+                                     const char* name, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_transaction_t*
+rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db);
+
 /* Transaction Options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
@@ -1355,6 +1441,17 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_transaction_options_set_max_write_batch_size(
     rocksdb_transaction_options_t* opt, size_t size);
 
+
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create();
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v);
+
 // referring to convention (3), this should be used by client
 // to free memory that was malloc()ed
 extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
diff --git a/include/rocksdb/cleanable.h b/include/rocksdb/cleanable.h
index ecc172b44..cd2e9425f 100644
--- a/include/rocksdb/cleanable.h
+++ b/include/rocksdb/cleanable.h
@@ -25,6 +25,15 @@ class Cleanable {
  public:
   Cleanable();
   ~Cleanable();
+
+  // No copy constructor and copy assignment allowed.
+  Cleanable(Cleanable&) = delete;
+  Cleanable& operator=(Cleanable&) = delete;
+
+  // Move consturctor and move assignment is allowed.
+  Cleanable(Cleanable&&);
+  Cleanable& operator=(Cleanable&&);
+
   // Clients are allowed to register function/arg1/arg2 triples that
   // will be invoked when this iterator is destroyed.
   //
@@ -33,7 +42,7 @@ class Cleanable {
   typedef void (*CleanupFunction)(void* arg1, void* arg2);
   void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
   void DelegateCleanupsTo(Cleanable* other);
-  // DoCkeanup and also resets the pointers for reuse
+  // DoCleanup and also resets the pointers for reuse
   inline void Reset() {
     DoCleanup();
     cleanup_.function = nullptr;
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
index 9a8c0318c..64f61a35e 100644
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -36,6 +36,7 @@ class CompactionFilter {
   enum ValueType {
     kValue,
     kMergeOperand,
+    kBlobIndex,  // used internally by BlobDB.
   };
 
   enum class Decision {
@@ -171,6 +172,8 @@ class CompactionFilter {
         bool rv = FilterMergeOperand(level, key, existing_value);
         return rv ? Decision::kRemove : Decision::kKeep;
       }
+      case ValueType::kBlobIndex:
+        return Decision::kKeep;
     }
     assert(false);
     return Decision::kKeep;
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index ac6e4a9b0..64db73a72 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -64,6 +64,10 @@ class Comparator {
   // Simple comparator implementations may return with *key unchanged,
   // i.e., an implementation of this method that does nothing is correct.
   virtual void FindShortSuccessor(std::string* key) const = 0;
+
+  // if it is a wrapped comparator, may return the root one.
+  // return itself it is not wrapped.
+  virtual const Comparator* GetRootComparator() const { return this; }
 };
 
 // Return a builtin comparator that uses lexicographic byte-wise
diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h
index cb0c6f56b..b09ac4816 100644
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@@ -325,10 +325,16 @@ void CancelAllBackgroundWork(DB* db, bool wait = false);
 
 // Delete files which are entirely in the given range
 // Could leave some keys in the range which are in files which are not
-// entirely in the range.
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
 // Snapshots before the delete might not see the data in the given range.
 Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
                           const Slice* begin, const Slice* end);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path);
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index ee5706b4c..964f7b1db 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -582,6 +582,12 @@ class DB {
 
     //  "rocksdb.is-write-stopped" - Return 1 if write has been stopped.
     static const std::string kIsWriteStopped;
+
+    //  "rocksdb.estimate-oldest-key-time" - returns an estimation of
+    //      oldest key timestamp in the DB. Currently only available for
+    //      FIFO compaction with
+    //      compaction_options_fifo.allow_compaction = false.
+    static const std::string kEstimateOldestKeyTime;
   };
 #endif /* ROCKSDB_LITE */
 
@@ -632,6 +638,7 @@ class DB {
   //  "rocksdb.num-running-flushes"
   //  "rocksdb.actual-delayed-write-rate"
   //  "rocksdb.is-write-stopped"
+  //  "rocksdb.estimate-oldest-key-time"
   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                               const Slice& property, uint64_t* value) = 0;
   virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
@@ -976,6 +983,8 @@ class DB {
     return IngestExternalFile(DefaultColumnFamily(), external_files, options);
   }
 
+  virtual Status VerifyChecksum() = 0;
+
   // AddFile() is deprecated, please use IngestExternalFile()
   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
       ColumnFamilyHandle* column_family,
@@ -1097,6 +1106,17 @@ class DB {
   virtual Status GetPropertiesOfTablesInRange(
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) = 0;
+
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                     const Slice* begin, const Slice* end) {
+    return Status::NotSupported("SuggestCompactRange() is not implemented.");
+  }
+
+  virtual Status PromoteL0(ColumnFamilyHandle* column_family,
+                           int target_level) {
+    return Status::NotSupported("PromoteL0() is not implemented.");
+  }
+
 #endif  // ROCKSDB_LITE
 
   // Needed for StackableDB
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 869073899..709d50366 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -283,7 +283,7 @@ class Env {
   virtual Status UnlockFile(FileLock* lock) = 0;
 
   // Priority for scheduling job in thread pool
-  enum Priority { LOW, HIGH, TOTAL };
+  enum Priority { BOTTOM, LOW, HIGH, TOTAL };
 
   // Priority for requesting bytes in rate limiter scheduler
   enum IOPriority {
@@ -793,7 +793,7 @@ enum InfoLogLevel : unsigned char {
 // An interface for writing log messages.
 class Logger {
  public:
-  size_t kDoNotSupportGetLogFileSize = std::numeric_limits<size_t>::max();
+  size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
 
   explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
       : log_level_(log_level) {}
diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h
index 764fffba7..e4c924a4b 100644
--- a/include/rocksdb/env_encryption.h
+++ b/include/rocksdb/env_encryption.h
@@ -1,9 +1,7 @@
 //  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index 4e09f64e9..d4ac52818 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -80,6 +80,13 @@ class Iterator : public Cleanable {
   // satisfied without doing some IO, then this returns Status::Incomplete().
   virtual Status status() const = 0;
 
+  // If supported, renew the iterator to represent the latest state. The
+  // iterator will be invalidated after the call. Not supported if
+  // ReadOptions.snapshot is given when creating the iterator.
+  virtual Status Refresh() {
+    return Status::NotSupported("Refresh() is not supported");
+  }
+
   // Property "rocksdb.iterator.is-key-pinned":
   //   If returning "1", this means that the Slice returned by key() is valid
   //   as long as the iterator is not deleted.
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 40d318e09..e132033db 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -206,6 +206,7 @@ class CompactionEventListener {
     kDelete,
     kSingleDelete,
     kRangeDelete,
+    kBlobIndex,
     kInvalid,
   };
 
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
index 5fe3e0bfd..f29471005 100644
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -183,6 +183,13 @@ class MergeOperator {
   //       no checking is enforced. Client is responsible for providing
   //       consistent MergeOperator between DB opens.
   virtual const char* Name() const = 0;
+
+  // Determines whether the MergeOperator can be called with just a single
+  // merge operand.
+  // Override and return true for allowing a single operand. FullMergeV2 and
+  // PartialMerge/PartialMergeMulti should be implemented accordingly to handle
+  // a single operand.
+  virtual bool AllowSingleOperand() const { return false; }
 };
 
 // The simpler, associative merge operator.
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 57fba6ad3..dbd7e64a8 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1063,6 +1063,14 @@ struct ReadOptions {
   // Default: false
   bool ignore_range_deletions;
 
+  // A callback to determine whether relevant keys for this scan exist in a
+  // given table based on the table's properties. The callback is passed the
+  // properties of each table during iteration. If the callback returns false,
+  // the table will not be scanned. This option only affects Iterators and has
+  // no impact on point lookups.
+  // Default: empty (every table will be scanned)
+  std::function<bool(const TableProperties&)> table_filter;
+
   ReadOptions();
   ReadOptions(bool cksum, bool cache);
 };
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index ff1a0cacc..1095d063b 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -30,6 +30,11 @@ struct PerfContext {
   uint64_t block_read_time;           // total nanos spent on block reads
   uint64_t block_checksum_time;       // total nanos spent on block checksum
   uint64_t block_decompress_time;  // total nanos spent on block decompression
+
+  uint64_t get_read_bytes;       // bytes for vals returned by Get
+  uint64_t multiget_read_bytes;  // bytes for vals returned by MultiGet
+  uint64_t iter_read_bytes;      // bytes for keys/vals decoded by iterator
+
   // total number of internal keys skipped over during iteration.
   // There are several reasons for it:
   // 1. when calling Next(), the iterator is in the position of the previous
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
index fe8dee00f..4f24c8a22 100644
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -129,6 +129,10 @@ class PinnableSlice : public Slice, public Cleanable {
   PinnableSlice() { buf_ = &self_space_; }
   explicit PinnableSlice(std::string* buf) { buf_ = buf; }
 
+  // No copy constructor and copy assignment allowed.
+  PinnableSlice(PinnableSlice&) = delete;
+  PinnableSlice& operator=(PinnableSlice&) = delete;
+
   inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
                        void* arg2) {
     assert(!pinned_);
@@ -214,6 +218,7 @@ inline bool operator!=(const Slice& x, const Slice& y) {
 }
 
 inline int Slice::compare(const Slice& b) const {
+  assert(data_ != nullptr && b.data_ != nullptr);
   const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
   int r = memcmp(data_, b.data_, min_len);
   if (r == 0) {
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index b4629358e..731ff7809 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -105,8 +105,9 @@ enum Tickers : uint32_t {
   COMPACTION_KEY_DROP_OBSOLETE,     // The key is obsolete.
   COMPACTION_KEY_DROP_RANGE_DEL,    // key was covered by a range tombstone.
   COMPACTION_KEY_DROP_USER,  // user compaction function has dropped the key.
-
   COMPACTION_RANGE_DEL_DROP_OBSOLETE,  // all keys in range were deleted.
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
 
   // Number of keys written to the database via the Put and Write call's
   NUMBER_KEYS_WRITTEN,
@@ -264,7 +265,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
     {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
     {COMPACTION_RANGE_DEL_DROP_OBSOLETE,
-     "rocksdb.compaction.range_del.drop.obsolete"},
+      "rocksdb.compaction.range_del.drop.obsolete"},
+    {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+      "rocksdb.compaction.optimized.del.drop.obsolete"},
     {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
     {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
     {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 40e4d88b6..1b4c0ced9 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -43,7 +43,7 @@ struct Options;
 using std::unique_ptr;
 
 enum ChecksumType : char {
-  kNoChecksum = 0x0,  // not yet supported. Will fail
+  kNoChecksum = 0x0,
   kCRC32c = 0x1,
   kxxHash = 0x2,
 };
@@ -467,6 +467,12 @@ class TableFactory {
   // RocksDB prints configurations at DB Open().
   virtual std::string GetPrintableTableOptions() const = 0;
 
+  virtual Status GetOptionString(std::string* opt_string,
+                                 const std::string& delimiter) const {
+    return Status::NotSupported(
+        "The table factory doesn't implement GetOptionString().");
+  }
+
   // Returns the raw pointer of the table options that is used by this
   // TableFactory, or nullptr if this function is not supported.
   // Since the return value is a raw pointer, the TableFactory owns the
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 08360d179..2605fadd2 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -4,8 +4,8 @@
 #pragma once
 
 #include <stdint.h>
-#include <string>
 #include <map>
+#include <string>
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 
@@ -49,6 +49,7 @@ struct TablePropertiesNames {
   static const std::string kPropertyCollectors;
   static const std::string kCompression;
   static const std::string kCreationTime;
+  static const std::string kOldestKeyTime;
 };
 
 extern const std::string kPropertiesBlock;
@@ -162,6 +163,8 @@ struct TableProperties {
   // The time when the SST file was created.
   // Since SST files are immutable, this is equivalent to last modified time.
   uint64_t creation_time = 0;
+  // Timestamp of the earliest key. 0 means unknown.
+  uint64_t oldest_key_time = 0;
 
   // Name of the column family with which this SST file is associated.
   // If column family is unknown, `column_family_name` will be an empty string.
diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h
index f29fa045c..bc5b9bf03 100644
--- a/include/rocksdb/utilities/debug.h
+++ b/include/rocksdb/utilities/debug.h
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #pragma once
 
@@ -18,6 +16,8 @@ namespace rocksdb {
 // store multiple versions of a same user key due to snapshots, compaction not
 // happening yet, etc.
 struct KeyVersion {
+  KeyVersion() : user_key(""), value(""), sequence(0), type(0) {}
+
   KeyVersion(const std::string& _user_key, const std::string& _value,
              SequenceNumber _sequence, int _type)
       : user_key(_user_key), value(_value), sequence(_sequence), type(_type) {}
diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h
index 60c73ec5d..f29fd5e8f 100644
--- a/include/rocksdb/utilities/sim_cache.h
+++ b/include/rocksdb/utilities/sim_cache.h
@@ -9,6 +9,7 @@
 #include <memory>
 #include <string>
 #include "rocksdb/cache.h"
+#include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
@@ -67,6 +68,19 @@ class SimCache : public Cache {
   // String representation of the statistics of the simcache
   virtual std::string ToString() const = 0;
 
+  // Start storing logs of the cache activity (Add/Lookup) into
+  // a file located at activity_log_file, max_logging_size option can be used to
+  // stop logging to the file automatically after reaching a specific size in
+  // bytes, a values of 0 disable this feature
+  virtual Status StartActivityLogging(const std::string& activity_log_file,
+                                      Env* env, uint64_t max_logging_size = 0) = 0;
+
+  // Stop cache activity logging if any
+  virtual void StopActivityLogging() = 0;
+
+  // Status of cache logging happening in background
+  virtual Status GetActivityLoggingStatus() = 0;
+
  private:
   SimCache(const SimCache&);
   SimCache& operator=(const SimCache&);
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index db5068b1d..991de90aa 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -95,6 +95,8 @@ class StackableDB : public DB {
     return db_->IngestExternalFile(column_family, external_files, options);
   }
 
+  virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
+
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
@@ -350,6 +352,17 @@ class StackableDB : public DB {
     return db_->GetUpdatesSince(seq_number, iter, read_options);
   }
 
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                     const Slice* begin,
+                                     const Slice* end) override {
+    return db_->SuggestCompactRange(column_family, begin, end);
+  }
+
+  virtual Status PromoteL0(ColumnFamilyHandle* column_family,
+                           int target_level) override {
+    return db_->PromoteL0(column_family, target_level);
+  }
+
   virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
     return db_->DefaultColumnFamily();
   }
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index 8507ef133..a3519739c 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -169,8 +169,26 @@ class Transaction {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      std::string* value) = 0;
 
+  // An overload of the the above method that receives a PinnableSlice
+  // For backward compatiblity a default implementation is provided
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* pinnable_val) {
+    assert(pinnable_val != nullptr);
+    auto s = Get(options, column_family, key, pinnable_val->GetSelf());
+    pinnable_val->PinSelf();
+    return s;
+  }
+
   virtual Status Get(const ReadOptions& options, const Slice& key,
                      std::string* value) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     PinnableSlice* pinnable_val) {
+    assert(pinnable_val != nullptr);
+    auto s = Get(options, key, pinnable_val->GetSelf());
+    pinnable_val->PinSelf();
+    return s;
+  }
 
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
@@ -212,6 +230,22 @@ class Transaction {
                               const Slice& key, std::string* value,
                               bool exclusive = true) = 0;
 
+  // An overload of the the above method that receives a PinnableSlice
+  // For backward compatiblity a default implementation is provided
+  virtual Status GetForUpdate(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, PinnableSlice* pinnable_val,
+                              bool exclusive = true) {
+    if (pinnable_val == nullptr) {
+      std::string* null_str = nullptr;
+      return GetForUpdate(options, key, null_str);
+    } else {
+      auto s = GetForUpdate(options, key, pinnable_val->GetSelf());
+      pinnable_val->PinSelf();
+      return s;
+    }
+  }
+
   virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
                               std::string* value, bool exclusive = true) = 0;
 
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 259f50fe6..77043897a 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -23,6 +23,16 @@ namespace rocksdb {
 
 class TransactionDBMutexFactory;
 
+enum TxnDBWritePolicy {
+  WRITE_COMMITTED = 0,  // write only the committed data
+  // TODO(myabandeh): Not implemented yet
+  WRITE_PREPARED,  // write data after the prepare phase of 2pc
+  // TODO(myabandeh): Not implemented yet
+  WRITE_UNPREPARED  // write data before the prepare phase of 2pc
+};
+
+const uint32_t kInitialMaxDeadlocks = 5;
+
 struct TransactionDBOptions {
   // Specifies the maximum number of keys that can be locked at the same time
   // per column family.
@@ -31,6 +41,9 @@ struct TransactionDBOptions {
   // If this value is not positive, no limit will be enforced.
   int64_t max_num_locks = -1;
 
+  // Stores the number of latest deadlocks to track
+  uint32_t max_num_deadlocks = kInitialMaxDeadlocks;
+
   // Increasing this value will increase the concurrency by dividing the lock
   // table (per column family) into more sub-tables, each with their own
   // separate
@@ -66,6 +79,12 @@ struct TransactionDBOptions {
   // condition variable for all transaction locking instead of the default
   // mutex/condvar implementation.
   std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
+
+  // The policy for when to write the data into the DB. The default policy is to
+  // write only the committed data (WRITE_COMMITTED). The data could be written
+  // before the commit phase. The DB then needs to provide the mechanisms to
+  // tell apart committed from uncommitted data.
+  TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
 };
 
 struct TransactionOptions {
@@ -111,6 +130,26 @@ struct KeyLockInfo {
   bool exclusive;
 };
 
+struct DeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  std::string m_waiting_key;
+  bool m_exclusive;
+};
+
+struct DeadlockPath {
+  std::vector<DeadlockInfo> path;
+  bool limit_exceeded;
+
+  explicit DeadlockPath(std::vector<DeadlockInfo> path_entry)
+      : path(path_entry), limit_exceeded(false) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit DeadlockPath(bool limit = false) : path(0), limit_exceeded(limit) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
 class TransactionDB : public StackableDB {
  public:
   // Open a TransactionDB similar to DB::Open().
@@ -169,6 +208,8 @@ class TransactionDB : public StackableDB {
   // The mapping is column family id -> KeyLockInfo
   virtual std::unordered_multimap<uint32_t, KeyLockInfo>
   GetLockStatusData() = 0;
+  virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+  virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
 
  protected:
   // To Create an TransactionDB, call Open()
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 38809e1c7..24d8f30aa 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -186,10 +186,20 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // regardless).
   Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
                            const Slice& key, std::string* value);
+
+  // An overload of the the above method that receives a PinnableSlice
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           const Slice& key, PinnableSlice* value);
+
   Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
                            ColumnFamilyHandle* column_family, const Slice& key,
                            std::string* value);
 
+  // An overload of the the above method that receives a PinnableSlice
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value);
+
   // Records the state of the batch for future calls to RollbackToSavePoint().
   // May be called multiple times to set multiple save points.
   void SetSavePoint() override;
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 8d67af801..d592e6dae 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,8 +5,8 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 5
-#define ROCKSDB_MINOR 7
-#define ROCKSDB_PATCH 3
+#define ROCKSDB_MINOR 8
+#define ROCKSDB_PATCH 7
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 8bd93d36c..336391ead 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -233,6 +233,12 @@ class WriteBatch : public WriteBatchBase {
     }
     virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {}
 
+    virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
+                                  const Slice& /*key*/,
+                                  const Slice& /*value*/) {
+      return Status::InvalidArgument("PutBlobIndexCF not implemented");
+    }
+
     // The default implementation of LogData does nothing.
     virtual void LogData(const Slice& blob);
 
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index 6a22cee26..d67896c2c 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -24,6 +24,7 @@ set(JNI_NATIVE_SOURCES
         rocksjni/options.cc
         rocksjni/ratelimiterjni.cc
         rocksjni/remove_emptyvalue_compactionfilterjni.cc
+        rocksjni/cassandra_compactionfilterjni.cc
         rocksjni/restorejni.cc
         rocksjni/rocksdb_exception_test.cc
         rocksjni/rocksjni.cc
@@ -55,6 +56,8 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.BlockBasedTableConfig
         org.rocksdb.BloomFilter
         org.rocksdb.Cache
+        org.rocksdb.CassandraCompactionFilter
+        org.rocksdb.CassandraValueMergeOperator
         org.rocksdb.Checkpoint
         org.rocksdb.ClockCache
         org.rocksdb.ColumnFamilyHandle
@@ -74,6 +77,7 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.FlushOptions
         org.rocksdb.HashLinkedListMemTableConfig
         org.rocksdb.HashSkipListMemTableConfig
+        org.rocksdb.IngestExternalFileOptions
         org.rocksdb.Logger
         org.rocksdb.LRUCache
         org.rocksdb.MemTableConfig
diff --git a/java/Makefile b/java/Makefile
index 1210159af..b29447bd8 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -7,6 +7,7 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.BloomFilter\
 	org.rocksdb.Checkpoint\
 	org.rocksdb.ClockCache\
+	org.rocksdb.CassandraCompactionFilter\
 	org.rocksdb.CassandraValueMergeOperator\
 	org.rocksdb.ColumnFamilyHandle\
 	org.rocksdb.ColumnFamilyOptions\
diff --git a/java/rocksjni/cassandra_compactionfilterjni.cc b/java/rocksjni/cassandra_compactionfilterjni.cc
new file mode 100644
index 000000000..9d77559ab
--- /dev/null
+++ b/java/rocksjni/cassandra_compactionfilterjni.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CassandraCompactionFilter.h"
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+
+/*
+ * Class:     org_rocksdb_CassandraCompactionFilter
+ * Method:    createNewCassandraCompactionFilter0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CassandraCompactionFilter_createNewCassandraCompactionFilter0(
+    JNIEnv* env, jclass jcls, jboolean purge_ttl_on_expiration) {
+  auto* compaction_filter =
+      new rocksdb::cassandra::CassandraCompactionFilter(purge_ttl_on_expiration);
+  // set the native handle to our native compaction filter
+  return reinterpret_cast<jlong>(compaction_filter);
+}
diff --git a/java/rocksjni/cassandra_value_operator.cc b/java/rocksjni/cassandra_value_operator.cc
index 889213b9c..aa58eccc2 100644
--- a/java/rocksjni/cassandra_value_operator.cc
+++ b/java/rocksjni/cassandra_value_operator.cc
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -20,7 +18,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/merge_operator.h"
-#include "utilities/merge_operators/cassandra/merge_operator.h"
+#include "utilities/cassandra/merge_operator.h"
 
 /*
  * Class:     org_rocksdb_CassandraValueMergeOperator
diff --git a/java/rocksjni/sst_file_writerjni.cc b/java/rocksjni/sst_file_writerjni.cc
index 40595fb95..ceb93384a 100644
--- a/java/rocksjni/sst_file_writerjni.cc
+++ b/java/rocksjni/sst_file_writerjni.cc
@@ -77,9 +77,9 @@ void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject jobj,
  * Method:    put
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_put(JNIEnv *env, jobject jobj,
-                                        jlong jhandle, jlong jkey_handle,
-                                        jlong jvalue_handle) {
+void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject jobj,
+                                             jlong jhandle, jlong jkey_handle,
+                                             jlong jvalue_handle) {
   auto *key_slice = reinterpret_cast<rocksdb::Slice *>(jkey_handle);
   auto *value_slice = reinterpret_cast<rocksdb::Slice *>(jvalue_handle);
   rocksdb::Status s =
@@ -90,14 +90,51 @@ void Java_org_rocksdb_SstFileWriter_put(JNIEnv *env, jobject jobj,
   }
 }
 
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    put
+ * Signature: (JJJ)V
+ */
+ void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jobject jobj,
+                                                  jlong jhandle, jbyteArray jkey,
+                                                  jbyteArray jval) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if(key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key),  env->GetArrayLength(jkey));
+
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
+  if(value == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    return;
+  }
+  rocksdb::Slice value_slice(
+      reinterpret_cast<char*>(value),  env->GetArrayLength(jval));
+
+  rocksdb::Status s =
+  reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Put(key_slice,
+                                                           value_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 /*
  * Class:     org_rocksdb_SstFileWriter
  * Method:    merge
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_merge(JNIEnv *env, jobject jobj,
-                                          jlong jhandle, jlong jkey_handle,
-                                          jlong jvalue_handle) {
+void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject jobj,
+                                               jlong jhandle, jlong jkey_handle,
+                                               jlong jvalue_handle) {
   auto *key_slice = reinterpret_cast<rocksdb::Slice *>(jkey_handle);
   auto *value_slice = reinterpret_cast<rocksdb::Slice *>(jvalue_handle);
   rocksdb::Status s =
@@ -108,13 +145,76 @@ void Java_org_rocksdb_SstFileWriter_merge(JNIEnv *env, jobject jobj,
   }
 }
 
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    merge
+ * Signature: (J[B[B)V
+ */
+void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jobject jobj,
+                                                   jlong jhandle, jbyteArray jkey,
+                                                   jbyteArray jval) {
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if(key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key),  env->GetArrayLength(jkey));
+
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
+  if(value == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    return;
+  }
+  rocksdb::Slice value_slice(
+      reinterpret_cast<char*>(value),  env->GetArrayLength(jval));
+
+  rocksdb::Status s =
+    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Merge(key_slice,
+                                                               value_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileWriter
+ * Method:    delete
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject jobj,
+                                               jlong jhandle, jbyteArray jkey) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if(key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key),  env->GetArrayLength(jkey));
+
+  rocksdb::Status s =
+    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Delete(key_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 /*
  * Class:     org_rocksdb_SstFileWriter
  * Method:    delete
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_delete(JNIEnv *env, jobject jobj,
-                                           jlong jhandle, jlong jkey_handle) {
+ void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jobject jobj,
+  jlong jhandle, jlong jkey_handle) {
   auto *key_slice = reinterpret_cast<rocksdb::Slice *>(jkey_handle);
   rocksdb::Status s =
     reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Delete(*key_slice);
diff --git a/java/rocksjni/statisticsjni.cc b/java/rocksjni/statisticsjni.cc
index dc1d8f9f8..584ab5aa6 100644
--- a/java/rocksjni/statisticsjni.cc
+++ b/java/rocksjni/statisticsjni.cc
@@ -1,7 +1,7 @@
 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 //
 // This file implements the callback "bridge" between Java and C++ for
 // rocksdb::Statistics
diff --git a/java/rocksjni/statisticsjni.h b/java/rocksjni/statisticsjni.h
index d7c3ef3aa..600d9a676 100644
--- a/java/rocksjni/statisticsjni.h
+++ b/java/rocksjni/statisticsjni.h
@@ -1,7 +1,7 @@
 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 //
 // This file implements the callback "bridge" between Java and C++ for
 // rocksdb::Statistics
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index 0654e0158..199ad239d 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -59,7 +59,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
       rocksdb::ReadOptions(), &arena));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     rocksdb::ParsedInternalKey ikey;
-    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
+    ikey.clear();
     bool parsed = rocksdb::ParseInternalKey(iter->key(), &ikey);
     if (!parsed) {
       assert(parsed);
diff --git a/java/samples/src/main/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java
index b1f980555..f61995ed9 100644
--- a/java/samples/src/main/java/RocksDBSample.java
+++ b/java/samples/src/main/java/RocksDBSample.java
@@ -31,6 +31,7 @@ public static void main(final String[] args) {
          final Filter bloomFilter = new BloomFilter(10);
          final ReadOptions readOptions = new ReadOptions()
              .setFillCache(false);
+         final Statistics stats = new Statistics();
          final RateLimiter rateLimiter = new RateLimiter(10000000,10000, 10)) {
 
       try (final RocksDB db = RocksDB.open(options, db_path_not_found)) {
@@ -41,7 +42,7 @@ public static void main(final String[] args) {
 
       try {
         options.setCreateIfMissing(true)
-            .createStatistics()
+            .setStatistics(stats)
             .setWriteBufferSize(8 * SizeUnit.KB)
             .setMaxWriteBufferNumber(3)
             .setMaxBackgroundCompactions(10)
@@ -51,8 +52,6 @@ public static void main(final String[] args) {
         assert (false);
       }
 
-      final Statistics stats = options.statisticsPtr();
-
       assert (options.createIfMissing() == true);
       assert (options.writeBufferSize() == 8 * SizeUnit.KB);
       assert (options.maxWriteBufferNumber() == 3);
@@ -221,7 +220,9 @@ public static void main(final String[] args) {
 
         try {
           for (final TickerType statsType : TickerType.values()) {
-            stats.getTickerCount(statsType);
+            if (statsType != TickerType.TICKER_ENUM_MAX) {
+              stats.getTickerCount(statsType);
+            }
           }
           System.out.println("getTickerCount() passed.");
         } catch (final Exception e) {
@@ -231,7 +232,9 @@ public static void main(final String[] args) {
 
         try {
           for (final HistogramType histogramType : HistogramType.values()) {
-            HistogramData data = stats.getHistogramData(histogramType);
+            if (histogramType != HistogramType.HISTOGRAM_ENUM_MAX) {
+              HistogramData data = stats.getHistogramData(histogramType);
+            }
           }
           System.out.println("getHistogramData() passed.");
         } catch (final Exception e) {
diff --git a/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java b/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
new file mode 100644
index 000000000..26bf35883
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
@@ -0,0 +1,18 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Just a Java wrapper around CassandraCompactionFilter implemented in C++
+ */
+public class CassandraCompactionFilter
+    extends AbstractCompactionFilter<Slice> {
+  public CassandraCompactionFilter(boolean purgeTtlOnExpiration) {
+      super(createNewCassandraCompactionFilter0(purgeTtlOnExpiration));
+  }
+
+  private native static long createNewCassandraCompactionFilter0(boolean purgeTtlOnExpiration);
+}
diff --git a/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java b/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
index 55d67a3a3..a09556a2b 100644
--- a/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
+++ b/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 package org.rocksdb;
 
diff --git a/java/src/main/java/org/rocksdb/SstFileWriter.java b/java/src/main/java/org/rocksdb/SstFileWriter.java
index 8fe576082..5f35f0f61 100644
--- a/java/src/main/java/org/rocksdb/SstFileWriter.java
+++ b/java/src/main/java/org/rocksdb/SstFileWriter.java
@@ -117,6 +117,20 @@ public void put(final DirectSlice key, final DirectSlice value)
     put(nativeHandle_, key.getNativeHandle(), value.getNativeHandle());
   }
 
+ /**
+   * Add a Put key with value to currently opened file.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+public void put(final byte[] key, final byte[] value)
+    throws RocksDBException {
+  put(nativeHandle_, key, value);
+}
+
   /**
    * Add a Merge key with value to currently opened file.
    *
@@ -132,6 +146,21 @@ public void merge(final Slice key, final Slice value)
     merge(nativeHandle_, key.getNativeHandle(), value.getNativeHandle());
   }
 
+  /**
+   * Add a Merge key with value to currently opened file.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, key, value);
+  }
+
   /**
    * Add a Merge key with value to currently opened file.
    *
@@ -171,6 +200,18 @@ public void delete(final DirectSlice key) throws RocksDBException {
     delete(nativeHandle_, key.getNativeHandle());
   }
 
+  /**
+   * Add a deletion key to currently opened file.
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void delete(final byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key);
+  }
+
   /**
    * Finish the process and close the sst file.
    *
@@ -193,13 +234,22 @@ private native void open(final long handle, final String filePath)
 
   private native void put(final long handle, final long keyHandle,
       final long valueHandle) throws RocksDBException;
+      
+  private native void put(final long handle, final byte[] key,
+      final byte[] value) throws RocksDBException;
 
   private native void merge(final long handle, final long keyHandle,
       final long valueHandle) throws RocksDBException;
 
+  private native void merge(final long handle, final byte[] key,
+      final byte[] value) throws RocksDBException;
+
   private native void delete(final long handle, final long keyHandle)
       throws RocksDBException;
 
+  private native void delete(final long handle, final byte[] key)
+      throws RocksDBException;
+
   private native void finish(final long handle) throws RocksDBException;
 
   @Override protected final native void disposeInternal(final long handle);
diff --git a/java/src/main/java/org/rocksdb/StatsLevel.java b/java/src/main/java/org/rocksdb/StatsLevel.java
index 023d4e1a2..cc2a87c6a 100644
--- a/java/src/main/java/org/rocksdb/StatsLevel.java
+++ b/java/src/main/java/org/rocksdb/StatsLevel.java
@@ -1,7 +1,7 @@
 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 package org.rocksdb;
 
diff --git a/java/src/test/java/org/rocksdb/SstFileWriterTest.java b/java/src/test/java/org/rocksdb/SstFileWriterTest.java
index 8c3b0c3d9..6261210b1 100644
--- a/java/src/test/java/org/rocksdb/SstFileWriterTest.java
+++ b/java/src/test/java/org/rocksdb/SstFileWriterTest.java
@@ -30,7 +30,7 @@ public class SstFileWriterTest {
 
   @Rule public TemporaryFolder parentFolder = new TemporaryFolder();
 
-  enum OpType { PUT, MERGE, DELETE }
+  enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES}
 
   class KeyValueWithOp {
     KeyValueWithOp(String key, String value, OpType opType) {
@@ -79,16 +79,27 @@ private File newSstFile(final List<KeyValueWithOp> keyValues,
       for (KeyValueWithOp keyValue : keyValues) {
         Slice keySlice = new Slice(keyValue.getKey());
         Slice valueSlice = new Slice(keyValue.getValue());
+        byte[] keyBytes = keyValue.getKey().getBytes();
+        byte[] valueBytes = keyValue.getValue().getBytes();
         switch (keyValue.getOpType()) {
           case PUT:
             sstFileWriter.put(keySlice, valueSlice);
             break;
+          case PUT_BYTES:
+            sstFileWriter.put(keyBytes, valueBytes);
+            break;
           case MERGE:
             sstFileWriter.merge(keySlice, valueSlice);
             break;
+          case MERGE_BYTES:
+            sstFileWriter.merge(keyBytes, valueBytes);
+            break;
           case DELETE:
             sstFileWriter.delete(keySlice);
             break;
+          case DELETE_BYTES:
+            sstFileWriter.delete(keyBytes);
+            break;
           default:
             fail("Unsupported op type");
         }
@@ -142,8 +153,12 @@ public void ingestSstFile() throws RocksDBException, IOException {
     final List<KeyValueWithOp> keyValues = new ArrayList<>();
     keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
     keyValues.add(new KeyValueWithOp("key2", "value2", OpType.PUT));
-    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.MERGE));
-    keyValues.add(new KeyValueWithOp("key4", "", OpType.DELETE));
+    keyValues.add(new KeyValueWithOp("key3", "value3", OpType.PUT_BYTES));
+    keyValues.add(new KeyValueWithOp("key4", "value4", OpType.MERGE));
+    keyValues.add(new KeyValueWithOp("key5", "value5", OpType.MERGE_BYTES));
+    keyValues.add(new KeyValueWithOp("key6", "", OpType.DELETE));
+    keyValues.add(new KeyValueWithOp("key7", "", OpType.DELETE));
+
 
     final File sstFile = newSstFile(keyValues, false);
     final File dbFolder = parentFolder.newFolder(DB_DIRECTORY_NAME);
@@ -161,7 +176,10 @@ public void ingestSstFile() throws RocksDBException, IOException {
       assertThat(db.get("key1".getBytes())).isEqualTo("value1".getBytes());
       assertThat(db.get("key2".getBytes())).isEqualTo("value2".getBytes());
       assertThat(db.get("key3".getBytes())).isEqualTo("value3".getBytes());
-      assertThat(db.get("key4".getBytes())).isEqualTo(null);
+      assertThat(db.get("key4".getBytes())).isEqualTo("value4".getBytes());
+      assertThat(db.get("key5".getBytes())).isEqualTo("value5".getBytes());
+      assertThat(db.get("key6".getBytes())).isEqualTo(null);
+      assertThat(db.get("key7".getBytes())).isEqualTo(null);
     }
   }
 
diff --git a/java/src/test/java/org/rocksdb/StatisticsTest.java b/java/src/test/java/org/rocksdb/StatisticsTest.java
index 6b1d0f16c..2103c2fc7 100644
--- a/java/src/test/java/org/rocksdb/StatisticsTest.java
+++ b/java/src/test/java/org/rocksdb/StatisticsTest.java
@@ -1,7 +1,7 @@
 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 package org.rocksdb;
 
diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc
index 46d6c0fa9..5803e5b0f 100644
--- a/memtable/inlineskiplist_test.cc
+++ b/memtable/inlineskiplist_test.cc
@@ -571,6 +571,7 @@ static void RunConcurrentRead(int run) {
       fprintf(stderr, "Run %d of %d\n", i, N);
     }
     TestState state(seed + 1);
+    Env::Default()->SetBackgroundThreads(1);
     Env::Default()->Schedule(ConcurrentReader, &state);
     state.Wait(TestState::RUNNING);
     for (int k = 0; k < kSize; ++k) {
diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc
index 2f4af1788..50c3588bb 100644
--- a/memtable/skiplist_test.cc
+++ b/memtable/skiplist_test.cc
@@ -363,6 +363,7 @@ static void RunConcurrent(int run) {
       fprintf(stderr, "Run %d of %d\n", i, N);
     }
     TestState state(seed + 1);
+    Env::Default()->SetBackgroundThreads(1);
     Env::Default()->Schedule(ConcurrentReader, &state);
     state.Wait(TestState::RUNNING);
     for (int k = 0; k < kSize; k++) {
diff --git a/monitoring/file_read_sample.h b/monitoring/file_read_sample.h
index 2cefe5522..9ad7d2f56 100644
--- a/monitoring/file_read_sample.h
+++ b/monitoring/file_read_sample.h
@@ -1,9 +1,7 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 //
 #pragma once
 #include "db/version_edit.h"
diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc
index 835ffc88a..b3c01a78e 100644
--- a/monitoring/histogram.cc
+++ b/monitoring/histogram.cc
@@ -19,44 +19,30 @@
 #include <stdio.h>
 
 #include "port/port.h"
+#include "util/cast_util.h"
 
 namespace rocksdb {
 
-HistogramBucketMapper::HistogramBucketMapper()
-    :
-      // Add newer bucket index here.
-      // Should be always added in sorted order.
-      // If you change this, you also need to change
-      // size of array buckets_ in HistogramImpl
-      bucketValues_(
-          {1,         2,         3,         4,         5,         6,
-           7,         8,         9,         10,        12,        14,
-           16,        18,        20,        25,        30,        35,
-           40,        45,        50,        60,        70,        80,
-           90,        100,       120,       140,       160,       180,
-           200,       250,       300,       350,       400,       450,
-           500,       600,       700,       800,       900,       1000,
-           1200,      1400,      1600,      1800,      2000,      2500,
-           3000,      3500,      4000,      4500,      5000,      6000,
-           7000,      8000,      9000,      10000,     12000,     14000,
-           16000,     18000,     20000,     25000,     30000,     35000,
-           40000,     45000,     50000,     60000,     70000,     80000,
-           90000,     100000,    120000,    140000,    160000,    180000,
-           200000,    250000,    300000,    350000,    400000,    450000,
-           500000,    600000,    700000,    800000,    900000,    1000000,
-           1200000,   1400000,   1600000,   1800000,   2000000,   2500000,
-           3000000,   3500000,   4000000,   4500000,   5000000,   6000000,
-           7000000,   8000000,   9000000,   10000000,  12000000,  14000000,
-           16000000,  18000000,  20000000,  25000000,  30000000,  35000000,
-           40000000,  45000000,  50000000,  60000000,  70000000,  80000000,
-           90000000,  100000000, 120000000, 140000000, 160000000, 180000000,
-           200000000, 250000000, 300000000, 350000000, 400000000, 450000000,
-           500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}),
-      maxBucketValue_(bucketValues_.back()),
-      minBucketValue_(bucketValues_.front()) {
-  for (size_t i =0; i < bucketValues_.size(); ++i) {
-    valueIndexMap_[bucketValues_[i]] = i;
+HistogramBucketMapper::HistogramBucketMapper() {
+  // If you change this, you also need to change
+  // size of array buckets_ in HistogramImpl
+  bucketValues_ = {1, 2};
+  valueIndexMap_ = {{1, 0}, {2, 1}};
+  double bucket_val = static_cast<double>(bucketValues_.back());
+  while ((bucket_val = 1.5 * bucket_val) <= static_cast<double>(port::kMaxUint64)) {
+    bucketValues_.push_back(static_cast<uint64_t>(bucket_val));
+    // Extracts two most significant digits to make histogram buckets more
+    // human-readable. E.g., 172 becomes 170.
+    uint64_t pow_of_ten = 1;
+    while (bucketValues_.back() / 10 > 10) {
+      bucketValues_.back() /= 10;
+      pow_of_ten *= 10;
+    }
+    bucketValues_.back() *= pow_of_ten;
+    valueIndexMap_[bucketValues_.back()] = bucketValues_.size() - 1;
   }
+  maxBucketValue_ = bucketValues_.back();
+  minBucketValue_ = bucketValues_.front();
 }
 
 size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const {
@@ -104,17 +90,26 @@ void HistogramStat::Add(uint64_t value) {
   // by concurrent threads is tolerable.
   const size_t index = bucketMapper.IndexForValue(value);
   assert(index < num_buckets_);
-  buckets_[index].fetch_add(1, std::memory_order_relaxed);
+  buckets_[index].store(buckets_[index].load(std::memory_order_relaxed) + 1,
+                        std::memory_order_relaxed);
 
   uint64_t old_min = min();
-  while (value < old_min && !min_.compare_exchange_weak(old_min, value)) {}
+  if (value < old_min) {
+    min_.store(value, std::memory_order_relaxed);
+  }
 
   uint64_t old_max = max();
-  while (value > old_max && !max_.compare_exchange_weak(old_max, value)) {}
+  if (value > old_max) {
+    max_.store(value, std::memory_order_relaxed);
+  }
 
-  num_.fetch_add(1, std::memory_order_relaxed);
-  sum_.fetch_add(value, std::memory_order_relaxed);
-  sum_squares_.fetch_add(value * value, std::memory_order_relaxed);
+  num_.store(num_.load(std::memory_order_relaxed) + 1,
+             std::memory_order_relaxed);
+  sum_.store(sum_.load(std::memory_order_relaxed) + value,
+             std::memory_order_relaxed);
+  sum_squares_.store(
+      sum_squares_.load(std::memory_order_relaxed) + value * value,
+      std::memory_order_relaxed);
 }
 
 void HistogramStat::Merge(const HistogramStat& other) {
@@ -255,7 +250,8 @@ void HistogramImpl::Add(uint64_t value) {
 
 void HistogramImpl::Merge(const Histogram& other) {
   if (strcmp(Name(), other.Name()) == 0) {
-    Merge(dynamic_cast<const HistogramImpl&>(other));
+    Merge(
+        *static_cast_with_check<const HistogramImpl, const Histogram>(&other));
   }
 }
 
diff --git a/monitoring/histogram.h b/monitoring/histogram.h
index 6a1ebbf04..6bf2e9e93 100644
--- a/monitoring/histogram.h
+++ b/monitoring/histogram.h
@@ -45,9 +45,9 @@ class HistogramBucketMapper {
   }
 
  private:
-  const std::vector<uint64_t> bucketValues_;
-  const uint64_t maxBucketValue_;
-  const uint64_t minBucketValue_;
+  std::vector<uint64_t> bucketValues_;
+  uint64_t maxBucketValue_;
+  uint64_t minBucketValue_;
   std::map<uint64_t, uint64_t> valueIndexMap_;
 };
 
@@ -89,7 +89,7 @@ struct HistogramStat {
   std::atomic_uint_fast64_t num_;
   std::atomic_uint_fast64_t sum_;
   std::atomic_uint_fast64_t sum_squares_;
-  std::atomic_uint_fast64_t buckets_[138]; // 138==BucketMapper::BucketCount()
+  std::atomic_uint_fast64_t buckets_[109]; // 109==BucketMapper::BucketCount()
   const uint64_t num_buckets_;
 };
 
@@ -146,4 +146,4 @@ class HistogramImpl : public Histogram {
   std::mutex mutex_;
 };
 
-}  // namespace rocksdb
\ No newline at end of file
+}  // namespace rocksdb
diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc
index 70147af72..b4e3c981c 100644
--- a/monitoring/histogram_test.cc
+++ b/monitoring/histogram_test.cc
@@ -29,33 +29,31 @@ void PopulateHistogram(Histogram& histogram,
 }
 
 void BasicOperation(Histogram& histogram) {
-  PopulateHistogram(histogram, 1, 100, 10);
+  PopulateHistogram(histogram, 1, 110, 10); // fill up to bucket [70, 110)
 
   HistogramData data;
   histogram.Data(&data);
 
-  ASSERT_LE(fabs(histogram.Percentile(100.0) - 100.0), kIota);
-  ASSERT_LE(fabs(data.percentile99 - 99.0), kIota);
-  ASSERT_LE(fabs(data.percentile95 - 95.0), kIota);
-  ASSERT_LE(fabs(data.median - 50.0), kIota);
-  ASSERT_EQ(data.average, 50.5);               // avg is acurately calculated.
-  ASSERT_LT(fabs(data.standard_deviation- 28.86), kIota); //sd is ~= 28.86
+  ASSERT_LE(fabs(histogram.Percentile(100.0) - 110.0), kIota);
+  ASSERT_LE(fabs(data.percentile99 - 108.9), kIota);  // 99 * 110 / 100
+  ASSERT_LE(fabs(data.percentile95 - 104.5), kIota);  // 95 * 110 / 100
+  ASSERT_LE(fabs(data.median - 55.0), kIota);  // 50 * 110 / 100
+  ASSERT_EQ(data.average, 55.5);  // (1 + 110) / 2
 }
 
 void MergeHistogram(Histogram& histogram, Histogram& other) {
   PopulateHistogram(histogram, 1, 100);
-  PopulateHistogram(other, 101, 200);
+  PopulateHistogram(other, 101, 250);
   histogram.Merge(other);
 
   HistogramData data;
   histogram.Data(&data);
 
-  ASSERT_LE(fabs(histogram.Percentile(100.0) - 200.0), kIota);
-  ASSERT_LE(fabs(data.percentile99 - 198.0), kIota);
-  ASSERT_LE(fabs(data.percentile95 - 190.0), kIota);
-  ASSERT_LE(fabs(data.median - 100.0), kIota);
-  ASSERT_EQ(data.average, 100.5);                // avg is acurately calculated.
-  ASSERT_LT(fabs(data.standard_deviation - 57.73), kIota); //sd is ~= 57.73
+  ASSERT_LE(fabs(histogram.Percentile(100.0) - 250.0), kIota);
+  ASSERT_LE(fabs(data.percentile99 - 247.5), kIota);  // 99 * 250 / 100
+  ASSERT_LE(fabs(data.percentile95 - 237.5), kIota);  // 95 * 250 / 100
+  ASSERT_LE(fabs(data.median - 125.0), kIota);  // 50 * 250 / 100
+  ASSERT_EQ(data.average, 125.5);  // (1 + 250) / 2
 }
 
 void EmptyHistogram(Histogram& histogram) {
diff --git a/monitoring/histogram_windowing.cc b/monitoring/histogram_windowing.cc
index 20ee983f1..28d8265f2 100644
--- a/monitoring/histogram_windowing.cc
+++ b/monitoring/histogram_windowing.cc
@@ -9,6 +9,7 @@
 
 #include "monitoring/histogram_windowing.h"
 #include "monitoring/histogram.h"
+#include "util/cast_util.h"
 
 #include <algorithm>
 
@@ -64,7 +65,9 @@ void HistogramWindowingImpl::Add(uint64_t value){
 
 void HistogramWindowingImpl::Merge(const Histogram& other) {
   if (strcmp(Name(), other.Name()) == 0) {
-    Merge(dynamic_cast<const HistogramWindowingImpl&>(other));
+    Merge(
+        *static_cast_with_check<const HistogramWindowingImpl, const Histogram>(
+            &other));
   }
 }
 
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index 55df0459b..791f4bdbe 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -40,6 +40,9 @@ void PerfContext::Reset() {
   block_read_time = 0;
   block_checksum_time = 0;
   block_decompress_time = 0;
+  get_read_bytes = 0;
+  multiget_read_bytes = 0;
+  iter_read_bytes = 0;
   internal_key_skipped_count = 0;
   internal_delete_skipped_count = 0;
   internal_recent_skipped_count = 0;
@@ -117,6 +120,9 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(block_read_time);
   PERF_CONTEXT_OUTPUT(block_checksum_time);
   PERF_CONTEXT_OUTPUT(block_decompress_time);
+  PERF_CONTEXT_OUTPUT(get_read_bytes);
+  PERF_CONTEXT_OUTPUT(multiget_read_bytes);
+  PERF_CONTEXT_OUTPUT(iter_read_bytes);
   PERF_CONTEXT_OUTPUT(internal_key_skipped_count);
   PERF_CONTEXT_OUTPUT(internal_delete_skipped_count);
   PERF_CONTEXT_OUTPUT(internal_recent_skipped_count);
diff --git a/options/cf_options.h b/options/cf_options.h
index df5b460fc..f376729f8 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -83,7 +83,7 @@ struct ImmutableCFOptions {
   bool advise_random_on_open;
 
   // This options is required by PlainTableReader. May need to move it
-  // to PlainTalbeOptions just like bloom_bits_per_key
+  // to PlainTableOptions just like bloom_bits_per_key
   uint32_t bloom_locality;
 
   bool purge_redundant_kvs_while_flush;
diff --git a/options/db_options.cc b/options/db_options.cc
index 2a7860450..61775757d 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -162,6 +162,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                   Options.db_write_buffer_size: %" ROCKSDB_PRIszt,
       db_write_buffer_size);
+  ROCKS_LOG_HEADER(log, "                   Options.write_buffer_manager: %p",
+                   write_buffer_manager.get());
   ROCKS_LOG_HEADER(log, "        Options.access_hint_on_compaction_start: %d",
                    static_cast<int>(access_hint_on_compaction_start));
   ROCKS_LOG_HEADER(log, " Options.new_table_reader_for_compaction_inputs: %d",
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 9e984f6e3..5cf548fb9 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -21,6 +21,7 @@
 #include "rocksdb/table.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -303,6 +304,7 @@ bool ParseSliceTransform(
   //                 SliceTransforms here.
   return false;
 }
+}  // anonymouse namespace
 
 bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
                        const std::string& value) {
@@ -383,8 +385,6 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
   return true;
 }
 
-}  // anonymouse namespace
-
 bool SerializeSingleOptionHelper(const char* opt_address,
                                  const OptionType opt_type,
                                  std::string* value) {
@@ -466,12 +466,14 @@ bool SerializeSingleOptionHelper(const char* opt_address,
       // Since the user-specified comparator will be wrapped by
       // InternalKeyComparator, we should persist the user-specified one
       // instead of InternalKeyComparator.
-      const auto* internal_comparator =
-          dynamic_cast<const InternalKeyComparator*>(*ptr);
-      if (internal_comparator != nullptr) {
-        *value = internal_comparator->user_comparator()->Name();
+      if (*ptr == nullptr) {
+        *value = kNullptrString;
       } else {
-        *value = *ptr ? (*ptr)->Name() : kNullptrString;
+        const Comparator* root_comp = (*ptr)->GetRootComparator();
+        if (root_comp == nullptr) {
+          root_comp = (*ptr);
+        }
+        *value = root_comp->Name();
       }
       break;
     }
@@ -693,8 +695,9 @@ Status ParseColumnFamilyOption(const std::string& name,
     if (name == "block_based_table_factory") {
       // Nested options
       BlockBasedTableOptions table_opt, base_table_options;
-      auto block_based_table_factory = dynamic_cast<BlockBasedTableFactory*>(
-          new_options->table_factory.get());
+      BlockBasedTableFactory* block_based_table_factory =
+          static_cast_with_check<BlockBasedTableFactory, TableFactory>(
+              new_options->table_factory.get());
       if (block_based_table_factory != nullptr) {
         base_table_options = block_based_table_factory->table_options();
       }
@@ -708,8 +711,9 @@ Status ParseColumnFamilyOption(const std::string& name,
     } else if (name == "plain_table_factory") {
       // Nested options
       PlainTableOptions table_opt, base_table_options;
-      auto plain_table_factory = dynamic_cast<PlainTableFactory*>(
-          new_options->table_factory.get());
+      PlainTableFactory* plain_table_factory =
+          static_cast_with_check<PlainTableFactory, TableFactory>(
+              new_options->table_factory.get());
       if (plain_table_factory != nullptr) {
         base_table_options = plain_table_factory->table_options();
       }
@@ -909,59 +913,6 @@ std::vector<CompressionType> GetSupportedCompressions() {
   return supported_compressions;
 }
 
-bool SerializeSingleBlockBasedTableOption(
-    std::string* opt_string, const BlockBasedTableOptions& bbt_options,
-    const std::string& name, const std::string& delimiter) {
-  auto iter = block_based_table_type_info.find(name);
-  if (iter == block_based_table_type_info.end()) {
-    return false;
-  }
-  auto& opt_info = iter->second;
-  const char* opt_address =
-      reinterpret_cast<const char*>(&bbt_options) + opt_info.offset;
-  std::string value;
-  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
-  if (result) {
-    *opt_string = name + "=" + value + delimiter;
-  }
-  return result;
-}
-
-Status GetStringFromBlockBasedTableOptions(
-    std::string* opt_string, const BlockBasedTableOptions& bbt_options,
-    const std::string& delimiter) {
-  assert(opt_string);
-  opt_string->clear();
-  for (auto iter = block_based_table_type_info.begin();
-       iter != block_based_table_type_info.end(); ++iter) {
-    if (iter->second.verification == OptionVerificationType::kDeprecated) {
-      // If the option is no longer used in rocksdb and marked as deprecated,
-      // we skip it in the serialization.
-      continue;
-    }
-    std::string single_output;
-    bool result = SerializeSingleBlockBasedTableOption(
-        &single_output, bbt_options, iter->first, delimiter);
-    assert(result);
-    if (result) {
-      opt_string->append(single_output);
-    }
-  }
-  return Status::OK();
-}
-
-Status GetStringFromTableFactory(std::string* opts_str, const TableFactory* tf,
-                                 const std::string& delimiter) {
-  const auto* bbtf = dynamic_cast<const BlockBasedTableFactory*>(tf);
-  opts_str->clear();
-  if (bbtf != nullptr) {
-    return GetStringFromBlockBasedTableOptions(opts_str, bbtf->table_options(),
-                                               delimiter);
-  }
-
-  return Status::OK();
-}
-
 Status ParseDBOption(const std::string& name,
                      const std::string& org_value,
                      DBOptions* new_options,
@@ -1003,242 +954,6 @@ Status ParseDBOption(const std::string& name,
   return Status::OK();
 }
 
-std::string ParseBlockBasedTableOption(const std::string& name,
-                                       const std::string& org_value,
-                                       BlockBasedTableOptions* new_options,
-                                       bool input_strings_escaped = false,
-                                       bool ignore_unknown_options = false) {
-  const std::string& value =
-      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
-  if (!input_strings_escaped) {
-    // if the input string is not escaped, it means this function is
-    // invoked from SetOptions, which takes the old format.
-    if (name == "block_cache") {
-      new_options->block_cache = NewLRUCache(ParseSizeT(value));
-      return "";
-    } else if (name == "block_cache_compressed") {
-      new_options->block_cache_compressed = NewLRUCache(ParseSizeT(value));
-      return "";
-    } else if (name == "filter_policy") {
-      // Expect the following format
-      // bloomfilter:int:bool
-      const std::string kName = "bloomfilter:";
-      if (value.compare(0, kName.size(), kName) != 0) {
-        return "Invalid filter policy name";
-      }
-      size_t pos = value.find(':', kName.size());
-      if (pos == std::string::npos) {
-        return "Invalid filter policy config, missing bits_per_key";
-      }
-      int bits_per_key =
-          ParseInt(trim(value.substr(kName.size(), pos - kName.size())));
-      bool use_block_based_builder =
-          ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
-      new_options->filter_policy.reset(
-          NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
-      return "";
-    }
-  }
-  const auto iter = block_based_table_type_info.find(name);
-  if (iter == block_based_table_type_info.end()) {
-    if (ignore_unknown_options) {
-      return "";
-    } else {
-      return "Unrecognized option";
-    }
-  }
-  const auto& opt_info = iter->second;
-  if (opt_info.verification != OptionVerificationType::kDeprecated &&
-      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
-                         opt_info.type, value)) {
-    return "Invalid value";
-  }
-  return "";
-}
-
-std::string ParsePlainTableOptions(const std::string& name,
-                                   const std::string& org_value,
-                                   PlainTableOptions* new_options,
-                                   bool input_strings_escaped = false,
-                                   bool ignore_unknown_options = false) {
-  const std::string& value =
-      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
-  const auto iter = plain_table_type_info.find(name);
-  if (iter == plain_table_type_info.end()) {
-    if (ignore_unknown_options) {
-      return "";
-    } else {
-      return "Unrecognized option";
-    }
-  }
-  const auto& opt_info = iter->second;
-  if (opt_info.verification != OptionVerificationType::kDeprecated &&
-      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
-                         opt_info.type, value)) {
-    return "Invalid value";
-  }
-  return "";
-}
-
-Status GetBlockBasedTableOptionsFromMap(
-    const BlockBasedTableOptions& table_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
-    bool ignore_unknown_options) {
-  assert(new_table_options);
-  *new_table_options = table_options;
-  for (const auto& o : opts_map) {
-    auto error_message = ParseBlockBasedTableOption(
-        o.first, o.second, new_table_options, input_strings_escaped,
-        ignore_unknown_options);
-    if (error_message != "") {
-      const auto iter = block_based_table_type_info.find(o.first);
-      if (iter == block_based_table_type_info.end() ||
-          !input_strings_escaped ||  // !input_strings_escaped indicates
-                                     // the old API, where everything is
-                                     // parsable.
-          (iter->second.verification != OptionVerificationType::kByName &&
-           iter->second.verification !=
-               OptionVerificationType::kByNameAllowNull &&
-           iter->second.verification !=
-               OptionVerificationType::kDeprecated)) {
-        // Restore "new_options" to the default "base_options".
-        *new_table_options = table_options;
-        return Status::InvalidArgument("Can't parse BlockBasedTableOptions:",
-                                       o.first + " " + error_message);
-      }
-    }
-  }
-  return Status::OK();
-}
-
-Status GetBlockBasedTableOptionsFromString(
-    const BlockBasedTableOptions& table_options,
-    const std::string& opts_str,
-    BlockBasedTableOptions* new_table_options) {
-  std::unordered_map<std::string, std::string> opts_map;
-  Status s = StringToMap(opts_str, &opts_map);
-  if (!s.ok()) {
-    return s;
-  }
-  return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
-                                          new_table_options);
-}
-
-Status GetPlainTableOptionsFromMap(
-    const PlainTableOptions& table_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    PlainTableOptions* new_table_options, bool input_strings_escaped,
-    bool ignore_unknown_options) {
-  assert(new_table_options);
-  *new_table_options = table_options;
-  for (const auto& o : opts_map) {
-    auto error_message = ParsePlainTableOptions(
-        o.first, o.second, new_table_options, input_strings_escaped);
-    if (error_message != "") {
-      const auto iter = plain_table_type_info.find(o.first);
-      if (iter == plain_table_type_info.end() ||
-          !input_strings_escaped ||  // !input_strings_escaped indicates
-                                     // the old API, where everything is
-                                     // parsable.
-          (iter->second.verification != OptionVerificationType::kByName &&
-           iter->second.verification !=
-               OptionVerificationType::kByNameAllowNull &&
-           iter->second.verification !=
-               OptionVerificationType::kDeprecated)) {
-        // Restore "new_options" to the default "base_options".
-        *new_table_options = table_options;
-        return Status::InvalidArgument("Can't parse PlainTableOptions:",
-                                       o.first + " " + error_message);
-      }
-    }
-  }
-  return Status::OK();
-}
-
-Status GetPlainTableOptionsFromString(
-    const PlainTableOptions& table_options,
-    const std::string& opts_str,
-    PlainTableOptions* new_table_options) {
-  std::unordered_map<std::string, std::string> opts_map;
-  Status s = StringToMap(opts_str, &opts_map);
-  if (!s.ok()) {
-    return s;
-  }
-  return GetPlainTableOptionsFromMap(table_options, opts_map,
-                                     new_table_options);
-}
-
-Status GetMemTableRepFactoryFromString(const std::string& opts_str,
-    std::unique_ptr<MemTableRepFactory>* new_mem_factory) {
-  std::vector<std::string> opts_list = StringSplit(opts_str, ':');
-  size_t len = opts_list.size();
-
-  if (opts_list.size() <= 0 || opts_list.size() > 2) {
-    return Status::InvalidArgument("Can't parse memtable_factory option ",
-                                     opts_str);
-  }
-
-  MemTableRepFactory* mem_factory = nullptr;
-
-  if (opts_list[0] == "skip_list") {
-    // Expecting format
-    // skip_list:<lookahead>
-    if (2 == len) {
-      size_t lookahead = ParseSizeT(opts_list[1]);
-      mem_factory = new SkipListFactory(lookahead);
-    } else if (1 == len) {
-      mem_factory = new SkipListFactory();
-    }
-  } else if (opts_list[0] == "prefix_hash") {
-    // Expecting format
-    // prfix_hash:<hash_bucket_count>
-    if (2 == len) {
-      size_t hash_bucket_count = ParseSizeT(opts_list[1]);
-      mem_factory = NewHashSkipListRepFactory(hash_bucket_count);
-    } else if (1 == len) {
-      mem_factory = NewHashSkipListRepFactory();
-    }
-  } else if (opts_list[0] == "hash_linkedlist") {
-    // Expecting format
-    // hash_linkedlist:<hash_bucket_count>
-    if (2 == len) {
-      size_t hash_bucket_count = ParseSizeT(opts_list[1]);
-      mem_factory = NewHashLinkListRepFactory(hash_bucket_count);
-    } else if (1 == len) {
-      mem_factory = NewHashLinkListRepFactory();
-    }
-  } else if (opts_list[0] == "vector") {
-    // Expecting format
-    // vector:<count>
-    if (2 == len) {
-      size_t count = ParseSizeT(opts_list[1]);
-      mem_factory = new VectorRepFactory(count);
-    } else if (1 == len) {
-      mem_factory = new VectorRepFactory();
-    }
-  } else if (opts_list[0] == "cuckoo") {
-    // Expecting format
-    // cuckoo:<write_buffer_size>
-    if (2 == len) {
-      size_t write_buffer_size = ParseSizeT(opts_list[1]);
-      mem_factory= NewHashCuckooRepFactory(write_buffer_size);
-    } else if (1 == len) {
-      return Status::InvalidArgument("Can't parse memtable_factory option ",
-                                     opts_str);
-    }
-  } else {
-    return Status::InvalidArgument("Unrecognized memtable_factory option ",
-                                   opts_str);
-  }
-
-  if (mem_factory != nullptr){
-    new_mem_factory->reset(mem_factory);
-  }
-
-  return Status::OK();
-}
-
 Status GetColumnFamilyOptionsFromMap(
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
diff --git a/options/options_helper.h b/options/options_helper.h
index b15faa74f..67b04271f 100644
--- a/options/options_helper.h
+++ b/options/options_helper.h
@@ -42,6 +42,9 @@ static std::map<CompactionStopStyle, std::string>
         {kCompactionStopStyleSimilarSize, "kCompactionStopStyleSimilarSize"},
         {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
 
+static std::unordered_map<std::string, ChecksumType> checksum_type_string_map =
+    {{"kNoChecksum", kNoChecksum}, {"kCRC32c", kCRC32c}, {"kxxHash", kxxHash}};
+
 #ifndef ROCKSDB_LITE
 
 Status GetMutableOptionsFromStrings(
@@ -60,9 +63,6 @@ Status GetTableFactoryFromMap(
     std::shared_ptr<TableFactory>* table_factory,
     bool ignore_unknown_options = false);
 
-Status GetStringFromTableFactory(std::string* opts_str, const TableFactory* tf,
-                                 const std::string& delimiter = ";  ");
-
 enum class OptionType {
   kBoolean,
   kInt,
@@ -580,109 +580,6 @@ static std::unordered_map<std::string, OptionTypeInfo> cf_options_type_info = {
      {offset_of(&ColumnFamilyOptions::compaction_pri),
       OptionType::kCompactionPri, OptionVerificationType::kNormal, false, 0}}};
 
-static std::unordered_map<std::string, OptionTypeInfo>
-    block_based_table_type_info = {
-        /* currently not supported
-          std::shared_ptr<Cache> block_cache = nullptr;
-          std::shared_ptr<Cache> block_cache_compressed = nullptr;
-         */
-        {"flush_block_policy_factory",
-         {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
-          OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName,
-          false, 0}},
-        {"cache_index_and_filter_blocks",
-         {offsetof(struct BlockBasedTableOptions,
-                   cache_index_and_filter_blocks),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"cache_index_and_filter_blocks_with_high_priority",
-         {offsetof(struct BlockBasedTableOptions,
-                   cache_index_and_filter_blocks_with_high_priority),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"pin_l0_filter_and_index_blocks_in_cache",
-         {offsetof(struct BlockBasedTableOptions,
-                   pin_l0_filter_and_index_blocks_in_cache),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"index_type",
-         {offsetof(struct BlockBasedTableOptions, index_type),
-          OptionType::kBlockBasedTableIndexType,
-          OptionVerificationType::kNormal, false, 0}},
-        {"hash_index_allow_collision",
-         {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"checksum",
-         {offsetof(struct BlockBasedTableOptions, checksum),
-          OptionType::kChecksumType, OptionVerificationType::kNormal, false,
-          0}},
-        {"no_block_cache",
-         {offsetof(struct BlockBasedTableOptions, no_block_cache),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"block_size",
-         {offsetof(struct BlockBasedTableOptions, block_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-        {"block_size_deviation",
-         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"block_restart_interval",
-         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"index_block_restart_interval",
-         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
-          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-        {"index_per_partition",
-         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"metadata_block_size",
-         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-        {"partition_filters",
-         {offsetof(struct BlockBasedTableOptions, partition_filters),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"filter_policy",
-         {offsetof(struct BlockBasedTableOptions, filter_policy),
-          OptionType::kFilterPolicy, OptionVerificationType::kByName, false,
-          0}},
-        {"whole_key_filtering",
-         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"skip_table_builder_flush",
-         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
-          0}},
-        {"format_version",
-         {offsetof(struct BlockBasedTableOptions, format_version),
-          OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}},
-        {"verify_compression",
-         {offsetof(struct BlockBasedTableOptions, verify_compression),
-          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-        {"read_amp_bytes_per_bit",
-         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}};
-
-static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
-    {"user_key_len",
-     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"bloom_bits_per_key",
-     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
-      OptionVerificationType::kNormal, false, 0}},
-    {"hash_table_ratio",
-     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
-      OptionVerificationType::kNormal, false, 0}},
-    {"index_sparseness",
-     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"huge_page_tlb_size",
-     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
-      OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-    {"encoding_type",
-     {offsetof(struct PlainTableOptions, encoding_type),
-      OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
-    {"full_scan_mode",
-     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"store_index_in_file",
-     {offsetof(struct PlainTableOptions, store_index_in_file),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
-
 static std::unordered_map<std::string, CompressionType>
     compression_type_string_map = {
         {"kNoCompression", kNoCompression},
@@ -706,9 +603,6 @@ static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
 static std::unordered_map<std::string, EncodingType> encoding_type_string_map =
     {{"kPlain", kPlain}, {"kPrefix", kPrefix}};
 
-static std::unordered_map<std::string, ChecksumType> checksum_type_string_map =
-    {{"kNoChecksum", kNoChecksum}, {"kCRC32c", kCRC32c}, {"kxxHash", kxxHash}};
-
 static std::unordered_map<std::string, CompactionStyle>
     compaction_style_string_map = {
         {"kCompactionStyleLevel", kCompactionStyleLevel},
@@ -745,6 +639,12 @@ static std::unordered_map<std::string, InfoLogLevel> info_log_level_string_map =
      {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL},
      {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}};
 
+extern Status StringToMap(
+    const std::string& opts_str,
+    std::unordered_map<std::string, std::string>* opts_map);
+
+extern bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
+                              const std::string& value);
 #endif  // !ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/options/options_parser.cc b/options/options_parser.cc
index d5a3fec6e..2cb60a068 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -16,6 +16,7 @@
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 
@@ -84,7 +85,8 @@ Status PersistRocksDBOptions(const DBOptions& db_opt,
       writable->Append("[" + opt_section_titles[kOptionSectionTableOptions] +
                        tf->Name() + " \"" + EscapeOptionString(cf_names[i]) +
                        "\"]\n  ");
-      s = GetStringFromTableFactory(&options_file_content, tf, "\n  ");
+      options_file_content.clear();
+      s = tf->GetOptionString(&options_file_content, "\n  ");
       if (!s.ok()) {
         return s;
       }
@@ -507,6 +509,7 @@ namespace {
 bool AreEqualDoubles(const double a, const double b) {
   return (fabs(a - b) < 0.00001);
 }
+}  // namespace
 
 bool AreEqualOptions(
     const char* opt1, const char* opt2, const OptionTypeInfo& type_info,
@@ -613,8 +616,6 @@ bool AreEqualOptions(
   }
 }
 
-}  // namespace
-
 Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
     const DBOptions& db_opt, const std::vector<std::string>& cf_names,
     const std::vector<ColumnFamilyOptions>& cf_opts,
@@ -762,59 +763,23 @@ Status RocksDBOptionsParser::VerifyCFOptions(
   return Status::OK();
 }
 
-Status RocksDBOptionsParser::VerifyBlockBasedTableFactory(
-    const BlockBasedTableFactory* base_tf,
-    const BlockBasedTableFactory* file_tf,
-    OptionsSanityCheckLevel sanity_check_level) {
-  if ((base_tf != nullptr) != (file_tf != nullptr) &&
-      sanity_check_level > kSanityLevelNone) {
-    return Status::Corruption(
-        "[RocksDBOptionsParser]: Inconsistent TableFactory class type");
-  }
-  if (base_tf == nullptr) {
-    return Status::OK();
-  }
-  assert(file_tf != nullptr);
-
-  const auto& base_opt = base_tf->table_options();
-  const auto& file_opt = file_tf->table_options();
-
-  for (auto& pair : block_based_table_type_info) {
-    if (pair.second.verification == OptionVerificationType::kDeprecated) {
-      // We skip checking deprecated variables as they might
-      // contain random values since they might not be initialized
-      continue;
-    }
-    if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
-      if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
-                           reinterpret_cast<const char*>(&file_opt),
-                           pair.second, pair.first, nullptr)) {
-        return Status::Corruption(
-            "[RocksDBOptionsParser]: "
-            "failed the verification on BlockBasedTableOptions::",
-            pair.first);
-      }
-    }
-  }
-  return Status::OK();
-}
-
 Status RocksDBOptionsParser::VerifyTableFactory(
     const TableFactory* base_tf, const TableFactory* file_tf,
     OptionsSanityCheckLevel sanity_check_level) {
   if (base_tf && file_tf) {
     if (sanity_check_level > kSanityLevelNone &&
-        base_tf->Name() != file_tf->Name()) {
+        std::string(base_tf->Name()) != std::string(file_tf->Name())) {
       return Status::Corruption(
           "[RocksDBOptionsParser]: "
           "failed the verification on TableFactory->Name()");
     }
-    auto s = VerifyBlockBasedTableFactory(
-        dynamic_cast<const BlockBasedTableFactory*>(base_tf),
-        dynamic_cast<const BlockBasedTableFactory*>(file_tf),
-        sanity_check_level);
-    if (!s.ok()) {
-      return s;
+    if (base_tf->Name() == BlockBasedTableFactory::kName) {
+      return VerifyBlockBasedTableFactory(
+          static_cast_with_check<const BlockBasedTableFactory,
+                                 const TableFactory>(base_tf),
+          static_cast_with_check<const BlockBasedTableFactory,
+                                 const TableFactory>(file_tf),
+          sanity_check_level);
     }
     // TODO(yhchiang): add checks for other table factory types
   } else {
diff --git a/options/options_parser.h b/options/options_parser.h
index cae3dbba9..5545c0b0f 100644
--- a/options/options_parser.h
+++ b/options/options_parser.h
@@ -38,6 +38,11 @@ Status PersistRocksDBOptions(const DBOptions& db_opt,
                              const std::vector<ColumnFamilyOptions>& cf_opts,
                              const std::string& file_name, Env* env);
 
+extern bool AreEqualOptions(
+    const char* opt1, const char* opt2, const OptionTypeInfo& type_info,
+    const std::string& opt_name,
+    const std::unordered_map<std::string, std::string>* opt_map);
+
 class RocksDBOptionsParser {
  public:
   explicit RocksDBOptionsParser();
@@ -86,11 +91,6 @@ class RocksDBOptionsParser {
       const TableFactory* base_tf, const TableFactory* file_tf,
       OptionsSanityCheckLevel sanity_check_level = kSanityLevelExactMatch);
 
-  static Status VerifyBlockBasedTableFactory(
-      const BlockBasedTableFactory* base_tf,
-      const BlockBasedTableFactory* file_tf,
-      OptionsSanityCheckLevel sanity_check_level);
-
   static Status ExtraParserCheck(const RocksDBOptionsParser& input_parser);
 
  protected:
diff --git a/options/options_test.cc b/options/options_test.cc
index d5eb42b09..fc4939beb 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -889,11 +889,11 @@ TEST_F(OptionsTest, ConvertOptionsTest) {
   ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files);
   ASSERT_EQ(converted_opt.compression, leveldb_opt.compression);
 
-  std::shared_ptr<BlockBasedTableFactory> table_factory =
-      std::dynamic_pointer_cast<BlockBasedTableFactory>(
-          converted_opt.table_factory);
+  std::shared_ptr<TableFactory> tb_guard = converted_opt.table_factory;
+  BlockBasedTableFactory* table_factory =
+      dynamic_cast<BlockBasedTableFactory*>(converted_opt.table_factory.get());
 
-  ASSERT_TRUE(table_factory.get() != nullptr);
+  ASSERT_TRUE(table_factory != nullptr);
 
   const BlockBasedTableOptions table_opt = table_factory->table_options();
 
@@ -1278,6 +1278,11 @@ TEST_F(OptionsParserTest, DumpAndParse) {
   Random rnd(302);
   test::RandomInitDBOptions(&base_db_opt, &rnd);
   base_db_opt.db_log_dir += "/#odd #but #could #happen #path #/\\\\#OMG";
+
+  BlockBasedTableOptions special_bbto;
+  special_bbto.cache_index_and_filter_blocks = true;
+  special_bbto.block_size = 999999;
+
   for (int c = 0; c < num_cf; ++c) {
     ColumnFamilyOptions cf_opt;
     Random cf_rnd(0xFB + c);
@@ -1287,6 +1292,8 @@ TEST_F(OptionsParserTest, DumpAndParse) {
     }
     if (c < 3) {
       cf_opt.table_factory.reset(test::RandomTableFactory(&rnd, c));
+    } else if (c == 4) {
+      cf_opt.table_factory.reset(NewBlockBasedTableFactory(special_bbto));
     }
     base_cf_opts.emplace_back(cf_opt);
   }
@@ -1298,6 +1305,15 @@ TEST_F(OptionsParserTest, DumpAndParse) {
   RocksDBOptionsParser parser;
   ASSERT_OK(parser.Parse(kOptionsFileName, env_.get()));
 
+  // Make sure block-based table factory options was deserialized correctly
+  std::shared_ptr<TableFactory> ttf = (*parser.cf_opts())[4].table_factory;
+  ASSERT_EQ(BlockBasedTableFactory::kName, std::string(ttf->Name()));
+  const BlockBasedTableOptions& parsed_bbto =
+      static_cast<BlockBasedTableFactory*>(ttf.get())->table_options();
+  ASSERT_EQ(special_bbto.block_size, parsed_bbto.block_size);
+  ASSERT_EQ(special_bbto.cache_index_and_filter_blocks,
+            parsed_bbto.cache_index_and_filter_blocks);
+
   ASSERT_OK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
       base_db_opt, cf_names, base_cf_opts, kOptionsFileName, env_.get()));
 
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 59241daff..129933bb1 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -184,5 +184,24 @@ int GetMaxOpenFiles() {
   return -1;
 }
 
+void *cacheline_aligned_alloc(size_t size) {
+#if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__)
+  return malloc(size);
+#elif defined(_ISOC11_SOURCE)
+  return aligned_alloc(CACHE_LINE_SIZE, size);
+#elif ( _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__))
+  void *m;
+  errno = posix_memalign(&m, CACHE_LINE_SIZE, size);
+  return errno ? NULL : m;
+#else
+  return malloc(size);
+#endif
+}
+
+void cacheline_aligned_free(void *memblock) {
+  free(memblock);
+}
+
+
 }  // namespace port
 }  // namespace rocksdb
diff --git a/port/port_posix.h b/port/port_posix.h
index 72beb0409..fe0d42644 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -193,6 +193,13 @@ extern void InitOnce(OnceType* once, void (*initializer)());
   #endif
 #endif
 
+
+extern void *cacheline_aligned_alloc(size_t size);
+
+extern void cacheline_aligned_free(void *memblock);
+
+#define ALIGN_AS(n) alignas(n)
+
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
 
 extern void Crash(const std::string& srcfile, int srcline);
diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index 1e7ea0cb8..462148893 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -829,7 +829,7 @@ WinEnvThreads::~WinEnvThreads() {
 
 void WinEnvThreads::Schedule(void(*function)(void*), void* arg, Env::Priority pri,
   void* tag, void(*unschedFunction)(void* arg)) {
-  assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH);
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
 }
 
@@ -878,7 +878,7 @@ void WinEnvThreads::WaitForJoin() {
 }
 
 unsigned int WinEnvThreads::GetThreadPoolQueueLen(Env::Priority pri) const {
-  assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH);
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   return thread_pools_[pri].GetQueueLen();
 }
 
@@ -894,17 +894,17 @@ void  WinEnvThreads::SleepForMicroseconds(int micros) {
 }
 
 void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) {
-  assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH);
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].SetBackgroundThreads(num);
 }
 
 int WinEnvThreads::GetBackgroundThreads(Env::Priority pri) {
-  assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH);
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   return thread_pools_[pri].GetBackgroundThreads();
 }
 
 void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
-  assert(pri >= Env::Priority::LOW && pri <= Env::Priority::HIGH);
+  assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
 }
 
diff --git a/port/win/port_win.cc b/port/win/port_win.cc
index e5d5a44d6..b3fccbd93 100644
--- a/port/win/port_win.cc
+++ b/port/win/port_win.cc
@@ -228,80 +228,3 @@ int GetMaxOpenFiles() { return -1; }
 
 }  // namespace port
 }  // namespace rocksdb
-
-#ifdef JEMALLOC
-
-#include "jemalloc/jemalloc.h"
-
-#ifndef JEMALLOC_NON_INIT
-
-namespace rocksdb {
-
-namespace port {
-
-__declspec(noinline) void WINAPI InitializeJemalloc() {
-  je_init();
-  atexit(je_uninit);
-}
-
-}  // port
-}  // rocksdb
-
-extern "C" {
-
-#ifdef _WIN64
-
-#pragma comment(linker, "/INCLUDE:p_rocksdb_init_jemalloc")
-
-typedef void(WINAPI* CRT_Startup_Routine)(void);
-
-// .CRT section is merged with .rdata on x64 so it must be constant data.
-// must be of external linkage
-// We put this into XCT since we want to run this earlier than C++ static
-// constructors
-// which are placed into XCU
-#pragma const_seg(".CRT$XCT")
-extern const CRT_Startup_Routine p_rocksdb_init_jemalloc;
-const CRT_Startup_Routine p_rocksdb_init_jemalloc =
-    rocksdb::port::InitializeJemalloc;
-#pragma const_seg()
-
-#else  // _WIN64
-
-// x86 untested
-
-#pragma comment(linker, "/INCLUDE:_p_rocksdb_init_jemalloc")
-
-#pragma section(".CRT$XCT", read)
-JEMALLOC_SECTION(".CRT$XCT") JEMALLOC_ATTR(used) static const void(
-    WINAPI* p_rocksdb_init_jemalloc)(void) = rocksdb::port::InitializeJemalloc;
-
-#endif  // _WIN64
-
-}  // extern "C"
-
-#endif // JEMALLOC_NON_INIT
-
-// Global operators to be replaced by a linker
-
-void* operator new(size_t size) {
-  void* p = je_malloc(size);
-  if (!p) {
-    throw std::bad_alloc();
-  }
-  return p;
-}
-
-void* operator new[](size_t size) {
-  void* p = je_malloc(size);
-  if (!p) {
-    throw std::bad_alloc();
-  }
-  return p;
-}
-
-void operator delete(void* p) { je_free(p); }
-
-void operator delete[](void* p) { je_free(p); }
-
-#endif  // JEMALLOC
diff --git a/port/win/port_win.h b/port/win/port_win.h
index bbc5feec3..f3c866905 100644
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@@ -27,6 +27,7 @@
 #include <mutex>
 #include <limits>
 #include <condition_variable>
+#include <malloc.h>
 
 #include <stdint.h>
 
@@ -239,6 +240,41 @@ extern void InitOnce(OnceType* once, void (*initializer)());
 #define CACHE_LINE_SIZE 64U
 #endif
 
+#ifdef ROCKSDB_JEMALLOC
+#include "jemalloc/jemalloc.h"
+// Separate inlines so they can be replaced if needed
+inline void* jemalloc_aligned_alloc( size_t size, size_t alignment) {
+  return je_aligned_alloc(alignment, size);
+}
+inline void jemalloc_aligned_free(void* p) {
+  je_free(p);
+}
+#endif
+
+inline void *cacheline_aligned_alloc(size_t size) {
+#ifdef ROCKSDB_JEMALLOC
+  return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE);
+#else
+  return _aligned_malloc(size, CACHE_LINE_SIZE);
+#endif
+}
+
+inline void cacheline_aligned_free(void *memblock) {
+#ifdef ROCKSDB_JEMALLOC
+  jemalloc_aligned_free(memblock);
+#else
+  _aligned_free(memblock);
+#endif
+}
+
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52991 for MINGW32
+// could not be worked around with by -mno-ms-bitfields
+#ifndef __MINGW32__
+#define ALIGN_AS(n) __declspec(align(n))
+#else
+#define ALIGN_AS(n)
+#endif
+
 static inline void AsmVolatilePause() {
 #if defined(_M_IX86) || defined(_M_X64)
   YieldProcessor();
diff --git a/port/win/win_jemalloc.cc b/port/win/win_jemalloc.cc
new file mode 100644
index 000000000..fc46e189c
--- /dev/null
+++ b/port/win/win_jemalloc.cc
@@ -0,0 +1,47 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_JEMALLOC
+# error This file can only be part of jemalloc aware build
+#endif
+
+#include <stdexcept>
+#include "jemalloc/jemalloc.h"
+
+// Global operators to be replaced by a linker when this file is
+// a part of the build
+
+void* operator new(size_t size) {
+  void* p = je_malloc(size);
+  if (!p) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void* operator new[](size_t size) {
+  void* p = je_malloc(size);
+  if (!p) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void operator delete(void* p) {
+  if (p) {
+    je_free(p);
+  }
+}
+
+void operator delete[](void* p) {
+  if (p) {
+    je_free(p);
+  }
+}
+
diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc
index e16ca9864..9ab23c534 100644
--- a/port/win/xpress_win.cc
+++ b/port/win/xpress_win.cc
@@ -17,10 +17,6 @@
 
 #ifdef XPRESS
 
-#ifdef JEMALLOC
-#include <jemalloc/jemalloc.h>
-#endif
-
 // Put this under ifdef so windows systems w/o this
 // can still build
 #include <compressapi.h>
@@ -43,22 +39,6 @@ auto CloseDecompressorFun = [](void* h) {
     ::CloseDecompressor(reinterpret_cast<DECOMPRESSOR_HANDLE>(h));
   }
 };
-
-
-#ifdef JEMALLOC
-// Make sure compressors use our jemalloc if redirected
-PVOID CompressorAlloc(PVOID, SIZE_T size) {
-  return je_malloc(size);
-}
-
-VOID CompressorFree(PVOID, PVOID p) {
-  if (p != NULL) {
-    je_free(p);
-  }
-}
-
-#endif
-
 }
 
 bool Compress(const char* input, size_t length, std::string* output) {
@@ -73,17 +53,6 @@ bool Compress(const char* input, size_t length, std::string* output) {
 
   COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
 
-#ifdef JEMALLOC
-  COMPRESS_ALLOCATION_ROUTINES allocationRoutines;
-
-  //  Init. allocation routines
-  allocationRoutines.Allocate = CompressorAlloc;
-  allocationRoutines.Free = CompressorFree;
-  allocationRoutines.UserContext = NULL;
-
-  allocRoutinesPtr = &allocationRoutines;
-#endif
-
   COMPRESSOR_HANDLE compressor = NULL;
 
   BOOL success = CreateCompressor(
@@ -94,17 +63,17 @@ bool Compress(const char* input, size_t length, std::string* output) {
   if (!success) {
 #ifdef _DEBUG
     std::cerr << "XPRESS: Failed to create Compressor LastError: " <<
-       GetLastError() << std::endl;
+      GetLastError() << std::endl;
 #endif
     return false;
   }
 
   std::unique_ptr<void, decltype(CloseCompressorFun)>
-     compressorGuard(compressor, CloseCompressorFun);
+    compressorGuard(compressor, CloseCompressorFun);
 
   SIZE_T compressedBufferSize = 0;
 
- //  Query compressed buffer size.
+  //  Query compressed buffer size.
   success = ::Compress(
     compressor,                 //  Compressor Handle
     const_cast<char*>(input),   //  Input buffer
@@ -123,8 +92,8 @@ bool Compress(const char* input, size_t length, std::string* output) {
         "XPRESS: Failed to estimate compressed buffer size LastError " <<
         lastError << std::endl;
 #endif
-       return false;
-     }
+      return false;
+    }
   }
 
   assert(compressedBufferSize > 0);
@@ -146,7 +115,7 @@ bool Compress(const char* input, size_t length, std::string* output) {
   if (!success) {
 #ifdef _DEBUG
     std::cerr << "XPRESS: Failed to compress LastError " <<
-       GetLastError() << std::endl;
+      GetLastError() << std::endl;
 #endif
     return false;
   }
@@ -169,16 +138,6 @@ char* Decompress(const char* input_data, size_t input_length,
 
   COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
 
-#ifdef JEMALLOC
-  COMPRESS_ALLOCATION_ROUTINES allocationRoutines;
-
-  //  Init. allocation routines
-  allocationRoutines.Allocate = CompressorAlloc;
-  allocationRoutines.Free = CompressorFree;
-  allocationRoutines.UserContext = NULL;
-  allocRoutinesPtr = &allocationRoutines;
-#endif
-
   DECOMPRESSOR_HANDLE decompressor = NULL;
 
   BOOL success = CreateDecompressor(
@@ -190,7 +149,7 @@ char* Decompress(const char* input_data, size_t input_length,
   if (!success) {
 #ifdef _DEBUG
     std::cerr << "XPRESS: Failed to create Decompressor LastError "
-              << GetLastError() << std::endl;
+      << GetLastError() << std::endl;
 #endif
     return nullptr;
   }
@@ -215,8 +174,8 @@ char* Decompress(const char* input_data, size_t input_length,
     if (lastError != ERROR_INSUFFICIENT_BUFFER) {
 #ifdef _DEBUG
       std::cerr
-          << "XPRESS: Failed to estimate decompressed buffer size LastError "
-          << lastError << std::endl;
+        << "XPRESS: Failed to estimate decompressed buffer size LastError "
+        << lastError << std::endl;
 #endif
       return nullptr;
     }
diff --git a/src.mk b/src.mk
index cb3383ff0..5bd5236fa 100644
--- a/src.mk
+++ b/src.mk
@@ -154,12 +154,14 @@ LIB_SOURCES =                                                   \
   utilities/backupable/backupable_db.cc                         \
   utilities/blob_db/blob_db.cc                                  \
   utilities/blob_db/blob_db_impl.cc                             \
-  utilities/blob_db/blob_db_options_impl.cc                     \
   utilities/blob_db/blob_file.cc                                \
   utilities/blob_db/blob_log_reader.cc                          \
   utilities/blob_db/blob_log_writer.cc                          \
   utilities/blob_db/blob_log_format.cc                          \
   utilities/blob_db/ttl_extractor.cc                            \
+  utilities/cassandra/cassandra_compaction_filter.cc            \
+  utilities/cassandra/format.cc                                 \
+  utilities/cassandra/merge_operator.cc                         \
   utilities/checkpoint/checkpoint_impl.cc                       \
   utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc    \
   utilities/convenience/info_log_finder.cc                      \
@@ -174,8 +176,6 @@ LIB_SOURCES =                                                   \
   utilities/leveldb_options/leveldb_options.cc                  \
   utilities/lua/rocks_lua_compaction_filter.cc                  \
   utilities/memory/memory_util.cc                               \
-  utilities/merge_operators/cassandra/format.cc                 \
-  utilities/merge_operators/cassandra/merge_operator.cc         \
   utilities/merge_operators/max.cc                              \
   utilities/merge_operators/put.cc                              \
   utilities/merge_operators/string_append/stringappend.cc       \
@@ -193,13 +193,14 @@ LIB_SOURCES =                                                   \
   utilities/spatialdb/spatial_db.cc                             \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
   utilities/transactions/optimistic_transaction_db_impl.cc      \
-  utilities/transactions/optimistic_transaction_impl.cc         \
+  utilities/transactions/optimistic_transaction.cc         \
   utilities/transactions/transaction_base.cc                    \
-  utilities/transactions/transaction_db_impl.cc                 \
+  utilities/transactions/pessimistic_transaction_db.cc                 \
   utilities/transactions/transaction_db_mutex_impl.cc           \
-  utilities/transactions/transaction_impl.cc                    \
+  utilities/transactions/pessimistic_transaction.cc                    \
   utilities/transactions/transaction_lock_mgr.cc                \
   utilities/transactions/transaction_util.cc                    \
+  utilities/transactions/write_prepared_txn.cc     \
   utilities/ttl/db_ttl_impl.cc                                  \
   utilities/write_batch_with_index/write_batch_with_index.cc    \
   utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
@@ -226,7 +227,7 @@ TEST_LIB_SOURCES = \
   util/testharness.cc                                                   \
   util/testutil.cc                                                      \
   db/db_test_util.cc                                                    \
-  utilities/merge_operators/cassandra/test_utils.cc                     \
+  utilities/cassandra/test_utils.cc                                     \
 
 MAIN_SOURCES =                                                    \
   cache/cache_bench.cc                                                   \
@@ -300,6 +301,7 @@ MAIN_SOURCES =                                                    \
   options/options_test.cc                                               \
   table/block_based_filter_block_test.cc                                \
   table/block_test.cc                                                   \
+  table/cleanable_test.cc                                               \
   table/cuckoo_table_builder_test.cc                                    \
   table/cuckoo_table_reader_test.cc                                     \
   table/full_filter_block_test.cc                                       \
@@ -330,6 +332,10 @@ MAIN_SOURCES =                                                    \
   util/thread_local_test.cc                                             \
   utilities/backupable/backupable_db_test.cc                            \
   utilities/blob_db/blob_db_test.cc                                     \
+  utilities/cassandra/cassandra_format_test.cc                          \
+  utilities/cassandra/cassandra_functional_test.cc                      \
+  utilities/cassandra/cassandra_row_merge_test.cc                       \
+  utilities/cassandra/cassandra_serialize_test.cc                       \
   utilities/checkpoint/checkpoint_test.cc                               \
   utilities/column_aware_encoding_exp.cc                                \
   utilities/column_aware_encoding_test.cc                               \
@@ -340,10 +346,6 @@ MAIN_SOURCES =                                                    \
   utilities/lua/rocks_lua_test.cc                                       \
   utilities/memory/memory_test.cc                                       \
   utilities/merge_operators/string_append/stringappend_test.cc          \
-  utilities/merge_operators/cassandra/cassandra_merge_test.cc           \
-  utilities/merge_operators/cassandra/cassandra_format_test.cc          \
-  utilities/merge_operators/cassandra/cassandra_row_merge_test.cc       \
-  utilities/merge_operators/cassandra/cassandra_serialize_test.cc       \
   utilities/object_registry_test.cc                                     \
   utilities/option_change_migration/option_change_migration_test.cc     \
   utilities/options/options_util_test.cc                                \
@@ -380,6 +382,7 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/options.cc                                    \
   java/rocksjni/ratelimiterjni.cc                             \
   java/rocksjni/remove_emptyvalue_compactionfilterjni.cc      \
+  java/rocksjni/cassandra_compactionfilterjni.cc              \
   java/rocksjni/restorejni.cc                                 \
   java/rocksjni/rocksjni.cc                                   \
   java/rocksjni/rocksdb_exception_test.cc                     \
diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc
index f83905dff..47069f866 100644
--- a/table/adaptive_table_factory.cc
+++ b/table/adaptive_table_factory.cc
@@ -46,7 +46,8 @@ Status AdaptiveTableFactory::NewTableReader(
     unique_ptr<TableReader>* table,
     bool prefetch_index_and_filter_in_cache) const {
   Footer footer;
-  auto s = ReadFooterFromFile(file.get(), file_size, &footer);
+  auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */,
+                              file_size, &footer);
   if (!s.ok()) {
     return s;
   }
diff --git a/table/block.h b/table/block.h
index 044e07662..59dc16743 100644
--- a/table/block.h
+++ b/table/block.h
@@ -67,8 +67,7 @@ class BlockReadAmpBitmap {
     size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
 
     // Create bitmap and set all the bits to 0
-    bitmap_ = new std::atomic<uint32_t>[bitmap_size];
-    memset(bitmap_, 0, bitmap_size * kBytesPersEntry);
+    bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
 
     RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
   }
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index e87def73e..e82f91aec 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -10,7 +10,6 @@
 #include "table/block_based_table_builder.h"
 
 #include <assert.h>
-#include <inttypes.h>
 #include <stdio.h>
 
 #include <list>
@@ -276,6 +275,7 @@ struct BlockBasedTableBuilder::Rep {
   uint32_t column_family_id;
   const std::string& column_family_name;
   uint64_t creation_time = 0;
+  uint64_t oldest_key_time = 0;
 
   std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
 
@@ -288,7 +288,8 @@ struct BlockBasedTableBuilder::Rep {
       const CompressionType _compression_type,
       const CompressionOptions& _compression_opts,
       const std::string* _compression_dict, const bool skip_filters,
-      const std::string& _column_family_name, const uint64_t _creation_time)
+      const std::string& _column_family_name, const uint64_t _creation_time,
+      const uint64_t _oldest_key_time)
       : ioptions(_ioptions),
         table_options(table_opt),
         internal_comparator(icomparator),
@@ -305,7 +306,8 @@ struct BlockBasedTableBuilder::Rep {
                 table_options, data_block)),
         column_family_id(_column_family_id),
         column_family_name(_column_family_name),
-        creation_time(_creation_time) {
+        creation_time(_creation_time),
+        oldest_key_time(_oldest_key_time) {
     if (table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
@@ -344,7 +346,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     const CompressionType compression_type,
     const CompressionOptions& compression_opts,
     const std::string* compression_dict, const bool skip_filters,
-    const std::string& column_family_name, const uint64_t creation_time) {
+    const std::string& column_family_name, const uint64_t creation_time,
+    const uint64_t oldest_key_time) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
       sanitized_table_options.checksum != kCRC32c) {
@@ -357,10 +360,11 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     sanitized_table_options.format_version = 1;
   }
 
-  rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator,
-                 int_tbl_prop_collector_factories, column_family_id, file,
-                 compression_type, compression_opts, compression_dict,
-                 skip_filters, column_family_name, creation_time);
+  rep_ =
+      new Rep(ioptions, sanitized_table_options, internal_comparator,
+              int_tbl_prop_collector_factories, column_family_id, file,
+              compression_type, compression_opts, compression_dict,
+              skip_filters, column_family_name, creation_time, oldest_key_time);
 
   if (rep_->filter_builder != nullptr) {
     rep_->filter_builder->StartBlock(0);
@@ -551,9 +555,8 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
     char* trailer_without_type = trailer + 1;
     switch (r->table_options.checksum) {
       case kNoChecksum:
-        // we don't support no checksum yet
-        assert(false);
-        // intentional fallthrough
+        EncodeFixed32(trailer_without_type, 0);
+        break;
       case kCRC32c: {
         auto crc = crc32c::Value(block_contents.data(), block_contents.size());
         crc = crc32c::Extend(crc, trailer, 1);  // Extend to cover block type
@@ -739,6 +742,7 @@ Status BlockBasedTableBuilder::Finish() {
             r->p_index_builder_->EstimateTopLevelIndexSize(r->offset);
       }
       r->props.creation_time = r->creation_time;
+      r->props.oldest_key_time = r->oldest_key_time;
 
       // Add basic properties
       property_block_builder.AddTableProperty(r->props);
diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h
index 2e8606271..36dfce1f0 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@@ -47,7 +47,8 @@ class BlockBasedTableBuilder : public TableBuilder {
       const CompressionType compression_type,
       const CompressionOptions& compression_opts,
       const std::string* compression_dict, const bool skip_filters,
-      const std::string& column_family_name, const uint64_t creation_time = 0);
+      const std::string& column_family_name, const uint64_t creation_time = 0,
+      const uint64_t oldest_key_time = 0);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~BlockBasedTableBuilder();
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 4705046bf..0c6bbbcb6 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -13,12 +13,15 @@
 #include <string>
 #include <stdint.h>
 
+#include "options/options_helper.h"
 #include "port/port.h"
-#include "rocksdb/flush_block_policy.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/flush_block_policy.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_reader.h"
 #include "table/format.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -76,7 +79,8 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
       table_builder_options.compression_dict,
       table_builder_options.skip_filters,
       table_builder_options.column_family_name,
-      table_builder_options.creation_time);
+      table_builder_options.creation_time,
+      table_builder_options.oldest_key_time);
 
   return table_builder;
 }
@@ -201,15 +205,203 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
   return ret;
 }
 
+#ifndef ROCKSDB_LITE
+namespace {
+bool SerializeSingleBlockBasedTableOption(
+    std::string* opt_string, const BlockBasedTableOptions& bbt_options,
+    const std::string& name, const std::string& delimiter) {
+  auto iter = block_based_table_type_info.find(name);
+  if (iter == block_based_table_type_info.end()) {
+    return false;
+  }
+  auto& opt_info = iter->second;
+  const char* opt_address =
+      reinterpret_cast<const char*>(&bbt_options) + opt_info.offset;
+  std::string value;
+  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
+  if (result) {
+    *opt_string = name + "=" + value + delimiter;
+  }
+  return result;
+}
+}  // namespace
+
+Status BlockBasedTableFactory::GetOptionString(
+    std::string* opt_string, const std::string& delimiter) const {
+  assert(opt_string);
+  opt_string->clear();
+  for (auto iter = block_based_table_type_info.begin();
+       iter != block_based_table_type_info.end(); ++iter) {
+    if (iter->second.verification == OptionVerificationType::kDeprecated) {
+      // If the option is no longer used in rocksdb and marked as deprecated,
+      // we skip it in the serialization.
+      continue;
+    }
+    std::string single_output;
+    bool result = SerializeSingleBlockBasedTableOption(
+        &single_output, table_options_, iter->first, delimiter);
+    assert(result);
+    if (result) {
+      opt_string->append(single_output);
+    }
+  }
+  return Status::OK();
+}
+#else
+Status BlockBasedTableFactory::GetOptionString(
+    std::string* opt_string, const std::string& delimiter) const {
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
 const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const {
   return table_options_;
 }
 
+#ifndef ROCKSDB_LITE
+namespace {
+std::string ParseBlockBasedTableOption(const std::string& name,
+                                       const std::string& org_value,
+                                       BlockBasedTableOptions* new_options,
+                                       bool input_strings_escaped = false,
+                                       bool ignore_unknown_options = false) {
+  const std::string& value =
+      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
+  if (!input_strings_escaped) {
+    // if the input string is not escaped, it means this function is
+    // invoked from SetOptions, which takes the old format.
+    if (name == "block_cache") {
+      new_options->block_cache = NewLRUCache(ParseSizeT(value));
+      return "";
+    } else if (name == "block_cache_compressed") {
+      new_options->block_cache_compressed = NewLRUCache(ParseSizeT(value));
+      return "";
+    } else if (name == "filter_policy") {
+      // Expect the following format
+      // bloomfilter:int:bool
+      const std::string kName = "bloomfilter:";
+      if (value.compare(0, kName.size(), kName) != 0) {
+        return "Invalid filter policy name";
+      }
+      size_t pos = value.find(':', kName.size());
+      if (pos == std::string::npos) {
+        return "Invalid filter policy config, missing bits_per_key";
+      }
+      int bits_per_key =
+          ParseInt(trim(value.substr(kName.size(), pos - kName.size())));
+      bool use_block_based_builder =
+          ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
+      new_options->filter_policy.reset(
+          NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
+      return "";
+    }
+  }
+  const auto iter = block_based_table_type_info.find(name);
+  if (iter == block_based_table_type_info.end()) {
+    if (ignore_unknown_options) {
+      return "";
+    } else {
+      return "Unrecognized option";
+    }
+  }
+  const auto& opt_info = iter->second;
+  if (opt_info.verification != OptionVerificationType::kDeprecated &&
+      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
+                         opt_info.type, value)) {
+    return "Invalid value";
+  }
+  return "";
+}
+}  // namespace
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return GetBlockBasedTableOptionsFromMap(table_options, opts_map,
+                                          new_table_options);
+}
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  assert(new_table_options);
+  *new_table_options = table_options;
+  for (const auto& o : opts_map) {
+    auto error_message = ParseBlockBasedTableOption(
+        o.first, o.second, new_table_options, input_strings_escaped,
+        ignore_unknown_options);
+    if (error_message != "") {
+      const auto iter = block_based_table_type_info.find(o.first);
+      if (iter == block_based_table_type_info.end() ||
+          !input_strings_escaped ||  // !input_strings_escaped indicates
+                                     // the old API, where everything is
+                                     // parsable.
+          (iter->second.verification != OptionVerificationType::kByName &&
+           iter->second.verification !=
+               OptionVerificationType::kByNameAllowNull &&
+           iter->second.verification != OptionVerificationType::kDeprecated)) {
+        // Restore "new_options" to the default "base_options".
+        *new_table_options = table_options;
+        return Status::InvalidArgument("Can't parse BlockBasedTableOptions:",
+                                       o.first + " " + error_message);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyBlockBasedTableFactory(
+    const BlockBasedTableFactory* base_tf,
+    const BlockBasedTableFactory* file_tf,
+    OptionsSanityCheckLevel sanity_check_level) {
+  if ((base_tf != nullptr) != (file_tf != nullptr) &&
+      sanity_check_level > kSanityLevelNone) {
+    return Status::Corruption(
+        "[RocksDBOptionsParser]: Inconsistent TableFactory class type");
+  }
+  if (base_tf == nullptr) {
+    return Status::OK();
+  }
+  assert(file_tf != nullptr);
+
+  const auto& base_opt = base_tf->table_options();
+  const auto& file_opt = file_tf->table_options();
+
+  for (auto& pair : block_based_table_type_info) {
+    if (pair.second.verification == OptionVerificationType::kDeprecated) {
+      // We skip checking deprecated variables as they might
+      // contain random values since they might not be initialized
+      continue;
+    }
+    if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) {
+      if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
+                           reinterpret_cast<const char*>(&file_opt),
+                           pair.second, pair.first, nullptr)) {
+        return Status::Corruption(
+            "[RocksDBOptionsParser]: "
+            "failed the verification on BlockBasedTableOptions::",
+            pair.first);
+      }
+    }
+  }
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
 TableFactory* NewBlockBasedTableFactory(
     const BlockBasedTableOptions& _table_options) {
   return new BlockBasedTableFactory(_table_options);
 }
 
+const std::string BlockBasedTableFactory::kName = "BlockBasedTable";
 const std::string BlockBasedTablePropertyNames::kIndexType =
     "rocksdb.block.based.table.index.type";
 const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index bdff00d1e..39e3eac0b 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -13,9 +13,11 @@
 #include <memory>
 #include <string>
 
+#include "db/dbformat.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/table.h"
-#include "db/dbformat.h"
 
 namespace rocksdb {
 
@@ -31,7 +33,7 @@ class BlockBasedTableFactory : public TableFactory {
 
   ~BlockBasedTableFactory() {}
 
-  const char* Name() const override { return "BlockBasedTable"; }
+  const char* Name() const override { return kName.c_str(); }
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
@@ -49,12 +51,17 @@ class BlockBasedTableFactory : public TableFactory {
 
   std::string GetPrintableTableOptions() const override;
 
+  Status GetOptionString(std::string* opt_string,
+                         const std::string& delimiter) const override;
+
   const BlockBasedTableOptions& table_options() const;
 
   void* GetOptions() override { return &table_options_; }
 
   bool IsDeleteRangeSupported() const override { return true; }
 
+  static const std::string kName;
+
  private:
   BlockBasedTableOptions table_options_;
 };
@@ -64,4 +71,87 @@ extern const std::string kHashIndexPrefixesMetadataBlock;
 extern const std::string kPropTrue;
 extern const std::string kPropFalse;
 
+#ifndef ROCKSDB_LITE
+extern Status VerifyBlockBasedTableFactory(
+    const BlockBasedTableFactory* base_tf,
+    const BlockBasedTableFactory* file_tf,
+    OptionsSanityCheckLevel sanity_check_level);
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    block_based_table_type_info = {
+        /* currently not supported
+          std::shared_ptr<Cache> block_cache = nullptr;
+          std::shared_ptr<Cache> block_cache_compressed = nullptr;
+         */
+        {"flush_block_policy_factory",
+         {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
+          OptionType::kFlushBlockPolicyFactory, OptionVerificationType::kByName,
+          false, 0}},
+        {"cache_index_and_filter_blocks",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"cache_index_and_filter_blocks_with_high_priority",
+         {offsetof(struct BlockBasedTableOptions,
+                   cache_index_and_filter_blocks_with_high_priority),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"pin_l0_filter_and_index_blocks_in_cache",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_l0_filter_and_index_blocks_in_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"index_type",
+         {offsetof(struct BlockBasedTableOptions, index_type),
+          OptionType::kBlockBasedTableIndexType,
+          OptionVerificationType::kNormal, false, 0}},
+        {"hash_index_allow_collision",
+         {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"checksum",
+         {offsetof(struct BlockBasedTableOptions, checksum),
+          OptionType::kChecksumType, OptionVerificationType::kNormal, false,
+          0}},
+        {"no_block_cache",
+         {offsetof(struct BlockBasedTableOptions, no_block_cache),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"block_size",
+         {offsetof(struct BlockBasedTableOptions, block_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+        {"block_size_deviation",
+         {offsetof(struct BlockBasedTableOptions, block_size_deviation),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"index_block_restart_interval",
+         {offsetof(struct BlockBasedTableOptions, index_block_restart_interval),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"index_per_partition",
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"metadata_block_size",
+         {offsetof(struct BlockBasedTableOptions, metadata_block_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"partition_filters",
+         {offsetof(struct BlockBasedTableOptions, partition_filters),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"filter_policy",
+         {offsetof(struct BlockBasedTableOptions, filter_policy),
+          OptionType::kFilterPolicy, OptionVerificationType::kByName, false,
+          0}},
+        {"whole_key_filtering",
+         {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"skip_table_builder_flush",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"format_version",
+         {offsetof(struct BlockBasedTableOptions, format_version),
+          OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}},
+        {"verify_compression",
+         {offsetof(struct BlockBasedTableOptions, verify_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"read_amp_bytes_per_bit",
+         {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}};
+#endif  // !ROCKSDB_LITE
 }  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index 123e1814a..d8c6d807c 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -70,17 +70,17 @@ namespace {
 // On success fill *result and return OK - caller owns *result
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer,
-                         const ReadOptions& options, const BlockHandle& handle,
-                         std::unique_ptr<Block>* result,
-                         const ImmutableCFOptions& ioptions, bool do_uncompress,
-                         const Slice& compression_dict,
-                         const PersistentCacheOptions& cache_options,
-                         SequenceNumber global_seqno,
-                         size_t read_amp_bytes_per_bit) {
+Status ReadBlockFromFile(
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
+    bool do_uncompress, const Slice& compression_dict,
+    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
+    size_t read_amp_bytes_per_bit) {
   BlockContents contents;
-  Status s = ReadBlockContents(file, footer, options, handle, &contents, ioptions,
-                               do_uncompress, compression_dict, cache_options);
+  Status s = ReadBlockContents(file, prefetch_buffer, footer, options, handle,
+                               &contents, ioptions, do_uncompress,
+                               compression_dict, cache_options);
   if (s.ok()) {
     result->reset(new Block(std::move(contents), global_seqno,
                             read_amp_bytes_per_bit, ioptions.statistics));
@@ -157,6 +157,7 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
   static Status Create(BlockBasedTable* table, RandomAccessFileReader* file,
+                       FilePrefetchBuffer* prefetch_buffer,
                        const Footer& footer, const BlockHandle& index_handle,
                        const ImmutableCFOptions& ioptions,
                        const InternalKeyComparator* icomparator,
@@ -165,8 +166,9 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
                        const int level) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
-        file, footer, ReadOptions(), index_handle, &index_block, ioptions,
-        true /* decompress */, Slice() /*compression dict*/, cache_options,
+        file, prefetch_buffer, footer, ReadOptions(), index_handle,
+        &index_block, ioptions, true /* decompress */,
+        Slice() /*compression dict*/, cache_options,
         kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
 
     if (s.ok()) {
@@ -184,23 +186,88 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
     // Filters are already checked before seeking the index
     const bool skip_filters = true;
     const bool is_index = true;
-    Cleanable* block_cache_cleaner = nullptr;
-    const bool pin_cached_indexes =
-        level_ == 0 &&
-        table_->rep_->table_options.pin_l0_filter_and_index_blocks_in_cache;
-    if (pin_cached_indexes) {
-      // Keep partition indexes into the cache as long as the partition index
-      // reader object is alive
-      block_cache_cleaner = this;
-    }
     return NewTwoLevelIterator(
         new BlockBasedTable::BlockEntryIteratorState(
             table_, ReadOptions(), icomparator_, skip_filters, is_index,
-            block_cache_cleaner),
+            partition_map_.size() ? &partition_map_ : nullptr),
         index_block_->NewIterator(icomparator_, nullptr, true));
     // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
-    // on-stack
-    // BlockIter while the state is on heap
+    // on-stack BlockIter while the state is on heap. Currentlly it assumes
+    // the first level iter is always on heap and will attempt to delete it
+    // in its destructor.
+  }
+
+  virtual void CacheDependencies(bool pin) override {
+    // Before read partitions, prefetch them to avoid lots of IOs
+    auto rep = table_->rep_;
+    BlockIter biter;
+    BlockHandle handle;
+    index_block_->NewIterator(icomparator_, &biter, true);
+    // Index partitions are assumed to be consecuitive. Prefetch them all.
+    // Read the first block offset
+    biter.SeekToFirst();
+    Slice input = biter.value();
+    Status s = handle.DecodeFrom(&input);
+    assert(s.ok());
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep->ioptions.info_log,
+                     "Could not read first index partition");
+      return;
+    }
+    uint64_t prefetch_off = handle.offset();
+
+    // Read the last block's offset
+    biter.SeekToLast();
+    input = biter.value();
+    s = handle.DecodeFrom(&input);
+    assert(s.ok());
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep->ioptions.info_log,
+                     "Could not read last index partition");
+      return;
+    }
+    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+    uint64_t prefetch_len = last_off - prefetch_off;
+    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+    auto& file = table_->rep_->file;
+    prefetch_buffer.reset(new FilePrefetchBuffer());
+    s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
+
+    // After prefetch, read the partitions one by one
+    biter.SeekToFirst();
+    auto ro = ReadOptions();
+    Cache* block_cache = rep->table_options.block_cache.get();
+    for (; biter.Valid(); biter.Next()) {
+      input = biter.value();
+      s = handle.DecodeFrom(&input);
+      assert(s.ok());
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(rep->ioptions.info_log,
+                       "Could not read index partition");
+        continue;
+      }
+
+      BlockBasedTable::CachableEntry<Block> block;
+      Slice compression_dict;
+      if (rep->compression_dict_block) {
+        compression_dict = rep->compression_dict_block->data;
+      }
+      const bool is_index = true;
+      s = table_->MaybeLoadDataBlockToCache(prefetch_buffer.get(), rep, ro,
+                                            handle, compression_dict, &block,
+                                            is_index);
+
+      assert(s.ok() || block.value == nullptr);
+      if (s.ok() && block.value != nullptr) {
+        assert(block.cache_handle != nullptr);
+        if (pin) {
+          partition_map_[handle.offset()] = block;
+          RegisterCleanup(&ReleaseCachedEntry, block_cache, block.cache_handle);
+        } else {
+          block_cache->Release(block.cache_handle);
+        }
+      }
+    }
   }
 
   virtual size_t size() const override { return index_block_->size(); }
@@ -220,13 +287,13 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
                        const int level)
       : IndexReader(icomparator, stats),
         table_(table),
-        index_block_(std::move(index_block)),
-        level_(level) {
+        index_block_(std::move(index_block)) {
     assert(index_block_ != nullptr);
   }
   BlockBasedTable* table_;
   std::unique_ptr<Block> index_block_;
-  int level_;
+  std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
+      partition_map_;
 };
 
 // Index that allows binary search lookup for the first key of each block.
@@ -238,16 +305,18 @@ class BinarySearchIndexReader : public IndexReader {
   // `BinarySearchIndexReader`.
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
-  static Status Create(RandomAccessFileReader* file, const Footer& footer,
-                       const BlockHandle& index_handle,
+  static Status Create(RandomAccessFileReader* file,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       const Footer& footer, const BlockHandle& index_handle,
                        const ImmutableCFOptions& ioptions,
                        const InternalKeyComparator* icomparator,
                        IndexReader** index_reader,
                        const PersistentCacheOptions& cache_options) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
-        file, footer, ReadOptions(), index_handle, &index_block, ioptions,
-        true /* decompress */, Slice() /*compression dict*/, cache_options,
+        file, prefetch_buffer, footer, ReadOptions(), index_handle,
+        &index_block, ioptions, true /* decompress */,
+        Slice() /*compression dict*/, cache_options,
         kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
 
     if (s.ok()) {
@@ -289,6 +358,7 @@ class HashIndexReader : public IndexReader {
  public:
   static Status Create(const SliceTransform* hash_key_extractor,
                        const Footer& footer, RandomAccessFileReader* file,
+                       FilePrefetchBuffer* prefetch_buffer,
                        const ImmutableCFOptions& ioptions,
                        const InternalKeyComparator* icomparator,
                        const BlockHandle& index_handle,
@@ -298,8 +368,9 @@ class HashIndexReader : public IndexReader {
                        const PersistentCacheOptions& cache_options) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
-        file, footer, ReadOptions(), index_handle, &index_block, ioptions,
-        true /* decompress */, Slice() /*compression dict*/, cache_options,
+        file, prefetch_buffer, footer, ReadOptions(), index_handle,
+        &index_block, ioptions, true /* decompress */,
+        Slice() /*compression dict*/, cache_options,
         kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
 
     if (!s.ok()) {
@@ -335,15 +406,17 @@ class HashIndexReader : public IndexReader {
 
     // Read contents for the blocks
     BlockContents prefixes_contents;
-    s = ReadBlockContents(file, footer, ReadOptions(), prefixes_handle,
-                          &prefixes_contents, ioptions, true /* decompress */,
-                          Slice() /*compression dict*/, cache_options);
+    s = ReadBlockContents(file, prefetch_buffer, footer, ReadOptions(),
+                          prefixes_handle, &prefixes_contents, ioptions,
+                          true /* decompress */, Slice() /*compression dict*/,
+                          cache_options);
     if (!s.ok()) {
       return s;
     }
     BlockContents prefixes_meta_contents;
-    s = ReadBlockContents(file, footer, ReadOptions(), prefixes_meta_handle,
-                          &prefixes_meta_contents, ioptions, true /* decompress */,
+    s = ReadBlockContents(file, prefetch_buffer, footer, ReadOptions(),
+                          prefixes_meta_handle, &prefixes_meta_contents,
+                          ioptions, true /* decompress */,
                           Slice() /*compression dict*/, cache_options);
     if (!s.ok()) {
       // TODO: log error
@@ -535,12 +608,29 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
 
   Footer footer;
 
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
   // Before read footer, readahead backwards to prefetch data
-  Status s =
-      file->Prefetch((file_size < 512 * 1024 ? 0 : file_size - 512 * 1024),
-                     512 * 1024 /* 512 KB prefetching */);
-  s = ReadFooterFromFile(file.get(), file_size, &footer,
-                              kBlockBasedTableMagicNumber);
+  const size_t kTailPrefetchSize = 512 * 1024;
+  size_t prefetch_off;
+  size_t prefetch_len;
+  if (file_size < kTailPrefetchSize) {
+    prefetch_off = 0;
+    prefetch_len = file_size;
+  } else {
+    prefetch_off = file_size - kTailPrefetchSize;
+    prefetch_len = kTailPrefetchSize;
+  }
+  Status s;
+  // TODO should not have this special logic in the future.
+  if (!file->use_direct_io()) {
+    s = file->Prefetch(prefetch_off, prefetch_len);
+  } else {
+    prefetch_buffer.reset(new FilePrefetchBuffer());
+    s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
+  }
+  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
+                         kBlockBasedTableMagicNumber);
   if (!s.ok()) {
     return s;
   }
@@ -577,7 +667,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // Read meta index
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
-  s = ReadMetaBlock(rep, &meta, &meta_iter);
+  s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter);
   if (!s.ok()) {
     return s;
   }
@@ -623,8 +713,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     s = meta_iter->status();
     TableProperties* table_properties = nullptr;
     if (s.ok()) {
-      s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer,
-                         rep->ioptions, &table_properties);
+      s = ReadProperties(meta_iter->value(), rep->file.get(),
+                         prefetch_buffer.get(), rep->footer, rep->ioptions,
+                         &table_properties);
     }
 
     if (!s.ok()) {
@@ -655,9 +746,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     // TODO(andrewkr): ReadMetaBlock repeats SeekToCompressionDictBlock().
     // maybe decode a handle from meta_iter
     // and do ReadBlockContents(handle) instead
-    s = rocksdb::ReadMetaBlock(rep->file.get(), file_size,
-                               kBlockBasedTableMagicNumber, rep->ioptions,
-                               rocksdb::kCompressionDictBlock,
+    s = rocksdb::ReadMetaBlock(rep->file.get(), prefetch_buffer.get(),
+                               file_size, kBlockBasedTableMagicNumber,
+                               rep->ioptions, rocksdb::kCompressionDictBlock,
                                compression_dict_block.get());
     if (!s.ok()) {
       ROCKS_LOG_WARN(
@@ -682,9 +773,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   } else {
     if (found_range_del_block && !rep->range_del_handle.IsNull()) {
       ReadOptions read_options;
-      s = MaybeLoadDataBlockToCache(rep, read_options, rep->range_del_handle,
-                                    Slice() /* compression_dict */,
-                                    &rep->range_del_entry);
+      s = MaybeLoadDataBlockToCache(
+          prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
+          Slice() /* compression_dict */, &rep->range_del_entry);
       if (!s.ok()) {
         ROCKS_LOG_WARN(
             rep->ioptions.info_log,
@@ -708,7 +799,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                                                 rep->ioptions.info_log);
   }
 
-    // pre-fetching of blocks is turned on
+  const bool pin =
+      rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+  // pre-fetching of blocks is turned on
   // Will use block cache for index/filter blocks access
   // Always prefetch index and filter for level 0
   if (table_options.cache_index_and_filter_blocks) {
@@ -717,32 +810,29 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
       // Hack: Call NewIndexIterator() to implicitly add index to the
       // block_cache
 
-      // if pin_l0_filter_and_index_blocks_in_cache is true and this is
-      // a level0 file, then we will pass in this pointer to rep->index
-      // to NewIndexIterator(), which will save the index block in there
-      // else it's a nullptr and nothing special happens
-      CachableEntry<IndexReader>* index_entry = nullptr;
-      if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache &&
-          level == 0) {
-        index_entry = &rep->index_entry;
-      }
+      CachableEntry<IndexReader> index_entry;
       unique_ptr<InternalIterator> iter(
-          new_table->NewIndexIterator(ReadOptions(), nullptr, index_entry));
+          new_table->NewIndexIterator(ReadOptions(), nullptr, &index_entry));
+      index_entry.value->CacheDependencies(pin);
+      if (pin) {
+        rep->index_entry = std::move(index_entry);
+      } else {
+        index_entry.Release(table_options.block_cache.get());
+      }
       s = iter->status();
 
       if (s.ok()) {
         // Hack: Call GetFilter() to implicitly add filter to the block_cache
         auto filter_entry = new_table->GetFilter();
+        if (filter_entry.value != nullptr) {
+          filter_entry.value->CacheDependencies(pin);
+        }
         // if pin_l0_filter_and_index_blocks_in_cache is true, and this is
         // a level0 file, then save it in rep_->filter_entry; it will be
         // released in the destructor only, hence it will be pinned in the
         // cache while this reader is alive
-        if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache &&
-            level == 0) {
+        if (pin) {
           rep->filter_entry = filter_entry;
-          if (rep->filter_entry.value != nullptr) {
-            rep->filter_entry.value->SetLevel(level);
-          }
         } else {
           filter_entry.Release(table_options.block_cache.get());
         }
@@ -753,18 +843,27 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     // pre-load these blocks, which will kept in member variables in Rep
     // and with a same life-time as this table object.
     IndexReader* index_reader = nullptr;
-    s = new_table->CreateIndexReader(&index_reader, meta_iter.get(), level);
-
+    s = new_table->CreateIndexReader(prefetch_buffer.get(), &index_reader,
+                                     meta_iter.get(), level);
     if (s.ok()) {
       rep->index_reader.reset(index_reader);
+      // The partitions of partitioned index are always stored in cache. They
+      // are hence follow the configuration for pin and prefetch regardless of
+      // the value of cache_index_and_filter_blocks
+      if (prefetch_index_and_filter_in_cache || level == 0) {
+        rep->index_reader->CacheDependencies(pin);
+      }
 
       // Set filter block
       if (rep->filter_policy) {
         const bool is_a_filter_partition = true;
-        rep->filter.reset(
-            new_table->ReadFilter(rep->filter_handle, !is_a_filter_partition));
-        if (rep->filter.get()) {
-          rep->filter->SetLevel(level);
+        auto filter = new_table->ReadFilter(
+            prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition);
+        rep->filter.reset(filter);
+        // Refer to the comment above about paritioned indexes always being
+        // cached
+        if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
+          filter->CacheDependencies(pin);
         }
       }
     } else {
@@ -816,14 +915,14 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const {
 // Load the meta-block from the file. On success, return the loaded meta block
 // and its iterator.
 Status BlockBasedTable::ReadMetaBlock(Rep* rep,
+                                      FilePrefetchBuffer* prefetch_buffer,
                                       std::unique_ptr<Block>* meta_block,
                                       std::unique_ptr<InternalIterator>* iter) {
   // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
   // it is an empty block.
-  //  TODO: we never really verify check sum for meta index block
   std::unique_ptr<Block> meta;
   Status s = ReadBlockFromFile(
-      rep->file.get(), rep->footer, ReadOptions(),
+      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
       rep->footer.metaindex_handle(), &meta, rep->ioptions,
       true /* decompress */, Slice() /*compression dict*/,
       rep->persistent_cache_options, kDisableGlobalSequenceNumber,
@@ -1022,7 +1121,8 @@ Status BlockBasedTable::PutDataBlockToCache(
 }
 
 FilterBlockReader* BlockBasedTable::ReadFilter(
-    const BlockHandle& filter_handle, const bool is_a_filter_partition) const {
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
+    const bool is_a_filter_partition) const {
   auto& rep = rep_;
   // TODO: We might want to unify with ReadBlockFromFile() if we start
   // requiring checksum verification in Table::Open.
@@ -1030,8 +1130,8 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
     return nullptr;
   }
   BlockContents block;
-  if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
-                         filter_handle, &block, rep->ioptions,
+  if (!ReadBlockContents(rep->file.get(), prefetch_buffer, rep->footer,
+                         ReadOptions(), filter_handle, &block, rep->ioptions,
                          false /* decompress */, Slice() /*compression dict*/,
                          rep->persistent_cache_options)
            .ok()) {
@@ -1080,15 +1180,16 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-                                                          bool no_io) const {
+    FilePrefetchBuffer* prefetch_buffer, bool no_io) const {
   const BlockHandle& filter_blk_handle = rep_->filter_handle;
   const bool is_a_filter_partition = true;
-  return GetFilter(filter_blk_handle, !is_a_filter_partition, no_io);
+  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
+                   no_io);
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    const BlockHandle& filter_blk_handle, const bool is_a_filter_partition,
-    bool no_io) const {
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
+    const bool is_a_filter_partition, bool no_io) const {
   // If cache_index_and_filter_blocks is false, filter should be pre-populated.
   // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
   // read fails at Open() time. We don't want to reload again since it will
@@ -1128,7 +1229,8 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     // Do not invoke any io.
     return CachableEntry<FilterBlockReader>();
   } else {
-    filter = ReadFilter(filter_blk_handle, is_a_filter_partition);
+    filter =
+        ReadFilter(prefetch_buffer, filter_blk_handle, is_a_filter_partition);
     if (filter != nullptr) {
       assert(filter->size() > 0);
       Status s = block_cache->Insert(
@@ -1196,7 +1298,7 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
     // Create index reader and put it in the cache.
     Status s;
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2");
-    s = CreateIndexReader(&index_reader);
+    s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader);
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
@@ -1275,8 +1377,8 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
     if (rep->compression_dict_block) {
       compression_dict = rep->compression_dict_block->data;
     }
-    s = MaybeLoadDataBlockToCache(rep, ro, handle, compression_dict, &block,
-                                  is_index);
+    s = MaybeLoadDataBlockToCache(nullptr /*prefetch_buffer*/, rep, ro, handle,
+                                  compression_dict, &block, is_index);
   }
 
   // Didn't get any data from block caches.
@@ -1291,10 +1393,11 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
       }
     }
     std::unique_ptr<Block> block_value;
-    s = ReadBlockFromFile(
-        rep->file.get(), rep->footer, ro, handle, &block_value, rep->ioptions,
-        true /* compress */, compression_dict, rep->persistent_cache_options,
-        rep->global_seqno, rep->table_options.read_amp_bytes_per_bit);
+    s = ReadBlockFromFile(rep->file.get(), nullptr /* prefetch_buffer */,
+                          rep->footer, ro, handle, &block_value, rep->ioptions,
+                          true /* compress */, compression_dict,
+                          rep->persistent_cache_options, rep->global_seqno,
+                          rep->table_options.read_amp_bytes_per_bit);
     if (s.ok()) {
       block.value = block_value.release();
     }
@@ -1324,8 +1427,10 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
 }
 
 Status BlockBasedTable::MaybeLoadDataBlockToCache(
-    Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
-    Slice compression_dict, CachableEntry<Block>* block_entry, bool is_index) {
+    FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
+    const BlockHandle& handle, Slice compression_dict,
+    CachableEntry<Block>* block_entry, bool is_index) {
+  assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep->table_options.block_cache.get();
   Cache* block_cache_compressed =
@@ -1362,9 +1467,9 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
       {
         StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
         s = ReadBlockFromFile(
-            rep->file.get(), rep->footer, ro, handle, &raw_block, rep->ioptions,
-            block_cache_compressed == nullptr, compression_dict,
-            rep->persistent_cache_options, rep->global_seqno,
+            rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
+            &raw_block, rep->ioptions, block_cache_compressed == nullptr,
+            compression_dict, rep->persistent_cache_options, rep->global_seqno,
             rep->table_options.read_amp_bytes_per_bit);
       }
 
@@ -1382,20 +1487,21 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
       }
     }
   }
+  assert(s.ok() || block_entry->value == nullptr);
   return s;
 }
 
 BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState(
     BlockBasedTable* table, const ReadOptions& read_options,
     const InternalKeyComparator* icomparator, bool skip_filters, bool is_index,
-    Cleanable* block_cache_cleaner)
+    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
     : TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr),
       table_(table),
       read_options_(read_options),
       icomparator_(icomparator),
       skip_filters_(skip_filters),
       is_index_(is_index),
-      block_cache_cleaner_(block_cache_cleaner) {}
+      block_map_(block_map) {}
 
 InternalIterator*
 BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator(
@@ -1404,23 +1510,25 @@ BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator(
   BlockHandle handle;
   Slice input = index_value;
   Status s = handle.DecodeFrom(&input);
-  auto iter = NewDataBlockIterator(table_->rep_, read_options_, handle, nullptr,
-                                   is_index_, s);
-  if (block_cache_cleaner_) {
-    uint64_t offset = handle.offset();
-    {
-      ReadLock rl(&cleaner_mu);
-      if (cleaner_set.find(offset) != cleaner_set.end()) {
-        // already have a reference to the block cache objects
-        return iter;
-      }
-    }
-    WriteLock wl(&cleaner_mu);
-    cleaner_set.insert(offset);
-    // Keep the data into cache until the cleaner cleansup
-    iter->DelegateCleanupsTo(block_cache_cleaner_);
-  }
-  return iter;
+  auto rep = table_->rep_;
+  if (block_map_) {
+    auto block = block_map_->find(handle.offset());
+    // This is a possible scenario since block cache might not have had space
+    // for the partition
+    if (block != block_map_->end()) {
+      PERF_COUNTER_ADD(block_cache_hit_count, 1);
+      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
+      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
+      Cache* block_cache = rep->table_options.block_cache.get();
+      assert(block_cache);
+      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
+                 block_cache->GetUsage(block->second.cache_handle));
+      return block->second.value->NewIterator(
+          &rep->internal_comparator, nullptr, true, rep->ioptions.statistics);
+    }
+  }
+  return NewDataBlockIterator(rep, read_options_, handle, nullptr, is_index_,
+                              s);
 }
 
 bool BlockBasedTable::BlockEntryIteratorState::PrefixMayMatch(
@@ -1545,11 +1653,12 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
   return may_match;
 }
 
-InternalIterator* BlockBasedTable::NewIterator(
-    const ReadOptions& read_options, Arena* arena,
-    const InternalKeyComparator* icomp, bool skip_filters) {
+InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
+                                               Arena* arena,
+                                               bool skip_filters) {
   return NewTwoLevelIterator(
-      new BlockEntryIteratorState(this, read_options, icomp, skip_filters),
+      new BlockEntryIteratorState(this, read_options,
+                                  &rep_->internal_comparator, skip_filters),
       NewIndexIterator(read_options), arena);
 }
 
@@ -1613,7 +1722,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   CachableEntry<FilterBlockReader> filter_entry;
   if (!skip_filters) {
-    filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
+    filter_entry = GetFilter(/*prefetch_buffer*/ nullptr,
+                             read_options.read_tier == kBlockCacheTier);
   }
   FilterBlockReader* filter = filter_entry.value;
 
@@ -1745,6 +1855,61 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
   return Status::OK();
 }
 
+Status BlockBasedTable::VerifyChecksum() {
+  Status s;
+  // Check Meta blocks
+  std::unique_ptr<Block> meta;
+  std::unique_ptr<InternalIterator> meta_iter;
+  s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter);
+  if (s.ok()) {
+    s = VerifyChecksumInBlocks(meta_iter.get());
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    return s;
+  }
+  // Check Data blocks
+  BlockIter iiter_on_stack;
+  InternalIterator* iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack);
+  std::unique_ptr<InternalIterator> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
+  }
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+  s = VerifyChecksumInBlocks(iiter);
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
+    if (!s.ok()) {
+      break;
+    }
+    BlockContents contents;
+    s = ReadBlockContents(rep_->file.get(), nullptr /* prefetch buffer */,
+                          rep_->footer, ReadOptions(), handle, &contents,
+                          rep_->ioptions, false /* decompress */,
+                          Slice() /*compression dict*/,
+                          rep_->persistent_cache_options);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
   std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));
@@ -1786,8 +1951,8 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 //  4. internal_comparator
 //  5. index_type
 Status BlockBasedTable::CreateIndexReader(
-    IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter,
-    int level) {
+    FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
+    InternalIterator* preloaded_meta_index_iter, int level) {
   // Some old version of block-based tables don't have index type present in
   // table properties. If that's the case we can safely use the kBinarySearch.
   auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
@@ -1815,20 +1980,22 @@ Status BlockBasedTable::CreateIndexReader(
   switch (index_type_on_file) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
       return PartitionIndexReader::Create(
-          this, file, footer, footer.index_handle(), rep_->ioptions,
-          icomparator, index_reader, rep_->persistent_cache_options, level);
+          this, file, prefetch_buffer, footer, footer.index_handle(),
+          rep_->ioptions, icomparator, index_reader,
+          rep_->persistent_cache_options, level);
     }
     case BlockBasedTableOptions::kBinarySearch: {
       return BinarySearchIndexReader::Create(
-          file, footer, footer.index_handle(), rep_->ioptions, icomparator,
-          index_reader, rep_->persistent_cache_options);
+          file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
+          icomparator, index_reader, rep_->persistent_cache_options);
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
       std::unique_ptr<InternalIterator> meta_iter_guard;
       auto meta_index_iter = preloaded_meta_index_iter;
       if (meta_index_iter == nullptr) {
-        auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard);
+        auto s =
+            ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard);
         if (!s.ok()) {
           // we simply fall back to binary search in case there is any
           // problem with prefix hash index loading.
@@ -1836,16 +2003,18 @@ Status BlockBasedTable::CreateIndexReader(
                          "Unable to read the metaindex block."
                          " Fall back to binary search index.");
           return BinarySearchIndexReader::Create(
-              file, footer, footer.index_handle(), rep_->ioptions, icomparator,
-              index_reader, rep_->persistent_cache_options);
+              file, prefetch_buffer, footer, footer.index_handle(),
+              rep_->ioptions, icomparator, index_reader,
+              rep_->persistent_cache_options);
         }
         meta_index_iter = meta_iter_guard.get();
       }
 
       return HashIndexReader::Create(
-          rep_->internal_prefix_transform.get(), footer, file, rep_->ioptions,
-          icomparator, footer.index_handle(), meta_index_iter, index_reader,
-          rep_->hash_index_allow_collision, rep_->persistent_cache_options);
+          rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
+          rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
+          index_reader, rep_->hash_index_allow_collision,
+          rep_->persistent_cache_options);
     }
     default: {
       std::string error_message =
@@ -1961,7 +2130,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
       "--------------------------------------\n");
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
-  Status s = ReadMetaBlock(rep_, &meta, &meta_iter);
+  Status s =
+      ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter);
   if (s.ok()) {
     for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
       s = meta_iter->status();
@@ -2017,10 +2187,11 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
       BlockHandle handle;
       if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
         BlockContents block;
-        if (ReadBlockContents(
-                rep_->file.get(), rep_->footer, ReadOptions(), handle, &block,
-                rep_->ioptions, false /*decompress*/,
-                Slice() /*compression dict*/, rep_->persistent_cache_options)
+        if (ReadBlockContents(rep_->file.get(), nullptr /* prefetch_buffer */,
+                              rep_->footer, ReadOptions(), handle, &block,
+                              rep_->ioptions, false /*decompress*/,
+                              Slice() /*compression dict*/,
+                              rep_->persistent_cache_options)
                 .ok()) {
           rep_->filter.reset(new BlockBasedFilterBlockReader(
               rep_->ioptions.prefix_extractor, table_options,
@@ -2082,6 +2253,9 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
 }
 
 void BlockBasedTable::Close() {
+  if (rep_->closed) {
+    return;
+  }
   rep_->filter_entry.Release(rep_->table_options.block_cache.get());
   rep_->index_entry.Release(rep_->table_options.block_cache.get());
   rep_->range_del_entry.Release(rep_->table_options.block_cache.get());
@@ -2098,6 +2272,7 @@ void BlockBasedTable::Close() {
                                 rep_->dummy_index_reader_offset, cache_key);
     rep_->table_options.block_cache.get()->Erase(key);
   }
+  rep_->closed = true;
 }
 
 Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 857ea5605..a5426cded 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -102,7 +102,6 @@ class BlockBasedTable : public TableReader {
   // @param skip_filters Disables loading/accessing the filter block
   InternalIterator* NewIterator(
       const ReadOptions&, Arena* arena = nullptr,
-      const InternalKeyComparator* icomparator = nullptr,
       bool skip_filters = false) override;
 
   InternalIterator* NewRangeTombstoneIterator(
@@ -140,6 +139,8 @@ class BlockBasedTable : public TableReader {
   // convert SST file to a human readable form
   Status DumpTable(WritableFile* out_file) override;
 
+  Status VerifyChecksum() override;
+
   void Close() override;
 
   ~BlockBasedTable();
@@ -180,6 +181,11 @@ class BlockBasedTable : public TableReader {
     // that was allocated in block cache.
     virtual size_t ApproximateMemoryUsage() const = 0;
 
+    virtual void CacheDependencies(bool /* unused */) {}
+
+    // Prefetch all the blocks referenced by this index to the buffer
+    void PrefetchBlocks(FilePrefetchBuffer* buf);
+
    protected:
     const InternalKeyComparator* icomparator_;
 
@@ -207,6 +213,7 @@ class BlockBasedTable : public TableReader {
   explicit BlockBasedTable(Rep* rep) : rep_(rep) {}
 
  private:
+  friend class MockedBlockBasedTable;
   // input_iter: if it is not null, update this one and return it as Iterator
   static InternalIterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
                                                 const Slice& index_value,
@@ -226,7 +233,8 @@ class BlockBasedTable : public TableReader {
   // @param block_entry value is set to the uncompressed block if found. If
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
-  static Status MaybeLoadDataBlockToCache(Rep* rep, const ReadOptions& ro,
+  static Status MaybeLoadDataBlockToCache(FilePrefetchBuffer* prefetch_buffer,
+                                          Rep* rep, const ReadOptions& ro,
                                           const BlockHandle& handle,
                                           Slice compression_dict,
                                           CachableEntry<Block>* block_entry,
@@ -235,10 +243,11 @@ class BlockBasedTable : public TableReader {
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
   // were they not present in cache yet.
-  CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
+  CachableEntry<FilterBlockReader> GetFilter(
+      FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false) const;
   virtual CachableEntry<FilterBlockReader> GetFilter(
-      const BlockHandle& filter_blk_handle, const bool is_a_filter_partition,
-      bool no_io) const;
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
+      const bool is_a_filter_partition, bool no_io) const;
 
   // Get the iterator from the index reader.
   // If input_iter is not set, return new Iterator
@@ -299,7 +308,7 @@ class BlockBasedTable : public TableReader {
   // need to access extra meta blocks for index construction. This parameter
   // helps avoid re-reading meta index block if caller already created one.
   Status CreateIndexReader(
-      IndexReader** index_reader,
+      FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
       InternalIterator* preloaded_meta_index_iter = nullptr,
       const int level = -1);
 
@@ -308,11 +317,15 @@ class BlockBasedTable : public TableReader {
                              const bool no_io) const;
 
   // Read the meta block from sst.
-  static Status ReadMetaBlock(Rep* rep, std::unique_ptr<Block>* meta_block,
+  static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+                              std::unique_ptr<Block>* meta_block,
                               std::unique_ptr<InternalIterator>* iter);
 
+  Status VerifyChecksumInBlocks(InternalIterator* index_iter);
+
   // Create the filter from the filter block.
-  FilterBlockReader* ReadFilter(const BlockHandle& filter_handle,
+  FilterBlockReader* ReadFilter(FilePrefetchBuffer* prefetch_buffer,
+                                const BlockHandle& filter_handle,
                                 const bool is_a_filter_partition) const;
 
   static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
@@ -340,11 +353,11 @@ class BlockBasedTable : public TableReader {
 // Maitaning state of a two-level iteration on a partitioned index structure
 class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
  public:
-  BlockEntryIteratorState(BlockBasedTable* table,
-                          const ReadOptions& read_options,
-                          const InternalKeyComparator* icomparator,
-                          bool skip_filters, bool is_index = false,
-                          Cleanable* block_cache_cleaner = nullptr);
+  BlockEntryIteratorState(
+      BlockBasedTable* table, const ReadOptions& read_options,
+      const InternalKeyComparator* icomparator, bool skip_filters,
+      bool is_index = false,
+      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map = nullptr);
   InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
   bool PrefixMayMatch(const Slice& internal_key) override;
   bool KeyReachedUpperBound(const Slice& internal_key) override;
@@ -357,8 +370,7 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
   bool skip_filters_;
   // true if the 2nd level iterator is on indexes instead of on user data.
   bool is_index_;
-  Cleanable* block_cache_cleaner_;
-  std::set<uint64_t> cleaner_set;
+  std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
   port::RWMutex cleaner_mu;
 };
 
@@ -468,6 +480,7 @@ struct BlockBasedTable::Rep {
   // A value of kDisableGlobalSequenceNumber means that this feature is disabled
   // and every key have it's own seqno.
   SequenceNumber global_seqno;
+  bool closed = false;
 };
 
 }  // namespace rocksdb
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc
index ec282b4b5..93daaca47 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo_table_builder_test.cc
@@ -109,7 +109,11 @@ class CuckooBuilderTest : public testing::Test {
           expected_locations.begin();
       if (key_idx == keys.size()) {
         // i is not one of the expected locations. Empty bucket.
-        ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+        if (read_slice.data() == nullptr) {
+          ASSERT_EQ(0, expected_unused_bucket.size());
+        } else {
+          ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0);
+        }
       } else {
         keys_found[key_idx] = true;
         ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0);
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h
index 774dc3c3e..db860c3d0 100644
--- a/table/cuckoo_table_factory.h
+++ b/table/cuckoo_table_factory.h
@@ -76,6 +76,11 @@ class CuckooTableFactory : public TableFactory {
 
   void* GetOptions() override { return &table_options_; }
 
+  Status GetOptionString(std::string* opt_string,
+                         const std::string& delimiter) const override {
+    return Status::OK();
+  }
+
  private:
   CuckooTableOptions table_options_;
 };
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc
index 85670ad1d..9cecebaeb 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo_table_reader.cc
@@ -364,8 +364,7 @@ extern InternalIterator* NewErrorInternalIterator(const Status& status,
                                                   Arena* arena);
 
 InternalIterator* CuckooTableReader::NewIterator(
-    const ReadOptions& read_options, Arena* arena,
-    const InternalKeyComparator* icomp, bool skip_filters) {
+    const ReadOptions& read_options, Arena* arena, bool skip_filters) {
   if (!status().ok()) {
     return NewErrorInternalIterator(
         Status::Corruption("CuckooTableReader status is not okay."), arena);
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h
index f2b6d1a9c..4beac8f9d 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo_table_reader.h
@@ -47,7 +47,6 @@ class CuckooTableReader: public TableReader {
 
   InternalIterator* NewIterator(
       const ReadOptions&, Arena* arena = nullptr,
-      const InternalKeyComparator* icomparator = nullptr,
       bool skip_filters = false) override;
   void Prepare(const Slice& target) override;
 
diff --git a/table/filter_block.h b/table/filter_block.h
index 94136f659..7bf3b3132 100644
--- a/table/filter_block.h
+++ b/table/filter_block.h
@@ -108,15 +108,14 @@ class FilterBlockReader {
 
   bool whole_key_filtering() const { return whole_key_filtering_; }
 
-  int GetLevel() const { return level_; }
-  void SetLevel(int level) { level_ = level; }
-
   // convert this object to a human readable form
   virtual std::string ToString() const {
     std::string error_msg("Unsupported filter \n");
     return error_msg;
   }
 
+  virtual void CacheDependencies(bool pin) {}
+
  protected:
   bool whole_key_filtering_;
 
diff --git a/table/format.cc b/table/format.cc
index 3e5a191bb..364766e9a 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -102,7 +102,7 @@ inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
 //    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength
 //    table_magic_number (8 bytes)
 // new footer format:
-//    checksum (char, 1 byte)
+//    checksum type (char, 1 byte)
 //    metaindex handle (varint64 offset, varint64 size)
 //    index handle     (varint64 offset, varint64 size)
 //    <padding> to make the total size 2 * BlockHandle::kMaxEncodedLength + 1
@@ -216,8 +216,10 @@ std::string Footer::ToString() const {
   return result;
 }
 
-Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
-                          Footer* footer, uint64_t enforce_table_magic_number) {
+Status ReadFooterFromFile(RandomAccessFileReader* file,
+                          FilePrefetchBuffer* prefetch_buffer,
+                          uint64_t file_size, Footer* footer,
+                          uint64_t enforce_table_magic_number) {
   if (file_size < Footer::kMinEncodedLength) {
     return Status::Corruption(
       "file is too short (" + ToString(file_size) + " bytes) to be an "
@@ -230,9 +232,14 @@ Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
       (file_size > Footer::kMaxEncodedLength)
           ? static_cast<size_t>(file_size - Footer::kMaxEncodedLength)
           : 0;
-  Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
-                        footer_space);
-  if (!s.ok()) return s;
+  Status s;
+  if (prefetch_buffer == nullptr ||
+      !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
+                                         &footer_input)) {
+    s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input,
+                   footer_space);
+    if (!s.ok()) return s;
+  }
 
   // Check that we actually read the whole footer from the file. It may be
   // that size isn't correct.
@@ -259,6 +266,45 @@ Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
+Status CheckBlockChecksum(const ReadOptions& options, const Footer& footer,
+                          const Slice& contents, size_t block_size,
+                          RandomAccessFileReader* file,
+                          const BlockHandle& handle) {
+  Status s;
+  // Check the crc of the type and the block contents
+  if (options.verify_checksums) {
+    const char* data = contents.data();  // Pointer to where Read put the data
+    PERF_TIMER_GUARD(block_checksum_time);
+    uint32_t value = DecodeFixed32(data + block_size + 1);
+    uint32_t actual = 0;
+    switch (footer.checksum()) {
+      case kNoChecksum:
+        break;
+      case kCRC32c:
+        value = crc32c::Unmask(value);
+        actual = crc32c::Value(data, block_size + 1);
+        break;
+      case kxxHash:
+        actual = XXH32(data, static_cast<int>(block_size) + 1, 0);
+        break;
+      default:
+        s = Status::Corruption(
+            "unknown checksum type " + ToString(footer.checksum()) + " in " +
+            file->file_name() + " offset " + ToString(handle.offset()) +
+            " size " + ToString(block_size));
+    }
+    if (s.ok() && actual != value) {
+      s = Status::Corruption(
+          "block checksum mismatch: expected " + ToString(actual) + ", got " +
+          ToString(value) + "  in " + file->file_name() + " offset " +
+          ToString(handle.offset()) + " size " + ToString(block_size));
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return s;
+}
 
 // Read a block and check its CRC
 // contents is the result of reading.
@@ -281,53 +327,21 @@ Status ReadBlock(RandomAccessFileReader* file, const Footer& footer,
     return s;
   }
   if (contents->size() != n + kBlockTrailerSize) {
-    return Status::Corruption(
-      "truncated block read from " + file->file_name() + " offset "
-      + ToString(handle.offset()) + ", expected "
-      + ToString(n + kBlockTrailerSize) + " bytes, got "
-      + ToString(contents->size()));
+    return Status::Corruption("truncated block read from " + file->file_name() +
+                              " offset " + ToString(handle.offset()) +
+                              ", expected " + ToString(n + kBlockTrailerSize) +
+                              " bytes, got " + ToString(contents->size()));
   }
-
-  // Check the crc of the type and the block contents
-  const char* data = contents->data();  // Pointer to where Read put the data
-  if (options.verify_checksums) {
-    PERF_TIMER_GUARD(block_checksum_time);
-    uint32_t value = DecodeFixed32(data + n + 1);
-    uint32_t actual = 0;
-    switch (footer.checksum()) {
-      case kCRC32c:
-        value = crc32c::Unmask(value);
-        actual = crc32c::Value(data, n + 1);
-        break;
-      case kxxHash:
-        actual = XXH32(data, static_cast<int>(n) + 1, 0);
-        break;
-      default:
-        s = Status::Corruption(
-          "unknown checksum type " + ToString(footer.checksum())
-          + " in " + file->file_name() + " offset "
-          + ToString(handle.offset()) + " size " + ToString(n));
-    }
-    if (s.ok() && actual != value) {
-      s = Status::Corruption(
-        "block checksum mismatch: expected " + ToString(actual)
-        + ", got " + ToString(value) + "  in " + file->file_name()
-        + " offset " + ToString(handle.offset())
-        + " size " + ToString(n));
-    }
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return s;
+  return CheckBlockChecksum(options, footer, *contents, n, file, handle);
 }
 
 }  // namespace
 
-Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
-                         const ReadOptions& read_options,
+Status ReadBlockContents(RandomAccessFileReader* file,
+                         FilePrefetchBuffer* prefetch_buffer,
+                         const Footer& footer, const ReadOptions& read_options,
                          const BlockHandle& handle, BlockContents* contents,
-                         const ImmutableCFOptions &ioptions,
+                         const ImmutableCFOptions& ioptions,
                          bool decompression_requested,
                          const Slice& compression_dict,
                          const PersistentCacheOptions& cache_options) {
@@ -357,8 +371,21 @@ Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
     }
   }
 
-  if (cache_options.persistent_cache &&
-      cache_options.persistent_cache->IsCompressed()) {
+  bool got_from_prefetch_buffer = false;
+  if (prefetch_buffer != nullptr &&
+      prefetch_buffer->TryReadFromCache(
+          handle.offset(),
+          static_cast<size_t>(handle.size()) + kBlockTrailerSize, &slice)) {
+    status =
+        CheckBlockChecksum(read_options, footer, slice,
+                           static_cast<size_t>(handle.size()), file, handle);
+    if (!status.ok()) {
+      return status;
+    }
+    got_from_prefetch_buffer = true;
+    used_buf = const_cast<char*>(slice.data());
+  } else if (cache_options.persistent_cache &&
+             cache_options.persistent_cache->IsCompressed()) {
     // lookup uncompressed cache mode p-cache
     status = PersistentCacheHelper::LookupRawPage(
         cache_options, handle, &heap_buf, n + kBlockTrailerSize);
@@ -366,40 +393,42 @@ Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
     status = Status::NotFound();
   }
 
-  if (status.ok()) {
-    // cache hit
-    used_buf = heap_buf.get();
-    slice = Slice(heap_buf.get(), n);
-  } else {
-    if (ioptions.info_log && !status.IsNotFound()) {
-      assert(!status.ok());
-      ROCKS_LOG_INFO(ioptions.info_log,
-                     "Error reading from persistent cache. %s",
-                     status.ToString().c_str());
-    }
-    // cache miss read from device
-    if (decompression_requested &&
-        n + kBlockTrailerSize < DefaultStackBufferSize) {
-      // If we've got a small enough hunk of data, read it in to the
-      // trivially allocated stack buffer instead of needing a full malloc()
-      used_buf = &stack_buf[0];
-    } else {
-      heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
+  if (!got_from_prefetch_buffer) {
+    if (status.ok()) {
+      // cache hit
       used_buf = heap_buf.get();
-    }
+      slice = Slice(heap_buf.get(), n);
+    } else {
+      if (ioptions.info_log && !status.IsNotFound()) {
+        assert(!status.ok());
+        ROCKS_LOG_INFO(ioptions.info_log,
+                       "Error reading from persistent cache. %s",
+                       status.ToString().c_str());
+      }
+      // cache miss read from device
+      if (decompression_requested &&
+          n + kBlockTrailerSize < DefaultStackBufferSize) {
+        // If we've got a small enough hunk of data, read it in to the
+        // trivially allocated stack buffer instead of needing a full malloc()
+        used_buf = &stack_buf[0];
+      } else {
+        heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
+        used_buf = heap_buf.get();
+      }
 
-    status = ReadBlock(file, footer, read_options, handle, &slice, used_buf);
-    if (status.ok() && read_options.fill_cache &&
-        cache_options.persistent_cache &&
-        cache_options.persistent_cache->IsCompressed()) {
-      // insert to raw cache
-      PersistentCacheHelper::InsertRawPage(cache_options, handle, used_buf,
-                                           n + kBlockTrailerSize);
+      status = ReadBlock(file, footer, read_options, handle, &slice, used_buf);
+      if (status.ok() && read_options.fill_cache &&
+          cache_options.persistent_cache &&
+          cache_options.persistent_cache->IsCompressed()) {
+        // insert to raw cache
+        PersistentCacheHelper::InsertRawPage(cache_options, handle, used_buf,
+                                             n + kBlockTrailerSize);
+      }
     }
-  }
 
-  if (!status.ok()) {
-    return status;
+    if (!status.ok()) {
+      return status;
+    }
   }
 
   PERF_TIMER_GUARD(block_decompress_time);
@@ -416,14 +445,14 @@ Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
     *contents = BlockContents(Slice(slice.data(), n), false, compression_type);
   } else {
     // page is uncompressed, the buffer either stack or heap provided
-    if (used_buf == &stack_buf[0]) {
+    if (got_from_prefetch_buffer || used_buf == &stack_buf[0]) {
       heap_buf = std::unique_ptr<char[]>(new char[n]);
-      memcpy(heap_buf.get(), stack_buf, n);
+      memcpy(heap_buf.get(), used_buf, n);
     }
     *contents = BlockContents(std::move(heap_buf), n, true, compression_type);
   }
 
-  if (status.ok() && read_options.fill_cache &&
+  if (status.ok() && !got_from_prefetch_buffer && read_options.fill_cache &&
       cache_options.persistent_cache &&
       !cache_options.persistent_cache->IsCompressed()) {
     // insert to uncompressed cache
diff --git a/table/format.h b/table/format.h
index d89b1d312..512b4a32b 100644
--- a/table/format.h
+++ b/table/format.h
@@ -18,6 +18,7 @@
 #include "options/cf_options.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -173,8 +174,9 @@ class Footer {
 // Read the footer from file
 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
 // corruption if table_magic number is not equal to enforce_table_magic_number
-Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
-                          Footer* footer,
+Status ReadFooterFromFile(RandomAccessFileReader* file,
+                          FilePrefetchBuffer* prefetch_buffer,
+                          uint64_t file_size, Footer* footer,
                           uint64_t enforce_table_magic_number = 0);
 
 // 1-byte type + 32-bit crc
@@ -213,9 +215,9 @@ struct BlockContents {
 // Read the block identified by "handle" from "file".  On failure
 // return non-OK.  On success fill *result and return OK.
 extern Status ReadBlockContents(
-    RandomAccessFileReader* file, const Footer& footer,
-    const ReadOptions& options, const BlockHandle& handle,
-    BlockContents* contents, const ImmutableCFOptions &ioptions,
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    BlockContents* contents, const ImmutableCFOptions& ioptions,
     bool do_uncompress = true, const Slice& compression_dict = Slice(),
     const PersistentCacheOptions& cache_options = PersistentCacheOptions());
 
diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h
index c47a74754..b3be7e897 100644
--- a/table/full_filter_bits_builder.h
+++ b/table/full_filter_bits_builder.h
@@ -1,7 +1,7 @@
 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree. An additional grant
-// of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
diff --git a/table/get_context.cc b/table/get_context.cc
index 2b49eba6a..258891ec4 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -33,14 +33,12 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
 
 }  // namespace
 
-GetContext::GetContext(const Comparator* ucmp,
-                       const MergeOperator* merge_operator, Logger* logger,
-                       Statistics* statistics, GetState init_state,
-                       const Slice& user_key, PinnableSlice* pinnable_val,
-                       bool* value_found, MergeContext* merge_context,
-                       RangeDelAggregator* _range_del_agg, Env* env,
-                       SequenceNumber* seq,
-                       PinnedIteratorsManager* _pinned_iters_mgr)
+GetContext::GetContext(
+    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+    Statistics* statistics, GetState init_state, const Slice& user_key,
+    PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
+    RangeDelAggregator* _range_del_agg, Env* env, SequenceNumber* seq,
+    PinnedIteratorsManager* _pinned_iters_mgr, bool* is_blob_index)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -54,7 +52,8 @@ GetContext::GetContext(const Comparator* ucmp,
       env_(env),
       seq_(seq),
       replay_log_(nullptr),
-      pinned_iters_mgr_(_pinned_iters_mgr) {
+      pinned_iters_mgr_(_pinned_iters_mgr),
+      is_blob_index_(is_blob_index) {
   if (seq_) {
     *seq_ = kMaxSequenceNumber;
   }
@@ -99,13 +98,19 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
 
     auto type = parsed_key.type;
     // Key matches. Process it
-    if ((type == kTypeValue || type == kTypeMerge) &&
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
         range_del_agg_ != nullptr && range_del_agg_->ShouldDelete(parsed_key)) {
       type = kTypeRangeDeletion;
     }
     switch (type) {
       case kTypeValue:
+      case kTypeBlobIndex:
         assert(state_ == kNotFound || state_ == kMerge);
+        if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
+          // Blob value not supported. Stop.
+          state_ = kBlobIndex;
+          return false;
+        }
         if (kNotFound == state_) {
           state_ = kFound;
           if (LIKELY(pinnable_val_ != nullptr)) {
@@ -131,6 +136,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
             }
           }
         }
+        if (is_blob_index_ != nullptr) {
+          *is_blob_index_ = (type == kTypeBlobIndex);
+        }
         return false;
 
       case kTypeDeletion:
@@ -180,7 +188,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
 }
 
 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
-                         GetContext* get_context) {
+                         GetContext* get_context, Cleanable* value_pinner) {
 #ifndef ROCKSDB_LITE
   Slice s = replay_log;
   while (s.size()) {
@@ -194,7 +202,8 @@ void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
     // Since SequenceNumber is not stored and unknown, we will use
     // kMaxSequenceNumber.
     get_context->SaveValue(
-        ParsedInternalKey(user_key, kMaxSequenceNumber, type), value, nullptr);
+        ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
+        value_pinner);
   }
 #else   // ROCKSDB_LITE
   assert(false);
diff --git a/table/get_context.h b/table/get_context.h
index ee8a3beab..a708f6be7 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -22,7 +22,8 @@ class GetContext {
     kFound,
     kDeleted,
     kCorrupt,
-    kMerge  // saver contains the current merge result (the operands)
+    kMerge,  // saver contains the current merge result (the operands)
+    kBlobIndex,
   };
 
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
@@ -30,7 +31,8 @@ class GetContext {
              const Slice& user_key, PinnableSlice* value, bool* value_found,
              MergeContext* merge_context, RangeDelAggregator* range_del_agg,
              Env* env, SequenceNumber* seq = nullptr,
-             PinnedIteratorsManager* _pinned_iters_mgr = nullptr);
+             PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
+             bool* is_blob_index = nullptr);
 
   void MarkKeyMayExist();
 
@@ -83,9 +85,11 @@ class GetContext {
   // Used to temporarily pin blocks when state_ == GetContext::kMerge
   PinnedIteratorsManager* pinned_iters_mgr_;
   bool sample_;
+  bool* is_blob_index_;
 };
 
 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
-                         GetContext* get_context);
+                         GetContext* get_context,
+                         Cleanable* value_pinner = nullptr);
 
 }  // namespace rocksdb
diff --git a/table/iterator.cc b/table/iterator.cc
index 23a84b59e..ed6a2cdea 100644
--- a/table/iterator.cc
+++ b/table/iterator.cc
@@ -21,6 +21,19 @@ Cleanable::Cleanable() {
 
 Cleanable::~Cleanable() { DoCleanup(); }
 
+Cleanable::Cleanable(Cleanable&& other) {
+  *this = std::move(other);
+}
+
+Cleanable& Cleanable::operator=(Cleanable&& other) {
+  if (this != &other) {
+    cleanup_ = other.cleanup_;
+    other.cleanup_.function = nullptr;
+    other.cleanup_.next = nullptr;
+  }
+  return *this;
+}
+
 // If the entire linked list was on heap we could have simply add attach one
 // link list to another. However the head is an embeded object to avoid the cost
 // of creating objects for most of the use cases when the Cleanable has only one
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 5946e40fe..19925d788 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -16,6 +16,7 @@
 #include "table/persistent_cache_helper.h"
 #include "table/table_properties_internal.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -76,6 +77,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
   Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
   Add(TablePropertiesNames::kCreationTime, props.creation_time);
+  Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
 
   if (!props.filter_policy_name.empty()) {
     Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
@@ -159,7 +161,8 @@ bool NotifyCollectTableCollectorsOnFinish(
 }
 
 Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
-                      const Footer& footer, const ImmutableCFOptions& ioptions,
+                      FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
+                      const ImmutableCFOptions& ioptions,
                       TableProperties** table_properties) {
   assert(table_properties);
 
@@ -173,8 +176,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   ReadOptions read_options;
   read_options.verify_checksums = false;
   Status s;
-  s = ReadBlockContents(file, footer, read_options, handle, &block_contents,
-                        ioptions, false /* decompress */);
+  s = ReadBlockContents(file, prefetch_buffer, footer, read_options, handle,
+                        &block_contents, ioptions, false /* decompress */);
 
   if (!s.ok()) {
     return s;
@@ -209,6 +212,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
        &new_table_properties->column_family_id},
       {TablePropertiesNames::kCreationTime,
        &new_table_properties->creation_time},
+      {TablePropertiesNames::kOldestKeyTime,
+       &new_table_properties->oldest_key_time},
   };
 
   std::string last_key;
@@ -277,7 +282,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            TableProperties** properties) {
   // -- Read metaindex block
   Footer footer;
-  auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
+  auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
+                              &footer, table_magic_number);
   if (!s.ok()) {
     return s;
   }
@@ -286,8 +292,9 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
-                        &metaindex_contents, ioptions, false /* decompress */);
+  s = ReadBlockContents(file, nullptr /* prefetch_buffer */, footer,
+                        read_options, metaindex_handle, &metaindex_contents,
+                        ioptions, false /* decompress */);
   if (!s.ok()) {
     return s;
   }
@@ -305,7 +312,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
 
   TableProperties table_properties;
   if (found_properties_block == true) {
-    s = ReadProperties(meta_iter->value(), file, footer, ioptions, properties);
+    s = ReadProperties(meta_iter->value(), file, nullptr /* prefetch_buffer */,
+                       footer, ioptions, properties);
   } else {
     s = Status::NotFound();
   }
@@ -332,7 +340,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle) {
   Footer footer;
-  auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
+  auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
+                              &footer, table_magic_number);
   if (!s.ok()) {
     return s;
   }
@@ -341,8 +350,9 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
-                        &metaindex_contents, ioptions, false /* do decompression */);
+  s = ReadBlockContents(file, nullptr /* prefetch_buffer */, footer,
+                        read_options, metaindex_handle, &metaindex_contents,
+                        ioptions, false /* do decompression */);
   if (!s.ok()) {
     return s;
   }
@@ -355,14 +365,16 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
   return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
 
-Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
+Status ReadMetaBlock(RandomAccessFileReader* file,
+                     FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions &ioptions,
+                     const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockContents* contents) {
   Status status;
   Footer footer;
-  status = ReadFooterFromFile(file, file_size, &footer, table_magic_number);
+  status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer,
+                              table_magic_number);
   if (!status.ok()) {
     return status;
   }
@@ -372,8 +384,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  status = ReadBlockContents(file, footer, read_options, metaindex_handle,
-                             &metaindex_contents, ioptions,
+  status = ReadBlockContents(file, prefetch_buffer, footer, read_options,
+                             metaindex_handle, &metaindex_contents, ioptions,
                              false /* decompress */);
   if (!status.ok()) {
     return status;
@@ -394,8 +406,9 @@ Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
   }
 
   // Reading metablock
-  return ReadBlockContents(file, footer, read_options, block_handle, contents,
-                           ioptions, false /* decompress */);
+  return ReadBlockContents(file, prefetch_buffer, footer, read_options,
+                           block_handle, contents, ioptions,
+                           false /* decompress */);
 }
 
 }  // namespace rocksdb
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index ddb685360..220985d9e 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -94,7 +94,8 @@ bool NotifyCollectTableCollectorsOnFinish(
 //          *table_properties will point to a heap-allocated TableProperties
 //          object, otherwise value of `table_properties` will not be modified.
 Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
-                      const Footer& footer, const ImmutableCFOptions &ioptions,
+                      FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
+                      const ImmutableCFOptions& ioptions,
                       TableProperties** table_properties);
 
 // Directly read the properties from the properties block of a plain table.
@@ -121,9 +122,10 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
 // Read the specified meta block with name meta_block_name
 // from `file` and initialize `contents` with contents of this block.
 // Return Status::OK in case of success.
-Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
+Status ReadMetaBlock(RandomAccessFileReader* file,
+                     FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions &ioptions,
+                     const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockContents* contents);
 
diff --git a/table/mock_table.cc b/table/mock_table.cc
index d50a42127..86c380865 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -28,7 +28,6 @@ stl_wrappers::KVMap MakeMockFile(
 
 InternalIterator* MockTableReader::NewIterator(const ReadOptions&,
                                                Arena* arena,
-                                               const InternalKeyComparator*,
                                                bool skip_filters) {
   return new MockTableIterator(table_);
 }
@@ -135,7 +134,7 @@ void MockTableFactory::AssertLatestFile(
       ParseInternalKey(Slice(key), &ikey);
       std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
     }
-    ASSERT_TRUE(false);
+    FAIL();
   }
 }
 
diff --git a/table/mock_table.h b/table/mock_table.h
index 9e5396341..71609a173 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -40,7 +40,6 @@ class MockTableReader : public TableReader {
 
   InternalIterator* NewIterator(const ReadOptions&,
                                 Arena* arena,
-                                const InternalKeyComparator* = nullptr,
                                 bool skip_filters = false) override;
 
   Status Get(const ReadOptions&, const Slice& key, GetContext* get_context,
diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc
index 2b330039e..202245939 100644
--- a/table/partitioned_filter_block.cc
+++ b/table/partitioned_filter_block.cc
@@ -7,6 +7,7 @@
 
 #include <utility>
 
+#include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
 #include "table/block.h"
@@ -100,19 +101,29 @@ PartitionedFilterBlockReader::PartitionedFilterBlockReader(
 }
 
 PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
-  {
-    ReadLock rl(&mu_);
-    for (auto it = handle_list_.begin(); it != handle_list_.end(); ++it) {
-      table_->rep_->table_options.block_cache.get()->Release(*it);
-    }
+  // TODO(myabandeh): if instead of filter object we store only the blocks in
+  // block cache, then we don't have to manually earse them from block cache
+  // here.
+  auto block_cache = table_->rep_->table_options.block_cache.get();
+  if (UNLIKELY(block_cache == nullptr)) {
+    return;
   }
   char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  for (auto it = filter_block_set_.begin(); it != filter_block_set_.end();
-       ++it) {
+  BlockIter biter;
+  BlockHandle handle;
+  idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
+  biter.SeekToFirst();
+  for (; biter.Valid(); biter.Next()) {
+    auto input = biter.value();
+    auto s = handle.DecodeFrom(&input);
+    assert(s.ok());
+    if (!s.ok()) {
+      continue;
+    }
     auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
                                             table_->rep_->cache_key_prefix_size,
-                                            *it, cache_key);
-    table_->rep_->table_options.block_cache.get()->Erase(key);
+                                            handle, cache_key);
+    block_cache->Erase(key);
   }
 }
 
@@ -132,7 +143,8 @@ bool PartitionedFilterBlockReader::KeyMayMatch(
     return false;
   }
   bool cached = false;
-  auto filter_partition = GetFilterPartition(&filter_handle, no_io, &cached);
+  auto filter_partition = GetFilterPartition(nullptr /* prefetch_buffer */,
+                                             &filter_handle, no_io, &cached);
   if (UNLIKELY(!filter_partition.value)) {
     return true;
   }
@@ -164,7 +176,8 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
     return false;
   }
   bool cached = false;
-  auto filter_partition = GetFilterPartition(&filter_handle, no_io, &cached);
+  auto filter_partition = GetFilterPartition(nullptr /* prefetch_buffer */,
+                                             &filter_handle, no_io, &cached);
   if (UNLIKELY(!filter_partition.value)) {
     return true;
   }
@@ -194,45 +207,34 @@ Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader>
-PartitionedFilterBlockReader::GetFilterPartition(Slice* handle_value,
-                                                 const bool no_io,
-                                                 bool* cached) {
+PartitionedFilterBlockReader::GetFilterPartition(
+    FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io,
+    bool* cached) {
   BlockHandle fltr_blk_handle;
   auto s = fltr_blk_handle.DecodeFrom(handle_value);
   assert(s.ok());
   const bool is_a_filter_partition = true;
   auto block_cache = table_->rep_->table_options.block_cache.get();
   if (LIKELY(block_cache != nullptr)) {
-    bool pin_cached_filters =
-        GetLevel() == 0 &&
-        table_->rep_->table_options.pin_l0_filter_and_index_blocks_in_cache;
-    if (pin_cached_filters) {
-      ReadLock rl(&mu_);
-      auto iter = filter_cache_.find(fltr_blk_handle.offset());
-      if (iter != filter_cache_.end()) {
+    if (filter_map_.size() != 0) {
+      auto iter = filter_map_.find(fltr_blk_handle.offset());
+      // This is a possible scenario since block cache might not have had space
+      // for the partition
+      if (iter != filter_map_.end()) {
+        PERF_COUNTER_ADD(block_cache_hit_count, 1);
         RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT);
+        RecordTick(statistics(), BLOCK_CACHE_HIT);
+        RecordTick(statistics(), BLOCK_CACHE_BYTES_READ,
+                   block_cache->GetUsage(iter->second.cache_handle));
         *cached = true;
-        return {iter->second, nullptr};
-      }
-    }
-    auto filter =
-        table_->GetFilter(fltr_blk_handle, is_a_filter_partition, no_io);
-    if (filter.IsSet()) {
-      WriteLock wl(&mu_);
-      filter_block_set_.insert(fltr_blk_handle);
-      if (pin_cached_filters) {
-        std::pair<uint64_t, FilterBlockReader*> pair(fltr_blk_handle.offset(),
-                                                     filter.value);
-        auto succ = filter_cache_.insert(pair).second;
-        if (succ) {
-          handle_list_.push_back(filter.cache_handle);
-        }  // Otherwise it is already inserted by a concurrent thread
-        *cached = true;
+        return iter->second;
       }
     }
-    return filter;
+    return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
+                             is_a_filter_partition, no_io);
   } else {
-    auto filter = table_->ReadFilter(fltr_blk_handle, is_a_filter_partition);
+    auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
+                                     is_a_filter_partition);
     return {filter, nullptr};
   }
 }
@@ -241,4 +243,69 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
   return idx_on_fltr_blk_->size();
 }
 
+// TODO(myabandeh): merge this with the same function in IndexReader
+void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+  // Before read partitions, prefetch them to avoid lots of IOs
+  auto rep = table_->rep_;
+  BlockIter biter;
+  BlockHandle handle;
+  idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  Slice input = biter.value();
+  Status s = handle.DecodeFrom(&input);
+  assert(s.ok());
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep->ioptions.info_log,
+                   "Could not read first index partition");
+    return;
+  }
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  input = biter.value();
+  s = handle.DecodeFrom(&input);
+  assert(s.ok());
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep->ioptions.info_log,
+                   "Could not read last index partition");
+    return;
+  }
+  uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  auto& file = table_->rep_->file;
+  prefetch_buffer.reset(new FilePrefetchBuffer());
+  s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
+
+  // After prefetch, read the partitions one by one
+  biter.SeekToFirst();
+  Cache* block_cache = rep->table_options.block_cache.get();
+  for (; biter.Valid(); biter.Next()) {
+    input = biter.value();
+    s = handle.DecodeFrom(&input);
+    assert(s.ok());
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition");
+      continue;
+    }
+
+    const bool no_io = true;
+    const bool is_a_filter_partition = true;
+    auto filter = table_->GetFilter(prefetch_buffer.get(), handle,
+                                    is_a_filter_partition, !no_io);
+    if (LIKELY(filter.IsSet())) {
+      if (pin) {
+        filter_map_[handle.offset()] = std::move(filter);
+      } else {
+        block_cache->Release(filter.cache_handle);
+      }
+    } else {
+      delete filter.value;
+    }
+  }
+}
+
 }  // namespace rocksdb
diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h
index 6c4a5d7b9..1a00a86e6 100644
--- a/table/partitioned_filter_block.h
+++ b/table/partitioned_filter_block.h
@@ -86,21 +86,17 @@ class PartitionedFilterBlockReader : public FilterBlockReader {
  private:
   Slice GetFilterPartitionHandle(const Slice& entry);
   BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
-      Slice* handle, const bool no_io, bool* cached);
+      FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io,
+      bool* cached);
+  virtual void CacheDependencies(bool pin) override;
 
   const SliceTransform* prefix_extractor_;
   std::unique_ptr<Block> idx_on_fltr_blk_;
   const Comparator& comparator_;
   const BlockBasedTable* table_;
-  std::unordered_map<uint64_t, FilterBlockReader*> filter_cache_;
-  autovector<Cache::Handle*> handle_list_;
-  struct BlockHandleCmp {
-    bool operator()(const BlockHandle& lhs, const BlockHandle& rhs) const {
-      return lhs.offset() < rhs.offset();
-    }
-  };
-  std::set<BlockHandle, BlockHandleCmp> filter_block_set_;
-  port::RWMutex mu_;
+  std::unordered_map<uint64_t,
+                     BlockBasedTable::CachableEntry<FilterBlockReader>>
+      filter_map_;
 };
 
 }  // namespace rocksdb
diff --git a/table/partitioned_filter_block_test.cc b/table/partitioned_filter_block_test.cc
index a49143dae..1bc529ed9 100644
--- a/table/partitioned_filter_block_test.cc
+++ b/table/partitioned_filter_block_test.cc
@@ -22,11 +22,14 @@ std::map<uint64_t, Slice> slices;
 
 class MockedBlockBasedTable : public BlockBasedTable {
  public:
-  explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) {}
+  explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) {
+    // Initialize what Open normally does as much as necessary for the test
+    rep->cache_key_prefix_size = 10;
+  }
 
   virtual CachableEntry<FilterBlockReader> GetFilter(
-      const BlockHandle& filter_blk_handle, const bool is_a_filter_partition,
-      bool no_io) const override {
+      FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
+      const bool /* unused */, bool /* unused */) const override {
     Slice slice = slices[filter_blk_handle.offset()];
     auto obj = new FullFilterBlockReader(
         nullptr, true, BlockContents(slice, false, kNoCompression),
diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc
index eadc2c099..5f7809b96 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@@ -5,12 +5,15 @@
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_factory.h"
 
-#include <memory>
 #include <stdint.h>
+#include <memory>
 #include "db/dbformat.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
 #include "table/plain_table_builder.h"
 #include "table/plain_table_reader.h"
-#include "port/port.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -81,6 +84,143 @@ const PlainTableOptions& PlainTableFactory::table_options() const {
   return table_options_;
 }
 
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status s = StringToMap(opts_str, &opts_map);
+  if (!s.ok()) {
+    return s;
+  }
+  return GetPlainTableOptionsFromMap(table_options, opts_map,
+                                     new_table_options);
+}
+
+Status GetMemTableRepFactoryFromString(
+    const std::string& opts_str,
+    std::unique_ptr<MemTableRepFactory>* new_mem_factory) {
+  std::vector<std::string> opts_list = StringSplit(opts_str, ':');
+  size_t len = opts_list.size();
+
+  if (opts_list.size() <= 0 || opts_list.size() > 2) {
+    return Status::InvalidArgument("Can't parse memtable_factory option ",
+                                   opts_str);
+  }
+
+  MemTableRepFactory* mem_factory = nullptr;
+
+  if (opts_list[0] == "skip_list") {
+    // Expecting format
+    // skip_list:<lookahead>
+    if (2 == len) {
+      size_t lookahead = ParseSizeT(opts_list[1]);
+      mem_factory = new SkipListFactory(lookahead);
+    } else if (1 == len) {
+      mem_factory = new SkipListFactory();
+    }
+  } else if (opts_list[0] == "prefix_hash") {
+    // Expecting format
+    // prfix_hash:<hash_bucket_count>
+    if (2 == len) {
+      size_t hash_bucket_count = ParseSizeT(opts_list[1]);
+      mem_factory = NewHashSkipListRepFactory(hash_bucket_count);
+    } else if (1 == len) {
+      mem_factory = NewHashSkipListRepFactory();
+    }
+  } else if (opts_list[0] == "hash_linkedlist") {
+    // Expecting format
+    // hash_linkedlist:<hash_bucket_count>
+    if (2 == len) {
+      size_t hash_bucket_count = ParseSizeT(opts_list[1]);
+      mem_factory = NewHashLinkListRepFactory(hash_bucket_count);
+    } else if (1 == len) {
+      mem_factory = NewHashLinkListRepFactory();
+    }
+  } else if (opts_list[0] == "vector") {
+    // Expecting format
+    // vector:<count>
+    if (2 == len) {
+      size_t count = ParseSizeT(opts_list[1]);
+      mem_factory = new VectorRepFactory(count);
+    } else if (1 == len) {
+      mem_factory = new VectorRepFactory();
+    }
+  } else if (opts_list[0] == "cuckoo") {
+    // Expecting format
+    // cuckoo:<write_buffer_size>
+    if (2 == len) {
+      size_t write_buffer_size = ParseSizeT(opts_list[1]);
+      mem_factory = NewHashCuckooRepFactory(write_buffer_size);
+    } else if (1 == len) {
+      return Status::InvalidArgument("Can't parse memtable_factory option ",
+                                     opts_str);
+    }
+  } else {
+    return Status::InvalidArgument("Unrecognized memtable_factory option ",
+                                   opts_str);
+  }
+
+  if (mem_factory != nullptr) {
+    new_mem_factory->reset(mem_factory);
+  }
+
+  return Status::OK();
+}
+
+std::string ParsePlainTableOptions(const std::string& name,
+                                   const std::string& org_value,
+                                   PlainTableOptions* new_options,
+                                   bool input_strings_escaped = false,
+                                   bool ignore_unknown_options = false) {
+  const std::string& value =
+      input_strings_escaped ? UnescapeOptionString(org_value) : org_value;
+  const auto iter = plain_table_type_info.find(name);
+  if (iter == plain_table_type_info.end()) {
+    if (ignore_unknown_options) {
+      return "";
+    } else {
+      return "Unrecognized option";
+    }
+  }
+  const auto& opt_info = iter->second;
+  if (opt_info.verification != OptionVerificationType::kDeprecated &&
+      !ParseOptionHelper(reinterpret_cast<char*>(new_options) + opt_info.offset,
+                         opt_info.type, value)) {
+    return "Invalid value";
+  }
+  return "";
+}
+
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options, bool input_strings_escaped,
+    bool ignore_unknown_options) {
+  assert(new_table_options);
+  *new_table_options = table_options;
+  for (const auto& o : opts_map) {
+    auto error_message = ParsePlainTableOptions(
+        o.first, o.second, new_table_options, input_strings_escaped);
+    if (error_message != "") {
+      const auto iter = plain_table_type_info.find(o.first);
+      if (iter == plain_table_type_info.end() ||
+          !input_strings_escaped ||  // !input_strings_escaped indicates
+                                     // the old API, where everything is
+                                     // parsable.
+          (iter->second.verification != OptionVerificationType::kByName &&
+           iter->second.verification !=
+               OptionVerificationType::kByNameAllowNull &&
+           iter->second.verification != OptionVerificationType::kDeprecated)) {
+        // Restore "new_options" to the default "base_options".
+        *new_table_options = table_options;
+        return Status::InvalidArgument("Can't parse PlainTableOptions:",
+                                       o.first + " " + error_message);
+      }
+    }
+  }
+  return Status::OK();
+}
+
 extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
   return new PlainTableFactory(options);
 }
diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h
index 33cd31347..6c9ca44f3 100644
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <stdint.h>
 
+#include "options/options_helper.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 
@@ -170,9 +171,40 @@ class PlainTableFactory : public TableFactory {
 
   void* GetOptions() override { return &table_options_; }
 
+  Status GetOptionString(std::string* opt_string,
+                         const std::string& delimiter) const override {
+    return Status::OK();
+  }
+
  private:
   PlainTableOptions table_options_;
 };
 
+static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
+    {"user_key_len",
+     {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
+      OptionVerificationType::kNormal, false, 0}},
+    {"bloom_bits_per_key",
+     {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
+      OptionVerificationType::kNormal, false, 0}},
+    {"hash_table_ratio",
+     {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
+      OptionVerificationType::kNormal, false, 0}},
+    {"index_sparseness",
+     {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
+      OptionVerificationType::kNormal, false, 0}},
+    {"huge_page_tlb_size",
+     {offsetof(struct PlainTableOptions, huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+    {"encoding_type",
+     {offsetof(struct PlainTableOptions, encoding_type),
+      OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
+    {"full_scan_mode",
+     {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
+      OptionVerificationType::kNormal, false, 0}},
+    {"store_index_in_file",
+     {offsetof(struct PlainTableOptions, store_index_in_file),
+      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc
index 0f9449e86..d4d9edb74 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@@ -191,7 +191,6 @@ void PlainTableReader::SetupForCompaction() {
 
 InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options,
                                                 Arena* arena,
-                                                const InternalKeyComparator*,
                                                 bool skip_filters) {
   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
   if (arena == nullptr) {
@@ -292,9 +291,10 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   table_properties_.reset(props);
 
   BlockContents index_block_contents;
-  Status s = ReadMetaBlock(
-      file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_,
-      PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_contents);
+  Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                           file_size_, kPlainTableMagicNumber, ioptions_,
+                           PlainTableIndexBuilder::kPlainTableIndexBlock,
+                           &index_block_contents);
 
   bool index_in_file = s.ok();
 
@@ -302,9 +302,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   bool bloom_in_file = false;
   // We only need to read the bloom block if index block is in file.
   if (index_in_file) {
-    s = ReadMetaBlock(file_info_.file.get(), file_size_, kPlainTableMagicNumber,
-                      ioptions_, BloomBlockBuilder::kBloomBlock,
-                      &bloom_block_contents);
+    s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
+                      file_size_, kPlainTableMagicNumber, ioptions_,
+                      BloomBlockBuilder::kBloomBlock, &bloom_block_contents);
     bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
   }
 
diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h
index 236bab4fd..6bf8da2f9 100644
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@@ -71,7 +71,7 @@ class PlainTableReader: public TableReader {
  public:
   static Status Open(const ImmutableCFOptions& ioptions,
                      const EnvOptions& env_options,
-                     const InternalKeyComparator& icomparator,
+                     const InternalKeyComparator& internal_comparator,
                      unique_ptr<RandomAccessFileReader>&& file,
                      uint64_t file_size, unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
@@ -80,7 +80,6 @@ class PlainTableReader: public TableReader {
 
   InternalIterator* NewIterator(const ReadOptions&,
                                 Arena* arena = nullptr,
-                                const InternalKeyComparator* = nullptr,
                                 bool skip_filters = false) override;
 
   void Prepare(const Slice& target) override;
diff --git a/table/table_builder.h b/table/table_builder.h
index ef2e608ed..e5e7d6e22 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -55,7 +55,7 @@ struct TableBuilderOptions {
       const CompressionOptions& _compression_opts,
       const std::string* _compression_dict, bool _skip_filters,
       const std::string& _column_family_name, int _level,
-      const uint64_t _creation_time = 0)
+      const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0)
       : ioptions(_ioptions),
         internal_comparator(_internal_comparator),
         int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
@@ -65,7 +65,8 @@ struct TableBuilderOptions {
         skip_filters(_skip_filters),
         column_family_name(_column_family_name),
         level(_level),
-        creation_time(_creation_time) {}
+        creation_time(_creation_time),
+        oldest_key_time(_oldest_key_time) {}
   const ImmutableCFOptions& ioptions;
   const InternalKeyComparator& internal_comparator;
   const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
@@ -78,6 +79,7 @@ struct TableBuilderOptions {
   const std::string& column_family_name;
   int level; // what level this table/file is on, -1 for "not set, don't know"
   const uint64_t creation_time;
+  const int64_t oldest_key_time;
 };
 
 // TableBuilder provides the interface used to build a Table
diff --git a/table/table_properties.cc b/table/table_properties.cc
index ef77ae566..24453f6f9 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -139,6 +139,9 @@ std::string TableProperties::ToString(
 
   AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim);
 
+  AppendProperty(result, "time stamp of earliest key", oldest_key_time,
+                 prop_delim, kv_delim);
+
   return result;
 }
 
@@ -191,6 +194,8 @@ const std::string TablePropertiesNames::kPropertyCollectors =
     "rocksdb.property.collectors";
 const std::string TablePropertiesNames::kCompression = "rocksdb.compression";
 const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time";
+const std::string TablePropertiesNames::kOldestKeyTime =
+    "rocksdb.oldest.key.time";
 
 extern const std::string kPropertiesBlock = "rocksdb.properties";
 // Old property block name for backward compatibility
diff --git a/table/table_reader.h b/table/table_reader.h
index 5f47468e6..18fcda273 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -40,7 +40,6 @@ class TableReader {
   //               option is effective only for block-based table format.
   virtual InternalIterator* NewIterator(const ReadOptions&,
                                         Arena* arena = nullptr,
-                                        const InternalKeyComparator* = nullptr,
                                         bool skip_filters = false) = 0;
 
   virtual InternalIterator* NewRangeTombstoneIterator(
@@ -99,6 +98,11 @@ class TableReader {
     return Status::NotSupported("DumpTable() not supported");
   }
 
+  // check whether there is corruption in this db file
+  virtual Status VerifyChecksum() {
+    return Status::NotSupported("VerifyChecksum() not supported");
+  }
+
   virtual void Close() {}
 };
 
diff --git a/table/table_test.cc b/table/table_test.cc
index c55eb4255..178cf4243 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2174,7 +2174,7 @@ std::map<std::string, size_t> MockCache::marked_data_in_cache_;
 
 // Block cache can contain raw data blocks as well as general objects. If an
 // object depends on the table to be live, it then must be destructed before the
-// table is closed. This test makese sure that the only items remains in the
+// table is closed. This test makes sure that the only items remains in the
 // cache after the table is closed are raw data blocks.
 TEST_F(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
   for (auto index_type :
diff --git a/thirdparty.inc b/thirdparty.inc
index 9fffd9bff..a364d1d44 100644
--- a/thirdparty.inc
+++ b/thirdparty.inc
@@ -8,8 +8,6 @@ set(USE_SNAPPY_DEFAULT 0)        # SNAPPY is disabled by default, enable with -D
 set(USE_LZ4_DEFAULT 0)           # LZ4 is disabled by default, enable with -DLZ4=1 cmake command line agrument
 set(USE_ZLIB_DEFAULT 0)          # ZLIB is disabled by default, enable with -DZLIB=1 cmake command line agrument
 set(USE_XPRESS_DEFAULT 0)        # XPRESS is disabled by default, enable with -DXPRESS=1 cmake command line agrument
-set(USE_JEMALLOC_DEFAULT 0)      # JEMALLOC is disabled by default, enable with -DJEMALLOC=1 cmake command line agrument
-set(USE_JENONINIT_DEFAULT 1)     # Default is enabled do not call je_init/je_uninit as the newer versions do not have it disable with -DJENONINIT=0
 
 #
 # This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable
@@ -219,15 +217,15 @@ set(JEMALLOC_LIB_RELEASE ${JEMALLOC_HOME}/bin/retail/amd64/jemalloc.lib)
 #
 # Don't touch these lines
 #
-if (DEFINED JEMALLOC)
-  set(USE_JEMALLOC ${JEMALLOC})
-else ()
-  set(USE_JEMALLOC ${USE_JEMALLOC_DEFAULT})
-endif ()
 
-if (${USE_JEMALLOC} EQUAL 1)
+# For compatibilty with previous
+if(JEMALLOC)
+  set(WITH_JEMALLOC ON)
+endif()
+
+if (WITH_JEMALLOC)
   message(STATUS "JEMALLOC library is enabled")
-  set(JEMALLOC_CXX_FLAGS "-DJEMALLOC -DJEMALLOC_EXPORT= ")
+  set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= ")
   
   if(DEFINED ENV{JEMALLOC_INCLUDE})
     set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE})
@@ -248,16 +246,7 @@ if (${USE_JEMALLOC} EQUAL 1)
   set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS})
   set (ARTIFACT_SUFFIX "_je")
   
-  set(USE_JENONINIT USE_JENONINIT_DEFAULT)
- 
-  if(JENONINIT)
-    set(USE_JENONINIT ${JENONINIT})
-  endif()
-  
-  if(${USE_JENONINIT} EQUAL 1)
-    add_definitions(-DJEMALLOC_NON_INIT)
-    message(STATUS "JEMALLOC NONINIT version")
-  endif()
+  set(WITH_JEMALLOC ON)
   
 else ()
   set (ARTIFACT_SUFFIX "")
diff --git a/tools/benchmark.sh b/tools/benchmark.sh
index 46e1c6567..1a2c38439 100755
--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # REQUIRE: db_bench binary exists in the current directory
 
 if [ $# -ne 1 ]; then
diff --git a/tools/benchmark_leveldb.sh b/tools/benchmark_leveldb.sh
index dce66d47a..776996980 100755
--- a/tools/benchmark_leveldb.sh
+++ b/tools/benchmark_leveldb.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # REQUIRE: db_bench binary exists in the current directory
 #
 # This should be used with the LevelDB fork listed here to use additional test options.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 300aaf089..801648963 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # A shell script to load some pre generated data file to a DB using ldb tool
 # ./ldb needs to be avaible to be executed.
@@ -41,8 +41,9 @@ with open('${input_data[$i]}', 'w') as f:
 EOF
 done
 
-declare -a checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb")
+declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
 declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb")
+declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]})
 
 generate_db()
 {
@@ -76,7 +77,7 @@ https_proxy="fwdproxy:8080" git fetch github_origin
 for checkout_obj in "${checkout_objs[@]}"
 do
    echo == Generating DB from "$checkout_obj" ...
-   git checkout $checkout_obj
+   https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
    make clean
    make ldb -j32
    generate_db $input_data_path $test_dir/$checkout_obj
@@ -85,7 +86,7 @@ done
 checkout_flag=${1:-"master"}
 
 echo == Building $checkout_flag debug
-git checkout $checkout_flag
+https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
 make clean
 make ldb -j32
 compare_base_db_dir=$test_dir"/base_db_dir"
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 0cc424eea..0f8909543 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -28,6 +28,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <cstddef>
+#include <memory>
 #include <mutex>
 #include <thread>
 #include <unordered_map>
@@ -57,6 +58,7 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/write_batch.h"
+#include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
@@ -204,6 +206,15 @@ DEFINE_int32(
     "create new set of column families and insert to them. Only used "
     "when num_column_families > 1.");
 
+DEFINE_string(column_family_distribution, "",
+              "Comma-separated list of percentages, where the ith element "
+              "indicates the probability of an op using the ith column family. "
+              "The number of elements must be `num_hot_column_families` if "
+              "specified; otherwise, it must be `num_column_families`. The "
+              "sum of elements must be 100. E.g., if `num_column_families=4`, "
+              "and `num_hot_column_families=0`, a valid list could be "
+              "\"10,20,30,40\".");
+
 DEFINE_int64(reads, -1, "Number of read operations to do.  "
              "If negative, do FLAGS_num reads.");
 
@@ -316,6 +327,18 @@ DEFINE_int32(max_background_jobs,
              "The maximum number of concurrent background jobs that can occur "
              "in parallel.");
 
+DEFINE_int32(num_bottom_pri_threads, 0,
+             "The number of threads in the bottom-priority thread pool (used "
+             "by universal compaction only).");
+
+DEFINE_int32(num_high_pri_threads, 0,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
+DEFINE_int32(num_low_pri_threads, 0,
+             "The maximum number of concurrent background compactions"
+             " that can occur in parallel.");
+
 DEFINE_int32(max_background_compactions,
              rocksdb::Options().max_background_compactions,
              "The maximum number of concurrent background compactions"
@@ -434,7 +457,7 @@ DEFINE_int32(file_opening_threads, rocksdb::Options().max_file_opening_threads,
              "If open_files is set to -1, this option set the number of "
              "threads that will be used to open files during DB::Open()");
 
-DEFINE_int32(new_table_reader_for_compaction_inputs, true,
+DEFINE_bool(new_table_reader_for_compaction_inputs, true,
              "If true, uses a separate file handle for compaction inputs");
 
 DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
@@ -1184,6 +1207,8 @@ struct DBWithColumnFamilies {
                    // After each CreateNewCf(), another num_hot number of new
                    // Column families will be created and used to be queried.
   port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
+  std::vector<int> cfh_idx_to_prob;  // ith index holds probability of operating
+                                     // on cfh[i].
 
   DBWithColumnFamilies()
       : db(nullptr)
@@ -1203,7 +1228,9 @@ struct DBWithColumnFamilies {
         opt_txn_db(other.opt_txn_db),
 #endif  // ROCKSDB_LITE
         num_created(other.num_created.load()),
-        num_hot(other.num_hot) {}
+        num_hot(other.num_hot),
+        cfh_idx_to_prob(other.cfh_idx_to_prob) {
+  }
 
   void DeleteDBs() {
     std::for_each(cfh.begin(), cfh.end(),
@@ -1225,8 +1252,20 @@ struct DBWithColumnFamilies {
 
   ColumnFamilyHandle* GetCfh(int64_t rand_num) {
     assert(num_hot > 0);
+    size_t rand_offset = 0;
+    if (!cfh_idx_to_prob.empty()) {
+      assert(cfh_idx_to_prob.size() == num_hot);
+      int sum = 0;
+      while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
+        sum += cfh_idx_to_prob[rand_offset];
+        ++rand_offset;
+      }
+      assert(rand_offset < cfh_idx_to_prob.size());
+    } else {
+      rand_offset = rand_num % num_hot;
+    }
     return cfh[num_created.load(std::memory_order_acquire) - num_hot +
-               rand_num % num_hot];
+               rand_offset];
   }
 
   // stage: assume CF from 0 to stage * num_hot has be created. Need to create
@@ -2551,7 +2590,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     if (FLAGS_simcache_size >= 0) {
       fprintf(stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
-              std::dynamic_pointer_cast<SimCache>(cache_)->ToString().c_str());
+              static_cast_with_check<SimCache, Cache>(cache_.get())
+                  ->ToString()
+                  .c_str());
     }
   }
 
@@ -2852,7 +2893,6 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
     assert(db_.db == nullptr);
 
-    options.create_missing_column_families = FLAGS_num_column_families > 1;
     options.max_open_files = FLAGS_open_files;
     if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
       options.write_buffer_manager.reset(
@@ -3199,6 +3239,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
   void InitializeOptionsGeneral(Options* opts) {
     Options& options = *opts;
 
+    options.create_missing_column_families = FLAGS_num_column_families > 1;
     options.statistics = dbstats;
     options.wal_dir = FLAGS_wal_dir;
     options.create_if_missing = !FLAGS_use_existing_db;
@@ -3259,6 +3300,28 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         column_families.push_back(ColumnFamilyDescriptor(
               ColumnFamilyName(i), ColumnFamilyOptions(options)));
       }
+      std::vector<int> cfh_idx_to_prob;
+      if (!FLAGS_column_family_distribution.empty()) {
+        std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
+        std::string cf_prob;
+        int sum = 0;
+        while (std::getline(cf_prob_stream, cf_prob, ',')) {
+          cfh_idx_to_prob.push_back(std::stoi(cf_prob));
+          sum += cfh_idx_to_prob.back();
+        }
+        if (sum != 100) {
+          fprintf(stderr, "column_family_distribution items must sum to 100\n");
+          exit(1);
+        }
+        if (cfh_idx_to_prob.size() != num_hot) {
+          fprintf(stderr,
+                  "got %" ROCKSDB_PRIszt
+                  " column_family_distribution items; expected "
+                  "%" ROCKSDB_PRIszt "\n",
+                  cfh_idx_to_prob.size(), num_hot);
+          exit(1);
+        }
+      }
 #ifndef ROCKSDB_LITE
       if (FLAGS_readonly) {
         s = DB::OpenForReadOnly(options, db_name, column_families,
@@ -3286,6 +3349,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       db->cfh.resize(FLAGS_num_column_families);
       db->num_created = num_hot;
       db->num_hot = num_hot;
+      db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
 #ifndef ROCKSDB_LITE
     } else if (FLAGS_readonly) {
       s = DB::OpenForReadOnly(options, db_name, &db->db);
@@ -3307,10 +3371,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     } else if (FLAGS_use_blob_db) {
       blob_db::BlobDBOptions blob_db_options;
       blob_db::BlobDB* ptr;
-      s = CreateLoggerFromOptions(db_name, options, &options.info_log);
-      if (s.ok()) {
-        s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
-      }
+      s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
       if (s.ok()) {
         db->db = ptr;
       }
@@ -5236,11 +5297,14 @@ int db_bench_tool(int argc, char** argv) {
 
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
-  // The number of background threads should be at least as much the
-  // max number of concurrent compactions.
-  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
-  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_flushes,
+  // Note options sanitization may increase thread pool sizes according to
+  // max_background_flushes/max_background_compactions/max_background_jobs
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
                                   rocksdb::Env::Priority::HIGH);
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+                                  rocksdb::Env::Priority::BOTTOM);
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
+                                  rocksdb::Env::Priority::LOW);
 
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 83c1e013b..d64da7ac1 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -44,6 +44,7 @@
     "verify_checksum": 1,
     "write_buffer_size": 4 * 1024 * 1024,
     "writepercent": 35,
+    "log2_keys_per_lock": 2,
     "subcompactions": lambda: random.randint(1, 4),
     "use_merge": lambda: random.randint(0, 1),
     "use_full_merge_v1": lambda: random.randint(0, 1),
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index db905f0c8..d18eeab0c 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -44,6 +44,7 @@ int main() {
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
 #include "monitoring/histogram.h"
+#include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
@@ -72,6 +73,8 @@ using GFLAGS::RegisterFlagValidator;
 using GFLAGS::SetUsageMessage;
 
 static const long KB = 1024;
+static const int kRandomValueMaxFactor = 3;
+static const int kValueMaxLen = 100;
 
 static bool ValidateUint32Range(const char* flagname, uint64_t value) {
   if (value > std::numeric_limits<uint32_t>::max()) {
@@ -93,6 +96,13 @@ DEFINE_int64(max_key, 1 * KB* KB,
 
 DEFINE_int32(column_families, 10, "Number of column families");
 
+DEFINE_int64(
+    active_width, 0,
+    "Number of keys in active span of the key-range at any given time. The "
+    "span begins with its left endpoint at key 0, gradually moves rightwards, "
+    "and ends with its right endpoint at max_key. If set to 0, active_width "
+    "will be sanitized to be equal to max_key.");
+
 // TODO(noetzli) Add support for single deletes
 DEFINE_bool(test_batches_snapshots, false,
             "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
@@ -167,6 +177,11 @@ DEFINE_int32(max_write_buffer_number_to_maintain,
              "after they are flushed.  If this value is set to -1, "
              "'max_write_buffer_number' will be used.");
 
+DEFINE_double(memtable_prefix_bloom_size_ratio,
+              rocksdb::Options().memtable_prefix_bloom_size_ratio,
+              "creates prefix blooms for memtables, each with size "
+              "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
+
 DEFINE_int32(open_files, rocksdb::Options().max_open_files,
              "Maximum number of files to keep open at the same time "
              "(use default if == 0)");
@@ -198,6 +213,10 @@ DEFINE_int32(max_background_compactions,
              "The maximum number of concurrent background compactions "
              "that can occur in parallel.");
 
+DEFINE_int32(num_bottom_pri_threads, 0,
+             "The number of threads in the bottom-priority thread pool (used "
+             "by universal compaction only).");
+
 DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
              "The interval (in milliseconds) to adjust compaction thread pool "
              "size. Don't change it periodically if the value is 0.");
@@ -310,13 +329,15 @@ extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
 
 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
 
-DEFINE_int32(target_file_size_base, 64 * KB,
+DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
              "Target level-1 file size for compaction");
 
 DEFINE_int32(target_file_size_multiplier, 1,
              "A multiplier to compute target level-N file size (N >= 2)");
 
-DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1");
+DEFINE_uint64(max_bytes_for_level_base,
+              rocksdb::Options().max_bytes_for_level_base,
+              "Max bytes for level-1");
 
 DEFINE_double(max_bytes_for_level_multiplier, 2,
               "A multiplier to compute max bytes for level-N (N >= 2)");
@@ -406,10 +427,30 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   else if (!strcasecmp(ctype, "zstd"))
     return rocksdb::kZSTD;
 
-  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+  fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
   return rocksdb::kSnappyCompression; //default value
 }
 
+enum rocksdb::ChecksumType StringToChecksumType(const char* ctype) {
+  assert(ctype);
+  auto iter = rocksdb::checksum_type_string_map.find(ctype);
+  if (iter != rocksdb::checksum_type_string_map.end()) {
+    return iter->second;
+  }
+  fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype);
+  return rocksdb::kCRC32c;
+}
+
+std::string ChecksumTypeToString(rocksdb::ChecksumType ctype) {
+  auto iter = std::find_if(
+      rocksdb::checksum_type_string_map.begin(),
+      rocksdb::checksum_type_string_map.end(),
+      [&](const std::pair<std::string, rocksdb::ChecksumType>&
+              name_and_enum_val) { return name_and_enum_val.second == ctype; });
+  assert(iter != rocksdb::checksum_type_string_map.end());
+  return iter->first;
+}
+
 std::vector<std::string> SplitString(std::string src) {
   std::vector<std::string> ret;
   if (src.empty()) {
@@ -431,6 +472,9 @@ DEFINE_string(compression_type, "snappy",
 static enum rocksdb::CompressionType FLAGS_compression_type_e =
     rocksdb::kSnappyCompression;
 
+DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
+static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
+
 DEFINE_string(hdfs, "", "Name of hdfs environment");
 // posix or hdfs environment
 static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
@@ -1114,8 +1158,6 @@ class StressTest {
              ToString(FLAGS_write_buffer_size / 4),
              ToString(FLAGS_write_buffer_size / 8),
          }},
-        {"memtable_prefix_bloom_bits", {"0", "8", "10"}},
-        {"memtable_prefix_bloom_probes", {"4", "5", "6"}},
         {"memtable_huge_page_size", {"0", ToString(2 * 1024 * 1024)}},
         {"max_successive_merges", {"0", "2", "4"}},
         {"inplace_update_num_locks", {"100", "200", "300"}},
@@ -1243,7 +1285,7 @@ class StressTest {
       threads[i] = nullptr;
     }
     auto now = FLAGS_env->NowMicros();
-    if (!FLAGS_test_batches_snapshots) {
+    if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
       fprintf(stdout, "%s Verification successful\n",
               FLAGS_env->TimeToString(now/1000000).c_str());
     }
@@ -1727,7 +1769,11 @@ class StressTest {
       }
 #endif                // !ROCKSDB_LITE
 
-      long rand_key = thread->rand.Next() % max_key;
+      const double completed_ratio =
+          static_cast<double>(i) / FLAGS_ops_per_thread;
+      const int64_t base_key = static_cast<int64_t>(
+          completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+      long rand_key = base_key + thread->rand.Next() % FLAGS_active_width;
       int rand_column_family = thread->rand.Next() % FLAGS_column_families;
       std::string keystr = Key(rand_key);
       Slice key = keystr;
@@ -2010,7 +2056,7 @@ class StressTest {
       return false;
     }
     // compare value_from_db with the value in the shared state
-    char value[100];
+    char value[kValueMaxLen];
     uint32_t value_base = shared->Get(cf, key);
     if (value_base == SharedState::SENTINEL && !strict) {
       return true;
@@ -2053,7 +2099,8 @@ class StressTest {
   }
 
   static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
-    size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult;
+    size_t value_sz =
+        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
     assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
     *((uint32_t*)v) = rand;
     for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
@@ -2105,6 +2152,8 @@ class StressTest {
             1 << FLAGS_log2_keys_per_lock);
     std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
     fprintf(stdout, "Compression               : %s\n", compression.c_str());
+    std::string checksum = ChecksumTypeToString(FLAGS_checksum_type_e);
+    fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
     fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
             FLAGS_subcompactions);
 
@@ -2139,6 +2188,7 @@ class StressTest {
     BlockBasedTableOptions block_based_options;
     block_based_options.block_cache = cache_;
     block_based_options.block_cache_compressed = compressed_cache_;
+    block_based_options.checksum = FLAGS_checksum_type_e;
     block_based_options.block_size = FLAGS_block_size;
     block_based_options.format_version = 2;
     block_based_options.filter_policy = filter_policy_;
@@ -2151,6 +2201,8 @@ class StressTest {
         FLAGS_min_write_buffer_number_to_merge;
     options_.max_write_buffer_number_to_maintain =
         FLAGS_max_write_buffer_number_to_maintain;
+    options_.memtable_prefix_bloom_size_ratio =
+        FLAGS_memtable_prefix_bloom_size_ratio;
     options_.max_background_compactions = FLAGS_max_background_compactions;
     options_.max_background_flushes = FLAGS_max_background_flushes;
     options_.compaction_style =
@@ -2387,6 +2439,7 @@ int main(int argc, char** argv) {
   }
   FLAGS_compression_type_e =
     StringToCompressionType(FLAGS_compression_type.c_str());
+  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
   if (!FLAGS_hdfs.empty()) {
     FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
   }
@@ -2395,7 +2448,8 @@ int main(int argc, char** argv) {
   // The number of background threads should be at least as much the
   // max number of concurrent compactions.
   FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
-
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+                                  rocksdb::Env::Priority::BOTTOM);
   if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
     fprintf(stderr,
             "Error: prefixpercent is non-zero while prefix_size is "
@@ -2408,6 +2462,12 @@ int main(int argc, char** argv) {
             "test_batches_snapshots test!\n");
     exit(1);
   }
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: please specify positive prefix_size in order to use "
+            "memtable_prefix_bloom_size_ratio\n");
+    exit(1);
+  }
   if ((FLAGS_readpercent + FLAGS_prefixpercent +
        FLAGS_writepercent + FLAGS_delpercent + FLAGS_delrangepercent +
        FLAGS_iterpercent) != 100) {
@@ -2433,6 +2493,17 @@ int main(int argc, char** argv) {
                     "test_batches_snapshots mode\n");
     exit(1);
   }
+  if (FLAGS_active_width > FLAGS_max_key) {
+    fprintf(stderr, "Error: active_width can be at most max_key\n");
+    exit(1);
+  } else if (FLAGS_active_width == 0) {
+    FLAGS_active_width = FLAGS_max_key;
+  }
+  if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
+    fprintf(stderr, "Error: value_size_mult can be at most %d\n",
+            kValueMaxLen / kRandomValueMaxFactor);
+    exit(1);
+  }
 
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
diff --git a/tools/dbench_monitor b/tools/dbench_monitor
index 10726dc23..d85f9d070 100755
--- a/tools/dbench_monitor
+++ b/tools/dbench_monitor
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 #(c) 2004-present, Facebook Inc. All rights reserved.
 #
diff --git a/tools/generate_random_db.sh b/tools/generate_random_db.sh
index 28bdceb2b..e10843bab 100755
--- a/tools/generate_random_db.sh
+++ b/tools/generate_random_db.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # A shell script to load some pre generated data file to a DB using ldb tool
 # ./ldb needs to be avaible to be executed.
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 2cd4d94d1..c8b6221a5 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -29,6 +29,7 @@
 #include "table/scoped_arena_iterator.h"
 #include "tools/ldb_cmd_impl.h"
 #include "tools/sst_dump_tool_imp.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/filename.h"
 #include "util/stderr_logger.h"
@@ -1493,8 +1494,7 @@ void DBDumperCommand::DoDumpCommand() {
     if (max_keys == 0)
       break;
     if (is_db_ttl_) {
-      TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(iter);
-      assert(it_ttl);
+      TtlIterator* it_ttl = static_cast_with_check<TtlIterator, Iterator>(iter);
       rawtime = it_ttl->timestamp();
       if (rawtime < ttl_start || rawtime >= ttl_end) {
         continue;
@@ -2291,8 +2291,7 @@ void ScanCommand::DoCommand() {
         it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
         it->Next()) {
     if (is_db_ttl_) {
-      TtlIterator* it_ttl = dynamic_cast<TtlIterator*>(it);
-      assert(it_ttl);
+      TtlIterator* it_ttl = static_cast_with_check<TtlIterator, Iterator>(it);
       int rawtime = it_ttl->timestamp();
       if (rawtime < ttl_start || rawtime >= ttl_end) {
         continue;
diff --git a/tools/pflag b/tools/pflag
index adfac23bc..f3394a666 100755
--- a/tools/pflag
+++ b/tools/pflag
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 #(c) 2004-present, Facebook, all rights reserved. 
 # See the LICENSE file for usage and distribution rights.
diff --git a/tools/rdb/rdb b/tools/rdb/rdb
index 82cd17fb7..05da1158b 100755
--- a/tools/rdb/rdb
+++ b/tools/rdb/rdb
@@ -1,3 +1,3 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 node -e "RDB = require('./build/Release/rdb').DBWrapper; console.log('Loaded rocksdb in variable RDB'); repl = require('repl').start('> ');"
diff --git a/tools/regression_test.sh b/tools/regression_test.sh
index 7801da14f..58558bbe4 100755
--- a/tools/regression_test.sh
+++ b/tools/regression_test.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # The RocksDB regression test script.
 # REQUIREMENT: must be able to run make db_bench in the current directory
 #
diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh
index 76c16bb59..4d9d0d557 100755
--- a/tools/run_flash_bench.sh
+++ b/tools/run_flash_bench.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # REQUIRE: benchmark.sh exists in the current directory
 # After execution of this script, log files are generated in $output_dir.
 # report.txt provides a high level statistics
diff --git a/tools/run_leveldb.sh b/tools/run_leveldb.sh
index 884312e3d..de628c310 100755
--- a/tools/run_leveldb.sh
+++ b/tools/run_leveldb.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 # REQUIRE: benchmark_leveldb.sh exists in the current directory
 # After execution of this script, log files are generated in $output_dir.
 # report.txt provides a high level statistics
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 0a222afa3..460b5a2cc 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -195,7 +195,7 @@ TEST_F(SSTDumpToolTest, CompressedSizes) {
   }
 
   snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--show_compression_sizes");
+  snprintf(usage[1], optLength, "--command=recompress");
   snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
   rocksdb::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage));
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 2a1729c76..4dca284cc 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -14,6 +14,7 @@
 #include <inttypes.h>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <sstream>
 #include <vector>
 
@@ -42,8 +43,6 @@
 
 namespace rocksdb {
 
-using std::dynamic_pointer_cast;
-
 SstFileReader::SstFileReader(const std::string& file_path,
                              bool verify_checksum,
                              bool output_hex)
@@ -61,6 +60,17 @@ extern const uint64_t kLegacyPlainTableMagicNumber;
 
 const char* testFileName = "test_file_name";
 
+static const std::vector<std::pair<CompressionType, const char*>>
+    kCompressions = {
+        {CompressionType::kNoCompression, "kNoCompression"},
+        {CompressionType::kSnappyCompression, "kSnappyCompression"},
+        {CompressionType::kZlibCompression, "kZlibCompression"},
+        {CompressionType::kBZip2Compression, "kBZip2Compression"},
+        {CompressionType::kLZ4Compression, "kLZ4Compression"},
+        {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"},
+        {CompressionType::kXpressCompression, "kXpressCompression"},
+        {CompressionType::kZSTD, "kZSTD"}};
+
 Status SstFileReader::GetTableReader(const std::string& file_path) {
   // Warning about 'magic_number' being uninitialized shows up only in UBsan
   // builds. Though access is guarded by 's.ok()' checks, fix the issue to
@@ -80,7 +90,8 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
   file_.reset(new RandomAccessFileReader(std::move(file), file_path));
 
   if (s.ok()) {
-    s = ReadFooterFromFile(file_.get(), file_size, &footer);
+    s = ReadFooterFromFile(file_.get(), nullptr /* prefetch_buffer */,
+                           file_size, &footer);
   }
   if (s.ok()) {
     magic_number = footer.table_magic_number();
@@ -115,24 +126,23 @@ Status SstFileReader::NewTableReader(
     unique_ptr<TableReader>* table_reader) {
   // We need to turn off pre-fetching of index and filter nodes for
   // BlockBasedTable
-  shared_ptr<BlockBasedTableFactory> block_table_factory =
-      dynamic_pointer_cast<BlockBasedTableFactory>(options_.table_factory);
-
-  if (block_table_factory) {
-    return block_table_factory->NewTableReader(
+  if (BlockBasedTableFactory::kName == options_.table_factory->Name()) {
+    return options_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, soptions_, internal_comparator_,
                            /*skip_filters=*/false),
         std::move(file_), file_size, &table_reader_, /*enable_prefetch=*/false);
   }
 
-  assert(!block_table_factory);
-
   // For all other factory implementation
   return options_.table_factory->NewTableReader(
       TableReaderOptions(ioptions_, soptions_, internal_comparator_),
       std::move(file_), file_size, &table_reader_);
 }
 
+Status SstFileReader::VerifyChecksum() {
+  return table_reader_->VerifyChecksum();
+}
+
 Status SstFileReader::DumpTable(const std::string& out_filename) {
   unique_ptr<WritableFile> out_file;
   Env* env = Env::Default();
@@ -175,7 +185,10 @@ uint64_t SstFileReader::CalculateCompressedTableSize(
   return size;
 }
 
-int SstFileReader::ShowAllCompressionSizes(size_t block_size) {
+int SstFileReader::ShowAllCompressionSizes(
+    size_t block_size,
+    const std::vector<std::pair<CompressionType, const char*>>&
+        compression_types) {
   ReadOptions read_options;
   Options opts;
   const ImmutableCFOptions imoptions(opts);
@@ -185,17 +198,7 @@ int SstFileReader::ShowAllCompressionSizes(size_t block_size) {
 
   fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
 
-  std::pair<CompressionType, const char*> compressions[] = {
-      {CompressionType::kNoCompression, "kNoCompression"},
-      {CompressionType::kSnappyCompression, "kSnappyCompression"},
-      {CompressionType::kZlibCompression, "kZlibCompression"},
-      {CompressionType::kBZip2Compression, "kBZip2Compression"},
-      {CompressionType::kLZ4Compression, "kLZ4Compression"},
-      {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"},
-      {CompressionType::kXpressCompression, "kXpressCompression"},
-      {CompressionType::kZSTD, "kZSTD"}};
-
-  for (auto& i : compressions) {
+  for (auto& i : compression_types) {
     if (CompressionTypeSupported(i.first)) {
       CompressionOptions compress_opt;
       std::string column_family_name;
@@ -355,10 +358,13 @@ void print_help() {
     --file=<data_dir_OR_sst_file>
       Path to SST file or directory containing SST files
 
-    --command=check|scan|raw
+    --command=check|scan|raw|verify
         check: Iterate over entries in files but dont print anything except if an error is encounterd (default command)
         scan: Iterate over entries in files and print them to screen
         raw: Dump all the table contents to <file_name>_dump.txt
+        verify: Iterate all the blocks in files verifying checksum to detect possible coruption but dont print anything except if a corruption is encountered
+        recompress: reports the SST file size if recompressed with different
+                    compression types
 
     --output_hex
       Can be combined with scan command to print the keys and values in Hex
@@ -383,15 +389,17 @@ void print_help() {
       Can be combined with --from and --to to indicate that these values are encoded in Hex
 
     --show_properties
-      Print table properties after iterating over the file
-
-    --show_compression_sizes
-      Independent command that will recreate the SST file using 16K block size with different
-      compressions and report the size of the file using such compression
+      Print table properties after iterating over the file when executing
+      check|scan|raw
 
     --set_block_size=<block_size>
-      Can be combined with --show_compression_sizes to set the block size that will be used
-      when trying different compression algorithms
+      Can be combined with --command=recompress to set the block size that will
+      be used when trying different compression algorithms
+
+    --compression_types=<comma-separated list of CompressionType members, e.g.,
+      kSnappyCompression>
+      Can be combined with --command=recompress to run recompression for this
+      list of compression types
 
     --parse_internal_key=<0xKEY>
       Convenience option to parse an internal key on the command line. Dumps the
@@ -415,13 +423,13 @@ int SSTDumpTool::Run(int argc, char** argv) {
   bool has_to = false;
   bool use_from_as_prefix = false;
   bool show_properties = false;
-  bool show_compression_sizes = false;
   bool show_summary = false;
   bool set_block_size = false;
   std::string from_key;
   std::string to_key;
   std::string block_size_str;
   size_t block_size;
+  std::vector<std::pair<CompressionType, const char*>> compression_types;
   uint64_t total_num_files = 0;
   uint64_t total_num_data_blocks = 0;
   uint64_t total_data_block_size = 0;
@@ -453,19 +461,34 @@ int SSTDumpTool::Run(int argc, char** argv) {
       use_from_as_prefix = true;
     } else if (strcmp(argv[i], "--show_properties") == 0) {
       show_properties = true;
-    } else if (strcmp(argv[i], "--show_compression_sizes") == 0) {
-      show_compression_sizes = true;
     } else if (strcmp(argv[i], "--show_summary") == 0) {
       show_summary = true;
     } else if (strncmp(argv[i], "--set_block_size=", 17) == 0) {
       set_block_size = true;
       block_size_str = argv[i] + 17;
       std::istringstream iss(block_size_str);
+      iss >> block_size;
       if (iss.fail()) {
-        fprintf(stderr, "block size must be numeric");
+        fprintf(stderr, "block size must be numeric\n");
         exit(1);
       }
-      iss >> block_size;
+    } else if (strncmp(argv[i], "--compression_types=", 20) == 0) {
+      std::string compression_types_csv = argv[i] + 20;
+      std::istringstream iss(compression_types_csv);
+      std::string compression_type;
+      while (std::getline(iss, compression_type, ',')) {
+        auto iter = std::find_if(
+            kCompressions.begin(), kCompressions.end(),
+            [&compression_type](std::pair<CompressionType, const char*> curr) {
+              return curr.second == compression_type;
+            });
+        if (iter == kCompressions.end()) {
+          fprintf(stderr, "%s is not a valid CompressionType\n",
+                  compression_type.c_str());
+          exit(1);
+        }
+        compression_types.emplace_back(*iter);
+      }
     } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) {
       std::string in_key(argv[i] + 21);
       try {
@@ -547,12 +570,10 @@ int SSTDumpTool::Run(int argc, char** argv) {
       continue;
     }
 
-    if (show_compression_sizes) {
-      if (set_block_size) {
-        reader.ShowAllCompressionSizes(block_size);
-      } else {
-        reader.ShowAllCompressionSizes(16384);
-      }
+    if (command == "recompress") {
+      reader.ShowAllCompressionSizes(
+          set_block_size ? block_size : 16384,
+          compression_types.empty() ? kCompressions : compression_types);
       return 0;
     }
 
@@ -586,6 +607,17 @@ int SSTDumpTool::Run(int argc, char** argv) {
       }
     }
 
+    if (command == "verify") {
+      st = reader.VerifyChecksum();
+      if (!st.ok()) {
+        fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(),
+                st.ToString().c_str());
+      } else {
+        fprintf(stdout, "The file is ok\n");
+      }
+      continue;
+    }
+
     if (show_properties || show_summary) {
       const rocksdb::TableProperties* table_properties;
 
diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h
index 0129d98eb..9531b5415 100644
--- a/tools/sst_dump_tool_imp.h
+++ b/tools/sst_dump_tool_imp.h
@@ -30,10 +30,14 @@ class SstFileReader {
   uint64_t GetReadNumber() { return read_num_; }
   TableProperties* GetInitTableProperties() { return table_properties_.get(); }
 
+  Status VerifyChecksum();
   Status DumpTable(const std::string& out_filename);
   Status getStatus() { return init_result_; }
 
-  int ShowAllCompressionSizes(size_t block_size);
+  int ShowAllCompressionSizes(
+      size_t block_size,
+      const std::vector<std::pair<CompressionType, const char*>>&
+          compression_types);
 
  private:
   // Get the TableReader implementation for the sst file
diff --git a/tools/verify_random_db.sh b/tools/verify_random_db.sh
index 8ff6a3fd1..7000f5a1a 100755
--- a/tools/verify_random_db.sh
+++ b/tools/verify_random_db.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 #
 # A shell script to verify DB generated by generate_random_db.sh cannot opened and read correct data.
 # ./ldb needs to be avaible to be executed.
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index 597e93798..e5e4204a8 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -135,8 +135,8 @@ class WriteStress {
     // compactions
     options.create_if_missing = true;
     options.write_buffer_size = 256 * 1024;              // 256k
-    options.max_bytes_for_level_base = 1 * 1024 * 1204;  // 1MB
-    options.target_file_size_base = 100 * 1204;          // 100k
+    options.max_bytes_for_level_base = 1 * 1024 * 1024;  // 1MB
+    options.target_file_size_base = 100 * 1024;          // 100k
     options.max_write_buffer_number = 16;
     options.max_background_compactions = 16;
     options.max_background_flushes = 16;
diff --git a/util/arena.h b/util/arena.h
index a20935171..af53a2ff8 100644
--- a/util/arena.h
+++ b/util/arena.h
@@ -77,6 +77,10 @@ class Arena : public Allocator {
 
   size_t BlockSize() const override { return kBlockSize; }
 
+  bool IsInInlineBlock() const {
+    return blocks_.empty();
+  }
+
  private:
   char inline_block_[kInlineSize] __attribute__((__aligned__(sizeof(void*))));
   // Number of bytes allocated in one block
diff --git a/util/arena_test.cc b/util/arena_test.cc
index a033765ad..53777a20b 100644
--- a/util/arena_test.cc
+++ b/util/arena_test.cc
@@ -91,9 +91,13 @@ static void ApproximateMemoryUsageTest(size_t huge_page_size) {
   ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
   // allocate inline bytes
+  EXPECT_TRUE(arena.IsInInlineBlock());
   arena.AllocateAligned(8);
+  EXPECT_TRUE(arena.IsInInlineBlock());
   arena.AllocateAligned(Arena::kInlineSize / 2 - 16);
+  EXPECT_TRUE(arena.IsInInlineBlock());
   arena.AllocateAligned(Arena::kInlineSize / 2);
+  EXPECT_TRUE(arena.IsInInlineBlock());
   ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - 8);
   ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
                Arena::kInlineSize);
@@ -102,6 +106,7 @@ static void ApproximateMemoryUsageTest(size_t huge_page_size) {
 
   // first allocation
   arena.AllocateAligned(kEntrySize);
+  EXPECT_FALSE(arena.IsInInlineBlock());
   auto mem_usage = arena.MemoryAllocatedBytes();
   if (huge_page_size) {
     ASSERT_TRUE(
@@ -117,6 +122,7 @@ static void ApproximateMemoryUsageTest(size_t huge_page_size) {
     arena.AllocateAligned(kEntrySize);
     ASSERT_EQ(mem_usage, arena.MemoryAllocatedBytes());
     ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize);
+    EXPECT_FALSE(arena.IsInInlineBlock());
     usage = arena.ApproximateMemoryUsage();
   }
   if (huge_page_size) {
diff --git a/util/cast_util.h b/util/cast_util.h
new file mode 100644
index 000000000..2dc8138ab
--- /dev/null
+++ b/util/cast_util.h
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+// The helper function to assert the move from dynamic_cast<> to
+// static_cast<> is correct. This function is to deal with legacy code.
+// It is not recommanded to add new code to issue class casting. The preferred
+// solution is to implement the functionality without a need of casting.
+template <class DestClass, class SrcClass>
+inline DestClass* static_cast_with_check(SrcClass* x) {
+  DestClass* ret = static_cast<DestClass*>(x);
+#ifdef ROCKSDB_USE_RTTI
+  assert(ret == dynamic_cast<DestClass*>(x));
+#endif
+  return ret;
+}
+}  // namespace rocksdb
diff --git a/util/concurrent_arena.h b/util/concurrent_arena.h
index a79fb95fe..1ab88c7ff 100644
--- a/util/concurrent_arena.h
+++ b/util/concurrent_arena.h
@@ -164,6 +164,21 @@ class ConcurrentArena : public Allocator {
       // size, we adjust our request to avoid arena waste.
       auto exact = arena_allocated_and_unused_.load(std::memory_order_relaxed);
       assert(exact == arena_.AllocatedAndUnused());
+
+      if (exact >= bytes && arena_.IsInInlineBlock()) {
+        // If we haven't exhausted arena's inline block yet, allocate from arena
+        // directly. This ensures that we'll do the first few small allocations
+        // without allocating any blocks.
+        // In particular this prevents empty memtables from using
+        // disproportionately large amount of memory: a memtable allocates on
+        // the order of 1 KB of memory when created; we wouldn't want to
+        // allocate a full arena block (typically a few megabytes) for that,
+        // especially if there are thousands of empty memtables.
+        auto rv = func();
+        Fixup();
+        return rv;
+      }
+
       avail = exact >= shard_block_size_ / 2 && exact < shard_block_size_ * 2
                   ? exact
                   : shard_block_size_;
diff --git a/util/delete_scheduler_test.cc b/util/delete_scheduler_test.cc
index 666728918..208bdd741 100644
--- a/util/delete_scheduler_test.cc
+++ b/util/delete_scheduler_test.cc
@@ -541,10 +541,9 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
     delete_scheduler_->DeleteFile(file_name);
   }
 
-  // When we end up with 24 files in trash we will start
+  // When we end up with 26 files in trash we will start
   // deleting new files immediately
   ASSERT_EQ(fg_delete_file, 74);
-  ASSERT_EQ(CountFilesInDir(trash_dir_), 25);
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
index 22ab71287..f46b78fa0 100644
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@@ -603,6 +603,34 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
 };
 }  // namespace
 
+Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n) {
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  uint64_t roundup_offset = Roundup(offset, alignment);
+  uint64_t roundup_len = Roundup(n, alignment);
+  buffer_.Alignment(alignment);
+  buffer_.AllocateNewBuffer(roundup_len);
+
+  Slice result;
+  Status s =
+      reader->Read(roundup_offset, roundup_len, &result, buffer_.BufferStart());
+  if (s.ok()) {
+    buffer_offset_ = roundup_offset;
+    buffer_len_ = result.size();
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
+                                          Slice* result) const {
+  if (offset < buffer_offset_ || offset + n > buffer_offset_ + buffer_len_) {
+    return false;
+  }
+  uint64_t offset_in_buffer = offset - buffer_offset_;
+  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+
 std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
     std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
   std::unique_ptr<RandomAccessFile> result(
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
index deed73c38..9be692458 100644
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@@ -196,6 +196,17 @@ class WritableFileWriter {
   Status SyncInternal(bool use_fsync);
 };
 
+class FilePrefetchBuffer {
+ public:
+  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result) const;
+
+ private:
+  AlignedBuffer buffer_;
+  uint64_t buffer_offset_;
+  size_t buffer_len_;
+};
+
 extern Status NewWritableFile(Env* env, const std::string& fname,
                               unique_ptr<WritableFile>* result,
                               const EnvOptions& options);
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index dac518245..45675e9dd 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -143,7 +143,13 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
     env_options.writable_file_max_buffer_size =
         (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024;
     std::string actual;
-    unique_ptr<FakeWF> wf(new FakeWF(&actual, attempt % 2 == 1, no_flush));
+    unique_ptr<FakeWF> wf(new FakeWF(&actual,
+#ifndef ROCKSDB_LITE
+                                     attempt % 2 == 1,
+#else
+                                     false,
+#endif
+                                     no_flush));
     unique_ptr<WritableFileWriter> writer(
         new WritableFileWriter(std::move(wf), env_options));
 
diff --git a/util/murmurhash.cc b/util/murmurhash.cc
index 334ed898e..4d71d5890 100644
--- a/util/murmurhash.cc
+++ b/util/murmurhash.cc
@@ -113,8 +113,8 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
 
     switch(len)
     {
-    case 3: h ^= data[2] << 16;
-    case 2: h ^= data[1] << 8;
+    case 3: h ^= data[2] << 16; // fallthrough
+    case 2: h ^= data[1] << 8; // fallthrough
     case 1: h ^= data[0];
         h *= m;
     };
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
index 5806cf265..6fee5eaa5 100644
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@@ -569,7 +569,7 @@ TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) {
 #ifndef ROCKSDB_LITE
   } catch (const std::system_error& ex) {
     std::cerr << "Start thread: " << ex.code() << std::endl;
-    ASSERT_TRUE(false);
+    FAIL();
   }
 #endif  // ROCKSDB_LITE
 }
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index aa40ab9cd..f38e6422b 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -123,11 +123,11 @@ struct ThreadPoolImpl::Impl {
 
 inline
 ThreadPoolImpl::Impl::Impl()
-    : 
+    :
       low_io_priority_(false),
       priority_(Env::LOW),
       env_(nullptr),
-      total_threads_limit_(1),
+      total_threads_limit_(0),
       queue_len_(),
       exit_all_threads_(false),
       wait_for_jobs_to_complete_(false),
@@ -372,7 +372,7 @@ int ThreadPoolImpl::Impl::UnSchedule(void* arg) {
   return count;
 }
 
-ThreadPoolImpl::ThreadPoolImpl() : 
+ThreadPoolImpl::ThreadPoolImpl() :
   impl_(new Impl()) {
 }
 
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index d1a4bc60a..be20a8d9b 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -101,7 +101,7 @@ class DummyDB : public StackableDB {
 
     virtual uint64_t LogNumber() const override {
       // what business do you have calling this method?
-      EXPECT_TRUE(false);
+      ADD_FAILURE();
       return 0;
     }
 
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
new file mode 100644
index 000000000..26cd188fe
--- /dev/null
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "utilities/blob_db/blob_index.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+// CompactionFilter to delete expired blob index from base DB.
+class BlobIndexCompactionFilter : public CompactionFilter {
+ public:
+  explicit BlobIndexCompactionFilter(uint64_t current_time)
+      : current_time_(current_time) {}
+
+  virtual const char* Name() const override {
+    return "BlobIndexCompactionFilter";
+  }
+
+  // Filter expired blob indexes regardless of snapshots.
+  virtual bool IgnoreSnapshots() const override { return true; }
+
+  virtual Decision FilterV2(int /*level*/, const Slice& /*key*/,
+                            ValueType value_type, const Slice& value,
+                            std::string* /*new_value*/,
+                            std::string* /*skip_until*/) const override {
+    if (value_type != kBlobIndex) {
+      return Decision::kKeep;
+    }
+    BlobIndex blob_index;
+    Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      // Unable to decode blob index. Keeping the value.
+      return Decision::kKeep;
+    }
+    if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
+      // Expired
+      return Decision::kRemove;
+    }
+    return Decision::kKeep;
+  }
+
+ private:
+  const uint64_t current_time_;
+};
+
+class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit BlobIndexCompactionFilterFactory(Env* env) : env_(env) {}
+
+  virtual const char* Name() const override {
+    return "BlobIndexCompactionFilterFactory";
+  }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    int64_t current_time = 0;
+    Status s = env_->GetCurrentTime(&current_time);
+    if (!s.ok()) {
+      return nullptr;
+    }
+    assert(current_time >= 0);
+    return std::unique_ptr<CompactionFilter>(
+        new BlobIndexCompactionFilter(static_cast<uint64_t>(current_time)));
+  }
+
+ private:
+  Env* env_;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index ea60ad59b..b278df77f 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -5,7 +5,14 @@
 //
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include "utilities/blob_db/blob_db.h"
+
+#include <inttypes.h>
+
 #include "db/write_batch_internal.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
@@ -19,6 +26,7 @@
 #include "table/block_builder.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
+#include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
@@ -38,6 +46,11 @@ Status BlobDB::OpenAndLoad(const Options& options,
                            const BlobDBOptions& bdb_options,
                            const std::string& dbname, BlobDB** blob_db,
                            Options* changed_options) {
+  if (options.compaction_filter != nullptr ||
+      options.compaction_filter_factory != nullptr) {
+    return Status::NotSupported("Blob DB doesn't support compaction filter.");
+  }
+
   *changed_options = options;
   *blob_db = nullptr;
 
@@ -50,12 +63,18 @@ Status BlobDB::OpenAndLoad(const Options& options,
   {
     MutexLock l(&listener_mutex);
     all_blobdb_listeners.push_back(fblistener);
-    all_blobdb_listeners.push_back(ce_listener);
+    if (bdb_options.enable_garbage_collection) {
+      all_blobdb_listeners.push_back(ce_listener);
+    }
     all_wal_filters.push_back(rw_filter);
   }
 
+  changed_options->compaction_filter_factory.reset(
+      new BlobIndexCompactionFilterFactory(options.env));
   changed_options->listeners.emplace_back(fblistener);
-  changed_options->listeners.emplace_back(ce_listener);
+  if (bdb_options.enable_garbage_collection) {
+    changed_options->listeners.emplace_back(ce_listener);
+  }
   changed_options->wal_filter = rw_filter.get();
 
   DBOptions db_options(*changed_options);
@@ -64,7 +83,9 @@ Status BlobDB::OpenAndLoad(const Options& options,
   BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
 
   fblistener->SetImplPtr(bdb);
-  ce_listener->SetImplPtr(bdb);
+  if (bdb_options.enable_garbage_collection) {
+    ce_listener->SetImplPtr(bdb);
+  }
   rw_filter->SetImplPtr(bdb);
 
   Status s = bdb->OpenPhase1();
@@ -94,45 +115,82 @@ Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
   return s;
 }
 
-Status BlobDB::Open(const DBOptions& db_options,
+Status BlobDB::Open(const DBOptions& db_options_input,
                     const BlobDBOptions& bdb_options, const std::string& dbname,
                     const std::vector<ColumnFamilyDescriptor>& column_families,
                     std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
                     bool no_base_db) {
+  if (column_families.size() != 1 ||
+      column_families[0].name != kDefaultColumnFamilyName) {
+    return Status::NotSupported(
+        "Blob DB doesn't support non-default column family.");
+  }
   *blob_db = nullptr;
+  Status s;
+
+  DBOptions db_options(db_options_input);
+  if (db_options.info_log == nullptr) {
+    s = CreateLoggerFromOptions(dbname, db_options, &db_options.info_log);
+    if (!s.ok()) {
+      return s;
+    }
+  }
 
-  DBOptions my_db_options(db_options);
   FlushBeginListener_t fblistener =
       std::make_shared<BlobDBFlushBeginListener>();
   CompactionListener_t ce_listener =
       std::make_shared<EvictAllVersionsCompactionListener>();
   ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
 
-  my_db_options.listeners.emplace_back(fblistener);
-  my_db_options.listeners.emplace_back(ce_listener);
-  my_db_options.wal_filter = rw_filter.get();
+  db_options.listeners.emplace_back(fblistener);
+  if (bdb_options.enable_garbage_collection) {
+    db_options.listeners.emplace_back(ce_listener);
+  }
+  db_options.wal_filter = rw_filter.get();
 
   {
     MutexLock l(&listener_mutex);
     all_blobdb_listeners.push_back(fblistener);
-    all_blobdb_listeners.push_back(ce_listener);
+    if (bdb_options.enable_garbage_collection) {
+      all_blobdb_listeners.push_back(ce_listener);
+    }
     all_wal_filters.push_back(rw_filter);
   }
 
+  ColumnFamilyOptions cf_options(column_families[0].options);
+  if (cf_options.compaction_filter != nullptr ||
+      cf_options.compaction_filter_factory != nullptr) {
+    return Status::NotSupported("Blob DB doesn't support compaction filter.");
+  }
+  cf_options.compaction_filter_factory.reset(
+      new BlobIndexCompactionFilterFactory(db_options.env));
+  ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options);
+
   // we need to open blob db first so that recovery can happen
-  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, my_db_options);
+  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
   fblistener->SetImplPtr(bdb);
-  ce_listener->SetImplPtr(bdb);
+  if (bdb_options.enable_garbage_collection) {
+    ce_listener->SetImplPtr(bdb);
+  }
   rw_filter->SetImplPtr(bdb);
 
-  Status s = bdb->OpenPhase1();
-  if (!s.ok()) return s;
+  s = bdb->OpenPhase1();
+  if (!s.ok()) {
+    delete bdb;
+    return s;
+  }
 
-  if (no_base_db) return s;
+  if (no_base_db) {
+    *blob_db = bdb;
+    return s;
+  }
 
   DB* db = nullptr;
-  s = DB::Open(my_db_options, dbname, column_families, handles, &db);
-  if (!s.ok()) return s;
+  s = DB::Open(db_options, dbname, {cf_descriptor}, handles, &db);
+  if (!s.ok()) {
+    delete bdb;
+    return s;
+  }
 
   // set the implementation pointer
   s = bdb->LinkToBaseDB(db);
@@ -141,28 +199,36 @@ Status BlobDB::Open(const DBOptions& db_options,
     bdb = nullptr;
   }
   *blob_db = bdb;
+  bdb_options.Dump(db_options.info_log.get());
   return s;
 }
 
 BlobDB::BlobDB(DB* db) : StackableDB(db) {}
 
-////////////////////////////////////////////////////////////////////////////////
-//
-//
-// std::function<int(double)> fnCaller =
-//     std::bind(&A::fn, &anInstance, std::placeholders::_1);
-////////////////////////////////////////////////////////////////////////////////
-BlobDBOptions::BlobDBOptions()
-    : blob_dir("blob_dir"),
-      path_relative(true),
-      is_fifo(false),
-      blob_dir_size(1000ULL * 1024ULL * 1024ULL * 1024ULL),
-      ttl_range_secs(3600),
-      min_blob_size(512),
-      bytes_per_sync(0),
-      blob_file_size(256 * 1024 * 1024),
-      num_concurrent_simple_blobs(4),
-      compression(kNoCompression) {}
+void BlobDBOptions::Dump(Logger* log) const {
+  ROCKS_LOG_HEADER(log, "                 blob_db_options.blob_dir: %s",
+                   blob_dir.c_str());
+  ROCKS_LOG_HEADER(log, "            blob_db_options.path_relative: %d",
+                   path_relative);
+  ROCKS_LOG_HEADER(log, "                  blob_db_options.is_fifo: %d",
+                   is_fifo);
+  ROCKS_LOG_HEADER(log, "            blob_db_options.blob_dir_size: %" PRIu64,
+                   blob_dir_size);
+  ROCKS_LOG_HEADER(log, "           blob_db_options.ttl_range_secs: %" PRIu32,
+                   ttl_range_secs);
+  ROCKS_LOG_HEADER(log, "           blob_db_options.bytes_per_sync: %" PRIu64,
+                   bytes_per_sync);
+  ROCKS_LOG_HEADER(log, "           blob_db_options.blob_file_size: %" PRIu64,
+                   blob_file_size);
+  ROCKS_LOG_HEADER(log, "            blob_db_options.ttl_extractor: %p",
+                   ttl_extractor.get());
+  ROCKS_LOG_HEADER(log, "              blob_db_options.compression: %d",
+                   static_cast<int>(compression));
+  ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d",
+                   enable_garbage_collection);
+  ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
+                   disable_background_tasks);
+}
 
 }  // namespace blob_db
 }  // namespace rocksdb
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index e68b40a0a..3ade460eb 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -31,18 +31,18 @@ class TTLExtractor;
 struct BlobDBOptions {
   // name of the directory under main db, where blobs will be stored.
   // default is "blob_dir"
-  std::string blob_dir;
+  std::string blob_dir = "blob_dir";
 
   // whether the blob_dir path is relative or absolute.
-  bool path_relative;
+  bool path_relative = true;
 
   // is the eviction strategy fifo based
-  bool is_fifo;
+  bool is_fifo = false;
 
   // maximum size of the blob dir. Once this gets used, up
   // evict the blob file which is oldest (is_fifo )
   // 0 means no limits
-  uint64_t blob_dir_size;
+  uint64_t blob_dir_size = 0;
 
   // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
   // (10 minutes), and the first bucket starts at 1471542000
@@ -50,75 +50,90 @@ struct BlobDBOptions {
   // first bucket is 1471542000 - 1471542600
   // second bucket is 1471542600 - 1471543200
   // and so on
-  uint32_t ttl_range_secs;
+  uint64_t ttl_range_secs = 3600;
 
-  // at what size will the blobs be stored in separate log rather than
-  // inline
-  uint64_t min_blob_size;
+  // The smallest value to store in blob log. Value larger than this threshold
+  // will be inlined in base DB together with the key.
+  uint64_t min_blob_size = 0;
 
   // at what bytes will the blob files be synced to blob log.
-  uint64_t bytes_per_sync;
+  uint64_t bytes_per_sync = 0;
 
   // the target size of each blob file. File will become immutable
   // after it exceeds that size
-  uint64_t blob_file_size;
-
-  // how many files to use for simple blobs at one time
-  uint32_t num_concurrent_simple_blobs;
+  uint64_t blob_file_size = 256 * 1024 * 1024;
 
   // Instead of setting TTL explicitly by calling PutWithTTL or PutUntil,
   // applications can set a TTLExtractor which can extract TTL from key-value
   // pairs.
-  std::shared_ptr<TTLExtractor> ttl_extractor;
-
-  // eviction callback.
-  // this function will be called for every blob that is getting
-  // evicted.
-  std::function<void(const ColumnFamilyHandle*, const Slice&, const Slice&)>
-      gc_evict_cb_fn;
+  std::shared_ptr<TTLExtractor> ttl_extractor = nullptr;
 
   // what compression to use for Blob's
-  CompressionType compression;
+  CompressionType compression = kNoCompression;
 
-  // default constructor
-  BlobDBOptions();
+  // If enabled, blob DB periodically cleanup stale data by rewriting remaining
+  // live data in blob files to new files. If garbage collection is not enabled,
+  // blob files will be cleanup based on TTL.
+  bool enable_garbage_collection = false;
 
-  BlobDBOptions(const BlobDBOptions& in) = default;
+  // Disable all background job. Used for test only.
+  bool disable_background_tasks = false;
 
-  virtual ~BlobDBOptions() = default;
+  void Dump(Logger* log) const;
 };
 
 class BlobDB : public StackableDB {
  public:
   using rocksdb::StackableDB::Put;
-
+  virtual Status Put(const WriteOptions& options, const Slice& key,
+                     const Slice& value) override = 0;
   virtual Status Put(const WriteOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value) override = 0;
+                     const Slice& value) override {
+    if (column_family != DefaultColumnFamily()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return Put(options, key, value);
+  }
 
   using rocksdb::StackableDB::Delete;
   virtual Status Delete(const WriteOptions& options,
-                        ColumnFamilyHandle* column_family,
                         const Slice& key) override = 0;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    if (column_family != DefaultColumnFamily()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return Delete(options, key);
+  }
 
+  virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
+                            const Slice& value, uint64_t ttl) = 0;
   virtual Status PutWithTTL(const WriteOptions& options,
                             ColumnFamilyHandle* column_family, const Slice& key,
-                            const Slice& value, int32_t ttl) = 0;
-
-  virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
-                            const Slice& value, int32_t ttl) {
-    return PutWithTTL(options, DefaultColumnFamily(), key, value, ttl);
+                            const Slice& value, uint64_t ttl) {
+    if (column_family != DefaultColumnFamily()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return PutWithTTL(options, key, value, ttl);
   }
 
-  // Put with expiration. Key with expiration time equal to -1
-  // means the key don't expire.
+  // Put with expiration. Key with expiration time equal to
+  // std::numeric_limits<uint64_t>::max() means the key don't expire.
+  virtual Status PutUntil(const WriteOptions& options, const Slice& key,
+                          const Slice& value, uint64_t expiration) = 0;
   virtual Status PutUntil(const WriteOptions& options,
                           ColumnFamilyHandle* column_family, const Slice& key,
-                          const Slice& value, int32_t expiration) = 0;
-
-  virtual Status PutUntil(const WriteOptions& options, const Slice& key,
-                          const Slice& value, int32_t expiration) {
-    return PutUntil(options, DefaultColumnFamily(), key, value, expiration);
+                          const Slice& value, uint64_t expiration) {
+    if (column_family != DefaultColumnFamily()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+    return PutUntil(options, key, value, expiration);
   }
 
   using rocksdb::StackableDB::Get;
@@ -129,25 +144,52 @@ class BlobDB : public StackableDB {
   using rocksdb::StackableDB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
       const std::vector<Slice>& keys,
       std::vector<std::string>* values) override = 0;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    for (auto column_family : column_families) {
+      if (column_family != DefaultColumnFamily()) {
+        return std::vector<Status>(
+            column_families.size(),
+            Status::NotSupported(
+                "Blob DB doesn't support non-default column family."));
+      }
+    }
+    return MultiGet(options, keys, values);
+  }
 
   using rocksdb::StackableDB::SingleDelete;
-  virtual Status SingleDelete(const WriteOptions& wopts,
-                              ColumnFamilyHandle* column_family,
-                              const Slice& key) override = 0;
+  virtual Status SingleDelete(const WriteOptions& /*wopts*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
 
   using rocksdb::StackableDB::Merge;
-  virtual Status Merge(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) override {
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
     return Status::NotSupported("Not supported operation in blob db.");
   }
 
   virtual Status Write(const WriteOptions& opts,
                        WriteBatch* updates) override = 0;
 
+  using rocksdb::StackableDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override {
+    if (column_family != DefaultColumnFamily()) {
+      // Blob DB doesn't support non-default column family.
+      return nullptr;
+    }
+    return NewIterator(options);
+  }
+
   // Starting point for opening a Blob DB.
   // changed_options - critical. Blob DB loads and inserts listeners
   // into options which are necessary for recovery and atomicity
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 95deda5b0..23f173fd9 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -23,106 +23,25 @@
 #include "table/block_based_table_builder.h"
 #include "table/block_builder.h"
 #include "table/meta_blocks.h"
+#include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
 #include "util/logging.h"
+#include "util/mutexlock.h"
 #include "util/random.h"
+#include "util/sync_point.h"
 #include "util/timer_queue.h"
-#include "utilities/transactions/optimistic_transaction_db_impl.h"
-#include "utilities/transactions/optimistic_transaction_impl.h"
+#include "utilities/blob_db/blob_db_iterator.h"
+#include "utilities/blob_db/blob_index.h"
 
 namespace {
 int kBlockBasedTableVersionFormat = 2;
-
-void extendTTL(rocksdb::blob_db::ttlrange_t* ttl_range, uint32_t ttl) {
-  ttl_range->first = std::min(ttl_range->first, ttl);
-  ttl_range->second = std::max(ttl_range->second, ttl);
-}
-
-void extendTimestamps(rocksdb::blob_db::tsrange_t* ts_range, uint64_t ts) {
-  ts_range->first = std::min(ts_range->first, ts);
-  ts_range->second = std::max(ts_range->second, ts);
-}
-
-void extendSN(rocksdb::blob_db::snrange_t* sn_range,
-              rocksdb::SequenceNumber sn) {
-  sn_range->first = std::min(sn_range->first, sn);
-  sn_range->second = std::max(sn_range->second, sn);
-}
 }  // end namespace
 
 namespace rocksdb {
-
 namespace blob_db {
 
-// BlobHandle is a pointer to the blob that is stored in the LSM
-class BlobHandle {
- public:
-  BlobHandle()
-      : file_number_(std::numeric_limits<uint64_t>::max()),
-        offset_(std::numeric_limits<uint64_t>::max()),
-        size_(std::numeric_limits<uint64_t>::max()),
-        compression_(kNoCompression) {}
-
-  uint64_t filenumber() const { return file_number_; }
-  void set_filenumber(uint64_t fn) { file_number_ = fn; }
-
-  // The offset of the block in the file.
-  uint64_t offset() const { return offset_; }
-  void set_offset(uint64_t _offset) { offset_ = _offset; }
-
-  // The size of the stored block
-  uint64_t size() const { return size_; }
-  void set_size(uint64_t _size) { size_ = _size; }
-
-  CompressionType compression() const { return compression_; }
-  void set_compression(CompressionType t) { compression_ = t; }
-
-  void EncodeTo(std::string* dst) const;
-
-  Status DecodeFrom(Slice* input);
-
-  void clear();
-
- private:
-  uint64_t file_number_;
-  uint64_t offset_;
-  uint64_t size_;
-  CompressionType compression_;
-};
-
-void BlobHandle::EncodeTo(std::string* dst) const {
-  // Sanity check that all fields have been set
-  assert(offset_ != std::numeric_limits<uint64_t>::max());
-  assert(size_ != std::numeric_limits<uint64_t>::max());
-  assert(file_number_ != std::numeric_limits<uint64_t>::max());
-
-  dst->reserve(30);
-  PutVarint64(dst, file_number_);
-  PutVarint64(dst, offset_);
-  PutVarint64(dst, size_);
-  dst->push_back(static_cast<unsigned char>(compression_));
-}
-
-void BlobHandle::clear() {
-  file_number_ = std::numeric_limits<uint64_t>::max();
-  offset_ = std::numeric_limits<uint64_t>::max();
-  size_ = std::numeric_limits<uint64_t>::max();
-  compression_ = kNoCompression;
-}
-
-Status BlobHandle::DecodeFrom(Slice* input) {
-  if (GetVarint64(input, &file_number_) && GetVarint64(input, &offset_) &&
-      GetVarint64(input, &size_)) {
-    compression_ = static_cast<CompressionType>(input->data()[0]);
-    return Status::OK();
-  } else {
-    clear();
-    return Status::Corruption("bad blob handle");
-  }
-}
-
 Random blob_rgen(static_cast<uint32_t>(time(nullptr)));
 
 void BlobDBFlushBeginListener::OnFlushBegin(DB* db, const FlushJobInfo& info) {
@@ -137,34 +56,37 @@ WalFilter::WalProcessingOption BlobReconcileWalFilter::LogRecordFound(
 
 bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
                                    const std::shared_ptr<BlobFile>& rhs) const {
-  if (lhs->ttl_range_.first < rhs->ttl_range_.first) return true;
-
-  if (lhs->ttl_range_.first > rhs->ttl_range_.first) return false;
-
-  return lhs->BlobFileNumber() > rhs->BlobFileNumber();
+  if (lhs->expiration_range_.first < rhs->expiration_range_.first) {
+    return true;
+  }
+  if (lhs->expiration_range_.first > rhs->expiration_range_.first) {
+    return false;
+  }
+  return lhs->BlobFileNumber() < rhs->BlobFileNumber();
 }
 
 void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
     int level, const Slice& key,
     CompactionEventListener::CompactionListenerValueType value_type,
     const Slice& existing_value, const SequenceNumber& sn, bool is_new) {
+  assert(impl_->bdb_options_.enable_garbage_collection);
   if (!is_new &&
       value_type ==
           CompactionEventListener::CompactionListenerValueType::kValue) {
-    BlobHandle handle;
-    Slice lsmval(existing_value);
-    Status s = handle.DecodeFrom(&lsmval);
+    BlobIndex blob_index;
+    Status s = blob_index.DecodeFrom(existing_value);
     if (s.ok()) {
       if (impl_->debug_level_ >= 3)
-        ROCKS_LOG_INFO(impl_->db_options_.info_log,
-                       "CALLBACK COMPACTED OUT KEY: %s SN: %d "
-                       "NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64
-                       " SIZE: %" PRIu64,
-                       key.ToString().c_str(), sn, is_new, handle.filenumber(),
-                       handle.offset(), handle.size());
-
-      impl_->override_vals_q_.enqueue({handle.filenumber(), key.size(),
-                                       handle.offset(), handle.size(), sn});
+        ROCKS_LOG_INFO(
+            impl_->db_options_.info_log,
+            "CALLBACK COMPACTED OUT KEY: %s SN: %d "
+            "NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64 " SIZE: %" PRIu64,
+            key.ToString().c_str(), sn, is_new, blob_index.file_number(),
+            blob_index.offset(), blob_index.size());
+
+      impl_->override_vals_q_.enqueue({blob_index.file_number(), key.size(),
+                                       blob_index.offset(), blob_index.size(),
+                                       sn});
     }
   } else {
     if (impl_->debug_level_ >= 3)
@@ -181,7 +103,6 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       db_impl_(nullptr),
       env_(db_options.env),
       ttl_extractor_(blob_db_options.ttl_extractor.get()),
-      wo_set_(false),
       bdb_options_(blob_db_options),
       db_options_(db_options),
       env_options_(db_options),
@@ -197,12 +118,8 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       total_periods_ampl_(0),
       total_blob_space_(0),
       open_p1_done_(false),
-      debug_level_(0) {
-  const BlobDBOptionsImpl* options_impl =
-      dynamic_cast<const BlobDBOptionsImpl*>(&blob_db_options);
-  if (options_impl) {
-    bdb_options_ = *options_impl;
-  }
+      debug_level_(0),
+      oldest_file_evicted_(false) {
   blob_dir_ = (bdb_options_.path_relative)
                   ? dbname + "/" + bdb_options_.blob_dir
                   : bdb_options_.blob_dir;
@@ -215,17 +132,10 @@ Status BlobDBImpl::LinkToBaseDB(DB* db) {
   db_ = db;
 
   // the Base DB in-itself can be a stackable DB
-  StackableDB* sdb = dynamic_cast<StackableDB*>(db_);
-  if (sdb) {
-    db_impl_ = dynamic_cast<DBImpl*>(sdb->GetBaseDB());
-  } else {
-    db_impl_ = dynamic_cast<DBImpl*>(db);
-  }
+  db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
 
   env_ = db_->GetEnv();
 
-  opt_db_.reset(new OptimisticTransactionDBImpl(db, false));
-
   Status s = env_->CreateDirIfMissing(blob_dir_);
   if (!s.ok()) {
     ROCKS_LOG_WARN(db_options_.info_log,
@@ -249,9 +159,7 @@ BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
 
 BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
     : BlobDB(db),
-      db_impl_(dynamic_cast<DBImpl*>(db)),
-      opt_db_(new OptimisticTransactionDBImpl(db, false)),
-      wo_set_(false),
+      db_impl_(static_cast_with_check<DBImpl, DB>(db)),
       bdb_options_(blob_db_options),
       db_options_(db->GetOptions()),
       env_options_(db_->GetOptions()),
@@ -265,14 +173,8 @@ BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
       last_period_ampl_(0),
       total_periods_write_(0),
       total_periods_ampl_(0),
-      total_blob_space_(0) {
-  assert(db_impl_ != nullptr);
-  const BlobDBOptionsImpl* options_impl =
-      dynamic_cast<const BlobDBOptionsImpl*>(&blob_db_options);
-  if (options_impl) {
-    bdb_options_ = *options_impl;
-  }
-
+      total_blob_space_(0),
+      oldest_file_evicted_(false) {
   if (!bdb_options_.blob_dir.empty())
     blob_dir_ = (bdb_options_.path_relative)
                     ? db_->GetName() + "/" + bdb_options_.blob_dir
@@ -308,27 +210,29 @@ Status BlobDBImpl::OpenPhase1() {
 void BlobDBImpl::StartBackgroundTasks() {
   // store a call to a member function and object
   tqueue_.add(
-      bdb_options_.reclaim_of_period_millisecs,
+      kReclaimOpenFilesPeriodMillisecs,
       std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
-  tqueue_.add(bdb_options_.gc_check_period_millisecs,
+  tqueue_.add(kGCCheckPeriodMillisecs,
               std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1));
+  if (bdb_options_.enable_garbage_collection) {
+    tqueue_.add(
+        kDeleteCheckPeriodMillisecs,
+        std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
+    tqueue_.add(
+        kDeleteCheckPeriodMillisecs,
+        std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
+  }
   tqueue_.add(
-      bdb_options_.deletion_check_period_millisecs,
-      std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
-  tqueue_.add(
-      bdb_options_.deletion_check_period_millisecs,
-      std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
-  tqueue_.add(
-      bdb_options_.delete_obsf_period_millisecs,
-      std::bind(&BlobDBImpl::DeleteObsFiles, this, std::placeholders::_1));
-  tqueue_.add(bdb_options_.sanity_check_period_millisecs,
+      kDeleteObsoleteFilesPeriodMillisecs,
+      std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
+  tqueue_.add(kSanityCheckPeriodMillisecs,
               std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
-  tqueue_.add(bdb_options_.wa_stats_period_millisecs,
+  tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
               std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
-  tqueue_.add(bdb_options_.fsync_files_period_millisecs,
+  tqueue_.add(kFSyncFilesPeriodMillisecs,
               std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
   tqueue_.add(
-      bdb_options_.check_seqf_period_millisecs,
+      kCheckSeqFilesPeriodMillisecs,
       std::bind(&BlobDBImpl::CheckSeqFiles, this, std::placeholders::_1));
 }
 
@@ -425,6 +329,8 @@ Status BlobDBImpl::OpenAllFiles() {
                       bfpath.c_str(), s1.ToString().c_str(), size_bytes);
       continue;
     }
+    bfptr->SetHasTTL(bfptr->header_.has_ttl);
+    bfptr->SetCompression(bfptr->header_.compression);
     bfptr->header_valid_ = true;
 
     std::shared_ptr<RandomAccessFileReader> ra_reader =
@@ -448,28 +354,23 @@ Status BlobDBImpl::OpenAllFiles() {
                      "File found incomplete (w/o footer) %s", bfpath.c_str());
 
       // sequentially iterate over the file and read all the records
-      ttlrange_t ttl_range(std::numeric_limits<uint32_t>::max(),
-                           std::numeric_limits<uint32_t>::min());
-      tsrange_t ts_range(std::numeric_limits<uint32_t>::max(),
-                         std::numeric_limits<uint32_t>::min());
-      snrange_t sn_range(std::numeric_limits<SequenceNumber>::max(),
-                         std::numeric_limits<SequenceNumber>::min());
+      ExpirationRange expiration_range(std::numeric_limits<uint32_t>::max(),
+                                       std::numeric_limits<uint32_t>::min());
 
       uint64_t blob_count = 0;
       BlobLogRecord record;
-      Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter;
+      Reader::ReadLevel shallow = Reader::kReadHeaderKey;
 
       uint64_t record_start = reader->GetNextByte();
       // TODO(arahut) - when we detect corruption, we should truncate
       while (reader->ReadRecord(&record, shallow).ok()) {
         ++blob_count;
         if (bfptr->HasTTL()) {
-          extendTTL(&ttl_range, record.GetTTL());
-        }
-        if (bfptr->HasTimestamp()) {
-          extendTimestamps(&ts_range, record.GetTimeVal());
+          expiration_range.first =
+              std::min(expiration_range.first, record.expiration);
+          expiration_range.second =
+              std::max(expiration_range.second, record.expiration);
         }
-        extendSN(&sn_range, record.GetSN());
         record_start = reader->GetNextByte();
       }
 
@@ -487,26 +388,21 @@ Status BlobDBImpl::OpenAllFiles() {
       }
 
       bfptr->SetBlobCount(blob_count);
-      bfptr->SetSNRange(sn_range);
-
-      if (bfptr->HasTimestamp()) bfptr->set_time_range(ts_range);
+      bfptr->SetSequenceRange({0, 0});
 
       ROCKS_LOG_INFO(db_options_.info_log,
                      "Blob File: %s blob_count: %" PRIu64
-                     " size_bytes: %" PRIu64
-                     " sn_range: (%d, %d) ts: %d ttl: %d",
-                     bfpath.c_str(), blob_count, size_bytes, sn_range.first,
-                     sn_range.second, bfptr->HasTimestamp(), bfptr->HasTTL());
+                     " size_bytes: %" PRIu64 " has_ttl: %d",
+                     bfpath.c_str(), blob_count, size_bytes, bfptr->HasTTL());
 
       if (bfptr->HasTTL()) {
-        ttl_range.second =
-            std::max(ttl_range.second,
-                     ttl_range.first + (uint32_t)bdb_options_.ttl_range_secs);
-        bfptr->set_ttl_range(ttl_range);
-
-        std::time_t epoch_now = std::chrono::system_clock::to_time_t(
-            std::chrono::system_clock::now());
-        if (ttl_range.second < epoch_now) {
+        expiration_range.second = std::max(
+            expiration_range.second,
+            expiration_range.first + (uint32_t)bdb_options_.ttl_range_secs);
+        bfptr->set_expiration_range(expiration_range);
+
+        uint64_t now = EpochNow();
+        if (expiration_range.second < now) {
           Status fstatus = CreateWriterLocked(bfptr);
           if (fstatus.ok()) fstatus = bfptr->WriteFooterAndCloseLocked();
           if (!fstatus.ok()) {
@@ -516,13 +412,14 @@ Status BlobDBImpl::OpenAllFiles() {
                 bfpath.c_str(), fstatus.ToString().c_str());
             continue;
           } else {
-            ROCKS_LOG_ERROR(db_options_.info_log,
-                            "Blob File Closed: %s now: %d ttl_range: (%d, %d)",
-                            bfpath.c_str(), epoch_now, ttl_range.first,
-                            ttl_range.second);
+            ROCKS_LOG_ERROR(
+                db_options_.info_log,
+                "Blob File Closed: %s now: %d expiration_range: (%d, %d)",
+                bfpath.c_str(), now, expiration_range.first,
+                expiration_range.second);
           }
         } else {
-          open_blob_files_.insert(bfptr);
+          open_ttl_files_.insert(bfptr);
         }
       }
     }
@@ -561,12 +458,7 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
   std::string fpath(bfile->PathName());
   std::unique_ptr<WritableFile> wfile;
 
-  // We are having issue that we write duplicate blob to blob file and the bug
-  // is related to writable file buffer. Force no buffer until we fix the bug.
-  EnvOptions env_options = env_options_;
-  env_options.writable_file_max_buffer_size = 0;
-
-  Status s = env_->ReopenWritableFile(fpath, &wfile, env_options);
+  Status s = env_->ReopenWritableFile(fpath, &wfile, env_options_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to open blob file for write: %s status: '%s'"
@@ -577,7 +469,7 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
   }
 
   std::unique_ptr<WritableFileWriter> fwriter;
-  fwriter.reset(new WritableFileWriter(std::move(wfile), env_options));
+  fwriter.reset(new WritableFileWriter(std::move(wfile), env_options_));
 
   uint64_t boffset = bfile->GetFileSize();
   if (debug_level_ >= 2 && boffset) {
@@ -586,11 +478,11 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
   }
 
   Writer::ElemType et = Writer::kEtNone;
-  if (bfile->file_size_ == BlobLogHeader::kHeaderSize)
+  if (bfile->file_size_ == BlobLogHeader::kSize) {
     et = Writer::kEtFileHdr;
-  else if (bfile->file_size_ > BlobLogHeader::kHeaderSize)
-    et = Writer::kEtFooter;
-  else if (bfile->file_size_) {
+  } else if (bfile->file_size_ > BlobLogHeader::kSize) {
+    et = Writer::kEtRecord;
+  } else if (bfile->file_size_) {
     ROCKS_LOG_WARN(db_options_.info_log,
                    "Open blob file: %s with wrong size: %d", fpath.c_str(),
                    boffset);
@@ -606,27 +498,27 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
 }
 
 std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
-    uint32_t expiration) const {
-  if (open_blob_files_.empty()) return nullptr;
+    uint64_t expiration) const {
+  if (open_ttl_files_.empty()) return nullptr;
 
   std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
-  tmp->ttl_range_ = std::make_pair(expiration, 0);
+  tmp->expiration_range_ = std::make_pair(expiration, 0);
 
-  auto citr = open_blob_files_.equal_range(tmp);
-  if (citr.first == open_blob_files_.end()) {
-    assert(citr.second == open_blob_files_.end());
+  auto citr = open_ttl_files_.equal_range(tmp);
+  if (citr.first == open_ttl_files_.end()) {
+    assert(citr.second == open_ttl_files_.end());
 
-    std::shared_ptr<BlobFile> check = *(open_blob_files_.rbegin());
-    return (check->ttl_range_.second < expiration) ? nullptr : check;
+    std::shared_ptr<BlobFile> check = *(open_ttl_files_.rbegin());
+    return (check->expiration_range_.second < expiration) ? nullptr : check;
   }
 
   if (citr.first != citr.second) return *(citr.first);
 
   auto finditr = citr.second;
-  if (finditr != open_blob_files_.begin()) --finditr;
+  if (finditr != open_ttl_files_.begin()) --finditr;
 
-  bool b2 = (*finditr)->ttl_range_.second < expiration;
-  bool b1 = (*finditr)->ttl_range_.first > expiration;
+  bool b2 = (*finditr)->expiration_range_.second < expiration;
+  bool b1 = (*finditr)->expiration_range_.first > expiration;
 
   return (b1 || b2) ? nullptr : (*finditr);
 }
@@ -643,23 +535,18 @@ std::shared_ptr<Writer> BlobDBImpl::CheckOrCreateWriterLocked(
   return writer;
 }
 
-void BlobDBImpl::UpdateWriteOptions(const WriteOptions& options) {
-  if (!wo_set_.load(std::memory_order_relaxed)) {
-    // DCLP
-    WriteLock wl(&mutex_);
-    if (!wo_set_.load(std::memory_order_acquire)) {
-      wo_set_.store(true, std::memory_order_release);
-      write_options_ = options;
-    }
-  }
-}
-
 std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
-  uint32_t val = blob_rgen.Next();
   {
     ReadLock rl(&mutex_);
-    if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs)
-      return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs];
+    if (open_non_ttl_file_ != nullptr) {
+      return open_non_ttl_file_;
+    }
+  }
+
+  // CHECK again
+  WriteLock wl(&mutex_);
+  if (open_non_ttl_file_ != nullptr) {
+    return open_non_ttl_file_;
   }
 
   std::shared_ptr<BlobFile> bfile = NewBlobFile("SelectBlobFile");
@@ -674,15 +561,14 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
     return nullptr;
   }
 
-  bfile->file_size_ = BlobLogHeader::kHeaderSize;
-  bfile->header_.compression_ = bdb_options_.compression;
+  bfile->file_size_ = BlobLogHeader::kSize;
+  bfile->header_.compression = bdb_options_.compression;
+  bfile->header_.has_ttl = false;
+  bfile->header_.column_family_id =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
   bfile->header_valid_ = true;
-
-  // CHECK again
-  WriteLock wl(&mutex_);
-  if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs) {
-    return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs];
-  }
+  bfile->SetHasTTL(false);
+  bfile->SetCompression(bdb_options_.compression);
 
   Status s = writer->WriteHeader(bfile->header_);
   if (!s.ok()) {
@@ -695,11 +581,12 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
 
   dir_change_.store(true);
   blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
-  open_simple_files_.push_back(bfile);
+  open_non_ttl_file_ = bfile;
   return bfile;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) {
+std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
+  assert(expiration != kNoExpiration);
   uint64_t epoch_read = 0;
   std::shared_ptr<BlobFile> bfile;
   {
@@ -713,10 +600,10 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) {
     return bfile;
   }
 
-  uint32_t exp_low =
+  uint64_t exp_low =
       (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
-  uint32_t exp_high = exp_low + bdb_options_.ttl_range_secs;
-  ttlrange_t ttl_guess = std::make_pair(exp_low, exp_high);
+  uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
+  ExpirationRange expiration_range = std::make_pair(exp_low, exp_high);
 
   bfile = NewBlobFile("SelectBlobFileTTL");
   assert(bfile);
@@ -734,14 +621,20 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) {
     return nullptr;
   }
 
-  bfile->header_.set_ttl_guess(ttl_guess);
-  bfile->header_.compression_ = bdb_options_.compression;
+  bfile->header_.expiration_range = expiration_range;
+  bfile->header_.compression = bdb_options_.compression;
+  bfile->header_.has_ttl = true;
+  bfile->header_.column_family_id =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  ;
   bfile->header_valid_ = true;
-  bfile->file_size_ = BlobLogHeader::kHeaderSize;
+  bfile->SetHasTTL(true);
+  bfile->SetCompression(bdb_options_.compression);
+  bfile->file_size_ = BlobLogHeader::kSize;
 
   // set the first value of the range, since that is
-  // concrete at this time.  also necessary to add to open_blob_files_
-  bfile->ttl_range_ = ttl_guess;
+  // concrete at this time.  also necessary to add to open_ttl_files_
+  bfile->expiration_range_ = expiration_range;
 
   WriteLock wl(&mutex_);
   // in case the epoch has shifted in the interim, then check
@@ -762,153 +655,118 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) {
 
   dir_change_.store(true);
   blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
-  open_blob_files_.insert(bfile);
+  open_ttl_files_.insert(bfile);
   epoch_of_++;
 
   return bfile;
 }
 
-Status BlobDBImpl::Put(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) {
-  std::string new_value;
-  Slice value_slice;
-  int32_t expiration = ExtractExpiration(key, value, &value_slice, &new_value);
-  return PutUntil(options, column_family, key, value_slice, expiration);
-}
-
-Status BlobDBImpl::Delete(const WriteOptions& options,
-                          ColumnFamilyHandle* column_family, const Slice& key) {
-  SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
-  Status s = db_->Delete(options, column_family, key);
-
-  // add deleted key to list of keys that have been deleted for book-keeping
-  delete_keys_q_.enqueue({column_family, key.ToString(), lsn});
-  return s;
-}
-
-Status BlobDBImpl::SingleDelete(const WriteOptions& wopts,
-                                ColumnFamilyHandle* column_family,
-                                const Slice& key) {
+Status BlobDBImpl::Delete(const WriteOptions& options, const Slice& key) {
   SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
-  Status s = db_->SingleDelete(wopts, column_family, key);
+  Status s = db_->Delete(options, key);
 
-  delete_keys_q_.enqueue({column_family, key.ToString(), lsn});
+  if (bdb_options_.enable_garbage_collection) {
+    // add deleted key to list of keys that have been deleted for book-keeping
+    delete_keys_q_.enqueue({DefaultColumnFamily(), key.ToString(), lsn});
+  }
   return s;
 }
 
-Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
-  class BlobInserter : public WriteBatch::Handler {
-   private:
-    BlobDBImpl* impl_;
-    SequenceNumber sequence_;
-    WriteBatch updates_blob_;
-    Status batch_rewrite_status_;
-    std::shared_ptr<BlobFile> last_file_;
-    bool has_put_;
-    std::string new_value_;
-
-   public:
-    explicit BlobInserter(BlobDBImpl* impl, SequenceNumber seq)
-        : impl_(impl), sequence_(seq), has_put_(false) {}
-
-    WriteBatch& updates_blob() { return updates_blob_; }
-
-    Status batch_rewrite_status() { return batch_rewrite_status_; }
-
-    std::shared_ptr<BlobFile>& last_file() { return last_file_; }
-
-    bool has_put() { return has_put_; }
-
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value_slice) override {
-      Slice value_unc;
-      int32_t expiration =
-          impl_->ExtractExpiration(key, value_slice, &value_unc, &new_value_);
-
-      std::shared_ptr<BlobFile> bfile =
-          (expiration != -1)
-              ? impl_->SelectBlobFileTTL(expiration)
-              : ((last_file_) ? last_file_ : impl_->SelectBlobFile());
-      if (last_file_ && last_file_ != bfile) {
-        batch_rewrite_status_ = Status::NotFound("too many blob files");
-        return batch_rewrite_status_;
-      }
-
-      if (!bfile) {
-        batch_rewrite_status_ = Status::NotFound("blob file not found");
-        return batch_rewrite_status_;
-      }
-
-      last_file_ = bfile;
-      has_put_ = true;
-
-      std::string compression_output;
-      Slice value = impl_->GetCompressedSlice(value_unc, &compression_output);
-
-      std::string headerbuf;
-      Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
-      std::string index_entry;
-      Status st = impl_->AppendBlob(bfile, headerbuf, key, value, &index_entry);
-      if (st.ok()) {
-        impl_->AppendSN(last_file_, sequence_);
-        sequence_++;
-      }
-
-      if (expiration != -1) {
-        extendTTL(&(bfile->ttl_range_), (uint32_t)expiration);
-      }
+class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
+ private:
+  const WriteOptions& options_;
+  BlobDBImpl* blob_db_impl_;
+  uint32_t default_cf_id_;
+  SequenceNumber sequence_;
+  WriteBatch batch_;
 
-      if (!st.ok()) {
-        batch_rewrite_status_ = st;
-      } else {
-        WriteBatchInternal::Put(&updates_blob_, column_family_id, key,
-                                index_entry);
-      }
-      return Status::OK();
+ public:
+  BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl,
+               uint32_t default_cf_id, SequenceNumber seq)
+      : options_(options),
+        blob_db_impl_(blob_db_impl),
+        default_cf_id_(default_cf_id),
+        sequence_(seq) {}
+
+  SequenceNumber sequence() { return sequence_; }
+
+  WriteBatch* batch() { return &batch_; }
+
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) override {
+    if (column_family_id != default_cf_id_) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
     }
+    std::string new_value;
+    Slice value_slice;
+    uint64_t expiration =
+        blob_db_impl_->ExtractExpiration(key, value, &value_slice, &new_value);
+    Status s = blob_db_impl_->PutBlobValue(options_, key, value_slice,
+                                           expiration, sequence_, &batch_);
+    sequence_++;
+    return s;
+  }
 
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
-      WriteBatchInternal::Delete(&updates_blob_, column_family_id, key);
-      sequence_++;
-      return Status::OK();
+  virtual Status DeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+    if (column_family_id != default_cf_id_) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
     }
+    Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key);
+    sequence_++;
+    return s;
+  }
 
-    virtual Status SingleDeleteCF(uint32_t /*column_family_id*/,
-                                  const Slice& /*key*/) override {
-      batch_rewrite_status_ =
-          Status::NotSupported("Not supported operation in blob db.");
-      return batch_rewrite_status_;
+  virtual Status DeleteRange(uint32_t column_family_id, const Slice& begin_key,
+                             const Slice& end_key) {
+    if (column_family_id != default_cf_id_) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
     }
+    Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id,
+                                               begin_key, end_key);
+    sequence_++;
+    return s;
+  }
 
-    virtual Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
-                           const Slice& /*value*/) override {
-      batch_rewrite_status_ =
-          Status::NotSupported("Not supported operation in blob db.");
-      return batch_rewrite_status_;
-    }
+  virtual Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                                const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
 
-    virtual void LogData(const Slice& blob) override {
-      updates_blob_.PutLogData(blob);
-    }
-  };
+  virtual Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                         const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in blob db.");
+  }
 
-  SequenceNumber sequence = db_impl_->GetLatestSequenceNumber() + 1;
-  BlobInserter blob_inserter(this, sequence);
-  updates->Iterate(&blob_inserter);
+  virtual void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
+};
 
-  if (!blob_inserter.batch_rewrite_status().ok()) {
-    return blob_inserter.batch_rewrite_status();
-  }
+Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
 
-  Status s = db_->Write(opts, &(blob_inserter.updates_blob()));
+  uint32_t default_cf_id =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  // TODO(yiwu): In case there are multiple writers the latest sequence would
+  // not be the actually sequence we are writting. Need to get the sequence
+  // from write batch after DB write instead.
+  SequenceNumber current_seq = GetLatestSequenceNumber() + 1;
+  Status s;
+  BlobInserter blob_inserter(options, this, default_cf_id, current_seq);
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    s = updates->Iterate(&blob_inserter);
+  }
   if (!s.ok()) {
     return s;
   }
-
-  if (blob_inserter.has_put()) {
-    CloseIf(blob_inserter.last_file());
+  s = db_->Write(options, blob_inserter.batch());
+  if (!s.ok()) {
+    return s;
   }
 
   // add deleted key to list of keys that have been deleted for book-keeping
@@ -938,19 +796,143 @@ Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
     SequenceNumber sequence_;
   };
 
-  // add deleted key to list of keys that have been deleted for book-keeping
-  DeleteBookkeeper delete_bookkeeper(this, sequence);
-  updates->Iterate(&delete_bookkeeper);
+  if (bdb_options_.enable_garbage_collection) {
+    // add deleted key to list of keys that have been deleted for book-keeping
+    DeleteBookkeeper delete_bookkeeper(this, current_seq);
+    s = updates->Iterate(&delete_bookkeeper);
+  }
 
+  return s;
+}
+
+Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                                uint64_t* manifest_file_size,
+                                bool flush_memtable) {
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
+  if (!s.ok()) {
+    return s;
+  }
+  ret.reserve(ret.size() + blob_files_.size());
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    ret.emplace_back(blob_file->PathName());
+  }
   return Status::OK();
 }
 
+void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  db_->GetLiveFilesMetaData(metadata);
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    LiveFileMetaData filemetadata;
+    filemetadata.size = blob_file->GetFileSize();
+    filemetadata.name = blob_file->PathName();
+    auto cfh =
+        reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+    filemetadata.column_family_name = cfh->GetName();
+    metadata->emplace_back(filemetadata);
+  }
+}
+
+Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
+                       const Slice& value) {
+  std::string new_value;
+  Slice value_slice;
+  uint64_t expiration = ExtractExpiration(key, value, &value_slice, &new_value);
+  return PutUntil(options, key, value_slice, expiration);
+}
+
 Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
-                              ColumnFamilyHandle* column_family,
                               const Slice& key, const Slice& value,
-                              int32_t ttl) {
-  return PutUntil(options, column_family, key, value,
-                  static_cast<int32_t>(EpochNow()) + ttl);
+                              uint64_t ttl) {
+  uint64_t now = EpochNow();
+  uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration;
+  return PutUntil(options, key, value, expiration);
+}
+
+Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
+                            const Slice& value, uint64_t expiration) {
+  TEST_SYNC_POINT("BlobDBImpl::PutUntil:Start");
+  Status s;
+  WriteBatch batch;
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    // TODO(yiwu): In case there are multiple writers the latest sequence would
+    // not be the actually sequence we are writting. Need to get the sequence
+    // from write batch after DB write instead.
+    SequenceNumber sequence = GetLatestSequenceNumber() + 1;
+    s = PutBlobValue(options, key, value, expiration, sequence, &batch);
+  }
+  if (s.ok()) {
+    s = db_->Write(options, &batch);
+  }
+  TEST_SYNC_POINT("BlobDBImpl::PutUntil:Finish");
+  return s;
+}
+
+Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
+                                const Slice& value, uint64_t expiration,
+                                SequenceNumber sequence, WriteBatch* batch) {
+  Status s;
+  std::string index_entry;
+  uint32_t column_family_id =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  if (value.size() < bdb_options_.min_blob_size) {
+    if (expiration == kNoExpiration) {
+      // Put as normal value
+      s = batch->Put(key, value);
+    } else {
+      // Inlined with TTL
+      BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
+      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                           index_entry);
+    }
+  } else {
+    std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration)
+                                          ? SelectBlobFileTTL(expiration)
+                                          : SelectBlobFile();
+    if (!bfile) {
+      return Status::NotFound("Blob file not found");
+    }
+
+    assert(bfile->compression() == bdb_options_.compression);
+    std::string compression_output;
+    Slice value_compressed = GetCompressedSlice(value, &compression_output);
+
+    std::string headerbuf;
+    Writer::ConstructBlobHeader(&headerbuf, key, value_compressed, expiration);
+
+    s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration,
+                   &index_entry);
+
+    if (s.ok()) {
+      bfile->ExtendSequenceRange(sequence);
+      if (expiration != kNoExpiration) {
+        bfile->ExtendExpirationRange(expiration);
+      }
+      s = CloseBlobFileIfNeeded(bfile);
+      if (s.ok()) {
+        s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                             index_entry);
+      }
+    } else {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Failed to append blob to FILE: %s: KEY: %s VALSZ: %d"
+                      " status: '%s' blob_file: '%s'",
+                      bfile->PathName().c_str(), key.ToString().c_str(),
+                      value.size(), s.ToString().c_str(),
+                      bfile->DumpState().c_str());
+    }
+  }
+
+  return s;
 }
 
 Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
@@ -965,102 +947,83 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   return *compression_output;
 }
 
-// TODO(yiwu): We should use uint64_t for expiration.
-Status BlobDBImpl::PutUntil(const WriteOptions& options,
-                            ColumnFamilyHandle* column_family, const Slice& key,
-                            const Slice& value_unc, int32_t expiration) {
-  UpdateWriteOptions(options);
-
-  std::shared_ptr<BlobFile> bfile =
-      (expiration != -1) ? SelectBlobFileTTL(expiration) : SelectBlobFile();
-
-  if (!bfile) return Status::NotFound("Blob file not found");
-
-  std::string compression_output;
-  Slice value = GetCompressedSlice(value_unc, &compression_output);
-
-  std::string headerbuf;
-  Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
-
-  // this is another more safer way to do it, where you keep the writeLock
-  // for the entire write path. this will increase latency and reduce
-  // throughput
-  // WriteLock lockbfile_w(&bfile->mutex_);
-  // std::shared_ptr<Writer> writer =
-  // CheckOrCreateWriterLocked(bfile);
-
-  if (debug_level_ >= 3)
-    ROCKS_LOG_DEBUG(
-        db_options_.info_log, ">Adding KEY FILE: %s: KEY: %s VALSZ: %d",
-        bfile->PathName().c_str(), key.ToString().c_str(), value.size());
-
-  std::string index_entry;
-  Status s = AppendBlob(bfile, headerbuf, key, value, &index_entry);
-  if (!s.ok()) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to append blob to FILE: %s: KEY: %s VALSZ: %d"
-                    " status: '%s' blob_file: '%s'",
-                    bfile->PathName().c_str(), key.ToString().c_str(),
-                    value.size(), s.ToString().c_str(),
-                    bfile->DumpState().c_str());
-    // Fallback just write to the LSM and get going
-    WriteBatch batch;
-    batch.Put(column_family, key, value);
-    return db_->Write(options, &batch);
+uint64_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value,
+                                       Slice* value_slice,
+                                       std::string* new_value) {
+  uint64_t expiration = kNoExpiration;
+  bool has_expiration = false;
+  bool value_changed = false;
+  if (ttl_extractor_ != nullptr) {
+    has_expiration = ttl_extractor_->ExtractExpiration(
+        key, value, EpochNow(), &expiration, new_value, &value_changed);
   }
+  *value_slice = value_changed ? Slice(*new_value) : value;
+  return has_expiration ? expiration : kNoExpiration;
+}
 
-  WriteBatch batch;
-  batch.Put(column_family, key, index_entry);
-
-  // this goes to the base db and can be expensive
-  s = db_->Write(options, &batch);
-
-  // this is the sequence number of the write.
-  SequenceNumber sn = WriteBatchInternal::Sequence(&batch);
-
-  if (debug_level_ >= 3)
-    ROCKS_LOG_INFO(db_options_.info_log, "<Adding KEY FILE: %s: KEY: %s SN: %d",
-                   bfile->PathName().c_str(), key.ToString().c_str(), sn);
+std::shared_ptr<BlobFile> BlobDBImpl::GetOldestBlobFile() {
+  std::vector<std::shared_ptr<BlobFile>> blob_files;
+  CopyBlobFiles(&blob_files, [](const std::shared_ptr<BlobFile>& f) {
+    return !f->Obsolete() && f->Immutable();
+  });
+  blobf_compare_ttl compare;
+  return *std::min_element(blob_files.begin(), blob_files.end(), compare);
+}
 
-  s = AppendSN(bfile, sn);
-  if (!s.ok()) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to append SN to FILE: %s: KEY: %s VALSZ: %d"
-                    " status: '%s' blob_file: '%s'",
-                    bfile->PathName().c_str(), key.ToString().c_str(),
-                    value.size(), s.ToString().c_str(),
-                    bfile->DumpState().c_str());
+bool BlobDBImpl::EvictOldestBlobFile() {
+  auto oldest_file = GetOldestBlobFile();
+  if (oldest_file == nullptr) {
+    return false;
   }
 
-  if (expiration != -1) extendTTL(&(bfile->ttl_range_), (uint32_t)expiration);
-
-  CloseIf(bfile);
+  WriteLock wl(&mutex_);
+  // Double check the file is not obsolete by others
+  if (oldest_file_evicted_ == false && !oldest_file->Obsolete()) {
+    auto expiration_range = oldest_file->GetExpirationRange();
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Evict oldest blob file since DB out of space. Current "
+                   "space used: %" PRIu64 ", blob dir size: %" PRIu64
+                   ", evicted blob file #%" PRIu64
+                   " with expiration range (%" PRIu64 ", %" PRIu64 ").",
+                   total_blob_space_.load(), bdb_options_.blob_dir_size,
+                   oldest_file->BlobFileNumber(), expiration_range.first,
+                   expiration_range.second);
+    oldest_file->MarkObsolete(oldest_file->GetSequenceRange().second);
+    obsolete_files_.push_back(oldest_file);
+    oldest_file_evicted_.store(true);
+    return true;
+  }
 
-  return s;
+  return false;
 }
 
-// TODO(yiwu): We should return uint64_t after updating the rest of the code
-// to use uint64_t for expiration.
-int32_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value,
-                                      Slice* value_slice,
-                                      std::string* new_value) {
-  uint64_t expiration = kNoExpiration;
-  bool value_changed = false;
-  if (ttl_extractor_ != nullptr) {
-    bool has_ttl = ttl_extractor_->ExtractExpiration(
-        key, value, EpochNow(), &expiration, new_value, &value_changed);
-    if (!has_ttl) {
-      expiration = kNoExpiration;
+Status BlobDBImpl::CheckSize(size_t blob_size) {
+  uint64_t new_space_util = total_blob_space_.load() + blob_size;
+  if (bdb_options_.blob_dir_size > 0) {
+    if (!bdb_options_.is_fifo &&
+        (new_space_util > bdb_options_.blob_dir_size)) {
+      return Status::NoSpace(
+          "Write failed, as writing it would exceed blob_dir_size limit.");
+    }
+    if (bdb_options_.is_fifo && !oldest_file_evicted_.load() &&
+        (new_space_util >
+         kEvictOldestFileAtSize * bdb_options_.blob_dir_size)) {
+      EvictOldestBlobFile();
     }
   }
-  *value_slice = value_changed ? Slice(*new_value) : value;
-  return (expiration == kNoExpiration) ? -1 : static_cast<int32_t>(expiration);
+
+  return Status::OK();
 }
 
 Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
                               const std::string& headerbuf, const Slice& key,
-                              const Slice& value, std::string* index_entry) {
-  Status s;
+                              const Slice& value, uint64_t expiration,
+                              std::string* index_entry) {
+  auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
+  Status s = CheckSize(size_put);
+  if (!s.ok()) {
+    return s;
+  }
 
   uint64_t blob_offset = 0;
   uint64_t key_offset = 0;
@@ -1083,101 +1046,98 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
 
   // increment blob count
   bfile->blob_count_++;
-  auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
 
   bfile->file_size_ += size_put;
   last_period_write_ += size_put;
   total_blob_space_ += size_put;
 
-  BlobHandle handle;
-  handle.set_filenumber(bfile->BlobFileNumber());
-  handle.set_size(value.size());
-  handle.set_offset(blob_offset);
-  handle.set_compression(bdb_options_.compression);
-  handle.EncodeTo(index_entry);
-
-  if (debug_level_ >= 3)
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   ">Adding KEY FILE: %s: BC: %d OFFSET: %d SZ: %d",
-                   bfile->PathName().c_str(), bfile->blob_count_.load(),
-                   blob_offset, value.size());
-
-  return s;
-}
-
-Status BlobDBImpl::AppendSN(const std::shared_ptr<BlobFile>& bfile,
-                            const SequenceNumber& sn) {
-  Status s;
-  {
-    WriteLock lockbfile_w(&bfile->mutex_);
-    std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-    if (!writer) return Status::IOError("Failed to create blob writer");
-
-    s = writer->AddRecordFooter(sn);
-    if (!s.ok()) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Invalid status in AppendSN: %s status: '%s'",
-                      bfile->PathName().c_str(), s.ToString().c_str());
-      return s;
-    }
-
-    if (sn != std::numeric_limits<SequenceNumber>::max())
-      extendSN(&(bfile->sn_range_), sn);
+  if (expiration == kNoExpiration) {
+    BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset,
+                          value.size(), bdb_options_.compression);
+  } else {
+    BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(),
+                             blob_offset, value.size(),
+                             bdb_options_.compression);
   }
 
-  bfile->file_size_ += BlobLogRecord::kFooterSize;
-  last_period_write_ += BlobLogRecord::kFooterSize;
-  total_blob_space_ += BlobLogRecord::kFooterSize;
   return s;
 }
 
 std::vector<Status> BlobDBImpl::MultiGet(
-    const ReadOptions& options,
-    const std::vector<ColumnFamilyHandle*>& column_family,
+    const ReadOptions& read_options,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  std::vector<std::string> values_lsm;
-  values_lsm.resize(keys.size());
-  auto statuses = db_->MultiGet(options, column_family, keys, &values_lsm);
-
-  for (size_t i = 0; i < keys.size(); ++i) {
-    if (!statuses[i].ok()) continue;
-
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
-    auto cfd = cfh->cfd();
-
-    Status s = CommonGet(cfd, keys[i], values_lsm[i], &((*values)[i]));
-    statuses[i] = s;
+  // Get a snapshot to avoid blob file get deleted between we
+  // fetch and index entry and reading from the file.
+  ReadOptions ro(read_options);
+  bool snapshot_created = SetSnapshotIfNeeded(&ro);
+
+  std::vector<Status> statuses;
+  statuses.reserve(keys.size());
+  values->clear();
+  values->reserve(keys.size());
+  PinnableSlice value;
+  for (size_t i = 0; i < keys.size(); i++) {
+    statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value));
+    values->push_back(value.ToString());
+    value.Reset();
+  }
+  if (snapshot_created) {
+    db_->ReleaseSnapshot(ro.snapshot);
   }
   return statuses;
 }
 
-Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
-                             const std::string& index_entry, std::string* value,
-                             SequenceNumber* sequence) {
-  Slice index_entry_slice(index_entry);
-  BlobHandle handle;
-  Status s = handle.DecodeFrom(&index_entry_slice);
-  if (!s.ok()) return s;
+bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
+  assert(read_options != nullptr);
+  if (read_options->snapshot != nullptr) {
+    return false;
+  }
+  read_options->snapshot = db_->GetSnapshot();
+  return true;
+}
+
+Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
+                                PinnableSlice* value) {
+  assert(value != nullptr);
+  BlobIndex blob_index;
+  Status s = blob_index.DecodeFrom(index_entry);
+  if (!s.ok()) {
+    return s;
+  }
+  if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) {
+    return Status::NotFound("Key expired");
+  }
+  if (blob_index.IsInlined()) {
+    // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
+    // memory buffer to avoid extra copy.
+    value->PinSelf(blob_index.value());
+    return Status::OK();
+  }
+  if (blob_index.size() == 0) {
+    value->PinSelf("");
+    return Status::OK();
+  }
 
   // offset has to have certain min, as we will read CRC
   // later from the Blob Header, which needs to be also a
   // valid offset.
-  if (handle.offset() <
-      (BlobLogHeader::kHeaderSize + BlobLogRecord::kHeaderSize + key.size())) {
+  if (blob_index.offset() <
+      (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
     if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(
-          db_options_.info_log,
-          "Invalid blob handle file_number: %" PRIu64 " blob_offset: %" PRIu64
-          " blob_size: %" PRIu64 " key: %s",
-          handle.filenumber(), handle.offset(), handle.size(), key.data());
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Invalid blob index file_number: %" PRIu64
+                      " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+                      " key: %s",
+                      blob_index.file_number(), blob_index.offset(),
+                      blob_index.size(), key.data());
     }
-    return Status::NotFound("Blob Not Found, although found in LSM");
+    return Status::NotFound("Invalid blob offset");
   }
 
   std::shared_ptr<BlobFile> bfile;
   {
     ReadLock rl(&mutex_);
-    auto hitr = blob_files_.find(handle.filenumber());
+    auto hitr = blob_files_.find(blob_index.file_number());
 
     // file was deleted
     if (hitr == blob_files_.end()) {
@@ -1187,14 +1147,8 @@ Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
     bfile = hitr->second;
   }
 
-  if (bfile->Obsolete()) {
-    return Status::NotFound(
-        "Blob Not Found as blob file was garbage collected");
-  }
-
-  // 0 - size
-  if (!handle.size() && value != nullptr) {
-    value->clear();
+  if (blob_index.size() == 0 && value != nullptr) {
+    value->PinSelf("");
     return Status::OK();
   }
 
@@ -1202,126 +1156,110 @@ Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key,
   std::shared_ptr<RandomAccessFileReader> reader =
       GetOrOpenRandomAccessReader(bfile, env_, env_options_);
 
-  if (value != nullptr) {
-    std::string* valueptr = value;
-    std::string value_c;
-    if (bdb_options_.compression != kNoCompression) {
-      valueptr = &value_c;
-    }
+  std::string* valueptr = value->GetSelf();
+  std::string value_c;
+  if (bdb_options_.compression != kNoCompression) {
+    valueptr = &value_c;
+  }
 
-    // allocate the buffer. This is safe in C++11
-    valueptr->resize(handle.size());
-    char* buffer = &(*valueptr)[0];
-
-    Slice blob_value;
-    s = reader->Read(handle.offset(), handle.size(), &blob_value, buffer);
-    if (!s.ok() || blob_value.size() != handle.size()) {
-      if (debug_level_ >= 2) {
-        ROCKS_LOG_ERROR(
-            db_options_.info_log,
-            "Failed to read blob from file: %s blob_offset: %" PRIu64
-            " blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
-            bfile->PathName().c_str(), handle.offset(), handle.size(),
-            static_cast<int>(blob_value.size()), key.data(),
-            s.ToString().c_str());
-      }
-      return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
-    }
+  // Allocate the buffer. This is safe in C++11
+  // Note that std::string::reserved() does not work, since previous value
+  // of the buffer can be larger than blob_index.size().
+  valueptr->resize(blob_index.size());
+  char* buffer = &(*valueptr)[0];
 
-    Slice crc_slice;
-    uint32_t crc_exp;
-    std::string crc_str;
-    crc_str.resize(sizeof(uint32_t));
-    char* crc_buffer = &(crc_str[0]);
-    s = reader->Read(handle.offset() - (key.size() + sizeof(uint32_t)),
-                     sizeof(uint32_t), &crc_slice, crc_buffer);
-    if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
-      if (debug_level_ >= 2) {
-        ROCKS_LOG_ERROR(
-            db_options_.info_log,
-            "Failed to fetch blob crc file: %s blob_offset: %" PRIu64
-            " blob_size: %" PRIu64 " key: %s status: '%s'",
-            bfile->PathName().c_str(), handle.offset(), handle.size(),
-            key.data(), s.ToString().c_str());
-      }
-      return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
+  Slice blob_value;
+  s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, buffer);
+  if (!s.ok() || blob_value.size() != blob_index.size()) {
+    if (debug_level_ >= 2) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Failed to read blob from file: %s blob_offset: %" PRIu64
+                      " blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
+                      bfile->PathName().c_str(), blob_index.offset(),
+                      blob_index.size(), static_cast<int>(blob_value.size()),
+                      key.data(), s.ToString().c_str());
     }
-
-    uint32_t crc = crc32c::Extend(0, blob_value.data(), blob_value.size());
-    crc = crc32c::Mask(crc);  // Adjust for storage
-    if (crc != crc_exp) {
-      if (debug_level_ >= 2) {
-        ROCKS_LOG_ERROR(db_options_.info_log,
-                        "Blob crc mismatch file: %s blob_offset: %" PRIu64
-                        " blob_size: %" PRIu64 " key: %s status: '%s'",
-                        bfile->PathName().c_str(), handle.offset(),
-                        handle.size(), key.data(), s.ToString().c_str());
-      }
-      return Status::Corruption("Corruption. Blob CRC mismatch");
+    return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
+  }
+
+  // TODO(yiwu): Add an option to skip crc checking.
+  Slice crc_slice;
+  uint32_t crc_exp;
+  std::string crc_str;
+  crc_str.resize(sizeof(uint32_t));
+  char* crc_buffer = &(crc_str[0]);
+  s = reader->Read(blob_index.offset() - (key.size() + sizeof(uint32_t)),
+                   sizeof(uint32_t), &crc_slice, crc_buffer);
+  if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
+    if (debug_level_ >= 2) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Failed to fetch blob crc file: %s blob_offset: %" PRIu64
+                      " blob_size: %" PRIu64 " key: %s status: '%s'",
+                      bfile->PathName().c_str(), blob_index.offset(),
+                      blob_index.size(), key.data(), s.ToString().c_str());
     }
+    return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
+  }
 
-    if (bdb_options_.compression != kNoCompression) {
-      BlockContents contents;
-      s = UncompressBlockContentsForCompressionType(
-          blob_value.data(), blob_value.size(), &contents,
-          kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
-          *(cfd->ioptions()));
-      *value = contents.data.ToString();
+  uint32_t crc = crc32c::Value(key.data(), key.size());
+  crc = crc32c::Extend(crc, blob_value.data(), blob_value.size());
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  if (crc != crc_exp) {
+    if (debug_level_ >= 2) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Blob crc mismatch file: %s blob_offset: %" PRIu64
+                      " blob_size: %" PRIu64 " key: %s status: '%s'",
+                      bfile->PathName().c_str(), blob_index.offset(),
+                      blob_index.size(), key.data(), s.ToString().c_str());
     }
+    return Status::Corruption("Corruption. Blob CRC mismatch");
   }
 
-  if (sequence != nullptr) {
-    char buffer[BlobLogRecord::kFooterSize];
-    Slice footer_slice;
-    s = reader->Read(handle.offset() + handle.size(),
-                     BlobLogRecord::kFooterSize, &footer_slice, buffer);
-    if (!s.ok()) {
-      return s;
-    }
-    BlobLogRecord record;
-    s = record.DecodeFooterFrom(footer_slice);
-    if (!s.ok()) {
-      return s;
-    }
-    *sequence = record.GetSN();
+  if (bfile->compression() != kNoCompression) {
+    BlockContents contents;
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+    s = UncompressBlockContentsForCompressionType(
+        blob_value.data(), blob_value.size(), &contents,
+        kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
+        *(cfh->cfd()->ioptions()));
+    *(value->GetSelf()) = contents.data.ToString();
   }
 
+  value->PinSelf();
+
   return s;
 }
 
-Status BlobDBImpl::Get(const ReadOptions& options,
+Status BlobDBImpl::Get(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        PinnableSlice* value) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  auto cfd = cfh->cfd();
+  if (column_family != DefaultColumnFamily()) {
+    return Status::NotSupported(
+        "Blob DB doesn't support non-default column family.");
+  }
+  // Get a snapshot to avoid blob file get deleted between we
+  // fetch and index entry and reading from the file.
+  // TODO(yiwu): For Get() retry if file not found would be a simpler strategy.
+  ReadOptions ro(read_options);
+  bool snapshot_created = SetSnapshotIfNeeded(&ro);
 
   Status s;
-  std::string index_entry;
-  s = db_->Get(options, column_family, key, &index_entry);
-  if (!s.ok()) {
-    if (debug_level_ >= 3)
-      ROCKS_LOG_WARN(db_options_.info_log,
-                     "Get Failed on LSM KEY: %s status: '%s'",
-                     key.ToString().c_str(), s.ToString().c_str());
-    return s;
+  bool is_blob_index = false;
+  s = db_impl_->GetImpl(ro, column_family, key, value,
+                        nullptr /*value_found*/, &is_blob_index);
+  TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
+  TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
+  if (s.ok() && is_blob_index) {
+    std::string index_entry = value->ToString();
+    value->Reset();
+    s = GetBlobValue(key, index_entry, value);
+  }
+  if (snapshot_created) {
+    db_->ReleaseSnapshot(ro.snapshot);
   }
-
-  s = CommonGet(cfd, key, index_entry, value->GetSelf());
-  value->PinSelf();
   return s;
 }
 
-Slice BlobDBIterator::value() const {
-  Slice index_entry = iter_->value();
-
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh_);
-  auto cfd = cfh->cfd();
-
-  Status s = db_impl_->CommonGet(cfd, iter_->key(), index_entry.ToString(false),
-                                 &vpart_);
-  return Slice(vpart_);
-}
-
 std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
   if (aborted) return std::make_pair(false, -1);
 
@@ -1331,9 +1269,9 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
                  blob_files_.size());
 
   ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" PRIu64,
-                 open_blob_files_.size());
+                 open_ttl_files_.size());
 
-  for (auto bfile : open_blob_files_) {
+  for (auto bfile : open_ttl_files_) {
     assert(!bfile->Immutable());
   }
 
@@ -1346,92 +1284,65 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
         "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64,
         bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(),
         bfile->deleted_count_, bfile->deleted_size_,
-        (bfile->ttl_range_.second - epoch_now));
+        (bfile->expiration_range_.second - epoch_now));
   }
 
   // reschedule
   return std::make_pair(true, -1);
 }
 
-std::pair<bool, int64_t> BlobDBImpl::CloseSeqWrite(
-    std::shared_ptr<BlobFile> bfile, bool aborted) {
+Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+  assert(bfile != nullptr);
+  Status s;
+  ROCKS_LOG_INFO(db_options_.info_log, "Close blob file %" PRIu64,
+                 bfile->BlobFileNumber());
   {
     WriteLock wl(&mutex_);
 
-    // this prevents others from picking up this file
-    open_blob_files_.erase(bfile);
-
-    auto findit =
-        std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile);
-    if (findit != open_simple_files_.end()) open_simple_files_.erase(findit);
+    if (bfile->HasTTL()) {
+      size_t erased __attribute__((__unused__));
+      erased = open_ttl_files_.erase(bfile);
+      assert(erased == 1);
+    } else {
+      assert(bfile == open_non_ttl_file_);
+      open_non_ttl_file_ = nullptr;
+    }
   }
 
   if (!bfile->closed_.load()) {
     WriteLock lockbfile_w(&bfile->mutex_);
-    bfile->WriteFooterAndCloseLocked();
+    s = bfile->WriteFooterAndCloseLocked();
   }
 
-  return std::make_pair(false, -1);
-}
-
-void BlobDBImpl::CloseIf(const std::shared_ptr<BlobFile>& bfile) {
-  // atomic read
-  bool close = bfile->GetFileSize() > bdb_options_.blob_file_size;
-  if (!close) return;
-
-  if (debug_level_ >= 2) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Scheduling file for close %s fsize: %" PRIu64
-                    " limit: %" PRIu64,
-                    bfile->PathName().c_str(), bfile->GetFileSize(),
-                    bdb_options_.blob_file_size);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to close blob file %" PRIu64 "with error: %s",
+                    bfile->BlobFileNumber(), s.ToString().c_str());
   }
 
-  {
-    WriteLock wl(&mutex_);
+  return s;
+}
 
-    open_blob_files_.erase(bfile);
-    auto findit =
-        std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile);
-    if (findit != open_simple_files_.end()) {
-      open_simple_files_.erase(findit);
-    } else {
-      ROCKS_LOG_WARN(db_options_.info_log,
-                     "File not found while closing %s fsize: %" PRIu64
-                     " Multithreaded Writes?",
-                     bfile->PathName().c_str(), bfile->GetFileSize());
-    }
+Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
+  // atomic read
+  if (bfile->GetFileSize() < bdb_options_.blob_file_size) {
+    return Status::OK();
   }
-
-  tqueue_.add(0, std::bind(&BlobDBImpl::CloseSeqWrite, this, bfile,
-                           std::placeholders::_1));
+  return CloseBlobFile(bfile);
 }
 
-bool BlobDBImpl::FileDeleteOk_SnapshotCheckLocked(
+bool BlobDBImpl::VisibleToActiveSnapshot(
     const std::shared_ptr<BlobFile>& bfile) {
   assert(bfile->Obsolete());
-
-  SequenceNumber esn = bfile->GetSNRange().first;
-
-  // this is not correct.
-  // you want to check that there are no snapshots in the
-  bool notok = db_impl_->HasActiveSnapshotLaterThanSN(esn);
-  if (notok) {
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Could not delete file due to snapshot failure %s",
-                   bfile->PathName().c_str());
-    return false;
-  } else {
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Will delete file due to snapshot success %s",
-                   bfile->PathName().c_str());
-    return true;
-  }
+  SequenceNumber first_sequence = bfile->GetSequenceRange().first;
+  SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
+  return db_impl_->HasActiveSnapshotInRange(first_sequence, obsolete_sequence);
 }
 
 bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
                                        uint64_t blob_offset,
                                        uint64_t blob_size) {
+  assert(bdb_options_.enable_garbage_collection);
   (void)blob_offset;
   std::shared_ptr<BlobFile> bfile;
   {
@@ -1449,27 +1360,27 @@ bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
   WriteLock lockbfile_w(&bfile->mutex_);
 
   bfile->deleted_count_++;
-  bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize +
-                          BlobLogRecord::kFooterSize;
+  bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize;
   return true;
 }
 
-bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& lsmValue) {
-  Slice val(lsmValue);
-  BlobHandle handle;
-  Status s = handle.DecodeFrom(&val);
+bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
+  assert(bdb_options_.enable_garbage_collection);
+  BlobIndex blob_index;
+  Status s = blob_index.DecodeFrom(index_entry);
   if (!s.ok()) {
     ROCKS_LOG_INFO(db_options_.info_log,
                    "Could not parse lsm val in MarkBlobDeleted %s",
-                   lsmValue.ToString().c_str());
+                   index_entry.ToString().c_str());
     return false;
   }
-  bool succ = FindFileAndEvictABlob(handle.filenumber(), key.size(),
-                                    handle.offset(), handle.size());
+  bool succ = FindFileAndEvictABlob(blob_index.file_number(), key.size(),
+                                    blob_index.offset(), blob_index.size());
   return succ;
 }
 
 std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
+  assert(bdb_options_.enable_garbage_collection);
   if (aborted) return std::make_pair(false, -1);
 
   override_packet_t packet;
@@ -1493,6 +1404,7 @@ std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
 }
 
 std::pair<bool, int64_t> BlobDBImpl::EvictDeletions(bool aborted) {
+  assert(bdb_options_.enable_garbage_collection);
   if (aborted) return std::make_pair(false, -1);
 
   ColumnFamilyHandle* last_cfh = nullptr;
@@ -1565,17 +1477,19 @@ std::pair<bool, int64_t> BlobDBImpl::CheckSeqFiles(bool aborted) {
     uint64_t epoch_now = EpochNow();
 
     ReadLock rl(&mutex_);
-    for (auto bfile : open_blob_files_) {
+    for (auto bfile : open_ttl_files_) {
       {
         ReadLock lockbfile_r(&bfile->mutex_);
 
-        if (bfile->ttl_range_.second > epoch_now) continue;
+        if (bfile->expiration_range_.second > epoch_now) continue;
         process_files.push_back(bfile);
       }
     }
   }
 
-  for (auto bfile : process_files) CloseSeqWrite(bfile, false);
+  for (auto bfile : process_files) {
+    CloseBlobFile(bfile);
+  }
 
   return std::make_pair(true, -1);
 }
@@ -1583,17 +1497,19 @@ std::pair<bool, int64_t> BlobDBImpl::CheckSeqFiles(bool aborted) {
 std::pair<bool, int64_t> BlobDBImpl::FsyncFiles(bool aborted) {
   if (aborted) return std::make_pair(false, -1);
 
+  MutexLock l(&write_mutex_);
+
   std::vector<std::shared_ptr<BlobFile>> process_files;
   {
     ReadLock rl(&mutex_);
-    for (auto fitr : open_blob_files_) {
+    for (auto fitr : open_ttl_files_) {
       if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
         process_files.push_back(fitr);
     }
 
-    for (auto fitr : open_simple_files_) {
-      if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
-        process_files.push_back(fitr);
+    if (open_non_ttl_file_ != nullptr &&
+        open_non_ttl_file_->NeedsFsync(true, bdb_options_.bytes_per_sync)) {
+      process_files.push_back(open_non_ttl_file_);
     }
   }
 
@@ -1610,8 +1526,9 @@ std::pair<bool, int64_t> BlobDBImpl::FsyncFiles(bool aborted) {
 std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
   if (aborted) return std::make_pair(false, -1);
 
-  if (open_file_count_.load() < bdb_options_.open_files_trigger)
+  if (open_file_count_.load() < kOpenFilesTrigger) {
     return std::make_pair(true, -1);
+  }
 
   // in the future, we should sort by last_access_
   // instead of closing every file
@@ -1627,12 +1544,13 @@ std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
   return std::make_pair(true, -1);
 }
 
+// TODO(yiwu): correct the stats and expose it.
 std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
   if (aborted) return std::make_pair(false, -1);
 
   WriteLock wl(&mutex_);
 
-  if (all_periods_write_.size() < bdb_options_.wa_num_stats_periods) {
+  if (all_periods_write_.size() >= kWriteAmplificationStatsPeriods) {
     total_periods_write_ -= (*all_periods_write_.begin());
     total_periods_ampl_ = (*all_periods_ampl_.begin());
 
@@ -1655,7 +1573,52 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
   return std::make_pair(true, -1);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+// Write callback for garbage collection to check if key has been updated
+// since last read. Similar to how OptimisticTransaction works. See inline
+// comment in GCFileAndUpdateLSM().
+class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
+ public:
+  GarbageCollectionWriteCallback(ColumnFamilyData* cfd, const Slice& key,
+                                 SequenceNumber upper_bound)
+      : cfd_(cfd), key_(key), upper_bound_(upper_bound) {}
+
+  virtual Status Callback(DB* db) override {
+    auto* db_impl = reinterpret_cast<DBImpl*>(db);
+    auto* sv = db_impl->GetAndRefSuperVersion(cfd_);
+    SequenceNumber latest_seq = 0;
+    bool found_record_for_key = false;
+    bool is_blob_index = false;
+    Status s = db_impl->GetLatestSequenceForKey(
+        sv, key_, false /*cache_only*/, &latest_seq, &found_record_for_key,
+        &is_blob_index);
+    db_impl->ReturnAndCleanupSuperVersion(cfd_, sv);
+    if (!s.ok() && !s.IsNotFound()) {
+      // Error.
+      assert(!s.IsBusy());
+      return s;
+    }
+    if (s.IsNotFound()) {
+      assert(!found_record_for_key);
+      return Status::Busy("Key deleted");
+    }
+    assert(found_record_for_key);
+    assert(is_blob_index);
+    if (latest_seq > upper_bound_) {
+      return Status::Busy("Key overwritten");
+    }
+    return s;
+  }
+
+  virtual bool AllowWriteBatching() override { return false; }
+
+ private:
+  ColumnFamilyData* cfd_;
+  // Key to check
+  Slice key_;
+  // Upper bound of sequence number to proceed.
+  SequenceNumber upper_bound_;
+};
+
 // iterate over the blobs sequentially and check if the blob sequence number
 // is the latest. If it is the latest, preserve it, otherwise delete it
 // if it is TTL based, and the TTL has expired, then
@@ -1668,10 +1631,9 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
 //
 // if it is not TTL based, then we can blow the key if the key has been
 // DELETED in the LSM
-////////////////////////////////////////////////////////////////////////////////
 Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
-                                      GCStats* gcstats) {
-  uint64_t tt = EpochNow();
+                                      GCStats* gc_stats) {
+  uint64_t now = EpochNow();
 
   std::shared_ptr<Reader> reader =
       bfptr->OpenSequentialReader(env_, db_options_, env_options_);
@@ -1693,101 +1655,150 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
 
   bool first_gc = bfptr->gc_once_after_open_;
 
-  ColumnFamilyHandle* cfh = bfptr->GetColumnFamily(db_);
-  auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
-  auto cfd = cfhi->cfd();
-  bool has_ttl = header.HasTTL();
+  auto* cfh =
+      db_impl_->GetColumnFamilyHandleUnlocked(bfptr->column_family_id());
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+  auto column_family_id = cfd->GetID();
+  bool has_ttl = header.has_ttl;
 
   // this reads the key but skips the blob
-  Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter;
-
-  assert(opt_db_);
+  Reader::ReadLevel shallow = Reader::kReadHeaderKey;
 
-  bool no_relocation_ttl = (has_ttl && tt > bfptr->GetTTLRange().second);
+  bool no_relocation_ttl =
+      (has_ttl && now >= bfptr->GetExpirationRange().second);
 
   bool no_relocation_lsmdel = false;
   {
     ReadLock lockbfile_r(&bfptr->mutex_);
-    no_relocation_lsmdel = (bfptr->GetFileSize() ==
-                            (BlobLogHeader::kHeaderSize + bfptr->deleted_size_ +
-                             BlobLogFooter::kFooterSize));
+    no_relocation_lsmdel =
+        (bfptr->GetFileSize() ==
+         (BlobLogHeader::kSize + bfptr->deleted_size_ + BlobLogFooter::kSize));
   }
 
   bool no_relocation = no_relocation_ttl || no_relocation_lsmdel;
   if (!no_relocation) {
     // read the blob because you have to write it back to new file
-    shallow = Reader::kReadHdrKeyBlobFooter;
+    shallow = Reader::kReadHeaderKeyBlob;
   }
 
   BlobLogRecord record;
   std::shared_ptr<BlobFile> newfile;
   std::shared_ptr<Writer> new_writer;
+  uint64_t blob_offset = 0;
 
-  while (reader->ReadRecord(&record, shallow).ok()) {
-    gcstats->blob_count++;
+  while (true) {
+    assert(s.ok());
 
-    bool del_this = false;
-    // this particular TTL has expired
-    if (no_relocation_ttl || (has_ttl && tt > record.GetTTL())) {
-      del_this = true;
-    } else {
-      SequenceNumber seq = kMaxSequenceNumber;
-      bool found_record_for_key = false;
-      SuperVersion* sv = db_impl_->GetAndRefSuperVersion(cfd);
-      if (sv == nullptr) {
-        Status result =
-            Status::InvalidArgument("Could not access column family 0");
-        return result;
-      }
-      Status s1 = db_impl_->GetLatestSequenceForKey(
-          sv, record.Key(), false, &seq, &found_record_for_key);
-      if (s1.IsNotFound() || (!found_record_for_key || seq != record.GetSN())) {
-        del_this = true;
-      }
-      db_impl_->ReturnAndCleanupSuperVersion(cfd, sv);
+    // Read the next blob record.
+    Status read_record_status =
+        reader->ReadRecord(&record, shallow, &blob_offset);
+    // Exit if we reach the end of blob file.
+    // TODO(yiwu): properly handle ReadRecord error.
+    if (!read_record_status.ok()) {
+      break;
+    }
+    gc_stats->blob_count++;
+
+    // Similar to OptimisticTransaction, we obtain latest_seq from
+    // base DB, which is guaranteed to be no smaller than the sequence of
+    // current key. We use a WriteCallback on write to check the key sequence
+    // on write. If the key sequence is larger than latest_seq, we know
+    // a new versions is inserted and the old blob can be disgard.
+    //
+    // We cannot use OptimisticTransaction because we need to pass
+    // is_blob_index flag to GetImpl.
+    SequenceNumber latest_seq = GetLatestSequenceNumber();
+    bool is_blob_index = false;
+    PinnableSlice index_entry;
+    Status get_status = db_impl_->GetImpl(
+        ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/,
+        &is_blob_index);
+    TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
+    if (!get_status.ok() && !get_status.IsNotFound()) {
+      // error
+      s = get_status;
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Error while getting index entry: %s",
+                      s.ToString().c_str());
+      break;
+    }
+    if (get_status.IsNotFound() || !is_blob_index) {
+      // Either the key is deleted or updated with a newer version whish is
+      // inlined in LSM.
+      continue;
+    }
+
+    BlobIndex blob_index;
+    s = blob_index.DecodeFrom(index_entry);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Error while decoding index entry: %s",
+                      s.ToString().c_str());
+      break;
+    }
+    if (blob_index.file_number() != bfptr->BlobFileNumber() ||
+        blob_index.offset() != blob_offset) {
+      // Key has been overwritten. Drop the blob record.
+      continue;
     }
 
-    if (del_this) {
-      gcstats->num_deletes++;
-      gcstats->deleted_size += record.GetBlobSize();
-      if (first_gc) continue;
-
-      Transaction* txn = static_cast<OptimisticTransactionDB*>(opt_db_.get())
-                             ->BeginTransaction(write_options_);
-      txn->Delete(cfh, record.Key());
-      Status s1 = txn->Commit();
-      // chances that this DELETE will fail is low. If it fails, it would be
-      // because a new version of the key came in at this time, which will
-      // override the current version being iterated on.
-      if (!s1.IsBusy()) {
-        // assume that failures happen due to new writes.
-        gcstats->overrided_while_delete++;
+    GarbageCollectionWriteCallback callback(cfd, record.key, latest_seq);
+
+    // If key has expired, remove it from base DB.
+    // TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter.
+    // We can just drop the blob record.
+    if (no_relocation_ttl || (has_ttl && now >= record.expiration)) {
+      gc_stats->num_deletes++;
+      gc_stats->deleted_size += record.value_size;
+      TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
+      WriteBatch delete_batch;
+      Status delete_status = delete_batch.Delete(record.key);
+      if (delete_status.ok()) {
+        delete_status = db_impl_->WriteWithCallback(WriteOptions(),
+                                                    &delete_batch, &callback);
       }
-      delete txn;
+      if (delete_status.ok()) {
+        gc_stats->delete_succeeded++;
+      } else if (delete_status.IsBusy()) {
+        // The key is overwritten in the meanwhile. Drop the blob record.
+        gc_stats->overwritten_while_delete++;
+      } else {
+        // We hit an error.
+        s = delete_status;
+        ROCKS_LOG_ERROR(db_options_.info_log,
+                        "Error while deleting expired key: %s",
+                        s.ToString().c_str());
+        break;
+      }
+      // Continue to next blob record or retry.
       continue;
-    } else if (first_gc) {
+    }
+
+    if (first_gc) {
+      // Do not relocate blob record for initial GC.
       continue;
     }
 
+    // Relocate the blob record to new file.
     if (!newfile) {
       // new file
       std::string reason("GC of ");
       reason += bfptr->PathName();
       newfile = NewBlobFile(reason);
-      gcstats->newfile = newfile;
+      gc_stats->newfile = newfile;
 
       new_writer = CheckOrCreateWriterLocked(newfile);
       newfile->header_ = std::move(header);
       // Can't use header beyond this point
       newfile->header_valid_ = true;
-      newfile->file_size_ = BlobLogHeader::kHeaderSize;
+      newfile->file_size_ = BlobLogHeader::kSize;
       s = new_writer->WriteHeader(newfile->header_);
 
       if (!s.ok()) {
         ROCKS_LOG_ERROR(db_options_.info_log,
                         "File: %s - header writing failed",
                         newfile->PathName().c_str());
-        return s;
+        break;
       }
 
       WriteLock wl(&mutex_);
@@ -1796,67 +1807,84 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
       blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
     }
 
-    gcstats->num_relocs++;
-    std::string index_entry;
+    gc_stats->num_relocate++;
+    std::string new_index_entry;
 
-    uint64_t blob_offset = 0;
-    uint64_t key_offset = 0;
+    uint64_t new_blob_offset = 0;
+    uint64_t new_key_offset = 0;
     // write the blob to the blob log.
-    s = new_writer->AddRecord(record.Key(), record.Blob(), &key_offset,
-                              &blob_offset, record.GetTTL());
+    s = new_writer->AddRecord(record.key, record.value, record.expiration,
+                              &new_key_offset, &new_blob_offset);
 
-    BlobHandle handle;
-    handle.set_filenumber(newfile->BlobFileNumber());
-    handle.set_size(record.Blob().size());
-    handle.set_offset(blob_offset);
-    handle.set_compression(bdb_options_.compression);
-    handle.EncodeTo(&index_entry);
+    BlobIndex::EncodeBlob(&new_index_entry, newfile->BlobFileNumber(),
+                          new_blob_offset, record.value.size(),
+                          bdb_options_.compression);
 
-    new_writer->AddRecordFooter(record.GetSN());
     newfile->blob_count_++;
-    newfile->file_size_ += BlobLogRecord::kHeaderSize + record.Key().size() +
-                           record.Blob().size() + BlobLogRecord::kFooterSize;
-
-    Transaction* txn = static_cast<OptimisticTransactionDB*>(opt_db_.get())
-                           ->BeginTransaction(write_options_);
-    txn->Put(cfh, record.Key(), index_entry);
-    Status s1 = txn->Commit();
-    // chances that this Put will fail is low. If it fails, it would be because
-    // a new version of the key came in at this time, which will override
-    // the current version being iterated on.
-    if (s1.IsBusy()) {
-      ROCKS_LOG_INFO(db_options_.info_log,
-                     "Optimistic transaction failed: %s put bn: %" PRIu32,
-                     bfptr->PathName().c_str(), gcstats->blob_count);
+    newfile->file_size_ +=
+        BlobLogRecord::kHeaderSize + record.key.size() + record.value.size();
+
+    TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate");
+    WriteBatch rewrite_batch;
+    Status rewrite_status = WriteBatchInternal::PutBlobIndex(
+        &rewrite_batch, column_family_id, record.key, new_index_entry);
+    if (rewrite_status.ok()) {
+      rewrite_status = db_impl_->WriteWithCallback(WriteOptions(),
+                                                   &rewrite_batch, &callback);
+    }
+    if (rewrite_status.ok()) {
+      newfile->ExtendSequenceRange(
+          WriteBatchInternal::Sequence(&rewrite_batch));
+      gc_stats->relocate_succeeded++;
+    } else if (rewrite_status.IsBusy()) {
+      // The key is overwritten in the meanwhile. Drop the blob record.
+      gc_stats->overwritten_while_relocate++;
     } else {
-      gcstats->succ_relocs++;
-      ROCKS_LOG_DEBUG(db_options_.info_log,
-                      "Successfully added put back into LSM: %s bn: %" PRIu32,
-                      bfptr->PathName().c_str(), gcstats->blob_count);
+      // We hit an error.
+      s = rewrite_status;
+      ROCKS_LOG_ERROR(db_options_.info_log, "Error while relocating key: %s",
+                      s.ToString().c_str());
+      break;
+    }
+  }  // end of ReadRecord loop
+
+  if (s.ok()) {
+    SequenceNumber obsolete_sequence =
+        newfile == nullptr ? bfptr->GetSequenceRange().second + 1
+                           : newfile->GetSequenceRange().second;
+    bfptr->MarkObsolete(obsolete_sequence);
+    if (!first_gc) {
+      WriteLock wl(&mutex_);
+      obsolete_files_.push_back(bfptr);
     }
-    delete txn;
   }
 
-  if (gcstats->newfile) total_blob_space_ += newfile->file_size_;
-
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "File: %s Num deletes %" PRIu32 " Num relocs: %" PRIu32
-                 " Succ Deletes: %" PRIu32 " Succ relocs: %" PRIu32,
-                 bfptr->PathName().c_str(), gcstats->num_deletes,
-                 gcstats->num_relocs, gcstats->succ_deletes_lsm,
-                 gcstats->succ_relocs);
-
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "%s blob file %" PRIu64
+      ". Total blob records: %" PRIu64 ", Deletes: %" PRIu64 "/%" PRIu64
+      " succeeded, Relocates: %" PRIu64 "/%" PRIu64 " succeeded.",
+      s.ok() ? "Successfully garbage collected" : "Failed to garbage collect",
+      bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->delete_succeeded,
+      gc_stats->num_deletes, gc_stats->relocate_succeeded,
+      gc_stats->num_relocate);
+  if (newfile != nullptr) {
+    total_blob_space_ += newfile->file_size_;
+    ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".",
+                   newfile->BlobFileNumber());
+  }
   return s;
 }
 
 // Ideally we should hold the lock during the entire function,
 // but under the asusmption that this is only called when a
 // file is Immutable, we can reduce the critical section
-bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
-                              uint64_t last_id, std::string* reason) {
+bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
+                              bool is_oldest_non_ttl_file,
+                              std::string* reason) {
   if (bfile->HasTTL()) {
-    ttlrange_t ttl_range = bfile->GetTTLRange();
-    if (tt > ttl_range.second) {
+    ExpirationRange expiration_range = bfile->GetExpirationRange();
+    if (now > expiration_range.second) {
       *reason = "entire file ttl expired";
       return true;
     }
@@ -1872,15 +1900,14 @@ bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
       return true;
     }
 
-    if (bdb_options_.ttl_range_secs <
-        bdb_options_.partial_expiration_gc_range_secs) {
+    if (bdb_options_.ttl_range_secs < kPartialExpirationGCRangeSecs) {
       *reason = "has ttl but partial expiration not turned on";
       return false;
     }
 
     ReadLock lockbfile_r(&bfile->mutex_);
     bool ret = ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
-                bdb_options_.partial_expiration_pct);
+                kPartialExpirationPercentage);
     if (ret) {
       *reason = "deleted blobs beyond threshold";
     } else {
@@ -1898,28 +1925,30 @@ bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
 
   ReadLock lockbfile_r(&bfile->mutex_);
 
-  if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
-      bdb_options_.partial_expiration_pct) {
-    *reason = "deleted simple blobs beyond threshold";
-    return true;
+  if (bdb_options_.enable_garbage_collection) {
+    if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
+        kPartialExpirationPercentage) {
+      *reason = "deleted simple blobs beyond threshold";
+      return true;
+    }
   }
 
   // if we haven't reached limits of disk space, don't DELETE
-  if (total_blob_space_.load() < bdb_options_.blob_dir_size) {
+  if (bdb_options_.blob_dir_size == 0 ||
+      total_blob_space_.load() < bdb_options_.blob_dir_size) {
     *reason = "disk space not exceeded";
     return false;
   }
 
-  bool ret = bfile->BlobFileNumber() == last_id;
-  if (ret) {
-    *reason = "eligible last simple blob file";
-  } else {
-    *reason = "not eligible since not last simple blob file";
+  if (is_oldest_non_ttl_file) {
+    *reason = "out of space and is the oldest simple blob file";
+    return true;
   }
-  return ret;
+  *reason = "out of space but is not the oldest simple blob file";
+  return false;
 }
 
-std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
+std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
   if (aborted) return std::make_pair(false, -1);
 
   {
@@ -1938,12 +1967,19 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
     auto bfile = *iter;
     {
       ReadLock lockbfile_r(&bfile->mutex_);
-      if (!FileDeleteOk_SnapshotCheckLocked(bfile)) {
+      if (VisibleToActiveSnapshot(bfile)) {
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Could not delete file due to snapshot failure %s",
+                       bfile->PathName().c_str());
         ++iter;
         continue;
       }
     }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Will delete file due to snapshot success %s",
+                   bfile->PathName().c_str());
 
+    blob_files_.erase(bfile->BlobFileNumber());
     Status s = env_->DeleteFile(bfile->PathName());
     if (!s.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
@@ -1963,109 +1999,51 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsFiles(bool aborted) {
   }
 
   // directory change. Fsync
-  if (file_deleted) dir_ent_->Fsync();
+  if (file_deleted) {
+    dir_ent_->Fsync();
+
+    // reset oldest_file_evicted flag
+    oldest_file_evicted_.store(false);
+  }
 
   // put files back into obsolete if for some reason, delete failed
   if (!tobsolete.empty()) {
     WriteLock wl(&mutex_);
-    for (auto bfile : tobsolete) obsolete_files_.push_front(bfile);
+    for (auto bfile : tobsolete) {
+      obsolete_files_.push_front(bfile);
+    }
   }
 
   return std::make_pair(!aborted, -1);
 }
 
-bool BlobDBImpl::CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile) {
-  std::shared_ptr<Reader> reader =
-      bfile->OpenSequentialReader(env_, db_options_, env_options_);
-  if (!reader) {
-    ROCKS_LOG_ERROR(
-        db_options_.info_log,
-        "File sequential reader could not be opened for evict callback: %s",
-        bfile->PathName().c_str());
-    return false;
-  }
-
-  ReadLock lockbfile_r(&bfile->mutex_);
-
-  BlobLogHeader header;
-  Status s = reader->ReadHeader(&header);
-  if (!s.ok()) {
-    ROCKS_LOG_ERROR(
-        db_options_.info_log,
-        "Failure to read header for blob-file during evict callback %s",
-        bfile->PathName().c_str());
-    return false;
-  }
-
-  ColumnFamilyHandle* cfh = bfile->GetColumnFamily(db_);
-  BlobLogRecord record;
-  Reader::ReadLevel full = Reader::kReadHdrKeyBlobFooter;
-  while (reader->ReadRecord(&record, full).ok()) {
-    bdb_options_.gc_evict_cb_fn(cfh, record.Key(), record.Blob());
-  }
-
-  return true;
-}
-
-std::pair<bool, int64_t> BlobDBImpl::RemoveTimerQ(TimerQueue* tq,
-                                                  bool aborted) {
-  WriteLock wl(&mutex_);
-  for (auto itr = cb_threads_.begin(); itr != cb_threads_.end(); ++itr) {
-    if ((*itr).get() != tq) continue;
-
-    cb_threads_.erase(itr);
-    break;
-  }
-  return std::make_pair(false, -1);
-}
-
-std::pair<bool, int64_t> BlobDBImpl::CallbackEvicts(
-    TimerQueue* tq, std::shared_ptr<BlobFile> bfile, bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
-  bool succ = CallbackEvictsImpl(bfile);
-  if (succ) {
-    ROCKS_LOG_DEBUG(db_options_.info_log, "Eviction callbacks completed %s",
-                    bfile->PathName().c_str());
-  }
-
-  WriteLock wl(&mutex_);
-  bfile->SetCanBeDeleted();
-  obsolete_files_.push_front(bfile);
-  if (tq) {
-    // all of the callbacks have been processed
-    tqueue_.add(0, std::bind(&BlobDBImpl::RemoveTimerQ, this, tq,
-                             std::placeholders::_1));
-  }
-  return std::make_pair(false, -1);
-}
-
 void BlobDBImpl::CopyBlobFiles(
-    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy, uint64_t* last_id) {
+    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
+    std::function<bool(const std::shared_ptr<BlobFile>&)> predicate) {
   ReadLock rl(&mutex_);
 
-  // take a copy
-  bfiles_copy->reserve(blob_files_.size());
-  for (auto const& ent : blob_files_) {
-    bfiles_copy->push_back(ent.second);
-
-    // A. has ttl is immutable, once set, hence no locks required
-    // B. blob files are sorted based on number(i.e. index of creation )
-    //    so we will return the last blob file
-    if (!ent.second->HasTTL()) *last_id = ent.second->BlobFileNumber();
+  for (auto const& p : blob_files_) {
+    bool pred_value = true;
+    if (predicate) {
+      pred_value = predicate(p.second);
+    }
+    if (pred_value) {
+      bfiles_copy->push_back(p.second);
+    }
   }
 }
 
 void BlobDBImpl::FilterSubsetOfFiles(
     const std::vector<std::shared_ptr<BlobFile>>& blob_files,
     std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
-    uint64_t last_id, size_t files_to_collect) {
+    size_t files_to_collect) {
   // 100.0 / 15.0 = 7
   uint64_t next_epoch_increment = static_cast<uint64_t>(
-      std::ceil(100 / static_cast<double>(bdb_options_.gc_file_pct)));
-  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
-  std::time_t tt = std::chrono::system_clock::to_time_t(now);
+      std::ceil(100 / static_cast<double>(kGCFilePercentage)));
+  uint64_t now = EpochNow();
 
   size_t files_processed = 0;
+  bool non_ttl_file_found = false;
   for (auto bfile : blob_files) {
     if (files_processed >= files_to_collect) break;
     // if this is the first time processing the file
@@ -2085,20 +2063,28 @@ void BlobDBImpl::FilterSubsetOfFiles(
     // then it should not be GC'd
     if (bfile->Obsolete() || !bfile->Immutable()) continue;
 
+    bool is_oldest_non_ttl_file = false;
+    if (!non_ttl_file_found && !bfile->HasTTL()) {
+      is_oldest_non_ttl_file = true;
+      non_ttl_file_found = true;
+    }
+
     std::string reason;
-    bool shouldgc = ShouldGCFile(bfile, tt, last_id, &reason);
+    bool shouldgc = ShouldGCFile(bfile, now, is_oldest_non_ttl_file, &reason);
     if (!shouldgc) {
       ROCKS_LOG_DEBUG(db_options_.info_log,
-                      "File has been skipped for GC ttl %s %d %d reason='%s'",
-                      bfile->PathName().c_str(), tt,
-                      bfile->GetTTLRange().second, reason.c_str());
+                      "File has been skipped for GC ttl %s %" PRIu64 " %" PRIu64
+                      " reason='%s'",
+                      bfile->PathName().c_str(), now,
+                      bfile->GetExpirationRange().second, reason.c_str());
       continue;
     }
 
     ROCKS_LOG_INFO(db_options_.info_log,
-                   "File has been chosen for GC ttl %s %d %d reason='%s'",
-                   bfile->PathName().c_str(), tt, bfile->GetTTLRange().second,
-                   reason.c_str());
+                   "File has been chosen for GC ttl %s %" PRIu64 " %" PRIu64
+                   " reason='%s'",
+                   bfile->PathName().c_str(), now,
+                   bfile->GetExpirationRange().second, reason.c_str());
     to_process->push_back(bfile);
   }
 }
@@ -2108,73 +2094,54 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
 
   current_epoch_++;
 
-  // collect the ID of the last regular file, in case we need to GC it.
-  uint64_t last_id = std::numeric_limits<uint64_t>::max();
-
   std::vector<std::shared_ptr<BlobFile>> blob_files;
-  CopyBlobFiles(&blob_files, &last_id);
+  CopyBlobFiles(&blob_files);
 
   if (!blob_files.size()) return std::make_pair(true, -1);
 
   // 15% of files are collected each call to space out the IO and CPU
   // consumption.
-  size_t files_to_collect =
-      (bdb_options_.gc_file_pct * blob_files.size()) / 100;
+  size_t files_to_collect = (kGCFilePercentage * blob_files.size()) / 100;
 
   std::vector<std::shared_ptr<BlobFile>> to_process;
-  FilterSubsetOfFiles(blob_files, &to_process, current_epoch_, last_id,
+  FilterSubsetOfFiles(blob_files, &to_process, current_epoch_,
                       files_to_collect);
 
-  // in this collect the set of files, which became obsolete
-  std::vector<std::shared_ptr<BlobFile>> obsoletes;
   for (auto bfile : to_process) {
-    GCStats gcstats;
-    Status s = GCFileAndUpdateLSM(bfile, &gcstats);
-    if (!s.ok()) continue;
+    GCStats gc_stats;
+    Status s = GCFileAndUpdateLSM(bfile, &gc_stats);
+    if (!s.ok()) {
+      continue;
+    }
 
     if (bfile->gc_once_after_open_.load()) {
       WriteLock lockbfile_w(&bfile->mutex_);
 
-      bfile->deleted_size_ = gcstats.deleted_size;
-      bfile->deleted_count_ = gcstats.num_deletes;
+      bfile->deleted_size_ = gc_stats.deleted_size;
+      bfile->deleted_count_ = gc_stats.num_deletes;
       bfile->gc_once_after_open_ = false;
-    } else {
-      obsoletes.push_back(bfile);
     }
   }
 
-  if (!obsoletes.empty()) {
-    bool evict_cb = (!!bdb_options_.gc_evict_cb_fn);
-    std::shared_ptr<TimerQueue> tq;
-    if (evict_cb) tq = std::make_shared<TimerQueue>();
-
-    // if evict callback is present, first schedule the callback thread
-    WriteLock wl(&mutex_);
-    for (auto bfile : obsoletes) {
-      bool last_file = (bfile == obsoletes.back());
-      // remove from global list so writers
-      blob_files_.erase(bfile->BlobFileNumber());
-
-      if (!evict_cb) {
-        bfile->SetCanBeDeleted();
-        obsolete_files_.push_front(bfile);
-      } else {
-        tq->add(0, std::bind(&BlobDBImpl::CallbackEvicts, this,
-                             (last_file) ? tq.get() : nullptr, bfile,
-                             std::placeholders::_1));
-      }
-    }
-    if (evict_cb) cb_threads_.emplace_back(tq);
-  }
-
   // reschedule
   return std::make_pair(true, -1);
 }
 
-Iterator* BlobDBImpl::NewIterator(const ReadOptions& opts,
-                                  ColumnFamilyHandle* column_family) {
-  return new BlobDBIterator(db_->NewIterator(opts, column_family),
-                            column_family, this);
+Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
+  auto* cfd =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd();
+  // Get a snapshot to avoid blob file get deleted between we
+  // fetch and index entry and reading from the file.
+  ManagedSnapshot* own_snapshot = nullptr;
+  const Snapshot* snapshot = read_options.snapshot;
+  if (snapshot == nullptr) {
+    own_snapshot = new ManagedSnapshot(db_);
+    snapshot = own_snapshot->snapshot();
+  }
+  auto* iter = db_impl_->NewIteratorImpl(
+      read_options, cfd, snapshot->GetSequenceNumber(),
+      true /*allow_blob*/);
+  return new BlobDBIterator(own_snapshot, iter, this);
 }
 
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
@@ -2211,18 +2178,13 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
 }
 
 #ifndef NDEBUG
-Status BlobDBImpl::TEST_GetSequenceNumber(const Slice& key,
-                                          SequenceNumber* sequence) {
-  std::string index_entry;
-  Status s = db_->Get(ReadOptions(), key, &index_entry);
-  if (!s.ok()) {
-    return s;
-  }
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-  return CommonGet(cfh->cfd(), key, index_entry, nullptr, sequence);
+Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
+                                     PinnableSlice* value) {
+  return GetBlobValue(key, index_entry, value);
 }
 
 std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
+  ReadLock l(&mutex_);
   std::vector<std::shared_ptr<BlobFile>> blob_files;
   for (auto& p : blob_files_) {
     blob_files.emplace_back(p.second);
@@ -2230,14 +2192,30 @@ std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
   return blob_files;
 }
 
-void BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
-  CloseSeqWrite(bfile, false /*abort*/);
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetObsoleteFiles()
+    const {
+  ReadLock l(&mutex_);
+  std::vector<std::shared_ptr<BlobFile>> obsolete_files;
+  for (auto& bfile : obsolete_files_) {
+    obsolete_files.emplace_back(bfile);
+  }
+  return obsolete_files;
+}
+
+void BlobDBImpl::TEST_DeleteObsoleteFiles() {
+  DeleteObsoleteFiles(false /*abort*/);
+}
+
+Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
+  return CloseBlobFile(bfile);
 }
 
 Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
                                            GCStats* gc_stats) {
   return GCFileAndUpdateLSM(bfile, gc_stats);
 }
+
+void BlobDBImpl::TEST_RunGC() { RunGC(false /*abort*/); }
 #endif  //  !NDEBUG
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 8da5bbf65..9881107d3 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -9,27 +9,27 @@
 
 #include <atomic>
 #include <condition_variable>
-#include <ctime>
 #include <limits>
 #include <list>
 #include <memory>
 #include <set>
 #include <string>
 #include <thread>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "db/db_iter.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/wal_filter.h"
-#include "util/file_reader_writer.h"
 #include "util/mpsc.h"
 #include "util/mutexlock.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_db.h"
-#include "utilities/blob_db/blob_db_options_impl.h"
+#include "utilities/blob_db/blob_file.h"
 #include "utilities/blob_db/blob_log_format.h"
 #include "utilities/blob_db/blob_log_reader.h"
 #include "utilities/blob_db/blob_log_writer.h"
@@ -39,7 +39,6 @@ namespace rocksdb {
 class DBImpl;
 class ColumnFamilyHandle;
 class ColumnFamilyData;
-class OptimisticTransactionDBImpl;
 struct FlushJobInfo;
 
 namespace blob_db {
@@ -138,10 +137,13 @@ struct GCStats {
   uint64_t blob_count = 0;
   uint64_t num_deletes = 0;
   uint64_t deleted_size = 0;
-  uint64_t num_relocs = 0;
-  uint64_t succ_deletes_lsm = 0;
-  uint64_t overrided_while_delete = 0;
-  uint64_t succ_relocs = 0;
+  uint64_t retry_delete = 0;
+  uint64_t delete_succeeded = 0;
+  uint64_t overwritten_while_delete = 0;
+  uint64_t num_relocate = 0;
+  uint64_t retry_relocate = 0;
+  uint64_t relocate_succeeded = 0;
+  uint64_t overwritten_while_relocate = 0;
   std::shared_ptr<BlobFile> newfile = nullptr;
 };
 
@@ -158,48 +160,98 @@ class BlobDBImpl : public BlobDB {
   friend class BlobDBIterator;
 
  public:
-  static constexpr uint64_t kNoExpiration =
-      std::numeric_limits<uint64_t>::max();
+  // deletions check period
+  static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000;
 
-  using rocksdb::StackableDB::Put;
-  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
-             const Slice& key, const Slice& value) override;
+  // gc percentage each check period
+  static constexpr uint32_t kGCFilePercentage = 100;
 
-  using rocksdb::StackableDB::Delete;
-  Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
-                const Slice& key) override;
+  // gc period
+  static constexpr uint32_t kGCCheckPeriodMillisecs = 60 * 1000;
 
-  using rocksdb::StackableDB::SingleDelete;
-  virtual Status SingleDelete(const WriteOptions& wopts,
-                              ColumnFamilyHandle* column_family,
-                              const Slice& key) override;
+  // sanity check task
+  static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000;
 
-  using rocksdb::StackableDB::Get;
-  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+  // how many random access open files can we tolerate
+  static constexpr uint32_t kOpenFilesTrigger = 100;
+
+  // how many periods of stats do we keep.
+  static constexpr uint32_t kWriteAmplificationStatsPeriods = 24;
+
+  // what is the length of any period
+  static constexpr uint32_t kWriteAmplificationStatsPeriodMillisecs =
+      3600 * 1000;
+
+  // we will garbage collect blob files in
+  // which entire files have expired. However if the
+  // ttl_range of files is very large say a day, we
+  // would have to wait for the entire day, before we
+  // recover most of the space.
+  static constexpr uint32_t kPartialExpirationGCRangeSecs = 4 * 3600;
+
+  // this should be based on allowed Write Amplification
+  // if 50% of the space of a blob file has been deleted/expired,
+  static constexpr uint32_t kPartialExpirationPercentage = 75;
+
+  // how often should we schedule a job to fsync open files
+  static constexpr uint32_t kFSyncFilesPeriodMillisecs = 10 * 1000;
+
+  // how often to schedule reclaim open files.
+  static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000;
+
+  // how often to schedule delete obs files periods
+  static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000;
+
+  // how often to schedule check seq files period
+  static constexpr uint32_t kCheckSeqFilesPeriodMillisecs = 10 * 1000;
+
+  // when should oldest file be evicted:
+  // on reaching 90% of blob_dir_size
+  static constexpr double kEvictOldestFileAtSize = 0.9;
+
+  using BlobDB::Put;
+  Status Put(const WriteOptions& options, const Slice& key,
+             const Slice& value) override;
+
+  using BlobDB::Delete;
+  Status Delete(const WriteOptions& options, const Slice& key) override;
+
+  using BlobDB::Get;
+  Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
              const Slice& key, PinnableSlice* value) override;
 
-  using rocksdb::StackableDB::NewIterator;
-  virtual Iterator* NewIterator(const ReadOptions& opts,
-                                ColumnFamilyHandle* column_family) override;
+  using BlobDB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& read_options) override;
 
-  using rocksdb::StackableDB::MultiGet;
+  using BlobDB::NewIterators;
+  virtual Status NewIterators(
+      const ReadOptions& read_options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override {
+    return Status::NotSupported("Not implemented");
+  }
+
+  using BlobDB::MultiGet;
   virtual std::vector<Status> MultiGet(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
+      const ReadOptions& read_options,
       const std::vector<Slice>& keys,
       std::vector<std::string>* values) override;
 
   virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
 
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* ) override;
+
   using BlobDB::PutWithTTL;
-  Status PutWithTTL(const WriteOptions& options,
-                    ColumnFamilyHandle* column_family, const Slice& key,
-                    const Slice& value, int32_t ttl) override;
+  Status PutWithTTL(const WriteOptions& options, const Slice& key,
+                    const Slice& value, uint64_t ttl) override;
 
   using BlobDB::PutUntil;
-  Status PutUntil(const WriteOptions& options,
-                  ColumnFamilyHandle* column_family, const Slice& key,
-                  const Slice& value_unc, int32_t expiration) override;
+  Status PutUntil(const WriteOptions& options, const Slice& key,
+                  const Slice& value, uint64_t expiration) override;
 
   Status LinkToBaseDB(DB* db) override;
 
@@ -213,22 +265,35 @@ class BlobDBImpl : public BlobDB {
   ~BlobDBImpl();
 
 #ifndef NDEBUG
-  Status TEST_GetSequenceNumber(const Slice& key, SequenceNumber* sequence);
+  Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
+                           PinnableSlice* value);
 
   std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
 
-  void TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
+  std::vector<std::shared_ptr<BlobFile>> TEST_GetObsoleteFiles() const;
+
+  Status TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
 
   Status TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
                                  GCStats* gc_stats);
+
+  void TEST_RunGC();
+
+  void TEST_DeleteObsoleteFiles();
 #endif  //  !NDEBUG
 
  private:
+  class GarbageCollectionWriteCallback;
+  class BlobInserter;
+
   Status OpenPhase1();
 
-  Status CommonGet(const ColumnFamilyData* cfd, const Slice& key,
-                   const std::string& index_entry, std::string* value,
-                   SequenceNumber* sequence = nullptr);
+  // Create a snapshot if there isn't one in read options.
+  // Return true if a snapshot is created.
+  bool SetSnapshotIfNeeded(ReadOptions* read_options);
+
+  Status GetBlobValue(const Slice& key, const Slice& index_entry,
+                      PinnableSlice* value);
 
   Slice GetCompressedSlice(const Slice& raw,
                            std::string* compression_output) const;
@@ -237,44 +302,42 @@ class BlobDBImpl : public BlobDB {
   // this handler is called.
   void OnFlushBeginHandler(DB* db, const FlushJobInfo& info);
 
-  // timer queue callback to close a file by appending a footer
-  // removes file from open files list
-  std::pair<bool, int64_t> CloseSeqWrite(std::shared_ptr<BlobFile> bfile,
-                                         bool aborted);
-
   // is this file ready for Garbage collection. if the TTL of the file
   // has expired or if threshold of the file has been evicted
   // tt - current time
   // last_id - the id of the non-TTL file to evict
-  bool ShouldGCFile(std::shared_ptr<BlobFile> bfile, std::time_t tt,
-                    uint64_t last_id, std::string* reason);
+  bool ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
+                    bool is_oldest_non_ttl_file, std::string* reason);
 
   // collect all the blob log files from the blob directory
   Status GetAllLogFiles(std::set<std::pair<uint64_t, std::string>>* file_nums);
 
-  // appends a task into timer queue to close the file
-  void CloseIf(const std::shared_ptr<BlobFile>& bfile);
+  // Close a file by appending a footer, and removes file from open files list.
+  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
+
+  // Close a file if its size exceeds blob_file_size
+  Status CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile);
 
-  int32_t ExtractExpiration(const Slice& key, const Slice& value,
-                            Slice* value_slice, std::string* new_value);
+  uint64_t ExtractExpiration(const Slice& key, const Slice& value,
+                             Slice* value_slice, std::string* new_value);
+
+  Status PutBlobValue(const WriteOptions& options, const Slice& key,
+                      const Slice& value, uint64_t expiration,
+                      SequenceNumber sequence, WriteBatch* batch);
 
   Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
                     const std::string& headerbuf, const Slice& key,
-                    const Slice& value, std::string* index_entry);
-
-  Status AppendSN(const std::shared_ptr<BlobFile>& bfile,
-                  const SequenceNumber& sn);
+                    const Slice& value, uint64_t expiration,
+                    std::string* index_entry);
 
   // find an existing blob log file based on the expiration unix epoch
   // if such a file does not exist, return nullptr
-  std::shared_ptr<BlobFile> SelectBlobFileTTL(uint32_t expiration);
+  std::shared_ptr<BlobFile> SelectBlobFileTTL(uint64_t expiration);
 
   // find an existing blob log file to append the value to
   std::shared_ptr<BlobFile> SelectBlobFile();
 
-  std::shared_ptr<BlobFile> FindBlobFileLocked(uint32_t expiration) const;
-
-  void UpdateWriteOptions(const WriteOptions& options);
+  std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
 
   void Shutdown();
 
@@ -284,7 +347,7 @@ class BlobDBImpl : public BlobDB {
   // delete files which have been garbage collected and marked
   // obsolete. Check whether any snapshots exist which refer to
   // the same
-  std::pair<bool, int64_t> DeleteObsFiles(bool aborted);
+  std::pair<bool, int64_t> DeleteObsoleteFiles(bool aborted);
 
   // Major task to garbage collect expired and deleted blobs
   std::pair<bool, int64_t> RunGC(bool aborted);
@@ -309,14 +372,8 @@ class BlobDBImpl : public BlobDB {
 
   std::pair<bool, int64_t> EvictCompacted(bool aborted);
 
-  bool CallbackEvictsImpl(std::shared_ptr<BlobFile> bfile);
-
   std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
 
-  std::pair<bool, int64_t> CallbackEvicts(TimerQueue* tq,
-                                          std::shared_ptr<BlobFile> bfile,
-                                          bool aborted);
-
   // Adds the background tasks to the timer queue
   void StartBackgroundTasks();
 
@@ -352,6 +409,7 @@ class BlobDBImpl : public BlobDB {
 
   // checks if there is no snapshot which is referencing the
   // blobs
+  bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
   bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
 
   bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
@@ -359,31 +417,30 @@ class BlobDBImpl : public BlobDB {
   bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
                              uint64_t blob_offset, uint64_t blob_size);
 
-  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
-                     uint64_t* last_id);
+  void CopyBlobFiles(
+      std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
+      std::function<bool(const std::shared_ptr<BlobFile>&)> predicate = {});
 
   void FilterSubsetOfFiles(
       const std::vector<std::shared_ptr<BlobFile>>& blob_files,
       std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
-      uint64_t last_id, size_t files_to_collect);
+      size_t files_to_collect);
 
   uint64_t EpochNow() { return env_->NowMicros() / 1000000; }
 
+  Status CheckSize(size_t blob_size);
+
+  std::shared_ptr<BlobFile> GetOldestBlobFile();
+
+  bool EvictOldestBlobFile();
+
   // the base DB
   DBImpl* db_impl_;
   Env* env_;
   TTLExtractor* ttl_extractor_;
 
-  // Optimistic Transaction DB used during Garbage collection
-  // for atomicity
-  std::unique_ptr<OptimisticTransactionDBImpl> opt_db_;
-
-  // a boolean to capture whether write_options has been set
-  std::atomic<bool> wo_set_;
-  WriteOptions write_options_;
-
   // the options that govern the behavior of Blob Storage
-  BlobDBOptionsImpl bdb_options_;
+  BlobDBOptions bdb_options_;
   DBOptions db_options_;
   EnvOptions env_options_;
 
@@ -401,23 +458,26 @@ class BlobDBImpl : public BlobDB {
 
   // Read Write Mutex, which protects all the data structures
   // HEAVILY TRAFFICKED
-  port::RWMutex mutex_;
+  mutable port::RWMutex mutex_;
+
+  // Writers has to hold write_mutex_ before writing.
+  mutable port::Mutex write_mutex_;
 
   // counter for blob file number
   std::atomic<uint64_t> next_file_number_;
 
   // entire metadata of all the BLOB files memory
-  std::unordered_map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
+  std::map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
 
   // epoch or version of the open files.
   std::atomic<uint64_t> epoch_of_;
 
-  // typically we keep 4 open blob files (simple i.e. no TTL)
-  std::vector<std::shared_ptr<BlobFile>> open_simple_files_;
+  // opened non-TTL blob file.
+  std::shared_ptr<BlobFile> open_non_ttl_file_;
 
   // all the blob files which are currently being appended to based
   // on variety of incoming TTL's
-  std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_blob_files_;
+  std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_ttl_files_;
 
   // packet of information to put in lockess delete(s) queue
   struct delete_packet_t {
@@ -450,9 +510,6 @@ class BlobDBImpl : public BlobDB {
   // timer based queue to execute tasks
   TimerQueue tqueue_;
 
-  // timer queues to call eviction callbacks.
-  std::vector<std::shared_ptr<TimerQueue>> cb_threads_;
-
   // only accessed in GC thread, hence not atomic. The epoch of the
   // GC task. Each execution is one epoch. Helps us in allocating
   // files to one execution
@@ -480,208 +537,8 @@ class BlobDBImpl : public BlobDB {
   bool open_p1_done_;
 
   uint32_t debug_level_;
-};
-
-class BlobFile {
-  friend class BlobDBImpl;
-  friend struct blobf_compare_ttl;
-
- private:
-  // access to parent
-  const BlobDBImpl* parent_;
-
-  // path to blob directory
-  std::string path_to_dir_;
-
-  // the id of the file.
-  // the above 2 are created during file creation and never changed
-  // after that
-  uint64_t file_number_;
-
-  // number of blobs in the file
-  std::atomic<uint64_t> blob_count_;
-
-  // the file will be selected for GC in this future epoch
-  std::atomic<int64_t> gc_epoch_;
-
-  // size of the file
-  std::atomic<uint64_t> file_size_;
-
-  // number of blobs in this particular file which have been evicted
-  uint64_t deleted_count_;
-
-  // size of deleted blobs (used by heuristic to select file for GC)
-  uint64_t deleted_size_;
-
-  BlobLogHeader header_;
-
-  // closed_ = true implies the file is no more mutable
-  // no more blobs will be appended and the footer has been written out
-  std::atomic<bool> closed_;
-
-  // has a pass of garbage collection successfully finished on this file
-  // can_be_deleted_ still needs to do iterator/snapshot checks
-  std::atomic<bool> can_be_deleted_;
-
-  // should this file been gc'd once to reconcile lost deletes/compactions
-  std::atomic<bool> gc_once_after_open_;
-
-  // et - lt of the blobs
-  ttlrange_t ttl_range_;
-
-  // et - lt of the timestamp of the KV pairs.
-  tsrange_t time_range_;
-
-  // ESN - LSN of the blobs
-  snrange_t sn_range_;
-
-  // Sequential/Append writer for blobs
-  std::shared_ptr<Writer> log_writer_;
-
-  // random access file reader for GET calls
-  std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
-
-  // This Read-Write mutex is per file specific and protects
-  // all the datastructures
-  port::RWMutex mutex_;
-
-  // time when the random access reader was last created.
-  std::atomic<std::time_t> last_access_;
-
-  // last time file was fsync'd/fdatasyncd
-  std::atomic<uint64_t> last_fsync_;
 
-  bool header_valid_;
-
- public:
-  BlobFile();
-
-  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
-
-  ~BlobFile();
-
-  ColumnFamilyHandle* GetColumnFamily(DB* db);
-
-  // Returns log file's pathname relative to the main db dir
-  // Eg. For a live-log-file = blob_dir/000003.blob
-  std::string PathName() const;
-
-  // Primary identifier for blob file.
-  // once the file is created, this never changes
-  uint64_t BlobFileNumber() const { return file_number_; }
-
-  // the following functions are atomic, and don't need
-  // read lock
-  uint64_t BlobCount() const {
-    return blob_count_.load(std::memory_order_acquire);
-  }
-
-  std::string DumpState() const;
-
-  // if the file has gone through GC and blobs have been relocated
-  bool Obsolete() const { return can_be_deleted_.load(); }
-
-  // if the file is not taking any more appends.
-  bool Immutable() const { return closed_.load(); }
-
-  // we will assume this is atomic
-  bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
-
-  uint64_t GetFileSize() const {
-    return file_size_.load(std::memory_order_acquire);
-  }
-
-  // All Get functions which are not atomic, will need ReadLock on the mutex
-  tsrange_t GetTimeRange() const {
-    assert(HasTimestamp());
-    return time_range_;
-  }
-
-  ttlrange_t GetTTLRange() const { return ttl_range_; }
-
-  snrange_t GetSNRange() const { return sn_range_; }
-
-  bool HasTTL() const {
-    assert(header_valid_);
-    return header_.HasTTL();
-  }
-
-  bool HasTimestamp() const {
-    assert(header_valid_);
-    return header_.HasTimestamp();
-  }
-
-  std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
-
-  void Fsync();
-
- private:
-  std::shared_ptr<Reader> OpenSequentialReader(
-      Env* env, const DBOptions& db_options,
-      const EnvOptions& env_options) const;
-
-  Status ReadFooter(BlobLogFooter* footer);
-
-  Status WriteFooterAndCloseLocked();
-
-  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
-      Env* env, const EnvOptions& env_options, bool* fresh_open);
-
-  void CloseRandomAccessLocked();
-
-  // this is used, when you are reading only the footer of a
-  // previously closed file
-  Status SetFromFooterLocked(const BlobLogFooter& footer);
-
-  void set_time_range(const tsrange_t& tr) { time_range_ = tr; }
-
-  void set_ttl_range(const ttlrange_t& ttl) { ttl_range_ = ttl; }
-
-  void SetSNRange(const snrange_t& snr) { sn_range_ = snr; }
-
-  // The following functions are atomic, and don't need locks
-  void SetFileSize(uint64_t fs) { file_size_ = fs; }
-
-  void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
-
-  void SetCanBeDeleted() { can_be_deleted_ = true; }
-};
-
-class BlobDBIterator : public Iterator {
- public:
-  explicit BlobDBIterator(Iterator* iter, ColumnFamilyHandle* column_family,
-                          BlobDBImpl* impl)
-      : iter_(iter), cfh_(column_family), db_impl_(impl) {
-    assert(iter_);
-  }
-
-  ~BlobDBIterator() { delete iter_; }
-
-  bool Valid() const override { return iter_->Valid(); }
-
-  void SeekToFirst() override { iter_->SeekToFirst(); }
-
-  void SeekToLast() override { iter_->SeekToLast(); }
-
-  void Seek(const Slice& target) override { iter_->Seek(target); }
-
-  void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
-
-  void Next() override { iter_->Next(); }
-
-  void Prev() override { iter_->Prev(); }
-
-  Slice key() const override { return iter_->key(); }
-
-  Slice value() const override;
-
-  Status status() const override { return iter_->status(); }
-
- private:
-  Iterator* iter_;
-  ColumnFamilyHandle* cfh_;
-  BlobDBImpl* db_impl_;
-  mutable std::string vpart_;
+  std::atomic<bool> oldest_file_evicted_;
 };
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_db_iterator.h b/utilities/blob_db/blob_db_iterator.h
new file mode 100644
index 000000000..c8aa1ff17
--- /dev/null
+++ b/utilities/blob_db/blob_db_iterator.h
@@ -0,0 +1,104 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/iterator.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+using rocksdb::ManagedSnapshot;
+
+class BlobDBIterator : public Iterator {
+ public:
+  BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
+                 BlobDBImpl* blob_db)
+      : snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {}
+
+  virtual ~BlobDBIterator() = default;
+
+  bool Valid() const override {
+    if (!iter_->Valid()) {
+      return false;
+    }
+    return status_.ok();
+  }
+
+  Status status() const override {
+    if (!iter_->status().ok()) {
+      return iter_->status();
+    }
+    return status_;
+  }
+
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    UpdateBlobValue();
+  }
+
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    UpdateBlobValue();
+  }
+
+  void Seek(const Slice& target) override {
+    iter_->Seek(target);
+    UpdateBlobValue();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    iter_->SeekForPrev(target);
+    UpdateBlobValue();
+  }
+
+  void Next() override {
+    assert(Valid());
+    iter_->Next();
+    UpdateBlobValue();
+  }
+
+  void Prev() override {
+    assert(Valid());
+    iter_->Prev();
+    UpdateBlobValue();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return iter_->key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    if (!iter_->IsBlob()) {
+      return iter_->value();
+    }
+    return value_;
+  }
+
+  // Iterator::Refresh() not supported.
+
+ private:
+  void UpdateBlobValue() {
+    TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:1");
+    TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:2");
+    value_.Reset();
+    if (iter_->Valid() && iter_->IsBlob()) {
+      status_ = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
+    }
+  }
+
+  std::unique_ptr<ManagedSnapshot> snapshot_;
+  std::unique_ptr<ArenaWrappedDBIter> iter_;
+  BlobDBImpl* blob_db_;
+  Status status_;
+  PinnableSlice value_;
+};
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_options_impl.cc b/utilities/blob_db/blob_db_options_impl.cc
deleted file mode 100644
index 263213d8e..000000000
--- a/utilities/blob_db/blob_db_options_impl.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_LITE
-
-#include "utilities/blob_db/blob_db_options_impl.h"
-
-namespace rocksdb {
-
-namespace blob_db {
-
-BlobDBOptionsImpl::BlobDBOptionsImpl(const BlobDBOptions& in)
-    : BlobDBOptions(in),
-      deletion_check_period_millisecs(2 * 1000),
-      gc_file_pct(20),
-      gc_check_period_millisecs(60 * 1000),
-      sanity_check_period_millisecs(20 * 60 * 1000),
-      open_files_trigger(100),
-      wa_num_stats_periods(24),
-      wa_stats_period_millisecs(3600 * 1000),
-      partial_expiration_gc_range_secs(4 * 3600),
-      partial_expiration_pct(75),
-      fsync_files_period_millisecs(10 * 1000),
-      reclaim_of_period_millisecs(1 * 1000),
-      delete_obsf_period_millisecs(10 * 1000),
-      check_seqf_period_millisecs(10 * 1000),
-      disable_background_tasks(false) {}
-
-BlobDBOptionsImpl::BlobDBOptionsImpl()
-    : deletion_check_period_millisecs(2 * 1000),
-      gc_file_pct(20),
-      gc_check_period_millisecs(60 * 1000),
-      sanity_check_period_millisecs(20 * 60 * 1000),
-      open_files_trigger(100),
-      wa_num_stats_periods(24),
-      wa_stats_period_millisecs(3600 * 1000),
-      partial_expiration_gc_range_secs(4 * 3600),
-      partial_expiration_pct(75),
-      fsync_files_period_millisecs(10 * 1000),
-      reclaim_of_period_millisecs(1 * 1000),
-      delete_obsf_period_millisecs(10 * 1000),
-      check_seqf_period_millisecs(10 * 1000),
-      disable_background_tasks(false) {}
-
-BlobDBOptionsImpl& BlobDBOptionsImpl::operator=(const BlobDBOptionsImpl& in) {
-  BlobDBOptions::operator=(in);
-  if (this != &in) {
-    deletion_check_period_millisecs = in.deletion_check_period_millisecs;
-    gc_file_pct = in.gc_file_pct;
-    gc_check_period_millisecs = in.gc_check_period_millisecs;
-    sanity_check_period_millisecs = in.sanity_check_period_millisecs;
-    open_files_trigger = in.open_files_trigger;
-    wa_num_stats_periods = in.wa_num_stats_periods;
-    wa_stats_period_millisecs = in.wa_stats_period_millisecs;
-    partial_expiration_gc_range_secs = in.partial_expiration_gc_range_secs;
-    partial_expiration_pct = in.partial_expiration_pct;
-    fsync_files_period_millisecs = in.fsync_files_period_millisecs;
-    reclaim_of_period_millisecs = in.reclaim_of_period_millisecs;
-    delete_obsf_period_millisecs = in.delete_obsf_period_millisecs;
-    check_seqf_period_millisecs = in.check_seqf_period_millisecs;
-    disable_background_tasks = in.disable_background_tasks;
-  }
-  return *this;
-}
-
-}  // namespace blob_db
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_options_impl.h b/utilities/blob_db/blob_db_options_impl.h
deleted file mode 100644
index 0ee0aa920..000000000
--- a/utilities/blob_db/blob_db_options_impl.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#ifndef ROCKSDB_LITE
-
-#include "utilities/blob_db/blob_db.h"
-
-namespace rocksdb {
-
-namespace blob_db {
-
-struct BlobDBOptionsImpl : public BlobDBOptions {
-  // deletions check period
-  uint32_t deletion_check_period_millisecs;
-
-  // gc percentage each check period
-  uint32_t gc_file_pct;
-
-  // gc period
-  uint32_t gc_check_period_millisecs;
-
-  // sanity check task
-  uint32_t sanity_check_period_millisecs;
-
-  // how many random access open files can we tolerate
-  uint32_t open_files_trigger;
-
-  // how many periods of stats do we keep.
-  uint32_t wa_num_stats_periods;
-
-  // what is the length of any period
-  uint32_t wa_stats_period_millisecs;
-
-  // we will garbage collect blob files in
-  // which entire files have expired. However if the
-  // ttl_range of files is very large say a day, we
-  // would have to wait for the entire day, before we
-  // recover most of the space.
-  uint32_t partial_expiration_gc_range_secs;
-
-  // this should be based on allowed Write Amplification
-  // if 50% of the space of a blob file has been deleted/expired,
-  uint32_t partial_expiration_pct;
-
-  // how often should we schedule a job to fsync open files
-  uint32_t fsync_files_period_millisecs;
-
-  // how often to schedule reclaim open files.
-  uint32_t reclaim_of_period_millisecs;
-
-  // how often to schedule delete obs files periods
-  uint32_t delete_obsf_period_millisecs;
-
-  // how often to schedule check seq files period
-  uint32_t check_seqf_period_millisecs;
-
-  // Disable all background job.
-  bool disable_background_tasks;
-
-  // default constructor
-  BlobDBOptionsImpl();
-
-  explicit BlobDBOptionsImpl(const BlobDBOptions& in);
-
-  BlobDBOptionsImpl& operator=(const BlobDBOptionsImpl& in);
-};
-
-}  // namespace blob_db
-
-}  // namespace rocksdb
-
-#endif  // endif ROCKSDB
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 6a43f6b77..03396eed3 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -5,18 +5,24 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/blob_db/blob_db.h"
+#include <algorithm>
 #include <cstdlib>
 #include <map>
 #include <memory>
 #include <string>
+#include <vector>
+
 #include "db/db_test_util.h"
 #include "port/port.h"
+#include "rocksdb/utilities/debug.h"
+#include "util/cast_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
+#include "util/sync_point.h"
 #include "util/testharness.h"
+#include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_db_impl.h"
-#include "utilities/blob_db/blob_db_options_impl.h"
+#include "utilities/blob_db/blob_index.h"
 
 namespace rocksdb {
 namespace blob_db {
@@ -25,21 +31,15 @@ class BlobDBTest : public testing::Test {
  public:
   const int kMaxBlobSize = 1 << 14;
 
-  class MockEnv : public EnvWrapper {
-   public:
-    MockEnv() : EnvWrapper(Env::Default()) {}
-
-    void set_now_micros(uint64_t now_micros) { now_micros_ = now_micros; }
-
-    uint64_t NowMicros() override { return now_micros_; }
-
-   private:
-    uint64_t now_micros_ = 0;
+  struct BlobRecord {
+    std::string key;
+    std::string value;
+    uint64_t expiration = 0;
   };
 
   BlobDBTest()
       : dbname_(test::TmpDir() + "/blob_db_test"),
-        mock_env_(new MockEnv()),
+        mock_env_(new MockTimeEnv(Env::Default())),
         blob_db_(nullptr) {
     Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
     assert(s.ok());
@@ -47,10 +47,23 @@ class BlobDBTest : public testing::Test {
 
   ~BlobDBTest() { Destroy(); }
 
-  void Open(BlobDBOptionsImpl bdb_options = BlobDBOptionsImpl(),
-            Options options = Options()) {
+  Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
+                 Options options = Options()) {
     options.create_if_missing = true;
-    ASSERT_OK(BlobDB::Open(options, bdb_options, dbname_, &blob_db_));
+    return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
+  }
+
+  void Open(BlobDBOptions bdb_options = BlobDBOptions(),
+            Options options = Options()) {
+    ASSERT_OK(TryOpen(bdb_options, options));
+  }
+
+  void Reopen(BlobDBOptions bdb_options = BlobDBOptions(),
+              Options options = Options()) {
+    assert(blob_db_ != nullptr);
+    delete blob_db_;
+    blob_db_ = nullptr;
+    Open(bdb_options, options);
   }
 
   void Destroy() {
@@ -63,7 +76,27 @@ class BlobDBTest : public testing::Test {
     }
   }
 
-  void PutRandomWithTTL(const std::string &key, int32_t ttl, Random *rnd,
+  BlobDBImpl *blob_db_impl() {
+    return reinterpret_cast<BlobDBImpl *>(blob_db_);
+  }
+
+  Status Put(const Slice &key, const Slice &value) {
+    return blob_db_->Put(WriteOptions(), key, value);
+  }
+
+  void Delete(const std::string &key,
+              std::map<std::string, std::string> *data = nullptr) {
+    ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
+    if (data != nullptr) {
+      data->erase(key);
+    }
+  }
+
+  Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
+    return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
+  }
+
+  void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
                         std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = test::RandomHumanReadableString(rnd, len);
@@ -74,7 +107,7 @@ class BlobDBTest : public testing::Test {
     }
   }
 
-  void PutRandomUntil(const std::string &key, int32_t expiration, Random *rnd,
+  void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd,
                       std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = test::RandomHumanReadableString(rnd, len);
@@ -87,9 +120,14 @@ class BlobDBTest : public testing::Test {
 
   void PutRandom(const std::string &key, Random *rnd,
                  std::map<std::string, std::string> *data = nullptr) {
+    PutRandom(blob_db_, key, rnd, data);
+  }
+
+  void PutRandom(DB *db, const std::string &key, Random *rnd,
+                 std::map<std::string, std::string> *data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = test::RandomHumanReadableString(rnd, len);
-    ASSERT_OK(blob_db_->Put(WriteOptions(), Slice(key), Slice(value)));
+    ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
     if (data != nullptr) {
       (*data)[key] = value;
     }
@@ -106,18 +144,25 @@ class BlobDBTest : public testing::Test {
     }
   }
 
-  void Delete(const std::string &key,
-              std::map<std::string, std::string> *data = nullptr) {
-    ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
-    if (data != nullptr) {
-      data->erase(key);
-    }
-  }
-
   // Verify blob db contain expected data and nothing more.
-  // TODO(yiwu): Verify blob files are consistent with data in LSM.
   void VerifyDB(const std::map<std::string, std::string> &data) {
-    Iterator *iter = blob_db_->NewIterator(ReadOptions());
+    VerifyDB(blob_db_, data);
+  }
+
+  void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
+    // Verify normal Get
+    auto* cfh = db->DefaultColumnFamily();
+    for (auto &p : data) {
+      PinnableSlice value_slice;
+      ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
+      ASSERT_EQ(p.second, value_slice.ToString());
+      std::string value;
+      ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value));
+      ASSERT_EQ(p.second, value);
+    }
+
+    // Verify iterators
+    Iterator *iter = db->NewIterator(ReadOptions());
     iter->SeekToFirst();
     for (auto &p : data) {
       ASSERT_TRUE(iter->Valid());
@@ -130,13 +175,39 @@ class BlobDBTest : public testing::Test {
     delete iter;
   }
 
+  void VerifyBaseDB(
+      const std::map<std::string, KeyVersion> &expected_versions) {
+    auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+    DB *db = blob_db_->GetRootDB();
+    std::vector<KeyVersion> versions;
+    GetAllKeyVersions(db, "", "", &versions);
+    ASSERT_EQ(expected_versions.size(), versions.size());
+    size_t i = 0;
+    for (auto &key_version : expected_versions) {
+      const KeyVersion &expected_version = key_version.second;
+      ASSERT_EQ(expected_version.user_key, versions[i].user_key);
+      ASSERT_EQ(expected_version.sequence, versions[i].sequence);
+      ASSERT_EQ(expected_version.type, versions[i].type);
+      if (versions[i].type == kTypeValue) {
+        ASSERT_EQ(expected_version.value, versions[i].value);
+      } else {
+        ASSERT_EQ(kTypeBlobIndex, versions[i].type);
+        PinnableSlice value;
+        ASSERT_OK(bdb_impl->TEST_GetBlobValue(versions[i].user_key,
+                                              versions[i].value, &value));
+        ASSERT_EQ(expected_version.value, value.ToString());
+      }
+      i++;
+    }
+  }
+
   void InsertBlobs() {
     WriteOptions wo;
     std::string value;
 
     Random rnd(301);
     for (size_t i = 0; i < 100000; i++) {
-      int32_t ttl = rnd.Next() % 86400;
+      uint64_t ttl = rnd.Next() % 86400;
       PutRandomWithTTL("key" + ToString(i % 500), ttl, &rnd, nullptr);
     }
 
@@ -146,14 +217,15 @@ class BlobDBTest : public testing::Test {
   }
 
   const std::string dbname_;
-  std::unique_ptr<MockEnv> mock_env_;
+  std::unique_ptr<MockTimeEnv> mock_env_;
   std::shared_ptr<TTLExtractor> ttl_extractor_;
   BlobDB *blob_db_;
 };  // class BlobDBTest
 
 TEST_F(BlobDBTest, Put) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -167,28 +239,29 @@ TEST_F(BlobDBTest, PutWithTTL) {
   Random rnd(301);
   Options options;
   options.env = mock_env_.get();
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_now_micros(50 * 1000000);
+  mock_env_->set_current_time(50);
   for (size_t i = 0; i < 100; i++) {
-    int32_t ttl = rnd.Next() % 100;
+    uint64_t ttl = rnd.Next() % 100;
     PutRandomWithTTL("key" + ToString(i), ttl, &rnd,
-                     (ttl < 50 ? nullptr : &data));
+                     (ttl <= 50 ? nullptr : &data));
   }
-  mock_env_->set_now_micros(100 * 1000000);
+  mock_env_->set_current_time(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
-  bdb_impl->TEST_CloseBlobFile(blob_files[0]);
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocs);
+  ASSERT_EQ(data.size(), gc_stats.num_relocate);
   VerifyDB(data);
 }
 
@@ -196,28 +269,29 @@ TEST_F(BlobDBTest, PutUntil) {
   Random rnd(301);
   Options options;
   options.env = mock_env_.get();
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_now_micros(50 * 1000000);
+  mock_env_->set_current_time(50);
   for (size_t i = 0; i < 100; i++) {
-    int32_t expiration = rnd.Next() % 100 + 50;
+    uint64_t expiration = rnd.Next() % 100 + 50;
     PutRandomUntil("key" + ToString(i), expiration, &rnd,
-                   (expiration < 100 ? nullptr : &data));
+                   (expiration <= 100 ? nullptr : &data));
   }
-  mock_env_->set_now_micros(100 * 1000000);
+  mock_env_->set_current_time(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
-  bdb_impl->TEST_CloseBlobFile(blob_files[0]);
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocs);
+  ASSERT_EQ(data.size(), gc_stats.num_relocate);
   VerifyDB(data);
 }
 
@@ -227,29 +301,30 @@ TEST_F(BlobDBTest, TTLExtrator_NoTTL) {
   Random rnd(301);
   Options options;
   options.env = mock_env_.get();
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.num_concurrent_simple_blobs = 1;
   bdb_options.ttl_extractor = ttl_extractor_;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_now_micros(0);
+  mock_env_->set_current_time(0);
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd, &data);
   }
   // very far in the future..
-  mock_env_->set_now_micros(std::numeric_limits<uint64_t>::max() - 10);
+  mock_env_->set_current_time(std::numeric_limits<uint64_t>::max() / 1000000 -
+                              10);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_FALSE(blob_files[0]->HasTTL());
-  bdb_impl->TEST_CloseBlobFile(blob_files[0]);
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(100, gc_stats.num_relocs);
+  ASSERT_EQ(100, gc_stats.num_relocate);
   VerifyDB(data);
 }
 
@@ -263,7 +338,7 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractTTL) {
                             std::string * /*new_value*/,
                             bool * /*value_changed*/) override {
       *ttl = rnd->Next() % 100;
-      if (*ttl >= 50) {
+      if (*ttl > 50) {
         data[key.ToString()] = value.ToString();
       }
       return true;
@@ -275,27 +350,28 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractTTL) {
   ttl_extractor_.reset(new TestTTLExtractor(&rnd));
   Options options;
   options.env = mock_env_.get();
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.ttl_extractor = ttl_extractor_;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
-  mock_env_->set_now_micros(50 * 1000000);
+  mock_env_->set_current_time(50);
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd);
   }
-  mock_env_->set_now_micros(100 * 1000000);
+  mock_env_->set_current_time(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
-  bdb_impl->TEST_CloseBlobFile(blob_files[0]);
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
   ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocs);
+  ASSERT_EQ(data.size(), gc_stats.num_relocate);
   VerifyDB(data);
 }
 
@@ -310,7 +386,7 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractExpiration) {
                                    std::string * /*new_value*/,
                                    bool * /*value_changed*/) override {
       *expiration = rnd->Next() % 100 + 50;
-      if (*expiration >= 100) {
+      if (*expiration > 100) {
         data[key.ToString()] = value.ToString();
       }
       return true;
@@ -322,27 +398,28 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractExpiration) {
   ttl_extractor_.reset(new TestTTLExtractor(&rnd));
   Options options;
   options.env = mock_env_.get();
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.ttl_extractor = ttl_extractor_;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
-  mock_env_->set_now_micros(50 * 1000000);
+  mock_env_->set_current_time(50);
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd);
   }
-  mock_env_->set_now_micros(100 * 1000000);
+  mock_env_->set_current_time(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
-  bdb_impl->TEST_CloseBlobFile(blob_files[0]);
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
   ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocs);
+  ASSERT_EQ(data.size(), gc_stats.num_relocate);
   VerifyDB(data);
 }
 
@@ -369,14 +446,15 @@ TEST_F(BlobDBTest, TTLExtractor_ChangeValue) {
   Random rnd(301);
   Options options;
   options.env = mock_env_.get();
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
+  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.ttl_extractor = std::make_shared<TestTTLExtractor>();
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_now_micros(50 * 1000000);
+  mock_env_->set_current_time(50);
   for (size_t i = 0; i < 100; i++) {
     int len = rnd.Next() % kMaxBlobSize + 1;
     std::string key = "key" + ToString(i);
@@ -385,26 +463,27 @@ TEST_F(BlobDBTest, TTLExtractor_ChangeValue) {
     std::string value_ttl = value + "ttl:";
     PutFixed64(&value_ttl, ttl);
     ASSERT_OK(blob_db_->Put(WriteOptions(), Slice(key), Slice(value_ttl)));
-    if (ttl >= 50) {
+    if (ttl > 50) {
       data[key] = value;
     }
   }
-  mock_env_->set_now_micros(100 * 1000000);
+  mock_env_->set_current_time(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
-  bdb_impl->TEST_CloseBlobFile(blob_files[0]);
+  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocs);
+  ASSERT_EQ(data.size(), gc_stats.num_relocate);
   VerifyDB(data);
 }
 
 TEST_F(BlobDBTest, StackableDBGet) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -426,7 +505,8 @@ TEST_F(BlobDBTest, StackableDBGet) {
 
 TEST_F(BlobDBTest, WriteBatch) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -442,7 +522,8 @@ TEST_F(BlobDBTest, WriteBatch) {
 
 TEST_F(BlobDBTest, Delete) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -457,7 +538,8 @@ TEST_F(BlobDBTest, Delete) {
 
 TEST_F(BlobDBTest, DeleteBatch) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   for (size_t i = 0; i < 100; i++) {
@@ -474,7 +556,8 @@ TEST_F(BlobDBTest, DeleteBatch) {
 
 TEST_F(BlobDBTest, Override) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -491,7 +574,8 @@ TEST_F(BlobDBTest, Override) {
 #ifdef SNAPPY
 TEST_F(BlobDBTest, Compression) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   bdb_options.compression = CompressionType::kSnappyCompression;
   Open(bdb_options);
@@ -509,56 +593,670 @@ TEST_F(BlobDBTest, Compression) {
   }
   VerifyDB(data);
 }
+
+TEST_F(BlobDBTest, DecompressAfterReopen) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = CompressionType::kSnappyCompression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("put-key" + ToString(i), &rnd, &data);
+  }
+  VerifyDB(data);
+  bdb_options.compression = CompressionType::kNoCompression;
+  Reopen(bdb_options);
+  VerifyDB(data);
+}
+
 #endif
 
-TEST_F(BlobDBTest, DISABLED_MultipleWriters) {
-  Open();
+TEST_F(BlobDBTest, MultipleWriters) {
+  Open(BlobDBOptions());
 
   std::vector<port::Thread> workers;
-  for (size_t ii = 0; ii < 10; ii++)
-    workers.push_back(port::Thread(&BlobDBTest::InsertBlobs, this));
+  std::vector<std::map<std::string, std::string>> data_set(10);
+  for (uint32_t i = 0; i < 10; i++)
+    workers.push_back(port::Thread(
+        [&](uint32_t id) {
+          Random rnd(301 + id);
+          for (int j = 0; j < 100; j++) {
+            std::string key = "key" + ToString(id) + "_" + ToString(j);
+            if (id < 5) {
+              PutRandom(key, &rnd, &data_set[id]);
+            } else {
+              WriteBatch batch;
+              PutRandomToWriteBatch(key, &rnd, &batch, &data_set[id]);
+              blob_db_->Write(WriteOptions(), &batch);
+            }
+          }
+        },
+        i));
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 10; i++) {
+    workers[i].join();
+    data.insert(data_set[i].begin(), data_set[i].end());
+  }
+  VerifyDB(data);
+}
 
-  for (auto& t : workers) {
-    if (t.joinable()) {
-      t.join();
+TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  DBImpl *db_impl = static_cast_with_check<DBImpl, DB>(blob_db_->GetBaseDB());
+  std::map<std::string, std::string> data;
+  for (int i = 0; i < 200; i++) {
+    PutRandom("key" + ToString(i), &rnd, &data);
+  }
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+  // Test for data in SST
+  size_t new_keys = 0;
+  for (int i = 0; i < 100; i++) {
+    if (rnd.Next() % 2 == 1) {
+      new_keys++;
+      PutRandom("key" + ToString(i), &rnd, &data);
+    }
+  }
+  db_impl->TEST_FlushMemTable(true /*wait*/);
+  // Test for data in memtable
+  for (int i = 100; i < 200; i++) {
+    if (rnd.Next() % 2 == 1) {
+      new_keys++;
+      PutRandom("key" + ToString(i), &rnd, &data);
     }
   }
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
+  ASSERT_EQ(200, gc_stats.blob_count);
+  ASSERT_EQ(0, gc_stats.num_deletes);
+  ASSERT_EQ(200 - new_keys, gc_stats.num_relocate);
+  VerifyDB(data);
 }
 
-// Test sequence number store in blob file is correct.
-TEST_F(BlobDBTest, SequenceNumber) {
+TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
   Random rnd(301);
-  BlobDBOptionsImpl bdb_options;
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
-  SequenceNumber sequence = blob_db_->GetLatestSequenceNumber();
-  BlobDBImpl *blob_db_impl = reinterpret_cast<BlobDBImpl *>(blob_db_);
-  for (int i = 0; i < 100; i++) {
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "v1"));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
+        "BlobDBImpl::PutUntil:Start"},
+       {"BlobDBImpl::PutUntil:Finish",
+        "BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto writer = port::Thread(
+      [this]() { ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "v2")); });
+
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
+  ASSERT_EQ(1, gc_stats.blob_count);
+  ASSERT_EQ(0, gc_stats.num_deletes);
+  ASSERT_EQ(1, gc_stats.num_relocate);
+  ASSERT_EQ(0, gc_stats.relocate_succeeded);
+  ASSERT_EQ(1, gc_stats.overwritten_while_relocate);
+  writer.join();
+  VerifyDB({{"foo", "v2"}});
+}
+
+TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
+  Random rnd(301);
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options, options);
+  mock_env_->set_current_time(100);
+  ASSERT_OK(blob_db_->PutUntil(WriteOptions(), "foo", "v1", 200));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+  mock_env_->set_current_time(300);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
+        "BlobDBImpl::PutUntil:Start"},
+       {"BlobDBImpl::PutUntil:Finish",
+        "BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto writer = port::Thread([this]() {
+    ASSERT_OK(blob_db_->PutUntil(WriteOptions(), "foo", "v2", 400));
+  });
+
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
+  ASSERT_EQ(1, gc_stats.blob_count);
+  ASSERT_EQ(1, gc_stats.num_deletes);
+  ASSERT_EQ(0, gc_stats.delete_succeeded);
+  ASSERT_EQ(1, gc_stats.overwritten_while_delete);
+  ASSERT_EQ(0, gc_stats.num_relocate);
+  writer.join();
+  VerifyDB({{"foo", "v2"}});
+}
+
+// This test is no longer valid since we now return an error when we go
+// over the configured blob_dir_size.
+// The test needs to be re-written later in such a way that writes continue
+// after a GC happens.
+TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) {
+  // Use mock env to stop wall clock.
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.blob_dir_size = 100;
+  bdb_options.blob_file_size = 100;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::string value(100, 'v');
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key_with_ttl", value, 60));
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(blob_db_->Put(WriteOptions(), "key" + ToString(i), value));
+  }
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(11, blob_files.size());
+  ASSERT_TRUE(blob_files[0]->HasTTL());
+  ASSERT_TRUE(blob_files[0]->Immutable());
+  for (int i = 1; i <= 10; i++) {
+    ASSERT_FALSE(blob_files[i]->HasTTL());
+    if (i < 10) {
+      ASSERT_TRUE(blob_files[i]->Immutable());
+    }
+  }
+  blob_db_impl()->TEST_RunGC();
+  // The oldest simple blob file (i.e. blob_files[1]) has been selected for GC.
+  auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+  ASSERT_EQ(1, obsolete_files.size());
+  ASSERT_EQ(blob_files[1]->BlobFileNumber(),
+            obsolete_files[0]->BlobFileNumber());
+}
+
+TEST_F(BlobDBTest, ReadWhileGC) {
+  // run the same test for Get(), MultiGet() and Iterator each.
+  for (int i = 0; i < 2; i++) {
+    BlobDBOptions bdb_options;
+    bdb_options.min_blob_size = 0;
+    bdb_options.disable_background_tasks = true;
+    Open(bdb_options);
+    blob_db_->Put(WriteOptions(), "foo", "bar");
+    auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(1, blob_files.size());
+    std::shared_ptr<BlobFile> bfile = blob_files[0];
+    uint64_t bfile_number = bfile->BlobFileNumber();
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+
+    switch (i) {
+      case 0:
+        SyncPoint::GetInstance()->LoadDependency(
+            {{"BlobDBImpl::Get:AfterIndexEntryGet:1",
+              "BlobDBTest::ReadWhileGC:1"},
+             {"BlobDBTest::ReadWhileGC:2",
+              "BlobDBImpl::Get:AfterIndexEntryGet:2"}});
+        break;
+      case 1:
+        SyncPoint::GetInstance()->LoadDependency(
+            {{"BlobDBIterator::UpdateBlobValue:Start:1",
+              "BlobDBTest::ReadWhileGC:1"},
+             {"BlobDBTest::ReadWhileGC:2",
+              "BlobDBIterator::UpdateBlobValue:Start:2"}});
+        break;
+    }
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    auto reader = port::Thread([this, i]() {
+      std::string value;
+      std::vector<std::string> values;
+      std::vector<Status> statuses;
+      switch (i) {
+        case 0:
+          ASSERT_OK(blob_db_->Get(ReadOptions(), "foo", &value));
+          ASSERT_EQ("bar", value);
+          break;
+        case 1:
+          // VerifyDB use iterator to scan the DB.
+          VerifyDB({{"foo", "bar"}});
+          break;
+      }
+    });
+
+    TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:1");
+    GCStats gc_stats;
+    ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+    ASSERT_EQ(1, gc_stats.blob_count);
+    ASSERT_EQ(1, gc_stats.num_relocate);
+    ASSERT_EQ(1, gc_stats.relocate_succeeded);
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    // The file shouln't be deleted
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(2, blob_files.size());
+    ASSERT_EQ(bfile_number, blob_files[0]->BlobFileNumber());
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(1, obsolete_files.size());
+    ASSERT_EQ(bfile_number, obsolete_files[0]->BlobFileNumber());
+    TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:2");
+    reader.join();
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // The file is deleted this time
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(1, blob_files.size());
+    ASSERT_NE(bfile_number, blob_files[0]->BlobFileNumber());
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB({{"foo", "bar"}});
+    Destroy();
+  }
+}
+
+TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  // i = when to take snapshot
+  for (int i = 0; i < 4; i++) {
+    for (bool delete_key : {true, false}) {
+      const Snapshot *snapshot = nullptr;
+      Destroy();
+      Open(bdb_options);
+      // First file
+      ASSERT_OK(Put("key1", "value"));
+      if (i == 0) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+      ASSERT_EQ(1, blob_files.size());
+      ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+      // Second file
+      ASSERT_OK(Put("key2", "value"));
+      if (i == 1) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      blob_files = blob_db_impl()->TEST_GetBlobFiles();
+      ASSERT_EQ(2, blob_files.size());
+      auto bfile = blob_files[1];
+      ASSERT_FALSE(bfile->Immutable());
+      ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+      // Third file
+      ASSERT_OK(Put("key3", "value"));
+      if (i == 2) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      if (delete_key) {
+        Delete("key2");
+      }
+      GCStats gc_stats;
+      ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+      ASSERT_TRUE(bfile->Obsolete());
+      ASSERT_EQ(1, gc_stats.blob_count);
+      if (delete_key) {
+        ASSERT_EQ(0, gc_stats.num_relocate);
+        ASSERT_EQ(bfile->GetSequenceRange().second + 1,
+                  bfile->GetObsoleteSequence());
+      } else {
+        ASSERT_EQ(1, gc_stats.num_relocate);
+        ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
+                  bfile->GetObsoleteSequence());
+      }
+      if (i == 3) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      size_t num_files = delete_key ? 3 : 4;
+      ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
+      blob_db_impl()->TEST_DeleteObsoleteFiles();
+      if (i == 0 || i == 3 || (i == 2 && delete_key)) {
+        // The snapshot shouldn't see data in bfile
+        ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
+        blob_db_->ReleaseSnapshot(snapshot);
+      } else {
+        // The snapshot will see data in bfile, so the file shouldn't be deleted
+        ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
+        blob_db_->ReleaseSnapshot(snapshot);
+        blob_db_impl()->TEST_DeleteObsoleteFiles();
+        ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
+      }
+    }
+  }
+}
+
+TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
+  Options options;
+  options.env = mock_env_.get();
+  mock_env_->set_current_time(0);
+  Open(BlobDBOptions(), options);
+  ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
+  ColumnFamilyHandle *handle = nullptr;
+  std::string value;
+  std::vector<std::string> values;
+  // The call simply pass through to base db. It should succeed.
+  ASSERT_OK(
+      blob_db_->CreateColumnFamily(ColumnFamilyOptions(), "foo", &handle));
+  ASSERT_TRUE(blob_db_->Put(WriteOptions(), handle, "k", "v").IsNotSupported());
+  ASSERT_TRUE(blob_db_->PutWithTTL(WriteOptions(), handle, "k", "v", 60)
+                  .IsNotSupported());
+  ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
+                  .IsNotSupported());
+  WriteBatch batch;
+  batch.Put("k1", "v1");
+  batch.Put(handle, "k2", "v2");
+  ASSERT_TRUE(blob_db_->Write(WriteOptions(), &batch).IsNotSupported());
+  ASSERT_TRUE(blob_db_->Get(ReadOptions(), "k1", &value).IsNotFound());
+  ASSERT_TRUE(
+      blob_db_->Get(ReadOptions(), handle, "k", &value).IsNotSupported());
+  auto statuses = blob_db_->MultiGet(ReadOptions(), {default_handle, handle},
+                                     {"k1", "k2"}, &values);
+  ASSERT_EQ(2, statuses.size());
+  ASSERT_TRUE(statuses[0].IsNotSupported());
+  ASSERT_TRUE(statuses[1].IsNotSupported());
+  ASSERT_EQ(nullptr, blob_db_->NewIterator(ReadOptions(), handle));
+  delete handle;
+}
+
+TEST_F(BlobDBTest, GetLiveFilesMetaData) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("key" + ToString(i), &rnd, &data);
+  }
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  std::vector<LiveFileMetaData> metadata;
+  bdb_impl->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  std::string filename = dbname_ + "/blob_dir/000001.blob";
+  ASSERT_EQ(filename, metadata[0].name);
+  ASSERT_EQ("default", metadata[0].column_family_name);
+  std::vector<std::string> livefile;
+  uint64_t mfs;
+  bdb_impl->GetLiveFiles(livefile, &mfs, false);
+  ASSERT_EQ(4U, livefile.size());
+  ASSERT_EQ(filename, livefile[3]);
+  VerifyDB(data);
+}
+
+TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
+  constexpr size_t kNumKey = 20;
+  constexpr size_t kNumIteration = 10;
+  Random rnd(301);
+  std::map<std::string, std::string> data;
+  std::vector<bool> is_blob(kNumKey, false);
+
+  // Write to plain rocksdb.
+  Options options;
+  options.create_if_missing = true;
+  DB *db = nullptr;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  for (size_t i = 0; i < kNumIteration; i++) {
+    auto key_index = rnd.Next() % kNumKey;
+    std::string key = "key" + ToString(key_index);
+    PutRandom(db, key, &rnd, &data);
+  }
+  VerifyDB(db, data);
+  delete db;
+  db = nullptr;
+
+  // Open as blob db. Verify it can read existing data.
+  Open();
+  VerifyDB(blob_db_, data);
+  for (size_t i = 0; i < kNumIteration; i++) {
+    auto key_index = rnd.Next() % kNumKey;
+    std::string key = "key" + ToString(key_index);
+    is_blob[key_index] = true;
+    PutRandom(blob_db_, key, &rnd, &data);
+  }
+  VerifyDB(blob_db_, data);
+  delete blob_db_;
+  blob_db_ = nullptr;
+
+  // Verify plain db return error for keys written by blob db.
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  std::string value;
+  for (size_t i = 0; i < kNumKey; i++) {
     std::string key = "key" + ToString(i);
-    PutRandom(key, &rnd);
-    sequence += 1;
-    ASSERT_EQ(sequence, blob_db_->GetLatestSequenceNumber());
-    SequenceNumber actual_sequence = 0;
-    ASSERT_OK(blob_db_impl->TEST_GetSequenceNumber(key, &actual_sequence));
-    ASSERT_EQ(sequence, actual_sequence);
+    Status s = db->Get(ReadOptions(), key, &value);
+    if (data.count(key) == 0) {
+      ASSERT_TRUE(s.IsNotFound());
+    } else if (is_blob[i]) {
+      ASSERT_TRUE(s.IsNotSupported());
+    } else {
+      ASSERT_OK(s);
+      ASSERT_EQ(data[key], value);
+    }
   }
-  for (int i = 0; i < 100; i++) {
-    WriteBatch batch;
-    size_t batch_size = rnd.Next() % 10 + 1;
-    for (size_t k = 0; k < batch_size; k++) {
-      std::string value = test::RandomHumanReadableString(&rnd, 1000);
-      ASSERT_OK(batch.Put("key" + ToString(i) + "-" + ToString(k), value));
+  delete db;
+}
+
+// Test to verify that a NoSpace IOError Status is returned on reaching
+// blob_dir_size limit.
+TEST_F(BlobDBTest, OutOfSpace) {
+  // Use mock env to stop wall clock.
+  Options options;
+  options.env = mock_env_.get();
+  BlobDBOptions bdb_options;
+  bdb_options.blob_dir_size = 150;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  // Each stored blob has an overhead of about 42 bytes currently.
+  // So a small key + a 100 byte blob should take up ~150 bytes in the db.
+  std::string value(100, 'v');
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 60));
+
+  // Putting another blob should fail as ading it would exceed the blob_dir_size
+  // limit.
+  Status s = blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_TRUE(s.IsNoSpace());
+}
+
+TEST_F(BlobDBTest, EvictOldestFileWhenCloseToSpaceLimit) {
+  // Use mock env to stop wall clock.
+  Options options;
+  BlobDBOptions bdb_options;
+  bdb_options.blob_dir_size = 270;
+  bdb_options.blob_file_size = 100;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.is_fifo = true;
+  Open(bdb_options);
+
+  // Each stored blob has an overhead of 32 bytes currently.
+  // So a 100 byte blob should take up 132 bytes.
+  std::string value(100, 'v');
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
+
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto blob_files = bdb_impl->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+
+  // Adding another 100 byte blob would take the total size to 264 bytes
+  // (2*132), which is more than 90% of blob_dir_size. So, the oldest file
+  // should be evicted and put in obsolete files list.
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
+
+  auto obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
+  ASSERT_EQ(1, obsolete_files.size());
+  ASSERT_TRUE(obsolete_files[0]->Immutable());
+  ASSERT_EQ(blob_files[0]->BlobFileNumber(),
+            obsolete_files[0]->BlobFileNumber());
+
+  bdb_impl->TEST_DeleteObsoleteFiles();
+  obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
+  ASSERT_TRUE(obsolete_files.empty());
+}
+
+TEST_F(BlobDBTest, InlineSmallValues) {
+  constexpr uint64_t kMaxExpiration = 1000;
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = kMaxExpiration;
+  bdb_options.min_blob_size = 100;
+  bdb_options.blob_file_size = 256 * 1000 * 1000;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  mock_env_->set_current_time(0);
+  Open(bdb_options, options);
+  std::map<std::string, std::string> data;
+  std::map<std::string, KeyVersion> versions;
+  SequenceNumber first_non_ttl_seq = kMaxSequenceNumber;
+  SequenceNumber first_ttl_seq = kMaxSequenceNumber;
+  SequenceNumber last_non_ttl_seq = 0;
+  SequenceNumber last_ttl_seq = 0;
+  for (size_t i = 0; i < 1000; i++) {
+    bool is_small_value = rnd.Next() % 2;
+    bool has_ttl = rnd.Next() % 2;
+    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    int len = is_small_value ? 50 : 200;
+    std::string key = "key" + ToString(i);
+    std::string value = test::RandomHumanReadableString(&rnd, len);
+    std::string blob_index;
+    data[key] = value;
+    SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
+    if (!has_ttl) {
+      ASSERT_OK(blob_db_->Put(WriteOptions(), key, value));
+    } else {
+      ASSERT_OK(blob_db_->PutUntil(WriteOptions(), key, value, expiration));
     }
-    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
-    for (size_t k = 0; k < batch_size; k++) {
-      std::string key = "key" + ToString(i) + "-" + ToString(k);
-      sequence++;
-      SequenceNumber actual_sequence;
-      ASSERT_OK(blob_db_impl->TEST_GetSequenceNumber(key, &actual_sequence));
-      ASSERT_EQ(sequence, actual_sequence);
+    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
+    versions[key] =
+        KeyVersion(key, value, sequence,
+                   (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex);
+    if (!is_small_value) {
+      if (!has_ttl) {
+        first_non_ttl_seq = std::min(first_non_ttl_seq, sequence);
+        last_non_ttl_seq = std::max(last_non_ttl_seq, sequence);
+      } else {
+        first_ttl_seq = std::min(first_ttl_seq, sequence);
+        last_ttl_seq = std::max(last_ttl_seq, sequence);
+      }
     }
-    ASSERT_EQ(sequence, blob_db_->GetLatestSequenceNumber());
   }
+  VerifyDB(data);
+  VerifyBaseDB(versions);
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto blob_files = bdb_impl->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  std::shared_ptr<BlobFile> non_ttl_file;
+  std::shared_ptr<BlobFile> ttl_file;
+  if (blob_files[0]->HasTTL()) {
+    ttl_file = blob_files[0];
+    non_ttl_file = blob_files[1];
+  } else {
+    non_ttl_file = blob_files[0];
+    ttl_file = blob_files[1];
+  }
+  ASSERT_FALSE(non_ttl_file->HasTTL());
+  ASSERT_EQ(first_non_ttl_seq, non_ttl_file->GetSequenceRange().first);
+  ASSERT_EQ(last_non_ttl_seq, non_ttl_file->GetSequenceRange().second);
+  ASSERT_TRUE(ttl_file->HasTTL());
+  ASSERT_EQ(first_ttl_seq, ttl_file->GetSequenceRange().first);
+  ASSERT_EQ(last_ttl_seq, ttl_file->GetSequenceRange().second);
+}
+
+TEST_F(BlobDBTest, CompactionFilterNotSupported) {
+  class TestCompactionFilter : public CompactionFilter {
+    virtual const char *Name() const { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    virtual const char *Name() const { return "TestCompactionFilterFactory"; }
+    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context & /*context*/) {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+  for (int i = 0; i < 2; i++) {
+    Options options;
+    if (i == 0) {
+      options.compaction_filter = new TestCompactionFilter();
+    } else {
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    ASSERT_TRUE(TryOpen(BlobDBOptions(), options).IsNotSupported());
+    delete options.compaction_filter;
+  }
+}
+
+TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
+  constexpr size_t kNumKeys = 100;
+  constexpr size_t kNumPuts = 1000;
+  constexpr uint64_t kMaxExpiration = 1000;
+  constexpr uint64_t kCompactTime = 500;
+  constexpr uint64_t kMinBlobSize = 100;
+  Random rnd(301);
+  mock_env_->set_current_time(0);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = kMinBlobSize;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  std::map<std::string, std::string> data_after_compact;
+  for (size_t i = 0; i < kNumPuts; i++) {
+    bool is_small_value = rnd.Next() % 2;
+    bool has_ttl = rnd.Next() % 2;
+    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    int len = is_small_value ? 10 : 200;
+    std::string key = "key" + ToString(rnd.Next() % kNumKeys);
+    std::string value = test::RandomHumanReadableString(&rnd, len);
+    if (!has_ttl) {
+      if (is_small_value) {
+        std::string blob_entry;
+        BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
+        // Fake blob index with TTL. See what it will do.
+        ASSERT_GT(kMinBlobSize, blob_entry.size());
+        value = blob_entry;
+      }
+      ASSERT_OK(Put(key, value));
+      data_after_compact[key] = value;
+    } else {
+      ASSERT_OK(PutUntil(key, value, expiration));
+      if (expiration <= kCompactTime) {
+        data_after_compact.erase(key);
+      } else {
+        data_after_compact[key] = value;
+      }
+    }
+    data[key] = value;
+  }
+  VerifyDB(data);
+
+  mock_env_->set_current_time(kCompactTime);
+  // Take a snapshot before compaction. Make sure expired blob indexes is
+  // filtered regardless of snapshot.
+  const Snapshot *snapshot = blob_db_->GetSnapshot();
+  // Issue manual compaction to trigger compaction filter.
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(),
+                                   blob_db_->DefaultColumnFamily(), nullptr,
+                                   nullptr));
+  blob_db_->ReleaseSnapshot(snapshot);
+  // Verify expired blob index are filtered.
+  std::vector<KeyVersion> versions;
+  GetAllKeyVersions(blob_db_, "", "", &versions);
+  ASSERT_EQ(data_after_compact.size(), versions.size());
+  for (auto &version : versions) {
+    ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
+  }
+  VerifyDB(data_after_compact);
 }
 
 }  //  namespace blob_db
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index f426802c2..b7ae8162d 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -18,7 +18,6 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
-#include "util/crc32c.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -92,7 +91,7 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
 
 Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
   Slice slice;
-  Status s = Read(0, BlobLogHeader::kHeaderSize, &slice);
+  Status s = Read(0, BlobLogHeader::kSize, &slice);
   if (!s.ok()) {
     return s;
   }
@@ -102,20 +101,19 @@ Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
     return s;
   }
   fprintf(stdout, "Blob log header:\n");
-  fprintf(stdout, "  Magic Number   : %u\n", header.magic_number());
-  fprintf(stdout, "  Version        : %d\n", header.version());
-  CompressionType compression = header.compression();
+  fprintf(stdout, "  Version          : %" PRIu32 "\n", header.version);
+  fprintf(stdout, "  Column Family ID : %" PRIu32 "\n",
+          header.column_family_id);
   std::string compression_str;
-  if (!GetStringFromCompressionType(&compression_str, compression).ok()) {
+  if (!GetStringFromCompressionType(&compression_str, header.compression)
+           .ok()) {
     compression_str = "Unrecongnized compression type (" +
-                      ToString((int)header.compression()) + ")";
+                      ToString((int)header.compression) + ")";
   }
-  fprintf(stdout, "  Compression    : %s\n", compression_str.c_str());
-  fprintf(stdout, "  TTL Range      : %s\n",
-          GetString(header.ttl_range()).c_str());
-  fprintf(stdout, "  Timestamp Range: %s\n",
-          GetString(header.ts_range()).c_str());
-  *offset = BlobLogHeader::kHeaderSize;
+  fprintf(stdout, "  Compression      : %s\n", compression_str.c_str());
+  fprintf(stdout, "  Expiration range : %s\n",
+          GetString(header.expiration_range).c_str());
+  *offset = BlobLogHeader::kSize;
   return s;
 }
 
@@ -126,20 +124,12 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
     fprintf(stdout, "No blob log footer.\n");
     return Status::OK();
   };
-  if (file_size < BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize) {
+  if (file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
     return no_footer();
   }
   Slice slice;
-  Status s = Read(file_size - 4, 4, &slice);
-  if (!s.ok()) {
-    return s;
-  }
-  uint32_t magic_number = DecodeFixed32(slice.data());
-  if (magic_number != kMagicNumber) {
-    return no_footer();
-  }
-  *footer_offset = file_size - BlobLogFooter::kFooterSize;
-  s = Read(*footer_offset, BlobLogFooter::kFooterSize, &slice);
+  *footer_offset = file_size - BlobLogFooter::kSize;
+  Status s = Read(*footer_offset, BlobLogFooter::kSize, &slice);
   if (!s.ok()) {
     return s;
   }
@@ -149,13 +139,11 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
     return s;
   }
   fprintf(stdout, "Blob log footer:\n");
-  fprintf(stdout, "  Blob count     : %" PRIu64 "\n", footer.GetBlobCount());
-  fprintf(stdout, "  TTL Range      : %s\n",
-          GetString(footer.GetTTLRange()).c_str());
-  fprintf(stdout, "  Time Range     : %s\n",
-          GetString(footer.GetTimeRange()).c_str());
-  fprintf(stdout, "  Sequence Range : %s\n",
-          GetString(footer.GetSNRange()).c_str());
+  fprintf(stdout, "  Blob count       : %" PRIu64 "\n", footer.blob_count);
+  fprintf(stdout, "  Expiration Range : %s\n",
+          GetString(footer.expiration_range).c_str());
+  fprintf(stdout, "  Sequence Range   : %s\n",
+          GetString(footer.sequence_range).c_str());
   return s;
 }
 
@@ -173,49 +161,25 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
   if (!s.ok()) {
     return s;
   }
-  uint32_t key_size = record.GetKeySize();
-  uint64_t blob_size = record.GetBlobSize();
-  fprintf(stdout, "  key size   : %d\n", key_size);
-  fprintf(stdout, "  blob size  : %" PRIu64 "\n", record.GetBlobSize());
-  fprintf(stdout, "  TTL        : %u\n", record.GetTTL());
-  fprintf(stdout, "  time       : %" PRIu64 "\n", record.GetTimeVal());
-  fprintf(stdout, "  type       : %d, %d\n", record.type(), record.subtype());
-  fprintf(stdout, "  header CRC : %u\n", record.header_checksum());
-  fprintf(stdout, "  CRC        : %u\n", record.checksum());
-  uint32_t header_crc =
-      crc32c::Extend(0, slice.data(), slice.size() - 2 * sizeof(uint32_t));
+  uint64_t key_size = record.key_size;
+  uint64_t value_size = record.value_size;
+  fprintf(stdout, "  key size   : %" PRIu64 "\n", key_size);
+  fprintf(stdout, "  value size : %" PRIu64 "\n", value_size);
+  fprintf(stdout, "  expiration : %" PRIu64 "\n", record.expiration);
   *offset += BlobLogRecord::kHeaderSize;
-  s = Read(*offset, key_size + blob_size + BlobLogRecord::kFooterSize, &slice);
+  s = Read(*offset, key_size + value_size, &slice);
   if (!s.ok()) {
     return s;
   }
-  header_crc = crc32c::Extend(header_crc, slice.data(), key_size);
-  header_crc = crc32c::Mask(header_crc);
-  if (header_crc != record.header_checksum()) {
-    return Status::Corruption("Record header checksum mismatch.");
-  }
-  uint32_t blob_crc = crc32c::Extend(0, slice.data() + key_size, blob_size);
-  blob_crc = crc32c::Mask(blob_crc);
-  if (blob_crc != record.checksum()) {
-    return Status::Corruption("Blob checksum mismatch.");
-  }
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "  key        : ");
     DumpSlice(Slice(slice.data(), key_size), show_key);
     if (show_blob != DisplayType::kNone) {
       fprintf(stdout, "  blob       : ");
-      DumpSlice(Slice(slice.data() + key_size, blob_size), show_blob);
+      DumpSlice(Slice(slice.data() + key_size, value_size), show_blob);
     }
   }
-  Slice footer_slice(slice.data() + record.GetKeySize() + record.GetBlobSize(),
-                     BlobLogRecord::kFooterSize);
-  s = record.DecodeFooterFrom(footer_slice);
-  if (!s.ok()) {
-    return s;
-  }
-  fprintf(stdout, "  footer CRC : %u\n", record.footer_checksum());
-  fprintf(stdout, "  sequence   : %" PRIu64 "\n", record.GetSN());
-  *offset += key_size + blob_size + BlobLogRecord::kFooterSize;
+  *offset += key_size + value_size;
   return s;
 }
 
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 51bba2fb8..162f364a2 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -3,15 +3,24 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 #ifndef ROCKSDB_LITE
+#include "utilities/blob_db/blob_file.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <stdio.h>
-#include <chrono>
-#include <cinttypes>
+
+#include <algorithm>
 #include <memory>
-#include "utilities/blob_db/blob_db_impl.h"
 
+#include "db/column_family.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
 #include "util/filename.h"
 #include "util/logging.h"
+#include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
 
@@ -20,17 +29,18 @@ namespace blob_db {
 BlobFile::BlobFile()
     : parent_(nullptr),
       file_number_(0),
+      has_ttl_(false),
+      compression_(kNoCompression),
       blob_count_(0),
       gc_epoch_(-1),
       file_size_(0),
       deleted_count_(0),
       deleted_size_(0),
       closed_(false),
-      can_be_deleted_(false),
+      obsolete_(false),
       gc_once_after_open_(false),
-      ttl_range_(std::make_pair(0, 0)),
-      time_range_(std::make_pair(0, 0)),
-      sn_range_(std::make_pair(0, 0)),
+      expiration_range_({0, 0}),
+      sequence_range_({kMaxSequenceNumber, 0}),
       last_access_(-1),
       last_fsync_(0),
       header_valid_(false) {}
@@ -39,23 +49,24 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
     : parent_(p),
       path_to_dir_(bdir),
       file_number_(fn),
+      has_ttl_(false),
+      compression_(kNoCompression),
       blob_count_(0),
       gc_epoch_(-1),
       file_size_(0),
       deleted_count_(0),
       deleted_size_(0),
       closed_(false),
-      can_be_deleted_(false),
+      obsolete_(false),
       gc_once_after_open_(false),
-      ttl_range_(std::make_pair(0, 0)),
-      time_range_(std::make_pair(0, 0)),
-      sn_range_(std::make_pair(0, 0)),
+      expiration_range_({0, 0}),
+      sequence_range_({kMaxSequenceNumber, 0}),
       last_access_(-1),
       last_fsync_(0),
       header_valid_(false) {}
 
 BlobFile::~BlobFile() {
-  if (can_be_deleted_) {
+  if (obsolete_) {
     std::string pn(PathName());
     Status s = Env::Default()->DeleteFile(PathName());
     if (!s.ok()) {
@@ -65,6 +76,13 @@ BlobFile::~BlobFile() {
   }
 }
 
+uint32_t BlobFile::column_family_id() const {
+  // TODO(yiwu): Should return column family id encoded in blob file after
+  // we add blob db column family support.
+  return reinterpret_cast<ColumnFamilyHandle*>(parent_->DefaultColumnFamily())
+      ->GetID();
+}
+
 std::string BlobFile::PathName() const {
   return BlobFileName(path_to_dir_, file_number_);
 }
@@ -94,16 +112,21 @@ std::string BlobFile::DumpState() const {
            "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " gc_epoch: %" PRIu64
            " file_size: %" PRIu64 " deleted_count: %" PRIu64
            " deleted_size: %" PRIu64
-           " closed: %d can_be_deleted: %d ttl_range: (%d, %d)"
-           " sn_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
+           " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
+           ") sequence_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
            path_to_dir_.c_str(), file_number_, blob_count_.load(),
            gc_epoch_.load(), file_size_.load(), deleted_count_, deleted_size_,
-           closed_.load(), can_be_deleted_.load(), ttl_range_.first,
-           ttl_range_.second, sn_range_.first, sn_range_.second,
-           (!!log_writer_), (!!ra_file_reader_));
+           closed_.load(), obsolete_.load(), expiration_range_.first,
+           expiration_range_.second, sequence_range_.first,
+           sequence_range_.second, (!!log_writer_), (!!ra_file_reader_));
   return str;
 }
 
+void BlobFile::MarkObsolete(SequenceNumber sequence) {
+  obsolete_sequence_ = sequence;
+  obsolete_.store(true);
+}
+
 bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
   assert(last_fsync_ <= file_size_);
   return (hard) ? file_size_ > last_fsync_
@@ -115,17 +138,18 @@ Status BlobFile::WriteFooterAndCloseLocked() {
                  "File is being closed after footer %s", PathName().c_str());
 
   BlobLogFooter footer;
-  footer.blob_count_ = blob_count_;
-  if (HasTTL()) footer.set_ttl_range(ttl_range_);
+  footer.blob_count = blob_count_;
+  if (HasTTL()) {
+    footer.expiration_range = expiration_range_;
+  }
 
-  footer.sn_range_ = sn_range_;
-  if (HasTimestamp()) footer.set_time_range(time_range_);
+  footer.sequence_range = sequence_range_;
 
   // this will close the file and reset the Writable File Pointer.
   Status s = log_writer_->AppendFooter(footer);
   if (s.ok()) {
     closed_ = true;
-    file_size_ += BlobLogFooter::kFooterSize;
+    file_size_ += BlobLogFooter::kSize;
   } else {
     ROCKS_LOG_ERROR(parent_->db_options_.info_log,
                     "Failure to read Header for blob-file %s",
@@ -137,20 +161,20 @@ Status BlobFile::WriteFooterAndCloseLocked() {
 }
 
 Status BlobFile::ReadFooter(BlobLogFooter* bf) {
-  if (file_size_ < (BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize)) {
+  if (file_size_ < (BlobLogHeader::kSize + BlobLogFooter::kSize)) {
     return Status::IOError("File does not have footer", PathName());
   }
 
-  uint64_t footer_offset = file_size_ - BlobLogFooter::kFooterSize;
+  uint64_t footer_offset = file_size_ - BlobLogFooter::kSize;
   // assume that ra_file_reader_ is valid before we enter this
   assert(ra_file_reader_);
 
   Slice result;
-  char scratch[BlobLogFooter::kFooterSize + 10];
-  Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kFooterSize,
-                                   &result, scratch);
+  char scratch[BlobLogFooter::kSize + 10];
+  Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result,
+                                   scratch);
   if (!s.ok()) return s;
-  if (result.size() != BlobLogFooter::kFooterSize) {
+  if (result.size() != BlobLogFooter::kSize) {
     // should not happen
     return Status::IOError("EOF reached before footer");
   }
@@ -160,21 +184,12 @@ Status BlobFile::ReadFooter(BlobLogFooter* bf) {
 }
 
 Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
-  if (footer.HasTTL() != header_.HasTTL()) {
-    return Status::Corruption("has_ttl mismatch");
-  }
-  if (footer.HasTimestamp() != header_.HasTimestamp()) {
-    return Status::Corruption("has_ts mismatch");
-  }
-
   // assume that file has been fully fsync'd
   last_fsync_.store(file_size_);
-  blob_count_ = footer.GetBlobCount();
-  ttl_range_ = footer.GetTTLRange();
-  time_range_ = footer.GetTimeRange();
-  sn_range_ = footer.GetSNRange();
+  blob_count_ = footer.blob_count;
+  expiration_range_ = footer.expiration_range;
+  sequence_range_ = footer.sequence_range;
   closed_ = true;
-
   return Status::OK();
 }
 
@@ -193,8 +208,10 @@ void BlobFile::CloseRandomAccessLocked() {
 std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
     Env* env, const EnvOptions& env_options, bool* fresh_open) {
   *fresh_open = false;
-  last_access_ =
-      std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+  int64_t current_time = 0;
+  env->GetCurrentTime(&current_time);
+  last_access_.store(current_time);
+
   {
     ReadLock lockbfile_r(&mutex_);
     if (ra_file_reader_) return ra_file_reader_;
@@ -220,10 +237,6 @@ std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
   return ra_file_reader_;
 }
 
-ColumnFamilyHandle* BlobFile::GetColumnFamily(DB* db) {
-  return db->DefaultColumnFamily();
-}
-
 }  // namespace blob_db
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
new file mode 100644
index 000000000..4085cfef0
--- /dev/null
+++ b/utilities/blob_db/blob_file.h
@@ -0,0 +1,216 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <memory>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "util/file_reader_writer.h"
+#include "utilities/blob_db/blob_log_format.h"
+#include "utilities/blob_db/blob_log_reader.h"
+#include "utilities/blob_db/blob_log_writer.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+class BlobDBImpl;
+
+class BlobFile {
+  friend class BlobDBImpl;
+  friend struct blobf_compare_ttl;
+
+ private:
+  // access to parent
+  const BlobDBImpl* parent_;
+
+  // path to blob directory
+  std::string path_to_dir_;
+
+  // the id of the file.
+  // the above 2 are created during file creation and never changed
+  // after that
+  uint64_t file_number_;
+
+  // If true, the keys in this file all has TTL. Otherwise all keys don't
+  // have TTL.
+  bool has_ttl_;
+
+  // Compression type of blobs in the file
+  CompressionType compression_;
+
+  // number of blobs in the file
+  std::atomic<uint64_t> blob_count_;
+
+  // the file will be selected for GC in this future epoch
+  std::atomic<int64_t> gc_epoch_;
+
+  // size of the file
+  std::atomic<uint64_t> file_size_;
+
+  // number of blobs in this particular file which have been evicted
+  uint64_t deleted_count_;
+
+  // size of deleted blobs (used by heuristic to select file for GC)
+  uint64_t deleted_size_;
+
+  BlobLogHeader header_;
+
+  // closed_ = true implies the file is no more mutable
+  // no more blobs will be appended and the footer has been written out
+  std::atomic<bool> closed_;
+
+  // has a pass of garbage collection successfully finished on this file
+  // obsolete_ still needs to do iterator/snapshot checks
+  std::atomic<bool> obsolete_;
+
+  // The last sequence number by the time the file marked as obsolete.
+  // Data in this file is visible to a snapshot taken before the sequence.
+  SequenceNumber obsolete_sequence_;
+
+  // should this file been gc'd once to reconcile lost deletes/compactions
+  std::atomic<bool> gc_once_after_open_;
+
+  ExpirationRange expiration_range_;
+
+  SequenceRange sequence_range_;
+
+  // Sequential/Append writer for blobs
+  std::shared_ptr<Writer> log_writer_;
+
+  // random access file reader for GET calls
+  std::shared_ptr<RandomAccessFileReader> ra_file_reader_;
+
+  // This Read-Write mutex is per file specific and protects
+  // all the datastructures
+  mutable port::RWMutex mutex_;
+
+  // time when the random access reader was last created.
+  std::atomic<std::int64_t> last_access_;
+
+  // last time file was fsync'd/fdatasyncd
+  std::atomic<uint64_t> last_fsync_;
+
+  bool header_valid_;
+
+  SequenceNumber garbage_collection_finish_sequence_;
+
+ public:
+  BlobFile();
+
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
+
+  ~BlobFile();
+
+  uint32_t column_family_id() const;
+
+  // Returns log file's pathname relative to the main db dir
+  // Eg. For a live-log-file = blob_dir/000003.blob
+  std::string PathName() const;
+
+  // Primary identifier for blob file.
+  // once the file is created, this never changes
+  uint64_t BlobFileNumber() const { return file_number_; }
+
+  // the following functions are atomic, and don't need
+  // read lock
+  uint64_t BlobCount() const {
+    return blob_count_.load(std::memory_order_acquire);
+  }
+
+  std::string DumpState() const;
+
+  // if the file has gone through GC and blobs have been relocated
+  bool Obsolete() const {
+    assert(Immutable() || !obsolete_.load());
+    return obsolete_.load();
+  }
+
+  // Mark file as obsolete by garbage collection. The file is not visible to
+  // snapshots with sequence greater or equal to the given sequence.
+  void MarkObsolete(SequenceNumber sequence);
+
+  SequenceNumber GetObsoleteSequence() const {
+    assert(Obsolete());
+    return obsolete_sequence_;
+  }
+
+  // if the file is not taking any more appends.
+  bool Immutable() const { return closed_.load(); }
+
+  // we will assume this is atomic
+  bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
+
+  void Fsync();
+
+  uint64_t GetFileSize() const {
+    return file_size_.load(std::memory_order_acquire);
+  }
+
+  // All Get functions which are not atomic, will need ReadLock on the mutex
+
+  ExpirationRange GetExpirationRange() const { return expiration_range_; }
+
+  void ExtendExpirationRange(uint64_t expiration) {
+    expiration_range_.first = std::min(expiration_range_.first, expiration);
+    expiration_range_.second = std::max(expiration_range_.second, expiration);
+  }
+
+  SequenceRange GetSequenceRange() const { return sequence_range_; }
+
+  void SetSequenceRange(SequenceRange sequence_range) {
+    sequence_range_ = sequence_range;
+  }
+
+  void ExtendSequenceRange(SequenceNumber sequence) {
+    sequence_range_.first = std::min(sequence_range_.first, sequence);
+    sequence_range_.second = std::max(sequence_range_.second, sequence);
+  }
+
+  bool HasTTL() const { return has_ttl_; }
+
+  void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
+
+  CompressionType compression() const { return compression_; }
+
+  void SetCompression(CompressionType c) {
+    compression_ = c;
+  }
+
+  std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
+
+ private:
+  std::shared_ptr<Reader> OpenSequentialReader(
+      Env* env, const DBOptions& db_options,
+      const EnvOptions& env_options) const;
+
+  Status ReadFooter(BlobLogFooter* footer);
+
+  Status WriteFooterAndCloseLocked();
+
+  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
+      Env* env, const EnvOptions& env_options, bool* fresh_open);
+
+  void CloseRandomAccessLocked();
+
+  // this is used, when you are reading only the footer of a
+  // previously closed file
+  Status SetFromFooterLocked(const BlobLogFooter& footer);
+
+  void set_expiration_range(const ExpirationRange& expiration_range) {
+    expiration_range_ = expiration_range;
+  }
+
+  // The following functions are atomic, and don't need locks
+  void SetFileSize(uint64_t fs) { file_size_ = fs; }
+
+  void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
+};
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_index.h b/utilities/blob_db/blob_index.h
new file mode 100644
index 000000000..fd91b547a
--- /dev/null
+++ b/utilities/blob_db/blob_index.h
@@ -0,0 +1,161 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/options.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+//    kInlinedTTL:
+//      +------+------------+---------------+
+//      | type | expiration | value         |
+//      +------+------------+---------------+
+//      | char | varint64   | variable size |
+//      +------+------------+---------------+
+//
+//    kBlob:
+//      +------+-------------+----------+----------+-------------+
+//      | type | file number | offset   | size     | compression |
+//      +------+-------------+----------+----------+-------------+
+//      | char | varint64    | varint64 | varint64 | char        |
+//      +------+-------------+----------+----------+-------------+
+//
+//    kBlobTTL:
+//      +------+------------+-------------+----------+----------+-------------+
+//      | type | expiration | file number | offset   | size     | compression |
+//      +------+------------+-------------+----------+----------+-------------+
+//      | char | varint64   | varint64    | varint64 | varint64 | char        |
+//      +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+  enum class Type : unsigned char {
+    kInlinedTTL = 0,
+    kBlob = 1,
+    kBlobTTL = 2,
+    kUnknown = 3,
+  };
+
+  BlobIndex() : type_(Type::kUnknown) {}
+
+  bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+  bool HasTTL() const {
+    return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+  }
+
+  uint64_t expiration() const {
+    assert(HasTTL());
+    return expiration_;
+  }
+
+  const Slice& value() const {
+    assert(IsInlined());
+    return value_;
+  }
+
+  uint64_t file_number() const {
+    assert(!IsInlined());
+    return file_number_;
+  }
+
+  uint64_t offset() const {
+    assert(!IsInlined());
+    return offset_;
+  }
+
+  uint64_t size() const {
+    assert(!IsInlined());
+    return size_;
+  }
+
+  Status DecodeFrom(Slice slice) {
+    static const std::string kErrorMessage = "Error while decoding blob index";
+    assert(slice.size() > 0);
+    type_ = static_cast<Type>(*slice.data());
+    if (type_ >= Type::kUnknown) {
+      return Status::Corruption(
+          kErrorMessage,
+          "Unknown blob index type: " + ToString(static_cast<char>(type_)));
+    }
+    slice = Slice(slice.data() + 1, slice.size() - 1);
+    if (HasTTL()) {
+      if (!GetVarint64(&slice, &expiration_)) {
+        return Status::Corruption(kErrorMessage, "Corrupted expiration");
+      }
+    }
+    if (IsInlined()) {
+      value_ = slice;
+    } else {
+      if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+          GetVarint64(&slice, &size_) && slice.size() == 1) {
+        compression_ = static_cast<CompressionType>(*slice.data());
+      } else {
+        return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+      }
+    }
+    return Status::OK();
+  }
+
+  static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+                               const Slice& value) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(1 + kMaxVarint64Length + value.size());
+    dst->push_back(static_cast<char>(Type::kInlinedTTL));
+    PutVarint64(dst, expiration);
+    dst->append(value.data(), value.size());
+  }
+
+  static void EncodeBlob(std::string* dst, uint64_t file_number,
+                         uint64_t offset, uint64_t size,
+                         CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 3 + 2);
+    dst->push_back(static_cast<char>(Type::kBlob));
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+  static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+                            uint64_t file_number, uint64_t offset,
+                            uint64_t size, CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 4 + 2);
+    dst->push_back(static_cast<char>(Type::kBlobTTL));
+    PutVarint64(dst, expiration);
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+ private:
+  Type type_ = Type::kUnknown;
+  uint64_t expiration_ = 0;
+  Slice value_;
+  uint64_t file_number_ = 0;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+  CompressionType compression_ = kNoCompression;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_format.cc b/utilities/blob_db/blob_log_format.cc
index 6917a290f..eb748ac99 100644
--- a/utilities/blob_db/blob_log_format.cc
+++ b/utilities/blob_db/blob_log_format.cc
@@ -6,308 +6,145 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_log_format.h"
+
 #include "util/coding.h"
 #include "util/crc32c.h"
 
 namespace rocksdb {
 namespace blob_db {
 
-const uint32_t kMagicNumber = 2395959;
-const uint32_t kVersion1 = 1;
-const size_t kBlockSize = 32768;
-
-BlobLogHeader::BlobLogHeader()
-    : magic_number_(kMagicNumber), compression_(kNoCompression) {}
-
-BlobLogHeader& BlobLogHeader::operator=(BlobLogHeader&& in) noexcept {
-  if (this != &in) {
-    magic_number_ = in.magic_number_;
-    version_ = in.version_;
-    ttl_guess_ = std::move(in.ttl_guess_);
-    ts_guess_ = std::move(in.ts_guess_);
-    compression_ = in.compression_;
-  }
-  return *this;
+void BlobLogHeader::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogHeader::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed32(dst, version);
+  PutFixed32(dst, column_family_id);
+  unsigned char flags = (has_ttl ? 1 : 0);
+  dst->push_back(flags);
+  dst->push_back(compression);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
 }
 
-BlobLogFooter::BlobLogFooter() : magic_number_(kMagicNumber), blob_count_(0) {}
-
-Status BlobLogFooter::DecodeFrom(const Slice& input) {
-  Slice slice(input);
-  uint32_t val;
-  if (!GetFixed32(&slice, &val)) {
-    return Status::Corruption("Invalid Blob Footer: flags");
-  }
-
-  bool has_ttl = false;
-  bool has_ts = false;
-  val >>= 8;
-  RecordSubType st = static_cast<RecordSubType>(val);
-  switch (st) {
-    case kRegularType:
-      break;
-    case kTTLType:
-      has_ttl = true;
-      break;
-    case kTimestampType:
-      has_ts = true;
-      break;
-    default:
-      return Status::Corruption("Invalid Blob Footer: flags_val");
-  }
-
-  if (!GetFixed64(&slice, &blob_count_)) {
-    return Status::Corruption("Invalid Blob Footer: blob_count");
-  }
-
-  ttlrange_t temp_ttl;
-  if (!GetFixed32(&slice, &temp_ttl.first) ||
-      !GetFixed32(&slice, &temp_ttl.second)) {
-    return Status::Corruption("Invalid Blob Footer: ttl_range");
-  }
-  if (has_ttl) {
-    ttl_range_.reset(new ttlrange_t(temp_ttl));
-  }
-
-  if (!GetFixed64(&slice, &sn_range_.first) ||
-      !GetFixed64(&slice, &sn_range_.second)) {
-    return Status::Corruption("Invalid Blob Footer: sn_range");
-  }
-
-  tsrange_t temp_ts;
-  if (!GetFixed64(&slice, &temp_ts.first) ||
-      !GetFixed64(&slice, &temp_ts.second)) {
-    return Status::Corruption("Invalid Blob Footer: ts_range");
-  }
-  if (has_ts) {
-    ts_range_.reset(new tsrange_t(temp_ts));
-  }
-
-  if (!GetFixed32(&slice, &magic_number_) || magic_number_ != kMagicNumber) {
-    return Status::Corruption("Invalid Blob Footer: magic");
+Status BlobLogHeader::DecodeFrom(Slice src) {
+  static const std::string kErrorMessage =
+      "Error while decoding blob log header";
+  if (src.size() != BlobLogHeader::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file header size");
+  }
+  uint32_t magic_number;
+  unsigned char flags;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
+      !GetFixed32(&src, &column_family_id)) {
+    return Status::Corruption(
+        kErrorMessage,
+        "Error decoding magic number, version and column family id");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (version != kVersion1) {
+    return Status::Corruption(kErrorMessage, "Unknown header version");
+  }
+  flags = src.data()[0];
+  compression = static_cast<CompressionType>(src.data()[1]);
+  has_ttl = (flags & 1) == 1;
+  src.remove_prefix(2);
+  if (!GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second)) {
+    return Status::Corruption(kErrorMessage, "Error decoding expiration range");
   }
-
   return Status::OK();
 }
 
-void BlobLogFooter::EncodeTo(std::string* dst) const {
-  dst->reserve(kFooterSize);
-
-  RecordType rt = kFullType;
-  RecordSubType st = kRegularType;
-  if (HasTTL()) {
-    st = kTTLType;
-  } else if (HasTimestamp()) {
-    st = kTimestampType;
-  }
-  uint32_t val = static_cast<uint32_t>(rt) | (static_cast<uint32_t>(st) << 8);
-  PutFixed32(dst, val);
-
-  PutFixed64(dst, blob_count_);
-  bool has_ttl = HasTTL();
-  bool has_ts = HasTimestamp();
-
-  if (has_ttl) {
-    PutFixed32(dst, ttl_range_.get()->first);
-    PutFixed32(dst, ttl_range_.get()->second);
-  } else {
-    PutFixed32(dst, 0);
-    PutFixed32(dst, 0);
-  }
-  PutFixed64(dst, sn_range_.first);
-  PutFixed64(dst, sn_range_.second);
-
-  if (has_ts) {
-    PutFixed64(dst, ts_range_.get()->first);
-    PutFixed64(dst, ts_range_.get()->second);
-  } else {
-    PutFixed64(dst, 0);
-    PutFixed64(dst, 0);
-  }
-
-  PutFixed32(dst, magic_number_);
-}
-
-void BlobLogHeader::EncodeTo(std::string* dst) const {
-  dst->reserve(kHeaderSize);
-
-  PutFixed32(dst, magic_number_);
-
-  PutFixed32(dst, version_);
-
-  RecordSubType st = kRegularType;
-  bool has_ttl = HasTTL();
-  bool has_ts = HasTimestamp();
-
-  if (has_ttl) {
-    st = kTTLType;
-  } else if (has_ts) {
-    st = kTimestampType;
-  }
-  uint32_t val =
-      static_cast<uint32_t>(st) | (static_cast<uint32_t>(compression_) << 8);
-  PutFixed32(dst, val);
-
-  if (has_ttl) {
-    PutFixed32(dst, ttl_guess_.get()->first);
-    PutFixed32(dst, ttl_guess_.get()->second);
-  } else {
-    PutFixed32(dst, 0);
-    PutFixed32(dst, 0);
-  }
-
-  if (has_ts) {
-    PutFixed64(dst, ts_guess_.get()->first);
-    PutFixed64(dst, ts_guess_.get()->second);
-  } else {
-    PutFixed64(dst, 0);
-    PutFixed64(dst, 0);
-  }
+void BlobLogFooter::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogFooter::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed64(dst, blob_count);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+  PutFixed64(dst, sequence_range.first);
+  PutFixed64(dst, sequence_range.second);
+  crc = crc32c::Value(dst->c_str(), dst->size());
+  crc = crc32c::Mask(crc);
+  PutFixed32(dst, crc);
 }
 
-Status BlobLogHeader::DecodeFrom(const Slice& input) {
-  Slice slice(input);
-  if (!GetFixed32(&slice, &magic_number_) || magic_number_ != kMagicNumber) {
-    return Status::Corruption("Invalid Blob Log Header: magic");
-  }
-
-  // as of today, we only support 1 version
-  if (!GetFixed32(&slice, &version_) || version_ != kVersion1) {
-    return Status::Corruption("Invalid Blob Log Header: version");
-  }
-
-  uint32_t val;
-  if (!GetFixed32(&slice, &val)) {
-    return Status::Corruption("Invalid Blob Log Header: subtype");
-  }
-
-  bool has_ttl = false;
-  bool has_ts = false;
-  RecordSubType st = static_cast<RecordSubType>(val & 0xff);
-  compression_ = static_cast<CompressionType>((val >> 8) & 0xff);
-  switch (st) {
-    case kRegularType:
-      break;
-    case kTTLType:
-      has_ttl = true;
-      break;
-    case kTimestampType:
-      has_ts = true;
-      break;
-    default:
-      return Status::Corruption("Invalid Blob Log Header: subtype_2");
-  }
-
-  ttlrange_t temp_ttl;
-  if (!GetFixed32(&slice, &temp_ttl.first) ||
-      !GetFixed32(&slice, &temp_ttl.second)) {
-    return Status::Corruption("Invalid Blob Log Header: ttl");
+Status BlobLogFooter::DecodeFrom(Slice src) {
+  static const std::string kErrorMessage =
+      "Error while decoding blob log footer";
+  if (src.size() != BlobLogFooter::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file footer size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - 4);
+  src_crc = crc32c::Mask(src_crc);
+  uint32_t magic_number;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
+      !GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second) ||
+      !GetFixed64(&src, &sequence_range.first) ||
+      !GetFixed64(&src, &sequence_range.second) || !GetFixed32(&src, &crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (src_crc != crc) {
+    return Status::Corruption(kErrorMessage, "CRC mismatch");
   }
-  if (has_ttl) set_ttl_guess(temp_ttl);
-
-  tsrange_t temp_ts;
-  if (!GetFixed64(&slice, &temp_ts.first) ||
-      !GetFixed64(&slice, &temp_ts.second)) {
-    return Status::Corruption("Invalid Blob Log Header: timestamp");
-  }
-  if (has_ts) set_ts_guess(temp_ts);
-
   return Status::OK();
 }
 
-BlobLogRecord::BlobLogRecord()
-    : checksum_(0),
-      header_cksum_(0),
-      key_size_(0),
-      blob_size_(0),
-      time_val_(0),
-      ttl_val_(0),
-      sn_(0),
-      type_(0),
-      subtype_(0) {}
-
-BlobLogRecord::~BlobLogRecord() {}
-
-void BlobLogRecord::ResizeKeyBuffer(size_t kbs) {
-  if (kbs > key_buffer_.size()) {
-    key_buffer_.resize(kbs);
-  }
+void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
+  PutFixed64(dst, key.size());
+  PutFixed64(dst, value.size());
+  PutFixed64(dst, expiration);
+  header_crc = crc32c::Value(dst->c_str(), dst->size());
+  header_crc = crc32c::Mask(header_crc);
+  PutFixed32(dst, header_crc);
+  blob_crc = crc32c::Value(key.data(), key.size());
+  blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
+  blob_crc = crc32c::Mask(blob_crc);
+  PutFixed32(dst, blob_crc);
 }
 
-void BlobLogRecord::ResizeBlobBuffer(size_t bbs) {
-  if (bbs > blob_buffer_.size()) {
-    blob_buffer_.resize(bbs);
+Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
+  static const std::string kErrorMessage = "Error while decoding blob record";
+  if (src.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob record header size");
   }
-}
-
-void BlobLogRecord::Clear() {
-  checksum_ = 0;
-  header_cksum_ = 0;
-  key_size_ = 0;
-  blob_size_ = 0;
-  time_val_ = 0;
-  ttl_val_ = 0;
-  sn_ = 0;
-  type_ = subtype_ = 0;
-  key_.clear();
-  blob_.clear();
-}
-
-Status BlobLogRecord::DecodeHeaderFrom(const Slice& hdrslice) {
-  Slice input = hdrslice;
-  if (input.size() < kHeaderSize) {
-    return Status::Corruption("Invalid Blob Record Header: size");
-  }
-
-  if (!GetFixed32(&input, &key_size_)) {
-    return Status::Corruption("Invalid Blob Record Header: key_size");
-  }
-  if (!GetFixed64(&input, &blob_size_)) {
-    return Status::Corruption("Invalid Blob Record Header: blob_size");
-  }
-  if (!GetFixed32(&input, &ttl_val_)) {
-    return Status::Corruption("Invalid Blob Record Header: ttl_val");
-  }
-  if (!GetFixed64(&input, &time_val_)) {
-    return Status::Corruption("Invalid Blob Record Header: time_val");
-  }
-
-  type_ = *(input.data());
-  input.remove_prefix(1);
-  subtype_ = *(input.data());
-  input.remove_prefix(1);
-
-  if (!GetFixed32(&input, &header_cksum_)) {
-    return Status::Corruption("Invalid Blob Record Header: header_cksum");
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
+  src_crc = crc32c::Mask(src_crc);
+  if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
+      !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
+      !GetFixed32(&src, &blob_crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
   }
-  if (!GetFixed32(&input, &checksum_)) {
-    return Status::Corruption("Invalid Blob Record Header: checksum");
+  if (src_crc != header_crc) {
+    return Status::Corruption(kErrorMessage, "Header CRC mismatch");
   }
-
   return Status::OK();
 }
 
-Status BlobLogRecord::DecodeFooterFrom(const Slice& footerslice) {
-  Slice input = footerslice;
-  if (input.size() < kFooterSize) {
-    return Status::Corruption("Invalid Blob Record Footer: size");
+Status BlobLogRecord::CheckBlobCRC() const {
+  uint32_t expected_crc = 0;
+  expected_crc = crc32c::Value(key.data(), key.size());
+  expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
+  expected_crc = crc32c::Mask(expected_crc);
+  if (expected_crc != blob_crc) {
+    return Status::Corruption("Blob CRC mismatch");
   }
-
-  uint32_t f_crc = crc32c::Extend(0, input.data(), 8);
-  f_crc = crc32c::Mask(f_crc);
-
-  if (!GetFixed64(&input, &sn_)) {
-    return Status::Corruption("Invalid Blob Record Footer: sn");
-  }
-
-  if (!GetFixed32(&input, &footer_cksum_)) {
-    return Status::Corruption("Invalid Blob Record Footer: cksum");
-  }
-
-  if (f_crc != footer_cksum_) {
-    return Status::Corruption("Record Checksum mismatch: footer_cksum");
-  }
-
   return Status::OK();
 }
 
diff --git a/utilities/blob_db/blob_log_format.h b/utilities/blob_db/blob_log_format.h
index f4e62fe2d..0b5cff547 100644
--- a/utilities/blob_db/blob_log_format.h
+++ b/utilities/blob_db/blob_log_format.h
@@ -9,250 +9,113 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <cstddef>
-#include <cstdint>
 #include <limits>
-#include <memory>
-#include <string>
 #include <utility>
 #include "rocksdb/options.h"
+#include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 
 namespace rocksdb {
-
 namespace blob_db {
-class BlobFile;
-class BlobDBImpl;
-
-enum RecordType : uint8_t {
-  // Zero is reserved for preallocated files
-  kFullType = 0,
-
-  // For fragments
-  kFirstType = 1,
-  kMiddleType = 2,
-  kLastType = 3,
-  kMaxRecordType = kLastType
-};
-
-enum RecordSubType : uint8_t {
-  kRegularType = 0,
-  kTTLType = 1,
-  kTimestampType = 2,
-};
-
-extern const uint32_t kMagicNumber;
-
-class Reader;
-
-typedef std::pair<uint32_t, uint32_t> ttlrange_t;
-typedef std::pair<uint64_t, uint64_t> tsrange_t;
-typedef std::pair<rocksdb::SequenceNumber, rocksdb::SequenceNumber> snrange_t;
-
-class BlobLogHeader {
-  friend class BlobFile;
-  friend class BlobDBImpl;
-
- private:
-  uint32_t magic_number_ = 0;
-  uint32_t version_ = 1;
-  CompressionType compression_;
-  std::unique_ptr<ttlrange_t> ttl_guess_;
-  std::unique_ptr<tsrange_t> ts_guess_;
-
- private:
-  void set_ttl_guess(const ttlrange_t& ttl) {
-    ttl_guess_.reset(new ttlrange_t(ttl));
-  }
-
-  void set_version(uint32_t v) { version_ = v; }
-
-  void set_ts_guess(const tsrange_t& ts) { ts_guess_.reset(new tsrange_t(ts)); }
-
- public:
-  // magic number + version + flags + ttl guess + timestamp range = 36
-  static const size_t kHeaderSize = 4 + 4 + 4 + 4 * 2 + 8 * 2;
-
-  void EncodeTo(std::string* dst) const;
-
-  Status DecodeFrom(const Slice& input);
-
-  BlobLogHeader();
 
-  uint32_t magic_number() const { return magic_number_; }
+constexpr uint32_t kMagicNumber = 2395959;  // 0x00248f37
+constexpr uint32_t kVersion1 = 1;
+constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
 
-  uint32_t version() const { return version_; }
+using ExpirationRange = std::pair<uint64_t, uint64_t>;
+using SequenceRange = std::pair<uint64_t, uint64_t>;
 
-  CompressionType compression() const { return compression_; }
-
-  ttlrange_t ttl_range() const {
-    if (!ttl_guess_) {
-      return {0, 0};
-    }
-    return *ttl_guess_;
-  }
-
-  tsrange_t ts_range() const {
-    if (!ts_guess_) {
-      return {0, 0};
-    }
-    return *ts_guess_;
-  }
+// Format of blob log file header (30 bytes):
+//
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    | magic number | version |  cf id  | flags | compression | expiration range  |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    |   Fixed32    | Fixed32 | Fixed32 | char  |    char     | Fixed64   Fixed64 |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//
+// List of flags:
+//   has_ttl: Whether the file contain TTL data.
+//
+// Expiration range in the header is a rough range based on
+// blob_db_options.ttl_range_secs.
+struct BlobLogHeader {
+  static constexpr size_t kSize = 30;
 
-  bool HasTTL() const { return !!ttl_guess_; }
+  uint32_t version = kVersion1;
+  uint32_t column_family_id = 0;
+  CompressionType compression = kNoCompression;
+  bool has_ttl = false;
+  ExpirationRange expiration_range = std::make_pair(0, 0);
 
-  bool HasTimestamp() const { return !!ts_guess_; }
+  void EncodeTo(std::string* dst);
 
-  BlobLogHeader& operator=(BlobLogHeader&& in) noexcept;
+  Status DecodeFrom(Slice slice);
 };
 
-// Footer encapsulates the fixed information stored at the tail
-// end of every blob log file.
-class BlobLogFooter {
-  friend class BlobFile;
-
- public:
-  // Use this constructor when you plan to write out the footer using
-  // EncodeTo(). Never use this constructor with DecodeFrom().
-  BlobLogFooter();
-
-  uint32_t magic_number() const { return magic_number_; }
-
-  void EncodeTo(std::string* dst) const;
-
-  Status DecodeFrom(const Slice& input);
-
-  // convert this object to a human readable form
-  std::string ToString() const;
-
-  // footer size = 4 byte magic number
-  // 8 bytes count
-  // 4, 4 - ttl range
-  // 8, 8 - sn range
-  // 8, 8 - ts range
-  // = 56
-  static const size_t kFooterSize = 4 + 4 + 8 + (4 * 2) + (8 * 2) + (8 * 2);
-
-  bool HasTTL() const { return !!ttl_range_; }
-
-  bool HasTimestamp() const { return !!ts_range_; }
-
-  uint64_t GetBlobCount() const { return blob_count_; }
-
-  ttlrange_t GetTTLRange() const {
-    if (ttl_range_) {
-      *ttl_range_;
-    }
-    return {0, 0};
-  }
-
-  tsrange_t GetTimeRange() const {
-    if (ts_range_) {
-      return *ts_range_;
-    }
-    return {0, 0};
-  }
-
-  const snrange_t& GetSNRange() const { return sn_range_; }
+// Format of blob log file footer (48 bytes):
+//
+//    +--------------+------------+-------------------+-------------------+------------+
+//    | magic number | blob count | expiration range  |  sequence range   | footer CRC |
+//    +--------------+------------+-------------------+-------------------+------------+
+//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 | Fixed64 + Fixed64 |   Fixed32  |
+//    +--------------+------------+-------------------+-------------------+------------+
+//
+// The footer will be presented only when the blob file is properly closed.
+//
+// Unlike the same field in file header, expiration range in the footer is the
+// range of smallest and largest expiration of the data in this file.
+struct BlobLogFooter {
+  static constexpr size_t kSize = 48;
 
- private:
-  uint32_t magic_number_ = 0;
-  uint64_t blob_count_ = 0;
+  uint64_t blob_count = 0;
+  ExpirationRange expiration_range = std::make_pair(0, 0);
+  SequenceRange sequence_range = std::make_pair(0, 0);
+  uint32_t crc = 0;
 
-  std::unique_ptr<ttlrange_t> ttl_range_;
-  std::unique_ptr<tsrange_t> ts_range_;
-  snrange_t sn_range_;
+  void EncodeTo(std::string* dst);
 
- private:
-  void set_ttl_range(const ttlrange_t& ttl) {
-    ttl_range_.reset(new ttlrange_t(ttl));
-  }
-  void set_time_range(const tsrange_t& ts) {
-    ts_range_.reset(new tsrange_t(ts));
-  }
+  Status DecodeFrom(Slice slice);
 };
 
-extern const size_t kBlockSize;
-
-class BlobLogRecord {
-  friend class Reader;
-
- private:
-  // this might not be set.
-  uint32_t checksum_;
-  uint32_t header_cksum_;
-  uint32_t key_size_;
-  uint64_t blob_size_;
-  uint64_t time_val_;
-  uint32_t ttl_val_;
-  SequenceNumber sn_;
-  uint32_t footer_cksum_;
-  char type_;
-  char subtype_;
-  Slice key_;
-  Slice blob_;
-  std::string key_buffer_;
-  std::string blob_buffer_;
-
- private:
-  void Clear();
-
-  char* GetKeyBuffer() { return &(key_buffer_[0]); }
-
-  char* GetBlobBuffer() { return &(blob_buffer_[0]); }
-
-  void ResizeKeyBuffer(size_t kbs);
-
-  void ResizeBlobBuffer(size_t bbs);
-
- public:
-  // Header is
-  // Key Length ( 4 bytes ),
-  // Blob Length ( 8 bytes), timestamp/ttl (8 bytes),
-  // type (1 byte), subtype (1 byte)
-  // header checksum (4 bytes), blob checksum (4 bytes),
-  // = 34
-  static const size_t kHeaderSize = 4 + 4 + 4 + 8 + 4 + 8 + 1 + 1;
-
-  static const size_t kFooterSize = 8 + 4;
-
- public:
-  BlobLogRecord();
-
-  ~BlobLogRecord();
-
-  const Slice& Key() const { return key_; }
-
-  const Slice& Blob() const { return blob_; }
-
-  uint32_t GetKeySize() const { return key_size_; }
-
-  uint64_t GetBlobSize() const { return blob_size_; }
-
-  bool HasTTL() const {
-    return ttl_val_ != std::numeric_limits<uint32_t>::max();
-  }
-
-  uint32_t GetTTL() const { return ttl_val_; }
-
-  uint64_t GetTimeVal() const { return time_val_; }
-
-  char type() const { return type_; }
-
-  char subtype() const { return subtype_; }
-
-  SequenceNumber GetSN() const { return sn_; }
-
-  uint32_t header_checksum() const { return header_cksum_; }
-
-  uint32_t checksum() const { return checksum_; }
-
-  uint32_t footer_checksum() const { return footer_cksum_; }
-
-  Status DecodeHeaderFrom(const Slice& hdrslice);
-
-  Status DecodeFooterFrom(const Slice& footerslice);
+// Blob record format (32 bytes header + key + value):
+//
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    | key length | value length | expiration | header CRC | blob CRC |   key   |   value   |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    |   Fixed64  |   Fixed64    |  Fixed64   |  Fixed32   | Fixed32  | key len | value len |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//
+// If file has has_ttl = false, expiration field is always 0, and the blob
+// doesn't has expiration.
+//
+// Also note that if compression is used, value is compressed value and value
+// length is compressed value length.
+//
+// Header CRC is the checksum of (key_len + val_len + expiration), while
+// blob CRC is the checksum of (key + value).
+//
+// We could use variable length encoding (Varint64) to save more space, but it
+// make reader more complicated.
+struct BlobLogRecord {
+  // header include fields up to blob CRC
+  static constexpr size_t kHeaderSize = 32;
+
+  uint64_t key_size = 0;
+  uint64_t value_size = 0;
+  uint64_t expiration = 0;
+  uint32_t header_crc = 0;
+  uint32_t blob_crc = 0;
+  Slice key;
+  Slice value;
+  std::string key_buf;
+  std::string value_buf;
+
+  void EncodeHeaderTo(std::string* dst);
+
+  Status DecodeHeaderFrom(Slice src);
+
+  Status CheckBlobCRC() const;
 };
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc
index 3931c8669..a2421b930 100644
--- a/utilities/blob_db/blob_log_reader.cc
+++ b/utilities/blob_db/blob_log_reader.cc
@@ -7,10 +7,8 @@
 
 #include "utilities/blob_db/blob_log_reader.h"
 
-#include <cstdio>
-#include "rocksdb/env.h"
-#include "util/coding.h"
-#include "util/crc32c.h"
+#include <algorithm>
+
 #include "util/file_reader_writer.h"
 
 namespace rocksdb {
@@ -18,146 +16,79 @@ namespace blob_db {
 
 Reader::Reader(std::shared_ptr<Logger> info_log,
                unique_ptr<SequentialFileReader>&& _file)
-    : info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {
-  backing_store_.resize(kBlockSize);
+    : info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {}
+
+Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
+  buf->reserve(size);
+  Status s = file_->Read(size, slice, &(*buf)[0]);
+  next_byte_ += size;
+  if (!s.ok()) {
+    return s;
+  }
+  if (slice->size() != size) {
+    return Status::Corruption("EOF reached while reading record");
+  }
+  return s;
 }
 
-Reader::~Reader() {}
-
 Status Reader::ReadHeader(BlobLogHeader* header) {
   assert(file_.get() != nullptr);
   assert(next_byte_ == 0);
-  Status status =
-      file_->Read(BlobLogHeader::kHeaderSize, &buffer_, GetReadBuffer());
-  next_byte_ += buffer_.size();
-  if (!status.ok()) return status;
+  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, &backing_store_);
+  if (!s.ok()) {
+    return s;
+  }
 
-  if (buffer_.size() != BlobLogHeader::kHeaderSize) {
-    return Status::IOError("EOF reached before file header");
+  if (buffer_.size() != BlobLogHeader::kSize) {
+    return Status::Corruption("EOF reached before file header");
   }
 
-  status = header->DecodeFrom(buffer_);
-  return status;
+  return header->DecodeFrom(buffer_);
 }
 
 Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
-                          WALRecoveryMode wal_recovery_mode) {
-  record->Clear();
-  buffer_.clear();
-  backing_store_[0] = '\0';
-
-  Status status =
-      file_->Read(BlobLogRecord::kHeaderSize, &buffer_, GetReadBuffer());
-  next_byte_ += buffer_.size();
-  if (!status.ok()) return status;
+                          uint64_t* blob_offset) {
+  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, &backing_store_);
+  if (!s.ok()) {
+    return s;
+  }
   if (buffer_.size() != BlobLogRecord::kHeaderSize) {
-    return Status::IOError("EOF reached before record header");
+    return Status::Corruption("EOF reached before record header");
   }
 
-  status = record->DecodeHeaderFrom(buffer_);
-  if (!status.ok()) {
-    return status;
+  s = record->DecodeHeaderFrom(buffer_);
+  if (!s.ok()) {
+    return s;
   }
 
-  uint32_t header_crc = 0;
-  uint32_t blob_crc = 0;
-  size_t crc_data_size = BlobLogRecord::kHeaderSize - 2 * sizeof(uint32_t);
-  header_crc = crc32c::Extend(header_crc, buffer_.data(), crc_data_size);
+  uint64_t kb_size = record->key_size + record->value_size;
+  if (blob_offset != nullptr) {
+    *blob_offset = next_byte_ + record->key_size;
+  }
 
-  uint64_t kb_size = record->GetKeySize() + record->GetBlobSize();
   switch (level) {
-    case kReadHdrFooter:
-      file_->Skip(kb_size);
+    case kReadHeader:
+      file_->Skip(record->key_size + record->value_size);
       next_byte_ += kb_size;
-      status =
-          file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
-      next_byte_ += buffer_.size();
-      if (!status.ok()) return status;
-      if (buffer_.size() != BlobLogRecord::kFooterSize) {
-        return Status::IOError("EOF reached before record footer");
-      }
-
-      status = record->DecodeFooterFrom(buffer_);
-      return status;
-
-    case kReadHdrKeyFooter:
-      record->ResizeKeyBuffer(record->GetKeySize());
-      status = file_->Read(record->GetKeySize(), &record->key_,
-                           record->GetKeyBuffer());
-      next_byte_ += record->key_.size();
-      if (!status.ok()) return status;
-      if (record->key_.size() != record->GetKeySize()) {
-        return Status::IOError("EOF reached before key read");
-      }
-
-      header_crc =
-          crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
-      header_crc = crc32c::Mask(header_crc);
-      if (header_crc != record->header_cksum_) {
-        return Status::Corruption("Record Checksum mismatch: header_cksum");
-      }
-
-      file_->Skip(record->GetBlobSize());
-      next_byte_ += record->GetBlobSize();
-
-      status =
-          file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
-      next_byte_ += buffer_.size();
-      if (!status.ok()) return status;
-      if (buffer_.size() != BlobLogRecord::kFooterSize) {
-        return Status::IOError("EOF reached during footer read");
-      }
-
-      status = record->DecodeFooterFrom(buffer_);
-      return status;
-
-    case kReadHdrKeyBlobFooter:
-      record->ResizeKeyBuffer(record->GetKeySize());
-      status = file_->Read(record->GetKeySize(), &record->key_,
-                           record->GetKeyBuffer());
-      next_byte_ += record->key_.size();
-      if (!status.ok()) return status;
-      if (record->key_.size() != record->GetKeySize()) {
-        return Status::IOError("EOF reached before key read");
-      }
-
-      header_crc =
-          crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize());
-      header_crc = crc32c::Mask(header_crc);
-      if (header_crc != record->header_cksum_) {
-        return Status::Corruption("Record Checksum mismatch: header_cksum");
+      break;
+
+    case kReadHeaderKey:
+      s = ReadSlice(record->key_size, &record->key, &record->key_buf);
+      file_->Skip(record->value_size);
+      next_byte_ += record->value_size;
+      break;
+
+    case kReadHeaderKeyBlob:
+      s = ReadSlice(record->key_size, &record->key, &record->key_buf);
+      if (s.ok()) {
+        s = ReadSlice(record->value_size, &record->value, &record->value_buf);
       }
-
-      record->ResizeBlobBuffer(record->GetBlobSize());
-      status = file_->Read(record->GetBlobSize(), &record->blob_,
-                           record->GetBlobBuffer());
-      next_byte_ += record->blob_.size();
-      if (!status.ok()) return status;
-      if (record->blob_.size() != record->GetBlobSize()) {
-        return Status::IOError("EOF reached during blob read");
-      }
-
-      blob_crc =
-          crc32c::Extend(blob_crc, record->blob_.data(), record->blob_.size());
-      blob_crc = crc32c::Mask(blob_crc);
-      if (blob_crc != record->checksum_) {
-        return Status::Corruption("Blob Checksum mismatch");
+      if (s.ok()) {
+        s = record->CheckBlobCRC();
       }
-
-      status =
-          file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer());
-      next_byte_ += buffer_.size();
-      if (!status.ok()) return status;
-      if (buffer_.size() != BlobLogRecord::kFooterSize) {
-        return Status::IOError("EOF reached during blob footer read");
-      }
-
-      status = record->DecodeFooterFrom(buffer_);
-      return status;
-    default:
-      assert(0);
-      return status;
+      break;
   }
+  return s;
 }
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h
index 05f53fe93..9c76b92ae 100644
--- a/utilities/blob_db/blob_log_reader.h
+++ b/utilities/blob_db/blob_log_reader.h
@@ -7,11 +7,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <cstdint>
 #include <memory>
 #include <string>
 
-#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "utilities/blob_db/blob_log_format.h"
@@ -32,9 +30,9 @@ namespace blob_db {
 class Reader {
  public:
   enum ReadLevel {
-    kReadHdrFooter,
-    kReadHdrKeyFooter,
-    kReadHdrKeyBlobFooter,
+    kReadHeader,
+    kReadHeaderKey,
+    kReadHeaderKeyBlob,
   };
 
   // Create a reader that will return log records from "*file".
@@ -51,7 +49,11 @@ class Reader {
   Reader(std::shared_ptr<Logger> info_log,
          std::unique_ptr<SequentialFileReader>&& file);
 
-  ~Reader();
+  ~Reader() = default;
+
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  Reader& operator=(const Reader&) = delete;
 
   Status ReadHeader(BlobLogHeader* header);
 
@@ -60,9 +62,11 @@ class Reader {
   // "*scratch" as temporary storage.  The contents filled in *record
   // will only be valid until the next mutating operation on this
   // reader or the next mutation to *scratch.
-  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHdrFooter,
-                    WALRecoveryMode wal_recovery_mode =
-                        WALRecoveryMode::kTolerateCorruptedTailRecords);
+  // If blob_offset is non-null, return offset of the blob through it.
+  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
+                    uint64_t* blob_offset = nullptr);
+
+  Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);
 
   SequentialFileReader* file() { return file_.get(); }
 
@@ -72,9 +76,6 @@ class Reader {
 
   const SequentialFileReader* file_reader() const { return file_.get(); }
 
- private:
-  char* GetReadBuffer() { return &(backing_store_[0]); }
-
  private:
   std::shared_ptr<Logger> info_log_;
   const std::unique_ptr<SequentialFileReader> file_;
@@ -84,10 +85,6 @@ class Reader {
 
   // which byte to read next. For asserting proper usage
   uint64_t next_byte_;
-
-  // No copying allowed
-  Reader(const Reader&) = delete;
-  Reader& operator=(const Reader&) = delete;
 };
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_log_writer.cc b/utilities/blob_db/blob_log_writer.cc
index 1ffc74a42..806ca3c95 100644
--- a/utilities/blob_db/blob_log_writer.cc
+++ b/utilities/blob_db/blob_log_writer.cc
@@ -2,18 +2,16 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-//
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_log_writer.h"
 
 #include <cstdint>
-#include <limits>
 #include <string>
 #include "rocksdb/env.h"
 #include "util/coding.h"
-#include "util/crc32c.h"
 #include "util/file_reader_writer.h"
+#include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
 namespace blob_db {
@@ -26,18 +24,11 @@ Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
       bytes_per_sync_(bpsync),
       next_sync_offset_(0),
       use_fsync_(use_fs),
-      last_elem_type_(kEtNone) {
-  for (int i = 0; i <= kMaxRecordType; i++) {
-    char t = static_cast<char>(i);
-    type_crc_[i] = crc32c::Value(&t, 1);
-  }
-}
-
-Writer::~Writer() {}
+      last_elem_type_(kEtNone) {}
 
 void Writer::Sync() { dest_->Sync(use_fsync_); }
 
-Status Writer::WriteHeader(const BlobLogHeader& header) {
+Status Writer::WriteHeader(BlobLogHeader& header) {
   assert(block_offset_ == 0);
   assert(last_elem_type_ == kEtNone);
   std::string str;
@@ -52,9 +43,9 @@ Status Writer::WriteHeader(const BlobLogHeader& header) {
   return s;
 }
 
-Status Writer::AppendFooter(const BlobLogFooter& footer) {
+Status Writer::AppendFooter(BlobLogFooter& footer) {
   assert(block_offset_ != 0);
-  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
 
   std::string str;
   footer.EncodeTo(&str);
@@ -71,13 +62,13 @@ Status Writer::AppendFooter(const BlobLogFooter& footer) {
 }
 
 Status Writer::AddRecord(const Slice& key, const Slice& val,
-                         uint64_t* key_offset, uint64_t* blob_offset,
-                         uint32_t ttl) {
+                         uint64_t expiration, uint64_t* key_offset,
+                         uint64_t* blob_offset) {
   assert(block_offset_ != 0);
-  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
 
   std::string buf;
-  ConstructBlobHeader(&buf, key, val, ttl, -1);
+  ConstructBlobHeader(&buf, key, val, expiration);
 
   Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
   return s;
@@ -86,50 +77,22 @@ Status Writer::AddRecord(const Slice& key, const Slice& val,
 Status Writer::AddRecord(const Slice& key, const Slice& val,
                          uint64_t* key_offset, uint64_t* blob_offset) {
   assert(block_offset_ != 0);
-  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
 
   std::string buf;
-  ConstructBlobHeader(&buf, key, val, -1, -1);
+  ConstructBlobHeader(&buf, key, val, 0);
 
   Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
   return s;
 }
 
-void Writer::ConstructBlobHeader(std::string* headerbuf, const Slice& key,
-                                 const Slice& val, int32_t ttl, int64_t ts) {
-  headerbuf->reserve(BlobLogRecord::kHeaderSize);
-
-  uint32_t key_size = static_cast<uint32_t>(key.size());
-  PutFixed32(headerbuf, key_size);
-  PutFixed64(headerbuf, val.size());
-
-  uint32_t ttl_write = (ttl != -1) ? static_cast<uint32_t>(ttl)
-                                   : std::numeric_limits<uint32_t>::max();
-  PutFixed32(headerbuf, ttl_write);
-
-  uint64_t ts_write = (ts != -1) ? static_cast<uint64_t>(ts)
-                                 : std::numeric_limits<uint64_t>::max();
-  PutFixed64(headerbuf, ts_write);
-
-  RecordType t = kFullType;
-  headerbuf->push_back(static_cast<char>(t));
-
-  RecordSubType st = kRegularType;
-  if (ttl != -1) st = kTTLType;
-  headerbuf->push_back(static_cast<char>(st));
-
-  uint32_t header_crc = 0;
-  header_crc =
-      crc32c::Extend(header_crc, headerbuf->c_str(), headerbuf->size());
-  header_crc = crc32c::Extend(header_crc, key.data(), key.size());
-  header_crc = crc32c::Mask(header_crc);
-  PutFixed32(headerbuf, header_crc);
-
-  uint32_t crc = 0;
-  // Compute the crc of the record type and the payload.
-  crc = crc32c::Extend(crc, val.data(), val.size());
-  crc = crc32c::Mask(crc);  // Adjust for storage
-  PutFixed32(headerbuf, crc);
+void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
+                                 const Slice& val, uint64_t expiration) {
+  BlobLogRecord record;
+  record.key = key;
+  record.value = val;
+  record.expiration = expiration;
+  record.EncodeHeaderTo(buf);
 }
 
 Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
@@ -138,7 +101,12 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
   Status s = dest_->Append(Slice(headerbuf));
   if (s.ok()) {
     s = dest_->Append(key);
-    if (s.ok()) s = dest_->Append(val);
+  }
+  if (s.ok()) {
+    s = dest_->Append(val);
+  }
+  if (s.ok()) {
+    s = dest_->Flush();
   }
 
   *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
@@ -148,25 +116,6 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
   return s;
 }
 
-Status Writer::AddRecordFooter(const SequenceNumber& seq) {
-  assert(last_elem_type_ == kEtRecord);
-
-  std::string buf;
-  PutFixed64(&buf, seq);
-
-  uint32_t footer_crc = crc32c::Extend(0, buf.c_str(), buf.size());
-  footer_crc = crc32c::Mask(footer_crc);
-  PutFixed32(&buf, footer_crc);
-
-  Status s = dest_->Append(Slice(buf));
-  block_offset_ += BlobLogRecord::kFooterSize;
-
-  if (s.ok()) dest_->Flush();
-
-  last_elem_type_ = kEtFooter;
-  return s;
-}
-
 }  // namespace blob_db
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_log_writer.h b/utilities/blob_db/blob_log_writer.h
index b6c7a2a99..2a1f05e1b 100644
--- a/utilities/blob_db/blob_log_writer.h
+++ b/utilities/blob_db/blob_log_writer.h
@@ -2,7 +2,6 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-//
 #pragma once
 
 #ifndef ROCKSDB_LITE
@@ -38,26 +37,29 @@ class Writer {
   explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
                   uint64_t log_number, uint64_t bpsync, bool use_fsync,
                   uint64_t boffset = 0);
-  ~Writer();
 
-  static void ConstructBlobHeader(std::string* headerbuf, const Slice& key,
-                                  const Slice& val, int32_t ttl, int64_t ts);
+  ~Writer() = default;
+
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  Writer& operator=(const Writer&) = delete;
+
+  static void ConstructBlobHeader(std::string* buf, const Slice& key,
+                                  const Slice& val, uint64_t expiration);
 
   Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
                    uint64_t* blob_offset);
 
-  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
-                   uint64_t* blob_offset, uint32_t ttl);
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
+                   uint64_t* key_offset, uint64_t* blob_offset);
 
   Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
                             const Slice& val, uint64_t* key_offset,
                             uint64_t* blob_offset);
 
-  Status AddRecordFooter(const SequenceNumber& sn);
+  Status AppendFooter(BlobLogFooter& footer);
 
-  Status AppendFooter(const BlobLogFooter& footer);
-
-  Status WriteHeader(const BlobLogHeader& header);
+  Status WriteHeader(BlobLogHeader& header);
 
   WritableFileWriter* file() { return dest_.get(); }
 
@@ -79,17 +81,8 @@ class Writer {
   uint64_t next_sync_offset_;
   bool use_fsync_;
 
-  // crc32c values for all supported record types.  These are
-  // pre-computed to reduce the overhead of computing the crc of the
-  // record type stored in the header.
-  uint32_t type_crc_[kMaxRecordType + 1];
-
-  // No copying allowed
-  Writer(const Writer&) = delete;
-  Writer& operator=(const Writer&) = delete;
-
  public:
-  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFooter, kEtFileFooter };
+  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
   ElemType last_elem_type_;
 };
 
diff --git a/utilities/cassandra/cassandra_compaction_filter.cc b/utilities/cassandra/cassandra_compaction_filter.cc
new file mode 100644
index 000000000..e817972ee
--- /dev/null
+++ b/utilities/cassandra/cassandra_compaction_filter.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+#include <string>
+#include "rocksdb/slice.h"
+#include "utilities/cassandra/format.h"
+
+
+namespace rocksdb {
+namespace cassandra {
+
+const char* CassandraCompactionFilter::Name() const {
+  return "CassandraCompactionFilter";
+}
+
+CompactionFilter::Decision CassandraCompactionFilter::FilterV2(
+  int level,
+  const Slice& key,
+  ValueType value_type,
+  const Slice& existing_value,
+  std::string* new_value,
+  std::string* skip_until) const {
+
+  bool value_changed = false;
+  RowValue row_value = RowValue::Deserialize(
+    existing_value.data(), existing_value.size());
+  RowValue compacted = purge_ttl_on_expiration_ ?
+    row_value.PurgeTtl(&value_changed) :
+    row_value.ExpireTtl(&value_changed);
+
+  if(compacted.Empty()) {
+    return Decision::kRemove;
+  }
+
+  if (value_changed) {
+    compacted.Serialize(new_value);
+    return Decision::kChangeValue;
+  }
+
+  return Decision::kKeep;
+}
+
+}  // namespace cassandra
+}  // namespace rocksdb
diff --git a/utilities/cassandra/cassandra_compaction_filter.h b/utilities/cassandra/cassandra_compaction_filter.h
new file mode 100644
index 000000000..c09b8e74a
--- /dev/null
+++ b/utilities/cassandra/cassandra_compaction_filter.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+namespace cassandra {
+
+/**
+ * Compaction filter for removing expired Cassandra data with ttl.
+ * If option `purge_ttl_on_expiration` is set to true, expired data
+ * will be directly purged. Otherwise expired data will be converted
+ * tombstones first, then be eventally removed after gc grace period. 
+ * `purge_ttl_on_expiration` should only be on in the case all the 
+ * writes have same ttl setting, otherwise it could bring old data back.
+ */
+class CassandraCompactionFilter : public CompactionFilter {
+public:
+  explicit CassandraCompactionFilter(bool purge_ttl_on_expiration)
+    : purge_ttl_on_expiration_(purge_ttl_on_expiration) {}
+
+  const char* Name() const override;
+  virtual Decision FilterV2(int level,
+                            const Slice& key,
+                            ValueType value_type,
+                            const Slice& existing_value,
+                            std::string* new_value,
+                            std::string* skip_until) const override;
+
+private:
+  bool purge_ttl_on_expiration_;
+};
+}  // namespace cassandra
+}  // namespace rocksdb
diff --git a/utilities/merge_operators/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc
similarity index 80%
rename from utilities/merge_operators/cassandra/cassandra_format_test.cc
rename to utilities/cassandra/cassandra_format_test.cc
index 866098a1b..0cf124d0c 100644
--- a/utilities/merge_operators/cassandra/cassandra_format_test.cc
+++ b/utilities/cassandra/cassandra_format_test.cc
@@ -2,14 +2,13 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include <cstring>
 #include <memory>
 #include "util/testharness.h"
-#include "utilities/merge_operators/cassandra/format.h"
-#include "utilities/merge_operators/cassandra/serialize.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/serialize.h"
+#include "utilities/cassandra/test_utils.h"
 
 using namespace rocksdb::cassandra;
 
@@ -46,7 +45,7 @@ TEST(ColumnTest, Column) {
 
   // Verify the deserialization.
   std::string saved_dest = dest;
-  std::unique_ptr<Column> c1 = Column::Deserialize(saved_dest.c_str(), 0);
+  std::shared_ptr<Column> c1 = Column::Deserialize(saved_dest.c_str(), 0);
   EXPECT_EQ(c1->Index(), index);
   EXPECT_EQ(c1->Timestamp(), timestamp);
   EXPECT_EQ(c1->Size(), 14 + sizeof(data));
@@ -58,7 +57,7 @@ TEST(ColumnTest, Column) {
 
   // Verify the ColumnBase::Deserialization.
   saved_dest = dest;
-  std::unique_ptr<ColumnBase> c2 =
+  std::shared_ptr<ColumnBase> c2 =
       ColumnBase::Deserialize(saved_dest.c_str(), c.Size());
   c2->Serialize(&dest);
   EXPECT_EQ(dest.size(), 3 * c.Size());
@@ -101,7 +100,7 @@ TEST(ExpiringColumnTest, ExpiringColumn) {
 
   // Verify the deserialization.
   std::string saved_dest = dest;
-  std::unique_ptr<ExpiringColumn> c1 =
+  std::shared_ptr<ExpiringColumn> c1 =
       ExpiringColumn::Deserialize(saved_dest.c_str(), 0);
   EXPECT_EQ(c1->Index(), index);
   EXPECT_EQ(c1->Timestamp(), timestamp);
@@ -114,7 +113,7 @@ TEST(ExpiringColumnTest, ExpiringColumn) {
 
   // Verify the ColumnBase::Deserialization.
   saved_dest = dest;
-  std::unique_ptr<ColumnBase> c2 =
+  std::shared_ptr<ColumnBase> c2 =
       ColumnBase::Deserialize(saved_dest.c_str(), c.Size());
   c2->Serialize(&dest);
   EXPECT_EQ(dest.size(), 3 * c.Size());
@@ -151,7 +150,7 @@ TEST(TombstoneTest, Tombstone) {
   EXPECT_EQ(Deserialize<int64_t>(dest.c_str(), offset), marked_for_delete_at);
 
   // Verify the deserialization.
-  std::unique_ptr<Tombstone> c1 = Tombstone::Deserialize(dest.c_str(), 0);
+  std::shared_ptr<Tombstone> c1 = Tombstone::Deserialize(dest.c_str(), 0);
   EXPECT_EQ(c1->Index(), index);
   EXPECT_EQ(c1->Timestamp(), marked_for_delete_at);
   EXPECT_EQ(c1->Size(), 14);
@@ -162,7 +161,7 @@ TEST(TombstoneTest, Tombstone) {
     std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == 0);
 
   // Verify the ColumnBase::Deserialization.
-  std::unique_ptr<ColumnBase> c2 =
+  std::shared_ptr<ColumnBase> c2 =
     ColumnBase::Deserialize(dest.c_str(), c.Size());
   c2->Serialize(&dest);
   EXPECT_EQ(dest.size(), 3 * c.Size());
@@ -204,7 +203,7 @@ TEST(RowValueTest, RowTombstone) {
 }
 
 TEST(RowValueTest, RowWithColumns) {
-  std::vector<std::unique_ptr<ColumnBase>> columns;
+  std::vector<std::shared_ptr<ColumnBase>> columns;
   int64_t last_modified_time = 1494022807048;
   std::size_t columns_data_size = 0;
 
@@ -212,7 +211,7 @@ TEST(RowValueTest, RowWithColumns) {
   int8_t e_index = 0;
   int64_t e_timestamp = 1494022807044;
   int32_t e_ttl = 3600;
-  columns.push_back(std::unique_ptr<ExpiringColumn>(
+  columns.push_back(std::shared_ptr<ExpiringColumn>(
     new ExpiringColumn(ColumnTypeMask::EXPIRATION_MASK, e_index,
       e_timestamp, sizeof(e_data), e_data, e_ttl)));
   columns_data_size += columns[0]->Size();
@@ -220,14 +219,14 @@ TEST(RowValueTest, RowWithColumns) {
   char c_data[4] = {'d', 'a', 't', 'a'};
   int8_t c_index = 1;
   int64_t c_timestamp = 1494022807048;
-  columns.push_back(std::unique_ptr<Column>(
+  columns.push_back(std::shared_ptr<Column>(
     new Column(0, c_index, c_timestamp, sizeof(c_data), c_data)));
   columns_data_size += columns[1]->Size();
 
   int8_t t_index = 2;
   int32_t t_local_deletion_time = 1494022801;
   int64_t t_marked_for_delete_at = 1494022807043;
-  columns.push_back(std::unique_ptr<Tombstone>(
+  columns.push_back(std::shared_ptr<Tombstone>(
     new Tombstone(ColumnTypeMask::DELETION_MASK,
       t_index, t_local_deletion_time, t_marked_for_delete_at)));
   columns_data_size += columns[2]->Size();
@@ -301,6 +300,50 @@ TEST(RowValueTest, RowWithColumns) {
     std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == 0);
 }
 
+TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) {
+  int64_t now = time(nullptr);
+
+  auto row_value = CreateTestRowValue({
+    std::make_tuple(kColumn, 0, ToMicroSeconds(now)),
+    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired
+    std::make_tuple(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired
+    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+  });
+
+  bool changed = false;
+  auto purged = row_value.PurgeTtl(&changed);
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(purged.columns_.size(), 3);
+  VerifyRowValueColumns(purged.columns_, 0, kColumn, 0, ToMicroSeconds(now));
+  VerifyRowValueColumns(purged.columns_, 1, kExpiringColumn, 2, ToMicroSeconds(now));
+  VerifyRowValueColumns(purged.columns_, 2, kTombstone, 3, ToMicroSeconds(now));
+
+  purged.PurgeTtl(&changed);
+  EXPECT_FALSE(changed);
+}
+
+TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) {
+  int64_t now = time(nullptr);
+
+  auto row_value = CreateTestRowValue({
+    std::make_tuple(kColumn, 0, ToMicroSeconds(now)),
+    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired
+    std::make_tuple(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired
+    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+  });
+
+  bool changed = false;
+  auto compacted = row_value.ExpireTtl(&changed);
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(compacted.columns_.size(), 4);
+  VerifyRowValueColumns(compacted.columns_, 0, kColumn, 0, ToMicroSeconds(now));
+  VerifyRowValueColumns(compacted.columns_, 1, kTombstone, 1, ToMicroSeconds(now - 10));
+  VerifyRowValueColumns(compacted.columns_, 2, kExpiringColumn, 2, ToMicroSeconds(now));
+  VerifyRowValueColumns(compacted.columns_, 3, kTombstone, 3, ToMicroSeconds(now));
+
+  compacted.ExpireTtl(&changed);
+  EXPECT_FALSE(changed);
+}
 } // namespace cassandra
 } // namespace rocksdb
 
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
new file mode 100644
index 000000000..0c02228a7
--- /dev/null
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -0,0 +1,251 @@
+// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <iostream>
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "util/testharness.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+#include "utilities/cassandra/cassandra_compaction_filter.h"
+#include "utilities/cassandra/merge_operator.h"
+#include "utilities/cassandra/test_utils.h"
+
+using namespace rocksdb;
+
+namespace rocksdb {
+namespace cassandra {
+
+// Path to the database on file system
+const std::string kDbName = test::TmpDir() + "/cassandra_functional_test";
+
+class CassandraStore {
+ public:
+  explicit CassandraStore(std::shared_ptr<DB> db)
+      : db_(db),
+        merge_option_(),
+        get_option_() {
+    assert(db);
+  }
+
+  bool Append(const std::string& key, const RowValue& val){
+    std::string result;
+    val.Serialize(&result);
+    Slice valSlice(result.data(), result.size());
+    auto s = db_->Merge(merge_option_, key, valSlice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  void Flush() {
+    dbfull()->TEST_FlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+
+  void Compact() {
+    dbfull()->TEST_CompactRange(
+      0, nullptr, nullptr, db_->DefaultColumnFamily());
+  }
+
+  std::tuple<bool, RowValue> Get(const std::string& key){
+    std::string result;
+    auto s = db_->Get(get_option_, key, &result);
+
+    if (s.ok()) {
+      return std::make_tuple(true,
+                             RowValue::Deserialize(result.data(),
+                                                   result.size()));
+    }
+
+    if (!s.IsNotFound()) {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+    }
+
+    return std::make_tuple(false, RowValue(0, 0));
+  }
+
+ private:
+  std::shared_ptr<DB> db_;
+  WriteOptions merge_option_;
+  ReadOptions get_option_;
+
+  DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_.get()); }
+
+};
+
+class TestCompactionFilterFactory : public CompactionFilterFactory {
+public:
+  explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration)
+    : purge_ttl_on_expiration_(purge_ttl_on_expiration) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return unique_ptr<CompactionFilter>(new CassandraCompactionFilter(purge_ttl_on_expiration_));
+  }
+
+  virtual const char* Name() const override {
+    return "TestCompactionFilterFactory";
+  }
+
+private:
+  bool purge_ttl_on_expiration_;
+};
+
+
+// The class for unit-testing
+class CassandraFunctionalTest : public testing::Test {
+public:
+  CassandraFunctionalTest() {
+    DestroyDB(kDbName, Options());    // Start each test with a fresh DB
+  }
+
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.merge_operator.reset(new CassandraValueMergeOperator());
+    auto* cf_factory = new TestCompactionFilterFactory(purge_ttl_on_expiration_);
+    options.compaction_filter_factory.reset(cf_factory);
+    EXPECT_OK(DB::Open(options, kDbName, &db));
+    return std::shared_ptr<DB>(db);
+  }
+
+  bool purge_ttl_on_expiration_ = false;
+};
+
+// THE TEST CASES BEGIN HERE
+
+TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
+  CassandraStore store(OpenDb());
+
+  store.Append("k1", CreateTestRowValue({
+    std::make_tuple(kTombstone, 0, 5),
+    std::make_tuple(kColumn, 1, 8),
+    std::make_tuple(kExpiringColumn, 2, 5),
+  }));
+  store.Append("k1",CreateTestRowValue({
+    std::make_tuple(kColumn, 0, 2),
+    std::make_tuple(kExpiringColumn, 1, 5),
+    std::make_tuple(kTombstone, 2, 7),
+    std::make_tuple(kExpiringColumn, 7, 17),
+  }));
+  store.Append("k1", CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, 6),
+    std::make_tuple(kTombstone, 1, 5),
+    std::make_tuple(kColumn, 2, 4),
+    std::make_tuple(kTombstone, 11, 11),
+  }));
+
+  auto ret = store.Get("k1");
+
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& merged = std::get<1>(ret);
+  EXPECT_EQ(merged.columns_.size(), 5);
+  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 0, 6);
+  VerifyRowValueColumns(merged.columns_, 1, kColumn, 1, 8);
+  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 2, 7);
+  VerifyRowValueColumns(merged.columns_, 3, kExpiringColumn, 7, 17);
+  VerifyRowValueColumns(merged.columns_, 4, kTombstone, 11, 11);
+}
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldConvertExpiredColumnsToTombstone) {
+  CassandraStore store(OpenDb());
+  int64_t now= time(nullptr);
+
+  store.Append("k1", CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
+    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl + 10)), // not expired
+    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+  }));
+
+  store.Flush();
+
+  store.Append("k1",CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
+    std::make_tuple(kColumn, 2, ToMicroSeconds(now))
+  }));
+
+  store.Flush();
+  store.Compact();
+
+  auto ret = store.Get("k1");
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& merged = std::get<1>(ret);
+  EXPECT_EQ(merged.columns_.size(), 4);
+  VerifyRowValueColumns(merged.columns_, 0, kTombstone, 0, ToMicroSeconds(now - 10));
+  VerifyRowValueColumns(merged.columns_, 1, kExpiringColumn, 1, ToMicroSeconds(now - kTtl + 10));
+  VerifyRowValueColumns(merged.columns_, 2, kColumn, 2, ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.columns_, 3, kTombstone, 3, ToMicroSeconds(now));
+}
+
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append("k1", CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
+    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now)), // not expired
+    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+  }));
+
+  store.Flush();
+
+  store.Append("k1",CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
+    std::make_tuple(kColumn, 2, ToMicroSeconds(now))
+  }));
+
+  store.Flush();
+  store.Compact();
+
+  auto ret = store.Get("k1");
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& merged = std::get<1>(ret);
+  EXPECT_EQ(merged.columns_.size(), 3);
+  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 1, ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.columns_, 1, kColumn, 2, ToMicroSeconds(now));
+  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 3, ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldRemoveRowWhenAllColumnsExpiredIfPurgeTtlIsOn) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append("k1", CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)),
+    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 20)),
+  }));
+
+  store.Flush();
+
+  store.Append("k1",CreateTestRowValue({
+    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)),
+  }));
+
+  store.Flush();
+  store.Compact();
+  ASSERT_FALSE(std::get<0>(store.Get("k1")));
+}
+
+} // namespace cassandra
+} // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utilities/merge_operators/cassandra/cassandra_row_merge_test.cc b/utilities/cassandra/cassandra_row_merge_test.cc
similarity index 92%
rename from utilities/merge_operators/cassandra/cassandra_row_merge_test.cc
rename to utilities/cassandra/cassandra_row_merge_test.cc
index 76d112c7b..78c7d8e57 100644
--- a/utilities/merge_operators/cassandra/cassandra_row_merge_test.cc
+++ b/utilities/cassandra/cassandra_row_merge_test.cc
@@ -2,13 +2,11 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include <memory>
 #include "util/testharness.h"
-#include "utilities/merge_operators/cassandra/format.h"
-#include "utilities/merge_operators/cassandra/test_utils.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/test_utils.h"
 
 namespace rocksdb {
 namespace cassandra {
diff --git a/utilities/merge_operators/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc
similarity index 96%
rename from utilities/merge_operators/cassandra/cassandra_serialize_test.cc
rename to utilities/cassandra/cassandra_serialize_test.cc
index 978878b64..68d2c163d 100644
--- a/utilities/merge_operators/cassandra/cassandra_serialize_test.cc
+++ b/utilities/cassandra/cassandra_serialize_test.cc
@@ -2,11 +2,9 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include "util/testharness.h"
-#include "utilities/merge_operators/cassandra/serialize.h"
+#include "utilities/cassandra/serialize.h"
 
 using namespace rocksdb::cassandra;
 
diff --git a/utilities/merge_operators/cassandra/format.cc b/utilities/cassandra/format.cc
similarity index 75%
rename from utilities/merge_operators/cassandra/format.cc
rename to utilities/cassandra/format.cc
index 01eff67e3..2b096cdbb 100644
--- a/utilities/merge_operators/cassandra/format.cc
+++ b/utilities/cassandra/format.cc
@@ -2,8 +2,6 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include "format.h"
 
@@ -11,7 +9,7 @@
 #include <map>
 #include <memory>
 
-#include "utilities/merge_operators/cassandra/serialize.h"
+#include "utilities/cassandra/serialize.h"
 
 namespace rocksdb {
 namespace cassandra {
@@ -42,7 +40,7 @@ void ColumnBase::Serialize(std::string* dest) const {
   rocksdb::cassandra::Serialize<int8_t>(index_, dest);
 }
 
-std::unique_ptr<ColumnBase> ColumnBase::Deserialize(const char* src,
+std::shared_ptr<ColumnBase> ColumnBase::Deserialize(const char* src,
                                                     std::size_t offset) {
   int8_t mask = rocksdb::cassandra::Deserialize<int8_t>(src, offset);
   if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
@@ -79,7 +77,7 @@ void Column::Serialize(std::string* dest) const {
   dest->append(value_, value_size_);
 }
 
-std::unique_ptr<Column> Column::Deserialize(const char *src,
+std::shared_ptr<Column> Column::Deserialize(const char *src,
                                             std::size_t offset) {
   int8_t mask = rocksdb::cassandra::Deserialize<int8_t>(src, offset);
   offset += sizeof(mask);
@@ -89,8 +87,8 @@ std::unique_ptr<Column> Column::Deserialize(const char *src,
   offset += sizeof(timestamp);
   int32_t value_size = rocksdb::cassandra::Deserialize<int32_t>(src, offset);
   offset += sizeof(value_size);
-  return std::unique_ptr<Column>(
-    new Column(mask, index, timestamp, value_size, src + offset));
+  return std::make_shared<Column>(
+    mask, index, timestamp, value_size, src + offset);
 }
 
 ExpiringColumn::ExpiringColumn(
@@ -112,7 +110,32 @@ void ExpiringColumn::Serialize(std::string* dest) const {
   rocksdb::cassandra::Serialize<int32_t>(ttl_, dest);
 }
 
-std::unique_ptr<ExpiringColumn> ExpiringColumn::Deserialize(
+std::chrono::time_point<std::chrono::system_clock> ExpiringColumn::TimePoint() const {
+  return std::chrono::time_point<std::chrono::system_clock>(std::chrono::microseconds(Timestamp()));
+}
+
+std::chrono::seconds ExpiringColumn::Ttl() const {
+  return std::chrono::seconds(ttl_);
+}
+
+bool ExpiringColumn::Expired() const {
+  return TimePoint() + Ttl() < std::chrono::system_clock::now();
+}
+
+std::shared_ptr<Tombstone> ExpiringColumn::ToTombstone() const {
+  auto expired_at = (TimePoint() + Ttl()).time_since_epoch();
+  int32_t local_deletion_time = static_cast<int32_t>(
+    std::chrono::duration_cast<std::chrono::seconds>(expired_at).count());
+  int64_t marked_for_delete_at =
+    std::chrono::duration_cast<std::chrono::microseconds>(expired_at).count();
+  return std::make_shared<Tombstone>(
+    ColumnTypeMask::DELETION_MASK,
+    Index(),
+    local_deletion_time,
+    marked_for_delete_at);
+}
+
+std::shared_ptr<ExpiringColumn> ExpiringColumn::Deserialize(
     const char *src,
     std::size_t offset) {
   int8_t mask = rocksdb::cassandra::Deserialize<int8_t>(src, offset);
@@ -126,8 +149,8 @@ std::unique_ptr<ExpiringColumn> ExpiringColumn::Deserialize(
   const char* value = src + offset;
   offset += value_size;
   int32_t ttl =  rocksdb::cassandra::Deserialize<int32_t>(src, offset);
-  return std::unique_ptr<ExpiringColumn>(
-    new ExpiringColumn(mask, index, timestamp, value_size, value, ttl));
+  return std::make_shared<ExpiringColumn>(
+    mask, index, timestamp, value_size, value, ttl);
 }
 
 Tombstone::Tombstone(
@@ -153,7 +176,7 @@ void Tombstone::Serialize(std::string* dest) const {
   rocksdb::cassandra::Serialize<int64_t>(marked_for_delete_at_, dest);
 }
 
-std::unique_ptr<Tombstone> Tombstone::Deserialize(const char *src,
+std::shared_ptr<Tombstone> Tombstone::Deserialize(const char *src,
                                                   std::size_t offset) {
   int8_t mask = rocksdb::cassandra::Deserialize<int8_t>(src, offset);
   offset += sizeof(mask);
@@ -164,8 +187,8 @@ std::unique_ptr<Tombstone> Tombstone::Deserialize(const char *src,
   offset += sizeof(int32_t);
   int64_t marked_for_delete_at =
     rocksdb::cassandra::Deserialize<int64_t>(src, offset);
-  return std::unique_ptr<Tombstone>(
-    new Tombstone(mask, index, local_deletion_time, marked_for_delete_at));
+  return std::make_shared<Tombstone>(
+    mask, index, local_deletion_time, marked_for_delete_at);
 }
 
 RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at)
@@ -173,7 +196,7 @@ RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at)
   marked_for_delete_at_(marked_for_delete_at), columns_(),
   last_modified_time_(0) {}
 
-RowValue::RowValue(std::vector<std::unique_ptr<ColumnBase>> columns,
+RowValue::RowValue(Columns columns,
                   int64_t last_modified_time)
   : local_deletion_time_(kDefaultLocalDeletionTime),
   marked_for_delete_at_(kDefaultMarkedForDeleteAt),
@@ -208,6 +231,49 @@ void RowValue::Serialize(std::string* dest) const {
   }
 }
 
+RowValue RowValue::PurgeTtl(bool* changed) const {
+  *changed = false;
+  Columns new_columns;
+  for (auto& column : columns_) {
+    if(column->Mask() == ColumnTypeMask::EXPIRATION_MASK) {
+      std::shared_ptr<ExpiringColumn> expiring_column =
+        std::static_pointer_cast<ExpiringColumn>(column);
+
+      if(expiring_column->Expired()){
+        *changed = true;
+        continue;
+      }
+    }
+
+    new_columns.push_back(column);
+  }
+  return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+RowValue RowValue::ExpireTtl(bool* changed) const {
+  *changed = false;
+  Columns new_columns;
+  for (auto& column : columns_) {
+    if(column->Mask() == ColumnTypeMask::EXPIRATION_MASK) {
+      std::shared_ptr<ExpiringColumn> expiring_column =
+        std::static_pointer_cast<ExpiringColumn>(column);
+
+      if(expiring_column->Expired()) {
+        shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
+        new_columns.push_back(tombstone);
+        *changed = true;
+        continue;
+      }
+    }
+    new_columns.push_back(column);
+  }
+  return RowValue(std::move(new_columns), last_modified_time_);
+}
+
+bool RowValue::Empty() const {
+  return columns_.empty();
+}
+
 RowValue RowValue::Deserialize(const char *src, std::size_t size) {
   std::size_t offset = 0;
   assert(size >= sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_));
@@ -223,7 +289,7 @@ RowValue RowValue::Deserialize(const char *src, std::size_t size) {
 
   assert(local_deletion_time == kDefaultLocalDeletionTime);
   assert(marked_for_delete_at == kDefaultMarkedForDeleteAt);
-  std::vector<std::unique_ptr<ColumnBase>> columns;
+  Columns columns;
   int64_t last_modified_time = 0;
   while (offset < size) {
     auto c = ColumnBase::Deserialize(src, offset);
@@ -254,7 +320,7 @@ RowValue RowValue::Merge(std::vector<RowValue>&& values) {
       return r1.LastModifiedTime() > r2.LastModifiedTime();
     });
 
-  std::map<int8_t, std::unique_ptr<ColumnBase>> merged_columns;
+  std::map<int8_t, std::shared_ptr<ColumnBase>> merged_columns;
   int64_t tombstone_timestamp = 0;
 
   for (auto& value : values) {
@@ -268,17 +334,17 @@ RowValue RowValue::Merge(std::vector<RowValue>&& values) {
     for (auto& column : value.columns_) {
       int8_t index = column->Index();
       if (merged_columns.find(index) == merged_columns.end()) {
-        merged_columns[index] = std::move(column);
+        merged_columns[index] = column;
       } else {
         if (column->Timestamp() > merged_columns[index]->Timestamp()) {
-          merged_columns[index] = std::move(column);
+          merged_columns[index] = column;
         }
       }
     }
   }
 
   int64_t last_modified_time = 0;
-  std::vector<std::unique_ptr<ColumnBase>> columns;
+  Columns columns;
   for (auto& pair: merged_columns) {
     // For some row, its last_modified_time > row tombstone_timestamp, but
     // it might have rows whose timestamp is ealier than tombstone, so we
diff --git a/utilities/merge_operators/cassandra/format.h b/utilities/cassandra/format.h
similarity index 80%
rename from utilities/merge_operators/cassandra/format.h
rename to utilities/cassandra/format.h
index 0ffd9a5bb..d8f51df14 100644
--- a/utilities/merge_operators/cassandra/format.h
+++ b/utilities/cassandra/format.h
@@ -2,8 +2,6 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 /**
  * The encoding of Cassandra Row Value.
@@ -57,6 +55,7 @@
  */
 
 #pragma once
+#include <chrono>
 #include <vector>
 #include <memory>
 #include "rocksdb/merge_operator.h"
@@ -72,6 +71,7 @@ enum ColumnTypeMask {
   EXPIRATION_MASK = 0x02,
 };
 
+
 class ColumnBase {
 public:
   ColumnBase(int8_t mask, int8_t index);
@@ -82,8 +82,7 @@ class ColumnBase {
   virtual int8_t Index() const;
   virtual std::size_t Size() const;
   virtual void Serialize(std::string* dest) const;
-
-  static std::unique_ptr<ColumnBase> Deserialize(const char* src,
+  static std::shared_ptr<ColumnBase> Deserialize(const char* src,
                                                  std::size_t offset);
 
 private:
@@ -99,8 +98,7 @@ class Column : public ColumnBase {
   virtual int64_t Timestamp() const override;
   virtual std::size_t Size() const override;
   virtual void Serialize(std::string* dest) const override;
-
-  static std::unique_ptr<Column> Deserialize(const char* src,
+  static std::shared_ptr<Column> Deserialize(const char* src,
                                              std::size_t offset);
 
 private:
@@ -109,44 +107,50 @@ class Column : public ColumnBase {
   const char* value_;
 };
 
-class ExpiringColumn : public Column {
+class Tombstone : public ColumnBase {
 public:
-  ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp,
-    int32_t value_size, const char* value, int32_t ttl);
+  Tombstone(int8_t mask, int8_t index,
+    int32_t local_deletion_time, int64_t marked_for_delete_at);
 
+  virtual int64_t Timestamp() const override;
   virtual std::size_t Size() const override;
   virtual void Serialize(std::string* dest) const override;
 
-  static std::unique_ptr<ExpiringColumn> Deserialize(const char* src,
-                                                     std::size_t offset);
+  static std::shared_ptr<Tombstone> Deserialize(const char* src,
+                                                std::size_t offset);
 
 private:
-  int32_t ttl_;
+  int32_t local_deletion_time_;
+  int64_t marked_for_delete_at_;
 };
 
-class Tombstone : public ColumnBase {
+class ExpiringColumn : public Column {
 public:
-  Tombstone(int8_t mask, int8_t index,
-    int32_t local_deletion_time, int64_t marked_for_delete_at);
+  ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp,
+    int32_t value_size, const char* value, int32_t ttl);
 
-  virtual int64_t Timestamp() const override;
   virtual std::size_t Size() const override;
   virtual void Serialize(std::string* dest) const override;
+  bool Expired() const;
+  std::shared_ptr<Tombstone> ToTombstone() const;
 
-  static std::unique_ptr<Tombstone> Deserialize(const char* src,
-                                                std::size_t offset);
+  static std::shared_ptr<ExpiringColumn> Deserialize(const char* src,
+                                                     std::size_t offset);
 
 private:
-  int32_t local_deletion_time_;
-  int64_t marked_for_delete_at_;
+  int32_t ttl_;
+  std::chrono::time_point<std::chrono::system_clock> TimePoint() const;
+  std::chrono::seconds Ttl() const;
 };
 
+typedef std::vector<std::shared_ptr<ColumnBase>> Columns;
+
 class RowValue {
 public:
   // Create a Row Tombstone.
   RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at);
   // Create a Row containing columns.
-  RowValue(std::vector<std::unique_ptr<ColumnBase>> columns,
+  RowValue(Columns columns,
            int64_t last_modified_time);
   RowValue(const RowValue& that) = delete;
   RowValue(RowValue&& that) noexcept = default;
@@ -159,6 +163,9 @@ class RowValue {
   // otherwise it returns the max timestamp of containing columns.
   int64_t LastModifiedTime() const;
   void Serialize(std::string* dest) const;
+  RowValue PurgeTtl(bool* changed) const;
+  RowValue ExpireTtl(bool* changed) const;
+  bool Empty() const;
 
   static RowValue Deserialize(const char* src, std::size_t size);
   // Merge multiple rows according to their timestamp.
@@ -167,12 +174,20 @@ class RowValue {
 private:
   int32_t local_deletion_time_;
   int64_t marked_for_delete_at_;
-  std::vector<std::unique_ptr<ColumnBase>> columns_;
+  Columns columns_;
   int64_t last_modified_time_;
 
+  FRIEND_TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired);
+  FRIEND_TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones);
   FRIEND_TEST(RowValueMergeTest, Merge);
   FRIEND_TEST(RowValueMergeTest, MergeWithRowTombstone);
-  FRIEND_TEST(CassandraMergeTest, SimpleTest);
+  FRIEND_TEST(CassandraFunctionalTest, SimpleMergeTest);
+  FRIEND_TEST(
+    CassandraFunctionalTest, CompactionShouldConvertExpiredColumnsToTombstone);
+  FRIEND_TEST(
+    CassandraFunctionalTest, CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn);
+  FRIEND_TEST(
+    CassandraFunctionalTest, CompactionShouldRemoveRowWhenAllColumnExpiredIfPurgeTtlIsOn);
 };
 
 } // namepsace cassandrda
diff --git a/utilities/merge_operators/cassandra/merge_operator.cc b/utilities/cassandra/merge_operator.cc
similarity index 65%
rename from utilities/merge_operators/cassandra/merge_operator.cc
rename to utilities/cassandra/merge_operator.cc
index 03b4ec2e3..715ef8586 100644
--- a/utilities/merge_operators/cassandra/merge_operator.cc
+++ b/utilities/cassandra/merge_operator.cc
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include "merge_operator.h"
 
@@ -13,7 +11,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/merge_operator.h"
 #include "utilities/merge_operators.h"
-#include "utilities/merge_operators/cassandra/format.h"
+#include "utilities/cassandra/format.h"
 
 namespace rocksdb {
 namespace cassandra {
@@ -49,27 +47,6 @@ bool CassandraValueMergeOperator::FullMergeV2(
   return true;
 }
 
-// Implementation for the merge operation (merges two Cassandra values)
-bool CassandraValueMergeOperator::PartialMerge(const Slice& key,
-                                               const Slice& left_operand,
-                                               const Slice& right_operand,
-                                               std::string* new_value,
-                                               Logger* logger) const {
-  // Clear the *new_value for writing.
-  assert(new_value);
-  new_value->clear();
-
-  std::vector<RowValue> row_values;
-  row_values.push_back(RowValue::Deserialize(left_operand.data(),
-                                             left_operand.size()));
-  row_values.push_back(RowValue::Deserialize(right_operand.data(),
-                                             right_operand.size()));
-  RowValue merged = RowValue::Merge(std::move(row_values));
-  new_value->reserve(merged.Size());
-  merged.Serialize(new_value);
-  return true;
-}
-
 bool CassandraValueMergeOperator::PartialMergeMulti(
     const Slice& key,
     const std::deque<Slice>& operand_list,
diff --git a/utilities/merge_operators/cassandra/merge_operator.h b/utilities/cassandra/merge_operator.h
similarity index 68%
rename from utilities/merge_operators/cassandra/merge_operator.h
rename to utilities/cassandra/merge_operator.h
index b46662c26..28066ca05 100644
--- a/utilities/merge_operators/cassandra/merge_operator.h
+++ b/utilities/cassandra/merge_operator.h
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #pragma once
 #include "rocksdb/merge_operator.h"
@@ -22,18 +20,14 @@ class CassandraValueMergeOperator : public MergeOperator {
   virtual bool FullMergeV2(const MergeOperationInput& merge_in,
                            MergeOperationOutput* merge_out) const override;
 
-  virtual bool PartialMerge(const Slice& key,
-                            const Slice& left_operand,
-                            const Slice& right_operand,
-                            std::string* new_value,
-                            Logger* logger) const override;
-
   virtual bool PartialMergeMulti(const Slice& key,
                                  const std::deque<Slice>& operand_list,
                                  std::string* new_value,
                                  Logger* logger) const override;
 
   virtual const char* Name() const override;
+
+  virtual bool AllowSingleOperand() const override { return true; }
 };
 } // namespace cassandra
 } // namespace rocksdb
diff --git a/utilities/merge_operators/cassandra/serialize.h b/utilities/cassandra/serialize.h
similarity index 91%
rename from utilities/merge_operators/cassandra/serialize.h
rename to utilities/cassandra/serialize.h
index 0e35d34af..64ccd4c29 100644
--- a/utilities/merge_operators/cassandra/serialize.h
+++ b/utilities/cassandra/serialize.h
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 /**
  * Helper functions which serialize and deserialize integers
diff --git a/utilities/merge_operators/cassandra/test_utils.cc b/utilities/cassandra/test_utils.cc
similarity index 77%
rename from utilities/merge_operators/cassandra/test_utils.cc
rename to utilities/cassandra/test_utils.cc
index 91b9e6349..61f53b2d3 100644
--- a/utilities/merge_operators/cassandra/test_utils.cc
+++ b/utilities/cassandra/test_utils.cc
@@ -1,9 +1,7 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #include "test_utils.h"
 
@@ -12,29 +10,29 @@ namespace cassandra {
 const char kData[] = {'d', 'a', 't', 'a'};
 const char kExpiringData[] = {'e', 'd', 'a', 't', 'a'};
 const int32_t kLocalDeletionTime = 1;
-const int32_t kTtl = 100;
+const int32_t kTtl = 86400;
 const int8_t kColumn = 0;
 const int8_t kTombstone = 1;
 const int8_t kExpiringColumn = 2;
 
-std::unique_ptr<ColumnBase> CreateTestColumn(int8_t mask,
+std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask,
                                              int8_t index,
                                              int64_t timestamp) {
   if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
-    return std::unique_ptr<Tombstone>(new Tombstone(
+    return std::shared_ptr<Tombstone>(new Tombstone(
       mask, index, kLocalDeletionTime, timestamp));
   } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) {
-    return std::unique_ptr<ExpiringColumn>(new ExpiringColumn(
+    return std::shared_ptr<ExpiringColumn>(new ExpiringColumn(
       mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl));
   } else {
-    return std::unique_ptr<Column>(
+    return std::shared_ptr<Column>(
       new Column(mask, index, timestamp, sizeof(kData), kData));
   }
 }
 
 RowValue CreateTestRowValue(
     std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs) {
-  std::vector<std::unique_ptr<ColumnBase>> columns;
+  std::vector<std::shared_ptr<ColumnBase>> columns;
   int64_t last_modified_time = 0;
   for (auto spec: column_specs) {
     auto c = CreateTestColumn(std::get<0>(spec), std::get<1>(spec),
@@ -50,7 +48,7 @@ RowValue CreateRowTombstone(int64_t timestamp) {
 }
 
 void VerifyRowValueColumns(
-  std::vector<std::unique_ptr<ColumnBase>> &columns,
+  std::vector<std::shared_ptr<ColumnBase>> &columns,
   std::size_t index_of_vector,
   int8_t expected_mask,
   int8_t expected_index,
@@ -61,5 +59,9 @@ void VerifyRowValueColumns(
   EXPECT_EQ(expected_index, columns[index_of_vector]->Index());
 }
 
+int64_t ToMicroSeconds(int64_t seconds) {
+  return seconds * (int64_t)1000000;
+}
+
 }
 }
diff --git a/utilities/merge_operators/cassandra/test_utils.h b/utilities/cassandra/test_utils.h
similarity index 68%
rename from utilities/merge_operators/cassandra/test_utils.h
rename to utilities/cassandra/test_utils.h
index 4025b2a3f..463b12bf2 100644
--- a/utilities/merge_operators/cassandra/test_utils.h
+++ b/utilities/cassandra/test_utils.h
@@ -1,15 +1,13 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
 
 #pragma once
 #include <memory>
 #include "util/testharness.h"
-#include "utilities/merge_operators/cassandra/format.h"
-#include "utilities/merge_operators/cassandra/serialize.h"
+#include "utilities/cassandra/format.h"
+#include "utilities/cassandra/serialize.h"
 
 namespace rocksdb {
 namespace cassandra {
@@ -22,7 +20,7 @@ extern const int8_t kTombstone;
 extern const int8_t kExpiringColumn;
 
 
-std::unique_ptr<ColumnBase> CreateTestColumn(int8_t mask,
+std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask,
                                              int8_t index,
                                              int64_t timestamp);
 
@@ -32,12 +30,14 @@ RowValue CreateTestRowValue(
 RowValue CreateRowTombstone(int64_t timestamp);
 
 void VerifyRowValueColumns(
-  std::vector<std::unique_ptr<ColumnBase>> &columns,
+  std::vector<std::shared_ptr<ColumnBase>> &columns,
   std::size_t index_of_vector,
   int8_t expected_mask,
   int8_t expected_index,
   int64_t expected_timestamp
 );
 
+int64_t ToMicroSeconds(int64_t seconds);
+
 }
 }
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index 2872f3a42..56c8c6e05 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -372,7 +372,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCallFlush:start", [&](void* arg) {
         // Flush should never trigger.
-        ASSERT_TRUE(false);
+        FAIL();
       });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   Checkpoint* checkpoint;
diff --git a/utilities/column_aware_encoding_util.cc b/utilities/column_aware_encoding_util.cc
index a77d38d1d..c36e42254 100644
--- a/utilities/column_aware_encoding_util.cc
+++ b/utilities/column_aware_encoding_util.cc
@@ -51,11 +51,9 @@ void ColumnAwareEncodingReader::InitTableReader(const std::string& file_path) {
 
   options_.comparator = &internal_comparator_;
   options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-  shared_ptr<BlockBasedTableFactory> block_table_factory =
-      std::dynamic_pointer_cast<BlockBasedTableFactory>(options_.table_factory);
 
   std::unique_ptr<TableReader> table_reader;
-  block_table_factory->NewTableReader(
+  options_.table_factory->NewTableReader(
       TableReaderOptions(ioptions_, soptions_, internal_comparator_,
                          /*skip_filters=*/false),
       std::move(file_), file_size, &table_reader, /*enable_prefetch=*/false);
diff --git a/utilities/date_tiered/date_tiered_db_impl.cc b/utilities/date_tiered/date_tiered_db_impl.cc
index b75c077be..c1b1ceb5e 100644
--- a/utilities/date_tiered/date_tiered_db_impl.cc
+++ b/utilities/date_tiered/date_tiered_db_impl.cc
@@ -378,8 +378,8 @@ Iterator* DateTieredDBImpl::NewIterator(const ReadOptions& opts) {
   DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
 
   auto db_iter = NewArenaWrappedDbIterator(
-      db_impl->GetEnv(), opts, ioptions_, cf_options_.comparator,
-      kMaxSequenceNumber, cf_options_.max_sequential_skip_in_iterations, 0);
+      db_impl->GetEnv(), opts, ioptions_, kMaxSequenceNumber,
+      cf_options_.max_sequential_skip_in_iterations, 0);
 
   auto arena = db_iter->GetArena();
   MergeIteratorBuilder builder(cf_options_.comparator, arena);
diff --git a/utilities/lua/rocks_lua_test.cc b/utilities/lua/rocks_lua_test.cc
index c075e032f..025acaf6d 100644
--- a/utilities/lua/rocks_lua_test.cc
+++ b/utilities/lua/rocks_lua_test.cc
@@ -26,7 +26,7 @@ class StopOnErrorLogger : public Logger {
   virtual void Logv(const char* format, va_list ap) override {
     vfprintf(stderr, format, ap);
     fprintf(stderr, "\n");
-    ASSERT_TRUE(false);
+    FAIL();
   }
 };
 
diff --git a/utilities/merge_operators/cassandra/cassandra_merge_test.cc b/utilities/merge_operators/cassandra/cassandra_merge_test.cc
deleted file mode 100644
index 84886161e..000000000
--- a/utilities/merge_operators/cassandra/cassandra_merge_test.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-// This source code is also licensed under the GPLv2 license found in the
-// COPYING file in the root directory of this source tree.
-
-#include <iostream>
-
-#include "rocksdb/db.h"
-#include "rocksdb/merge_operator.h"
-#include "rocksdb/utilities/db_ttl.h"
-#include "util/testharness.h"
-#include "util/random.h"
-#include "utilities/merge_operators.h"
-#include "utilities/merge_operators/cassandra/merge_operator.h"
-#include "utilities/merge_operators/cassandra/test_utils.h"
-
-using namespace rocksdb;
-
-namespace rocksdb {
-namespace cassandra {
-
-// Path to the database on file system
-const std::string kDbName = test::TmpDir() + "/cassandramerge_test";
-
-class CassandraStore {
- public:
-  explicit CassandraStore(std::shared_ptr<DB> db)
-      : db_(db),
-        merge_option_(),
-        get_option_() {
-    assert(db);
-  }
-
-  bool Append(const std::string& key, const RowValue& val){
-    std::string result;
-    val.Serialize(&result);
-    Slice valSlice(result.data(), result.size());
-    auto s = db_->Merge(merge_option_, key, valSlice);
-
-    if (s.ok()) {
-      return true;
-    } else {
-      std::cerr << "ERROR " << s.ToString() << std::endl;
-      return false;
-    }
-  }
-
-  std::tuple<bool, RowValue> Get(const std::string& key){
-    std::string result;
-    auto s = db_->Get(get_option_, key, &result);
-
-    if (s.ok()) {
-      return std::make_tuple(true,
-                             RowValue::Deserialize(result.data(),
-                                                   result.size()));
-    }
-
-    if (!s.IsNotFound()) {
-      std::cerr << "ERROR " << s.ToString() << std::endl;
-    }
-
-    return std::make_tuple(false, RowValue(0, 0));
-  }
-
- private:
-  std::shared_ptr<DB> db_;
-  WriteOptions merge_option_;
-  ReadOptions get_option_;
-};
-
-
-// The class for unit-testing
-class CassandraMergeTest : public testing::Test {
- public:
-  CassandraMergeTest() {
-    DestroyDB(kDbName, Options());    // Start each test with a fresh DB
-  }
-
-  std::shared_ptr<DB> OpenDb() {
-    DB* db;
-    Options options;
-    options.create_if_missing = true;
-    options.merge_operator.reset(new CassandraValueMergeOperator());
-    EXPECT_OK(DB::Open(options, kDbName, &db));
-    return std::shared_ptr<DB>(db);
-  }
-};
-
-// THE TEST CASES BEGIN HERE
-
-TEST_F(CassandraMergeTest, SimpleTest) {
-  auto db = OpenDb();
-  CassandraStore store(db);
-
-  store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kTombstone, 0, 5),
-    std::make_tuple(kColumn, 1, 8),
-    std::make_tuple(kExpiringColumn, 2, 5),
-  }));
-  store.Append("k1",CreateTestRowValue({
-    std::make_tuple(kColumn, 0, 2),
-    std::make_tuple(kExpiringColumn, 1, 5),
-    std::make_tuple(kTombstone, 2, 7),
-    std::make_tuple(kExpiringColumn, 7, 17),
-  }));
-  store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, 6),
-    std::make_tuple(kTombstone, 1, 5),
-    std::make_tuple(kColumn, 2, 4),
-    std::make_tuple(kTombstone, 11, 11),
-  }));
-
-  auto ret = store.Get("k1");
-
-  ASSERT_TRUE(std::get<0>(ret));
-  RowValue& merged = std::get<1>(ret);
-  EXPECT_EQ(merged.columns_.size(), 5);
-  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 0, 6);
-  VerifyRowValueColumns(merged.columns_, 1, kColumn, 1, 8);
-  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 2, 7);
-  VerifyRowValueColumns(merged.columns_, 3, kExpiringColumn, 7, 17);
-  VerifyRowValueColumns(merged.columns_, 4, kTombstone, 11, 11);
-}
-
-
-} // namespace cassandra
-} // namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/utilities/merge_operators/max.cc b/utilities/merge_operators/max.cc
index 06e233fe8..5f42e816e 100644
--- a/utilities/merge_operators/max.cc
+++ b/utilities/merge_operators/max.cc
@@ -25,6 +25,8 @@ class MaxOperator : public MergeOperator {
     if (merge_in.existing_value) {
       max = Slice(merge_in.existing_value->data(),
                   merge_in.existing_value->size());
+    } else if (max.data() == nullptr) {
+      max = Slice();
     }
 
     for (const auto& op : merge_in.operand_list) {
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 86b382cfa..2ca8d4767 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -100,28 +100,34 @@ class DummyTableFactory : public TableFactory {
   DummyTableFactory() {}
   virtual ~DummyTableFactory() {}
 
-  virtual const char* Name() const { return "DummyTableFactory"; }
+  virtual const char* Name() const override { return "DummyTableFactory"; }
 
-  virtual Status NewTableReader(const TableReaderOptions& table_reader_options,
-                                unique_ptr<RandomAccessFileReader>&& file,
-                                uint64_t file_size,
-                                unique_ptr<TableReader>* table_reader,
-                                bool prefetch_index_and_filter_in_cache) const {
+  virtual Status NewTableReader(
+      const TableReaderOptions& table_reader_options,
+      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table_reader,
+      bool prefetch_index_and_filter_in_cache) const override {
     return Status::NotSupported();
   }
 
   virtual TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const {
+      uint32_t column_family_id, WritableFileWriter* file) const override {
     return nullptr;
   }
 
-  virtual Status SanitizeOptions(const DBOptions& db_opts,
-                                 const ColumnFamilyOptions& cf_opts) const {
+  virtual Status SanitizeOptions(
+      const DBOptions& db_opts,
+      const ColumnFamilyOptions& cf_opts) const override {
     return Status::NotSupported();
   }
 
-  virtual std::string GetPrintableTableOptions() const { return ""; }
+  virtual std::string GetPrintableTableOptions() const override { return ""; }
+
+  Status GetOptionString(std::string* opt_string,
+                         const std::string& delimiter) const override {
+    return Status::OK();
+  }
 };
 
 class DummyMergeOperator : public MergeOperator {
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index e65952cdb..714af2c62 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -136,7 +136,7 @@ Status BlockCacheTier::Close() {
 template<class T>
 void Add(std::map<std::string, double>* stats, const std::string& key,
          const T& t) {
-  stats->insert({key, static_cast<const double>(t)});
+  stats->insert({key, static_cast<double>(t)});
 }
 
 PersistentCache::StatsType BlockCacheTier::Stats() {
diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc
index db9cf373f..5affc4085 100644
--- a/utilities/persistent_cache/persistent_cache_test.cc
+++ b/utilities/persistent_cache/persistent_cache_test.cc
@@ -372,7 +372,7 @@ void PersistentCacheDBTest::RunTest(
         options.table_factory.reset(NewBlockBasedTableFactory(table_options));
         break;
       default:
-        ASSERT_TRUE(false);
+        FAIL();
     }
 
     std::vector<std::string> values;
@@ -425,7 +425,7 @@ void PersistentCacheDBTest::RunTest(
         ASSERT_EQ(compressed_block_miss, 0);
         break;
       default:
-        ASSERT_TRUE(false);
+        FAIL();
     }
 
     options.create_if_missing = true;
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index 335ac9896..e3d801657 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -7,10 +7,144 @@
 #include <atomic>
 #include "monitoring/statistics.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/file_reader_writer.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
 namespace {
+
+class CacheActivityLogger {
+ public:
+  CacheActivityLogger()
+      : activity_logging_enabled_(false), max_logging_size_(0) {}
+
+  ~CacheActivityLogger() {
+    MutexLock l(&mutex_);
+
+    StopLoggingInternal();
+  }
+
+  Status StartLogging(const std::string& activity_log_file, Env* env,
+                      uint64_t max_logging_size = 0) {
+    assert(activity_log_file != "");
+    assert(env != nullptr);
+
+    Status status;
+    EnvOptions env_opts;
+    std::unique_ptr<WritableFile> log_file;
+
+    MutexLock l(&mutex_);
+
+    // Stop existing logging if any
+    StopLoggingInternal();
+
+    // Open log file
+    status = env->NewWritableFile(activity_log_file, &log_file, env_opts);
+    if (!status.ok()) {
+      return status;
+    }
+    file_writer_.reset(new WritableFileWriter(std::move(log_file), env_opts));
+
+    max_logging_size_ = max_logging_size;
+    activity_logging_enabled_.store(true);
+
+    return status;
+  }
+
+  void StopLogging() {
+    MutexLock l(&mutex_);
+
+    StopLoggingInternal();
+  }
+
+  void ReportLookup(const Slice& key) {
+    if (activity_logging_enabled_.load() == false) {
+      return;
+    }
+
+    std::string log_line = "LOOKUP - " + key.ToString(true) + "\n";
+
+    // line format: "LOOKUP - <KEY>"
+    MutexLock l(&mutex_);
+    Status s = file_writer_->Append(log_line);
+    if (!s.ok() && bg_status_.ok()) {
+      bg_status_ = s;
+    }
+    if (MaxLoggingSizeReached() || !bg_status_.ok()) {
+      // Stop logging if we have reached the max file size or
+      // encountered an error
+      StopLoggingInternal();
+    }
+  }
+
+  void ReportAdd(const Slice& key, size_t size) {
+    if (activity_logging_enabled_.load() == false) {
+      return;
+    }
+
+    std::string log_line = "ADD - ";
+    log_line += key.ToString(true);
+    log_line += " - ";
+    AppendNumberTo(&log_line, size);
+		log_line += "\n";
+
+    // line format: "ADD - <KEY> - <KEY-SIZE>"
+    MutexLock l(&mutex_);
+    Status s = file_writer_->Append(log_line);
+    if (!s.ok() && bg_status_.ok()) {
+      bg_status_ = s;
+    }
+
+    if (MaxLoggingSizeReached() || !bg_status_.ok()) {
+      // Stop logging if we have reached the max file size or
+      // encountered an error
+      StopLoggingInternal();
+    }
+  }
+
+  Status& bg_status() {
+    MutexLock l(&mutex_);
+    return bg_status_;
+  }
+
+ private:
+  bool MaxLoggingSizeReached() {
+    mutex_.AssertHeld();
+
+    return (max_logging_size_ > 0 &&
+            file_writer_->GetFileSize() >= max_logging_size_);
+  }
+
+  void StopLoggingInternal() {
+    mutex_.AssertHeld();
+
+    if (!activity_logging_enabled_) {
+      return;
+    }
+
+    activity_logging_enabled_.store(false);
+    Status s = file_writer_->Close();
+    if (!s.ok() && bg_status_.ok()) {
+      bg_status_ = s;
+    }
+  }
+
+  // Mutex to sync writes to file_writer, and all following
+  // class data members
+  port::Mutex mutex_;
+  // Indicates if logging is currently enabled
+  // atomic to allow reads without mutex
+  std::atomic<bool> activity_logging_enabled_;
+  // When reached, we will stop logging and close the file
+  // Value of 0 means unlimited
+  uint64_t max_logging_size_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  Status bg_status_;
+};
+
 // SimCacheImpl definition
 class SimCacheImpl : public SimCache {
  public:
@@ -48,6 +182,9 @@ class SimCacheImpl : public SimCache {
     } else {
       key_only_cache_->Release(h);
     }
+
+    cache_activity_logger_.ReportAdd(key, charge);
+
     return cache_->Insert(key, value, charge, deleter, handle, priority);
   }
 
@@ -61,6 +198,9 @@ class SimCacheImpl : public SimCache {
       inc_miss_counter();
       RecordTick(stats, SIM_BLOCK_CACHE_MISS);
     }
+
+    cache_activity_logger_.ReportLookup(key);
+
     return cache_->Lookup(key, stats);
   }
 
@@ -158,12 +298,29 @@ class SimCacheImpl : public SimCache {
     return ret;
   }
 
+  virtual Status StartActivityLogging(const std::string& activity_log_file,
+                                      Env* env,
+                                      uint64_t max_logging_size = 0) override {
+    return cache_activity_logger_.StartLogging(activity_log_file, env,
+                                               max_logging_size);
+  }
+
+  virtual void StopActivityLogging() override {
+    cache_activity_logger_.StopLogging();
+  }
+
+  virtual Status GetActivityLoggingStatus() override {
+    return cache_activity_logger_.bg_status();
+  }
+
  private:
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> key_only_cache_;
   std::atomic<uint64_t> miss_times_;
   std::atomic<uint64_t> hit_times_;
   Statistics* stats_;
+  CacheActivityLogger cache_activity_logger_;
+
   void inc_miss_counter() {
     miss_times_.fetch_add(1, std::memory_order_relaxed);
   }
diff --git a/utilities/simulator_cache/sim_cache_test.cc b/utilities/simulator_cache/sim_cache_test.cc
index 01b328c78..4c175c947 100644
--- a/utilities/simulator_cache/sim_cache_test.cc
+++ b/utilities/simulator_cache/sim_cache_test.cc
@@ -138,6 +138,77 @@ TEST_F(SimCacheTest, SimCache) {
   ASSERT_EQ(6, simCache->get_hit_counter());
 }
 
+TEST_F(SimCacheTest, SimCacheLogging) {
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  options.disable_auto_compactions = true;
+  std::shared_ptr<SimCache> sim_cache =
+      NewSimCache(NewLRUCache(1024 * 1024), 20000, 0);
+  table_options.block_cache = sim_cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  int num_block_entries = 20;
+  for (int i = 0; i < num_block_entries; i++) {
+    Put(Key(i), "val");
+    Flush();
+  }
+
+  std::string log_file = test::TmpDir(env_) + "/cache_log.txt";
+  ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_));
+  for (int i = 0; i < num_block_entries; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
+  }
+  for (int i = 0; i < num_block_entries; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
+  }
+  sim_cache->StopActivityLogging();
+  ASSERT_OK(sim_cache->GetActivityLoggingStatus());
+
+  std::string file_contents = "";
+  ReadFileToString(env_, log_file, &file_contents);
+
+  int lookup_num = 0;
+  int add_num = 0;
+  std::string::size_type pos;
+
+  // count number of lookups
+  pos = 0;
+  while ((pos = file_contents.find("LOOKUP -", pos)) != std::string::npos) {
+    ++lookup_num;
+    pos += 1;
+  }
+
+  // count number of additions
+  pos = 0;
+  while ((pos = file_contents.find("ADD -", pos)) != std::string::npos) {
+    ++add_num;
+    pos += 1;
+  }
+
+  // We asked for every block twice
+  ASSERT_EQ(lookup_num, num_block_entries * 2);
+
+  // We added every block only once, since the cache can hold all blocks
+  ASSERT_EQ(add_num, num_block_entries);
+
+  // Log things again but stop logging automatically after reaching 512 bytes
+	int max_size = 512;
+  ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_, max_size));
+  for (int it = 0; it < 10; it++) {
+    for (int i = 0; i < num_block_entries; i++) {
+      ASSERT_EQ(Get(Key(i)), "val");
+    }
+  }
+  ASSERT_OK(sim_cache->GetActivityLoggingStatus());
+
+  uint64_t fsize = 0;
+  ASSERT_OK(env_->GetFileSize(log_file, &fsize));
+	// error margin of 100 bytes
+  ASSERT_LT(fsize, max_size + 100);
+	ASSERT_GT(fsize, max_size - 100);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/optimistic_transaction_impl.cc b/utilities/transactions/optimistic_transaction.cc
similarity index 70%
rename from utilities/transactions/optimistic_transaction_impl.cc
rename to utilities/transactions/optimistic_transaction.cc
index 5652189bc..89d3226d5 100644
--- a/utilities/transactions/optimistic_transaction_impl.cc
+++ b/utilities/transactions/optimistic_transaction.cc
@@ -5,11 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/transactions/optimistic_transaction_impl.h"
+#include "utilities/transactions/optimistic_transaction.h"
 
-#include <algorithm>
 #include <string>
-#include <vector>
 
 #include "db/column_family.h"
 #include "db/db_impl.h"
@@ -17,6 +15,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 #include "utilities/transactions/transaction_util.h"
 
@@ -24,51 +23,42 @@ namespace rocksdb {
 
 struct WriteOptions;
 
-OptimisticTransactionImpl::OptimisticTransactionImpl(
+OptimisticTransaction::OptimisticTransaction(
     OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
     const OptimisticTransactionOptions& txn_options)
     : TransactionBaseImpl(txn_db->GetBaseDB(), write_options), txn_db_(txn_db) {
   Initialize(txn_options);
 }
 
-void OptimisticTransactionImpl::Initialize(
+void OptimisticTransaction::Initialize(
     const OptimisticTransactionOptions& txn_options) {
   if (txn_options.set_snapshot) {
     SetSnapshot();
   }
 }
 
-void OptimisticTransactionImpl::Reinitialize(
+void OptimisticTransaction::Reinitialize(
     OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
     const OptimisticTransactionOptions& txn_options) {
   TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options);
   Initialize(txn_options);
 }
 
-OptimisticTransactionImpl::~OptimisticTransactionImpl() {
-}
+OptimisticTransaction::~OptimisticTransaction() {}
 
-void OptimisticTransactionImpl::Clear() {
-  TransactionBaseImpl::Clear();
-}
+void OptimisticTransaction::Clear() { TransactionBaseImpl::Clear(); }
 
-Status OptimisticTransactionImpl::Prepare() {
+Status OptimisticTransaction::Prepare() {
   return Status::InvalidArgument(
       "Two phase commit not supported for optimistic transactions.");
 }
 
-Status OptimisticTransactionImpl::Commit() {
+Status OptimisticTransaction::Commit() {
   // Set up callback which will call CheckTransactionForConflicts() to
   // check whether this transaction is safe to be committed.
   OptimisticTransactionCallback callback(this);
 
-  DBImpl* db_impl = dynamic_cast<DBImpl*>(db_->GetRootDB());
-  if (db_impl == nullptr) {
-    // This should only happen if we support creating transactions from
-    // a StackableDB and someone overrides GetRootDB().
-    return Status::InvalidArgument(
-        "DB::GetRootDB() returned an unexpected DB class");
-  }
+  DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
 
   Status s = db_impl->WriteWithCallback(
       write_options_, GetWriteBatch()->GetWriteBatch(), &callback);
@@ -80,7 +70,7 @@ Status OptimisticTransactionImpl::Commit() {
   return s;
 }
 
-Status OptimisticTransactionImpl::Rollback() {
+Status OptimisticTransaction::Rollback() {
   Clear();
   return Status::OK();
 }
@@ -88,9 +78,9 @@ Status OptimisticTransactionImpl::Rollback() {
 // Record this key so that we can check it for conflicts at commit time.
 //
 // 'exclusive' is unused for OptimisticTransaction.
-Status OptimisticTransactionImpl::TryLock(ColumnFamilyHandle* column_family,
-                                          const Slice& key, bool read_only,
-                                          bool exclusive, bool untracked) {
+Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
+                                      const Slice& key, bool read_only,
+                                      bool exclusive, bool untracked) {
   if (untracked) {
     return Status::OK();
   }
@@ -119,11 +109,10 @@ Status OptimisticTransactionImpl::TryLock(ColumnFamilyHandle* column_family,
 //
 // Should only be called on writer thread in order to avoid any race conditions
 // in detecting write conflicts.
-Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) {
+Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) {
   Status result;
 
-  assert(dynamic_cast<DBImpl*>(db) != nullptr);
-  auto db_impl = reinterpret_cast<DBImpl*>(db);
+  auto db_impl = static_cast_with_check<DBImpl, DB>(db);
 
   // Since we are on the write thread and do not want to block other writers,
   // we will do a cache-only conflict check.  This can result in TryAgain
@@ -133,7 +122,7 @@ Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) {
                                                 true /* cache_only */);
 }
 
-Status OptimisticTransactionImpl::SetName(const TransactionName& name) {
+Status OptimisticTransaction::SetName(const TransactionName& /* unused */) {
   return Status::InvalidArgument("Optimistic transactions cannot be named.");
 }
 
diff --git a/utilities/transactions/optimistic_transaction_impl.h b/utilities/transactions/optimistic_transaction.h
similarity index 78%
rename from utilities/transactions/optimistic_transaction_impl.h
rename to utilities/transactions/optimistic_transaction.h
index 6baec6962..5a19489f2 100644
--- a/utilities/transactions/optimistic_transaction_impl.h
+++ b/utilities/transactions/optimistic_transaction.h
@@ -26,13 +26,13 @@
 
 namespace rocksdb {
 
-class OptimisticTransactionImpl : public TransactionBaseImpl {
+class OptimisticTransaction : public TransactionBaseImpl {
  public:
-  OptimisticTransactionImpl(OptimisticTransactionDB* db,
-                            const WriteOptions& write_options,
-                            const OptimisticTransactionOptions& txn_options);
+  OptimisticTransaction(OptimisticTransactionDB* db,
+                        const WriteOptions& write_options,
+                        const OptimisticTransactionOptions& txn_options);
 
-  virtual ~OptimisticTransactionImpl();
+  virtual ~OptimisticTransaction();
 
   void Reinitialize(OptimisticTransactionDB* txn_db,
                     const WriteOptions& write_options,
@@ -67,20 +67,20 @@ class OptimisticTransactionImpl : public TransactionBaseImpl {
 
   void Clear() override;
 
-  void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
-                          const Slice& key) override {
+  void UnlockGetForUpdate(ColumnFamilyHandle* /* unused */,
+                          const Slice& /* unused */) override {
     // Nothing to unlock.
   }
 
   // No copying allowed
-  OptimisticTransactionImpl(const OptimisticTransactionImpl&);
-  void operator=(const OptimisticTransactionImpl&);
+  OptimisticTransaction(const OptimisticTransaction&);
+  void operator=(const OptimisticTransaction&);
 };
 
 // Used at commit time to trigger transaction validation
 class OptimisticTransactionCallback : public WriteCallback {
  public:
-  explicit OptimisticTransactionCallback(OptimisticTransactionImpl* txn)
+  explicit OptimisticTransactionCallback(OptimisticTransaction* txn)
       : txn_(txn) {}
 
   Status Callback(DB* db) override {
@@ -90,7 +90,7 @@ class OptimisticTransactionCallback : public WriteCallback {
   bool AllowWriteBatching() override { return false; }
 
  private:
-  OptimisticTransactionImpl* txn_;
+  OptimisticTransaction* txn_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index 001ebefe1..d9db6fde0 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -14,7 +14,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
-#include "utilities/transactions/optimistic_transaction_impl.h"
+#include "utilities/transactions/optimistic_transaction.h"
 
 namespace rocksdb {
 
@@ -25,7 +25,7 @@ Transaction* OptimisticTransactionDBImpl::BeginTransaction(
     ReinitializeTransaction(old_txn, write_options, txn_options);
     return old_txn;
   } else {
-    return new OptimisticTransactionImpl(this, write_options, txn_options);
+    return new OptimisticTransaction(this, write_options, txn_options);
   }
 }
 
@@ -81,8 +81,8 @@ Status OptimisticTransactionDB::Open(
 void OptimisticTransactionDBImpl::ReinitializeTransaction(
     Transaction* txn, const WriteOptions& write_options,
     const OptimisticTransactionOptions& txn_options) {
-  assert(dynamic_cast<OptimisticTransactionImpl*>(txn) != nullptr);
-  auto txn_impl = reinterpret_cast<OptimisticTransactionImpl*>(txn);
+  assert(dynamic_cast<OptimisticTransaction*>(txn) != nullptr);
+  auto txn_impl = reinterpret_cast<OptimisticTransaction*>(txn);
 
   txn_impl->Reinitialize(this, write_options, txn_options);
 }
diff --git a/utilities/transactions/transaction_impl.cc b/utilities/transactions/pessimistic_transaction.cc
similarity index 77%
rename from utilities/transactions/transaction_impl.cc
rename to utilities/transactions/pessimistic_transaction.cc
index 408b15bcd..68b8b4f1a 100644
--- a/utilities/transactions/transaction_impl.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/transactions/transaction_impl.h"
+#include "utilities/transactions/pessimistic_transaction.h"
 
 #include <map>
 #include <set>
@@ -19,41 +19,41 @@
 #include "rocksdb/snapshot.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
-#include "utilities/transactions/transaction_db_impl.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
 #include "utilities/transactions/transaction_util.h"
 
 namespace rocksdb {
 
 struct WriteOptions;
 
-std::atomic<TransactionID> TransactionImpl::txn_id_counter_(1);
+std::atomic<TransactionID> PessimisticTransaction::txn_id_counter_(1);
 
-TransactionID TransactionImpl::GenTxnID() {
+TransactionID PessimisticTransaction::GenTxnID() {
   return txn_id_counter_.fetch_add(1);
 }
 
-TransactionImpl::TransactionImpl(TransactionDB* txn_db,
-                                 const WriteOptions& write_options,
-                                 const TransactionOptions& txn_options)
+PessimisticTransaction::PessimisticTransaction(
+    TransactionDB* txn_db, const WriteOptions& write_options,
+    const TransactionOptions& txn_options)
     : TransactionBaseImpl(txn_db->GetRootDB(), write_options),
       txn_db_impl_(nullptr),
+      expiration_time_(0),
       txn_id_(0),
       waiting_cf_id_(0),
       waiting_key_(nullptr),
-      expiration_time_(0),
       lock_timeout_(0),
       deadlock_detect_(false),
       deadlock_detect_depth_(0) {
-  txn_db_impl_ = dynamic_cast<TransactionDBImpl*>(txn_db);
-  assert(txn_db_impl_);
-  db_impl_ = dynamic_cast<DBImpl*>(txn_db->GetRootDB());
-  assert(db_impl_);
+  txn_db_impl_ =
+      static_cast_with_check<PessimisticTransactionDB, TransactionDB>(txn_db);
+  db_impl_ = static_cast_with_check<DBImpl, DB>(db_);
   Initialize(txn_options);
 }
 
-void TransactionImpl::Initialize(const TransactionOptions& txn_options) {
+void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
   txn_id_ = GenTxnID();
 
   txn_state_ = STARTED;
@@ -84,7 +84,7 @@ void TransactionImpl::Initialize(const TransactionOptions& txn_options) {
   }
 }
 
-TransactionImpl::~TransactionImpl() {
+PessimisticTransaction::~PessimisticTransaction() {
   txn_db_impl_->UnLock(this, &GetTrackedKeys());
   if (expiration_time_ > 0) {
     txn_db_impl_->RemoveExpirableTransaction(txn_id_);
@@ -94,14 +94,14 @@ TransactionImpl::~TransactionImpl() {
   }
 }
 
-void TransactionImpl::Clear() {
+void PessimisticTransaction::Clear() {
   txn_db_impl_->UnLock(this, &GetTrackedKeys());
   TransactionBaseImpl::Clear();
 }
 
-void TransactionImpl::Reinitialize(TransactionDB* txn_db,
-                                   const WriteOptions& write_options,
-                                   const TransactionOptions& txn_options) {
+void PessimisticTransaction::Reinitialize(
+    TransactionDB* txn_db, const WriteOptions& write_options,
+    const TransactionOptions& txn_options) {
   if (!name_.empty() && txn_state_ != COMMITED) {
     txn_db_impl_->UnregisterTransaction(this);
   }
@@ -109,7 +109,7 @@ void TransactionImpl::Reinitialize(TransactionDB* txn_db,
   Initialize(txn_options);
 }
 
-bool TransactionImpl::IsExpired() const {
+bool PessimisticTransaction::IsExpired() const {
   if (expiration_time_ > 0) {
     if (db_->GetEnv()->NowMicros() >= expiration_time_) {
       // Transaction is expired.
@@ -120,7 +120,12 @@ bool TransactionImpl::IsExpired() const {
   return false;
 }
 
-Status TransactionImpl::CommitBatch(WriteBatch* batch) {
+WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db,
+                                     const WriteOptions& write_options,
+                                     const TransactionOptions& txn_options)
+    : PessimisticTransaction(txn_db, write_options, txn_options){};
+
+Status WriteCommittedTxn::CommitBatch(WriteBatch* batch) {
   TransactionKeyMap keys_to_unlock;
   Status s = LockBatch(batch, &keys_to_unlock);
 
@@ -158,7 +163,7 @@ Status TransactionImpl::CommitBatch(WriteBatch* batch) {
   return s;
 }
 
-Status TransactionImpl::Prepare() {
+Status PessimisticTransaction::Prepare() {
   Status s;
 
   if (name_.empty()) {
@@ -187,12 +192,7 @@ Status TransactionImpl::Prepare() {
     txn_state_.store(AWAITING_PREPARE);
     // transaction can't expire after preparation
     expiration_time_ = 0;
-    WriteOptions write_options = write_options_;
-    write_options.disableWAL = false;
-    WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
-    s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                            /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
-                            /* disable_memtable*/ true);
+    s = PrepareInternal();
     if (s.ok()) {
       assert(log_number_ != 0);
       dbimpl_->MarkLogAsContainingPrepSection(log_number_);
@@ -213,9 +213,20 @@ Status TransactionImpl::Prepare() {
   return s;
 }
 
-Status TransactionImpl::Commit() {
+Status WriteCommittedTxn::PrepareInternal() {
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
+  Status s =
+      db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+                          /* disable_memtable*/ true);
+  return s;
+}
+
+Status PessimisticTransaction::Commit() {
   Status s;
-  bool commit_single = false;
+  bool commit_without_prepare = false;
   bool commit_prepared = false;
 
   if (IsExpired()) {
@@ -229,25 +240,28 @@ Status TransactionImpl::Commit() {
     // our locks stolen. In this case the only valid state is STARTED because
     // a state of PREPARED would have a cleared expiration_time_.
     TransactionState expected = STARTED;
-    commit_single = std::atomic_compare_exchange_strong(&txn_state_, &expected,
-                                                        AWAITING_COMMIT);
+    commit_without_prepare = std::atomic_compare_exchange_strong(
+        &txn_state_, &expected, AWAITING_COMMIT);
     TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1");
   } else if (txn_state_ == PREPARED) {
     // expiration and lock stealing is not a concern
     commit_prepared = true;
   } else if (txn_state_ == STARTED) {
     // expiration and lock stealing is not a concern
-    commit_single = true;
+    commit_without_prepare = true;
+    // TODO(myabandeh): what if the user mistakenly forgets prepare? We should
+    // add an option so that the user explictly express the intention of
+    // skipping the prepare phase.
   }
 
-  if (commit_single) {
+  if (commit_without_prepare) {
     assert(!commit_prepared);
     if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) {
       s = Status::InvalidArgument(
           "Commit-time batch contains values that will not be committed.");
     } else {
       txn_state_.store(AWAITING_COMMIT);
-      s = db_->Write(write_options_, GetWriteBatch()->GetWriteBatch());
+      s = CommitWithoutPrepareInternal();
       Clear();
       if (s.ok()) {
         txn_state_.store(COMMITED);
@@ -256,21 +270,8 @@ Status TransactionImpl::Commit() {
   } else if (commit_prepared) {
     txn_state_.store(AWAITING_COMMIT);
 
-    // We take the commit-time batch and append the Commit marker.
-    // The Memtable will ignore the Commit marker in non-recovery mode
-    WriteBatch* working_batch = GetCommitTimeWriteBatch();
-    WriteBatchInternal::MarkCommit(working_batch, name_);
-
-    // any operations appended to this working_batch will be ignored from WAL
-    working_batch->MarkWalTerminationPoint();
+    s = CommitInternal();
 
-    // insert prepared batch into Memtable only skipping WAL.
-    // Memtable will ignore BeginPrepare/EndPrepare markers
-    // in non recovery mode and simply insert the values
-    WriteBatchInternal::Append(working_batch, GetWriteBatch()->GetWriteBatch());
-
-    s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                            log_number_);
     if (!s.ok()) {
       ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
                      "Commit write failed");
@@ -299,7 +300,31 @@ Status TransactionImpl::Commit() {
   return s;
 }
 
-Status TransactionImpl::Rollback() {
+Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
+  Status s = db_->Write(write_options_, GetWriteBatch()->GetWriteBatch());
+  return s;
+}
+
+Status WriteCommittedTxn::CommitInternal() {
+  // We take the commit-time batch and append the Commit marker.
+  // The Memtable will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+  WriteBatchInternal::MarkCommit(working_batch, name_);
+
+  // any operations appended to this working_batch will be ignored from WAL
+  working_batch->MarkWalTerminationPoint();
+
+  // insert prepared batch into Memtable only skipping WAL.
+  // Memtable will ignore BeginPrepare/EndPrepare markers
+  // in non recovery mode and simply insert the values
+  WriteBatchInternal::Append(working_batch, GetWriteBatch()->GetWriteBatch());
+
+  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                               log_number_);
+  return s;
+}
+
+Status WriteCommittedTxn::Rollback() {
   Status s;
   if (txn_state_ == PREPARED) {
     WriteBatch rollback_marker;
@@ -326,7 +351,7 @@ Status TransactionImpl::Rollback() {
   return s;
 }
 
-Status TransactionImpl::RollbackToSavePoint() {
+Status PessimisticTransaction::RollbackToSavePoint() {
   if (txn_state_ != STARTED) {
     return Status::InvalidArgument("Transaction is beyond state for rollback.");
   }
@@ -344,8 +369,8 @@ Status TransactionImpl::RollbackToSavePoint() {
 
 // Lock all keys in this batch.
 // On success, caller should unlock keys_to_unlock
-Status TransactionImpl::LockBatch(WriteBatch* batch,
-                                  TransactionKeyMap* keys_to_unlock) {
+Status PessimisticTransaction::LockBatch(WriteBatch* batch,
+                                         TransactionKeyMap* keys_to_unlock) {
   class Handler : public WriteBatch::Handler {
    public:
     // Sorted map of column_family_id to sorted set of keys.
@@ -367,12 +392,12 @@ Status TransactionImpl::LockBatch(WriteBatch* batch,
     }
 
     virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+                         const Slice& /* unused */) override {
       RecordKey(column_family_id, key);
       return Status::OK();
     }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
+                           const Slice& /* unused */) override {
       RecordKey(column_family_id, key);
       return Status::OK();
     }
@@ -422,9 +447,9 @@ Status TransactionImpl::LockBatch(WriteBatch* batch,
 // If check_shapshot is true and this transaction has a snapshot set,
 // this key will only be locked if there have been no writes to this key since
 // the snapshot time.
-Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family,
-                                const Slice& key, bool read_only,
-                                bool exclusive, bool untracked) {
+Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
+                                       const Slice& key, bool read_only,
+                                       bool exclusive, bool untracked) {
   uint32_t cfh_id = GetColumnFamilyID(column_family);
   std::string key_str = key.ToString();
   bool previously_locked;
@@ -510,10 +535,9 @@ Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family,
 
 // Return OK() if this key has not been modified more recently than the
 // transaction snapshot_.
-Status TransactionImpl::ValidateSnapshot(ColumnFamilyHandle* column_family,
-                                         const Slice& key,
-                                         SequenceNumber prev_seqno,
-                                         SequenceNumber* new_seqno) {
+Status PessimisticTransaction::ValidateSnapshot(
+    ColumnFamilyHandle* column_family, const Slice& key,
+    SequenceNumber prev_seqno, SequenceNumber* new_seqno) {
   assert(snapshot_);
 
   SequenceNumber seq = snapshot_->GetSequenceNumber();
@@ -526,30 +550,27 @@ Status TransactionImpl::ValidateSnapshot(ColumnFamilyHandle* column_family,
 
   *new_seqno = seq;
 
-  assert(dynamic_cast<DBImpl*>(db_) != nullptr);
-  auto db_impl = reinterpret_cast<DBImpl*>(db_);
-
   ColumnFamilyHandle* cfh =
-      column_family ? column_family : db_impl->DefaultColumnFamily();
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
 
-  return TransactionUtil::CheckKeyForConflicts(db_impl, cfh, key.ToString(),
+  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
                                                snapshot_->GetSequenceNumber(),
                                                false /* cache_only */);
 }
 
-bool TransactionImpl::TryStealingLocks() {
+bool PessimisticTransaction::TryStealingLocks() {
   assert(IsExpired());
   TransactionState expected = STARTED;
   return std::atomic_compare_exchange_strong(&txn_state_, &expected,
                                              LOCKS_STOLEN);
 }
 
-void TransactionImpl::UnlockGetForUpdate(ColumnFamilyHandle* column_family,
-                                         const Slice& key) {
+void PessimisticTransaction::UnlockGetForUpdate(
+    ColumnFamilyHandle* column_family, const Slice& key) {
   txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString());
 }
 
-Status TransactionImpl::SetName(const TransactionName& name) {
+Status PessimisticTransaction::SetName(const TransactionName& name) {
   Status s;
   if (txn_state_ == STARTED) {
     if (name_.length()) {
diff --git a/utilities/transactions/transaction_impl.h b/utilities/transactions/pessimistic_transaction.h
similarity index 76%
rename from utilities/transactions/transaction_impl.h
rename to utilities/transactions/pessimistic_transaction.h
index 01f8f4b2a..5c6d4d261 100644
--- a/utilities/transactions/transaction_impl.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -30,14 +30,17 @@
 
 namespace rocksdb {
 
-class TransactionDBImpl;
+class PessimisticTransactionDB;
 
-class TransactionImpl : public TransactionBaseImpl {
+// A transaction under pessimistic concurrency control. This class implements
+// the locking API and interfaces with the lock manager as well as the
+// pessimistic transactional db.
+class PessimisticTransaction : public TransactionBaseImpl {
  public:
-  TransactionImpl(TransactionDB* db, const WriteOptions& write_options,
-                  const TransactionOptions& txn_options);
+  PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options,
+                         const TransactionOptions& txn_options);
 
-  virtual ~TransactionImpl();
+  virtual ~PessimisticTransaction();
 
   void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options,
                     const TransactionOptions& txn_options);
@@ -46,9 +49,9 @@ class TransactionImpl : public TransactionBaseImpl {
 
   Status Commit() override;
 
-  Status CommitBatch(WriteBatch* batch);
+  virtual Status CommitBatch(WriteBatch* batch) = 0;
 
-  Status Rollback() override;
+  Status Rollback() override = 0;
 
   Status RollbackToSavePoint() override;
 
@@ -107,14 +110,30 @@ class TransactionImpl : public TransactionBaseImpl {
   int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; }
 
  protected:
+  virtual Status PrepareInternal() = 0;
+
+  virtual Status CommitWithoutPrepareInternal() = 0;
+
+  virtual Status CommitInternal() = 0;
+
+  void Initialize(const TransactionOptions& txn_options);
+
+  Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
+
   Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
                  bool read_only, bool exclusive,
                  bool untracked = false) override;
 
- private:
-  TransactionDBImpl* txn_db_impl_;
+  void Clear() override;
+
+  PessimisticTransactionDB* txn_db_impl_;
   DBImpl* db_impl_;
 
+  // If non-zero, this transaction should not be committed after this time (in
+  // microseconds according to Env->NowMicros())
+  uint64_t expiration_time_;
+
+ private:
   // Used to create unique ids for transactions.
   static std::atomic<TransactionID> txn_id_counter_;
 
@@ -140,10 +159,6 @@ class TransactionImpl : public TransactionBaseImpl {
   // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_.
   mutable std::mutex wait_mutex_;
 
-  // If non-zero, this transaction should not be committed after this time (in
-  // microseconds according to Env->NowMicros())
-  uint64_t expiration_time_;
-
   // Timeout in microseconds when locking a key or -1 if there is no timeout.
   int64_t lock_timeout_;
 
@@ -153,34 +168,50 @@ class TransactionImpl : public TransactionBaseImpl {
   // Whether to perform deadlock detection or not.
   int64_t deadlock_detect_depth_;
 
-  void Clear() override;
-
-  void Initialize(const TransactionOptions& txn_options);
-
   Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key,
                           SequenceNumber prev_seqno, SequenceNumber* new_seqno);
 
-  Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
+  void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
+                          const Slice& key) override;
 
-  Status DoCommit(WriteBatch* batch);
+  // No copying allowed
+  PessimisticTransaction(const PessimisticTransaction&);
+  void operator=(const PessimisticTransaction&);
+};
 
-  void RollbackLastN(size_t num);
+class WriteCommittedTxn : public PessimisticTransaction {
+ public:
+  WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options,
+                    const TransactionOptions& txn_options);
 
-  void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
-                          const Slice& key) override;
+  virtual ~WriteCommittedTxn() {}
+
+  Status CommitBatch(WriteBatch* batch) override;
+
+  Status Rollback() override;
+
+ private:
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+
+  Status CommitInternal() override;
+
+  Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key,
+                          SequenceNumber prev_seqno, SequenceNumber* new_seqno);
 
   // No copying allowed
-  TransactionImpl(const TransactionImpl&);
-  void operator=(const TransactionImpl&);
+  WriteCommittedTxn(const WriteCommittedTxn&);
+  void operator=(const WriteCommittedTxn&);
 };
 
 // Used at commit time to check whether transaction is committing before its
 // expiration time.
 class TransactionCallback : public WriteCallback {
  public:
-  explicit TransactionCallback(TransactionImpl* txn) : txn_(txn) {}
+  explicit TransactionCallback(PessimisticTransaction* txn) : txn_(txn) {}
 
-  Status Callback(DB* db) override {
+  Status Callback(DB* /* unused */) override {
     if (txn_->IsExpired()) {
       return Status::Expired();
     } else {
@@ -191,7 +222,7 @@ class TransactionCallback : public WriteCallback {
   bool AllowWriteBatching() override { return true; }
 
  private:
-  TransactionImpl* txn_;
+  PessimisticTransaction* txn_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
new file mode 100644
index 000000000..8fa9575e4
--- /dev/null
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -0,0 +1,806 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+#include <inttypes.h>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace rocksdb {
+
+PessimisticTransactionDB::PessimisticTransactionDB(
+    DB* db, const TransactionDBOptions& txn_db_options)
+    : TransactionDB(db),
+      db_impl_(static_cast_with_check<DBImpl, DB>(db)),
+      txn_db_options_(txn_db_options),
+      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
+                txn_db_options_.max_num_deadlocks,
+                txn_db_options_.custom_mutex_factory
+                    ? txn_db_options_.custom_mutex_factory
+                    : std::shared_ptr<TransactionDBMutexFactory>(
+                          new TransactionDBMutexFactoryImpl())) {
+  assert(db_impl_ != nullptr);
+  info_log_ = db_impl_->GetDBOptions().info_log;
+}
+
+// Support initiliazing PessimisticTransactionDB from a stackable db
+//
+//    PessimisticTransactionDB
+//     ^        ^
+//     |        |
+//     |        +
+//     |   StackableDB
+//     |   ^
+//     |   |
+//     +   +
+//     DBImpl
+//       ^
+//       |(inherit)
+//       +
+//       DB
+//
+PessimisticTransactionDB::PessimisticTransactionDB(
+    StackableDB* db, const TransactionDBOptions& txn_db_options)
+    : TransactionDB(db),
+      db_impl_(static_cast_with_check<DBImpl, DB>(db->GetRootDB())),
+      txn_db_options_(txn_db_options),
+      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
+                txn_db_options_.max_num_deadlocks,
+                txn_db_options_.custom_mutex_factory
+                    ? txn_db_options_.custom_mutex_factory
+                    : std::shared_ptr<TransactionDBMutexFactory>(
+                          new TransactionDBMutexFactoryImpl())) {
+  assert(db_impl_ != nullptr);
+}
+
+PessimisticTransactionDB::~PessimisticTransactionDB() {
+  while (!transactions_.empty()) {
+    delete transactions_.begin()->second;
+  }
+}
+
+Status PessimisticTransactionDB::Initialize(
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  for (auto cf_ptr : handles) {
+    AddColumnFamily(cf_ptr);
+  }
+  // Re-enable compaction for the column families that initially had
+  // compaction enabled.
+  std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
+  compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
+  for (auto index : compaction_enabled_cf_indices) {
+    compaction_enabled_cf_handles.push_back(handles[index]);
+  }
+
+  Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
+
+  // create 'real' transactions from recovered shell transactions
+  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  assert(dbimpl != nullptr);
+  auto rtrxs = dbimpl->recovered_transactions();
+
+  for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
+    auto recovered_trx = it->second;
+    assert(recovered_trx);
+    assert(recovered_trx->log_number_);
+    assert(recovered_trx->name_.length());
+
+    WriteOptions w_options;
+    w_options.sync = true;
+    TransactionOptions t_options;
+
+    Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
+    assert(real_trx);
+    real_trx->SetLogNumber(recovered_trx->log_number_);
+
+    s = real_trx->SetName(recovered_trx->name_);
+    if (!s.ok()) {
+      break;
+    }
+
+    s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
+    real_trx->SetState(Transaction::PREPARED);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (s.ok()) {
+    dbimpl->DeleteAllRecoveredTransactions();
+  }
+  return s;
+}
+
+Transaction* WriteCommittedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WriteCommittedTxn(this, write_options, txn_options);
+  }
+}
+
+Transaction* WritePreparedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WritePreparedTxn(this, write_options, txn_options);
+  }
+}
+
+TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions(
+    const TransactionDBOptions& txn_db_options) {
+  TransactionDBOptions validated = txn_db_options;
+
+  if (txn_db_options.num_stripes == 0) {
+    validated.num_stripes = 1;
+  }
+
+  return validated;
+}
+
+Status TransactionDB::Open(const Options& options,
+                           const TransactionDBOptions& txn_db_options,
+                           const std::string& dbname, TransactionDB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+
+  return s;
+}
+
+Status TransactionDB::Open(
+    const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
+    const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
+  Status s;
+  DB* db;
+
+  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+  std::vector<size_t> compaction_enabled_cf_indices;
+  DBOptions db_options_2pc = db_options;
+  PrepareWrap(&db_options_2pc, &column_families_copy,
+              &compaction_enabled_cf_indices);
+  s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
+  if (s.ok()) {
+    s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
+               dbptr);
+  }
+  return s;
+}
+
+void TransactionDB::PrepareWrap(
+    DBOptions* db_options, std::vector<ColumnFamilyDescriptor>* column_families,
+    std::vector<size_t>* compaction_enabled_cf_indices) {
+  compaction_enabled_cf_indices->clear();
+
+  // Enable MemTable History if not already enabled
+  for (size_t i = 0; i < column_families->size(); i++) {
+    ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
+
+    if (cf_options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to max_write_buffer_number.
+      cf_options->max_write_buffer_number_to_maintain = -1;
+    }
+    if (!cf_options->disable_auto_compactions) {
+      // Disable compactions momentarily to prevent race with DB::Open
+      cf_options->disable_auto_compactions = true;
+      compaction_enabled_cf_indices->push_back(i);
+    }
+  }
+  db_options->allow_2pc = true;
+}
+
+Status TransactionDB::WrapDB(
+    // make sure this db is already opened with memtable history enabled,
+    // auto compaction distabled and 2 phase commit enabled
+    DB* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  PessimisticTransactionDB* txn_db;
+  switch (txn_db_options.write_policy) {
+    case WRITE_UNPREPARED:
+      return Status::NotSupported("WRITE_UNPREPARED is not implemented yet");
+    case WRITE_PREPARED:
+      txn_db = new WritePreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+      break;
+    case WRITE_COMMITTED:
+    default:
+      txn_db = new WriteCommittedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+  }
+  *dbptr = txn_db;
+  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
+  return s;
+}
+
+Status TransactionDB::WrapStackableDB(
+    // make sure this stackable_db is already opened with memtable history
+    // enabled,
+    // auto compaction distabled and 2 phase commit enabled
+    StackableDB* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  PessimisticTransactionDB* txn_db;
+  switch (txn_db_options.write_policy) {
+    case WRITE_UNPREPARED:
+      return Status::NotSupported("WRITE_UNPREPARED is not implemented yet");
+    case WRITE_PREPARED:
+      txn_db = new WritePreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+      break;
+    case WRITE_COMMITTED:
+    default:
+      txn_db = new WriteCommittedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+  }
+  *dbptr = txn_db;
+  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
+  return s;
+}
+
+// Let TransactionLockMgr know that this column family exists so it can
+// allocate a LockMap for it.
+void PessimisticTransactionDB::AddColumnFamily(
+    const ColumnFamilyHandle* handle) {
+  lock_mgr_.AddColumnFamily(handle->GetID());
+}
+
+Status PessimisticTransactionDB::CreateColumnFamily(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    ColumnFamilyHandle** handle) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = db_->CreateColumnFamily(options, column_family_name, handle);
+  if (s.ok()) {
+    lock_mgr_.AddColumnFamily((*handle)->GetID());
+  }
+
+  return s;
+}
+
+// Let TransactionLockMgr know that it can deallocate the LockMap for this
+// column family.
+Status PessimisticTransactionDB::DropColumnFamily(
+    ColumnFamilyHandle* column_family) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = db_->DropColumnFamily(column_family);
+  if (s.ok()) {
+    lock_mgr_.RemoveColumnFamily(column_family->GetID());
+  }
+
+  return s;
+}
+
+Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn,
+                                         uint32_t cfh_id,
+                                         const std::string& key,
+                                         bool exclusive) {
+  return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+}
+
+void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
+                                      const TransactionKeyMap* keys) {
+  lock_mgr_.UnLock(txn, keys, GetEnv());
+}
+
+void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
+                                      uint32_t cfh_id, const std::string& key) {
+  lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
+}
+
+// Used when wrapping DB write operations in a transaction
+Transaction* PessimisticTransactionDB::BeginInternalTransaction(
+    const WriteOptions& options) {
+  TransactionOptions txn_options;
+  Transaction* txn = BeginTransaction(options, txn_options, nullptr);
+
+  // Use default timeout for non-transactional writes
+  txn->SetLockTimeout(txn_db_options_.default_lock_timeout);
+  return txn;
+}
+
+// All user Put, Merge, Delete, and Write requests must be intercepted to make
+// sure that they lock all keys that they are writing to avoid causing conflicts
+// with any concurrent transactions. The easiest way to do this is to wrap all
+// write operations in a transaction.
+//
+// Put(), Merge(), and Delete() only lock a single key per call.  Write() will
+// sort its keys before locking them.  This guarantees that TransactionDB write
+// methods cannot deadlock with eachother (but still could deadlock with a
+// Transaction).
+Status PessimisticTransactionDB::Put(const WriteOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Slice& key, const Slice& val) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(options);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do PutUntracked().
+  s = txn->PutUntracked(column_family, key, val);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::Delete(const WriteOptions& wopts,
+                                        ColumnFamilyHandle* column_family,
+                                        const Slice& key) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(wopts);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // DeleteUntracked().
+  s = txn->DeleteUntracked(column_family, key);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::Merge(const WriteOptions& options,
+                                       ColumnFamilyHandle* column_family,
+                                       const Slice& key, const Slice& value) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(options);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // MergeUntracked().
+  s = txn->MergeUntracked(column_family, key, value);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status PessimisticTransactionDB::Write(const WriteOptions& opts,
+                                       WriteBatch* updates) {
+  // Need to lock all keys in this batch to prevent write conflicts with
+  // concurrent transactions.
+  Transaction* txn = BeginInternalTransaction(opts);
+  txn->DisableIndexing();
+
+  auto txn_impl =
+      static_cast_with_check<PessimisticTransaction, Transaction>(txn);
+
+  // Since commitBatch sorts the keys before locking, concurrent Write()
+  // operations will not cause a deadlock.
+  // In order to avoid a deadlock with a concurrent Transaction, Transactions
+  // should use a lock timeout.
+  Status s = txn_impl->CommitBatch(updates);
+
+  delete txn;
+
+  return s;
+}
+
+void PessimisticTransactionDB::InsertExpirableTransaction(
+    TransactionID tx_id, PessimisticTransaction* tx) {
+  assert(tx->GetExpirationTime() > 0);
+  std::lock_guard<std::mutex> lock(map_mutex_);
+  expirable_transactions_map_.insert({tx_id, tx});
+}
+
+void PessimisticTransactionDB::RemoveExpirableTransaction(TransactionID tx_id) {
+  std::lock_guard<std::mutex> lock(map_mutex_);
+  expirable_transactions_map_.erase(tx_id);
+}
+
+bool PessimisticTransactionDB::TryStealingExpiredTransactionLocks(
+    TransactionID tx_id) {
+  std::lock_guard<std::mutex> lock(map_mutex_);
+
+  auto tx_it = expirable_transactions_map_.find(tx_id);
+  if (tx_it == expirable_transactions_map_.end()) {
+    return true;
+  }
+  PessimisticTransaction& tx = *(tx_it->second);
+  return tx.TryStealingLocks();
+}
+
+void PessimisticTransactionDB::ReinitializeTransaction(
+    Transaction* txn, const WriteOptions& write_options,
+    const TransactionOptions& txn_options) {
+  auto txn_impl =
+      static_cast_with_check<PessimisticTransaction, Transaction>(txn);
+
+  txn_impl->Reinitialize(this, write_options, txn_options);
+}
+
+Transaction* PessimisticTransactionDB::GetTransactionByName(
+    const TransactionName& name) {
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  auto it = transactions_.find(name);
+  if (it == transactions_.end()) {
+    return nullptr;
+  } else {
+    return it->second;
+  }
+}
+
+void PessimisticTransactionDB::GetAllPreparedTransactions(
+    std::vector<Transaction*>* transv) {
+  assert(transv);
+  transv->clear();
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  for (auto it = transactions_.begin(); it != transactions_.end(); it++) {
+    if (it->second->GetState() == Transaction::PREPARED) {
+      transv->push_back(it->second);
+    }
+  }
+}
+
+TransactionLockMgr::LockStatusData
+PessimisticTransactionDB::GetLockStatusData() {
+  return lock_mgr_.GetLockStatusData();
+}
+
+std::vector<DeadlockPath> PessimisticTransactionDB::GetDeadlockInfoBuffer() {
+  return lock_mgr_.GetDeadlockInfoBuffer();
+}
+
+void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) {
+  lock_mgr_.Resize(target_size);
+}
+
+void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
+  assert(txn);
+  assert(txn->GetName().length() > 0);
+  assert(GetTransactionByName(txn->GetName()) == nullptr);
+  assert(txn->GetState() == Transaction::STARTED);
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  transactions_[txn->GetName()] = txn;
+}
+
+void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) {
+  assert(txn);
+  std::lock_guard<std::mutex> lock(name_map_mutex_);
+  auto it = transactions_.find(txn->GetName());
+  assert(it != transactions_.end());
+  transactions_.erase(it);
+}
+
+// Returns true if commit_seq <= snapshot_seq
+bool WritePreparedTxnDB::IsInSnapshot(uint64_t prep_seq,
+                                      uint64_t snapshot_seq) {
+  // Here we try to infer the return value without looking into prepare list.
+  // This would help avoiding synchronization over a shared map.
+  // TODO(myabandeh): read your own writes
+  // TODO(myabandeh): optimize this. This sequence of checks must be correct but
+  // not necessary efficient
+  if (snapshot_seq < prep_seq) {
+    // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq
+    return false;
+  }
+  if (!delayed_prepared_empty_.load(std::memory_order_acquire)) {
+    // We should not normally reach here
+    ReadLock rl(&prepared_mutex_);
+    if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
+      // Then it is not committed yet
+      return false;
+    }
+  }
+  auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE;
+  CommitEntry cached;
+  bool exist = GetCommitEntry(indexed_seq, &cached);
+  if (!exist) {
+    // It is not committed, so it must be still prepared
+    return false;
+  }
+  if (prep_seq == cached.prep_seq) {
+    // It is committed and also not evicted from commit cache
+    return cached.commit_seq <= snapshot_seq;
+  }
+  // At this point we dont know if it was committed or it is still prepared
+  auto max_evicted_seq = max_evicted_seq_.load(std::memory_order_acquire);
+  if (max_evicted_seq < prep_seq) {
+    // Not evicted from cache and also not present, so must be still prepared
+    return false;
+  }
+  // When advancing max_evicted_seq_, we move older entires from prepared to
+  // delayed_prepared_. Also we move evicted entries from commit cache to
+  // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <=
+  // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in
+  // old_commit_map_, iii) committed with no conflict with any snapshot (i)
+  // delayed_prepared_ is checked above
+  if (max_evicted_seq < snapshot_seq) {  // then (ii) cannot be the case
+    // only (iii) is the case: committed
+    // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq <
+    // snapshot_seq
+    return true;
+  }
+  // else (ii) might be the case: check the commit data saved for this snapshot.
+  // If there was no overlapping commit entry, then it is committed with a
+  // commit_seq lower than any live snapshot, including snapshot_seq.
+  if (old_commit_map_empty_.load(std::memory_order_acquire)) {
+    return true;
+  }
+  {
+    // We should not normally reach here
+    ReadLock rl(&old_commit_map_mutex_);
+    auto old_commit_entry = old_commit_map_.find(prep_seq);
+    if (old_commit_entry == old_commit_map_.end() ||
+        old_commit_entry->second <= snapshot_seq) {
+      return true;
+    }
+  }
+  // (ii) it the case: it is committed but after the snapshot_seq
+  return false;
+}
+
+void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
+  ROCKS_LOG_DEBUG(info_log_, "Txn %" PRIu64 " Prepareing", seq);
+  WriteLock wl(&prepared_mutex_);
+  prepared_txns_.push(seq);
+}
+
+void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq,
+                                      uint64_t commit_seq) {
+  ROCKS_LOG_DEBUG(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64,
+                  prepare_seq, commit_seq);
+  auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE;
+  CommitEntry evicted;
+  bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted);
+  if (to_be_evicted) {
+    auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
+    if (prev_max < evicted.commit_seq) {
+      // TODO(myabandeh) inc max in larger steps to avoid frequent updates
+      auto max_evicted_seq = evicted.commit_seq;
+      // When max_evicted_seq_ advances, move older entries from prepared_txns_
+      // to delayed_prepared_. This guarantees that if a seq is lower than max,
+      // then it is not in prepared_txns_ ans save an expensive, synchronized
+      // lookup from a shared set. delayed_prepared_ is expected to be empty in
+      // normal cases.
+      {
+        WriteLock wl(&prepared_mutex_);
+        while (!prepared_txns_.empty() &&
+               prepared_txns_.top() <= max_evicted_seq) {
+          auto to_be_popped = prepared_txns_.top();
+          delayed_prepared_.insert(to_be_popped);
+          prepared_txns_.pop();
+          delayed_prepared_empty_.store(false, std::memory_order_release);
+        }
+      }
+
+      // With each change to max_evicted_seq_ fetch the live snapshots behind it
+      SequenceNumber curr_seq;
+      std::vector<SequenceNumber> all_snapshots;
+      bool update_snapshots = false;
+      {
+        InstrumentedMutex(db_impl_->mutex());
+        // We use this to identify how fresh are the snapshot list. Since this
+        // is done atomically with obtaining the snapshot list, the one with
+        // the larger seq is more fresh. If the seq is equal the full snapshot
+        // list could be different since taking snapshots does not increase
+        // the db seq. However since we only care about snapshots before the
+        // new max, such recent snapshots would not be included the in the
+        // list anyway.
+        curr_seq = db_impl_->GetLatestSequenceNumber();
+        if (curr_seq > snapshots_version_) {
+          // This is to avoid updating the snapshots_ if it already updated
+          // with a more recent vesion by a concrrent thread
+          update_snapshots = true;
+          // We only care about snapshots lower then max
+          all_snapshots =
+              db_impl_->snapshots().GetAll(nullptr, max_evicted_seq);
+        }
+      }
+      if (update_snapshots) {
+        WriteLock wl(&snapshots_mutex_);
+        snapshots_version_ = curr_seq;
+        // We update the list concurrently with the readers.
+        // Both new and old lists are sorted and the new list is subset of the
+        // previous list plus some new items. Thus if a snapshot repeats in
+        // both new and old lists, it will appear upper in the new list. So if
+        // we simply insert the new snapshots in order, if an overwritten item
+        // is still valid in the new list is either written to the same place in
+        // the array or it is written in a higher palce before it gets
+        // overwritten by another item. This guarantess a reader that reads the
+        // list bottom-up will eventaully see a snapshot that repeats in the
+        // update, either before it gets overwritten by the writer or
+        // afterwards.
+        size_t i = 0;
+        auto it = all_snapshots.begin();
+        for (; it != all_snapshots.end() && i < SNAPSHOT_CACHE_SIZE;
+             it++, i++) {
+          snapshot_cache_[i].store(*it, std::memory_order_release);
+        }
+        snapshots_.clear();
+        for (; it != all_snapshots.end(); it++) {
+          // Insert them to a vector that is less efficient to access
+          // concurrently
+          snapshots_.push_back(*it);
+        }
+        // Update the size at the end. Otherwise a parallel reader might read
+        // items that are not set yet.
+        snapshots_total_.store(all_snapshots.size(), std::memory_order_release);
+      }
+      while (prev_max < max_evicted_seq &&
+             !max_evicted_seq_.compare_exchange_weak(
+                 prev_max, max_evicted_seq, std::memory_order_release,
+                 std::memory_order_acquire)) {
+      };
+    }
+    // After each eviction from commit cache, check if the commit entry should
+    // be kept around because it overlaps with a live snapshot.
+    // First check the snapshot cache that is efficient for concurrent access
+    auto cnt = snapshots_total_.load(std::memory_order_acquire);
+    // The list might get updated concurrently as we are reading from it. The
+    // reader should be able to read all the snapshots that are still valid
+    // after the update. Since the survived snapshots are written in a higher
+    // place before gets overwritten the reader that reads bottom-up will
+    // eventully see it.
+    const bool next_is_larger = true;
+    SequenceNumber snapshot_seq = kMaxSequenceNumber;
+    size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE);
+    for (; 0 < ip1; ip1--) {
+      snapshot_seq = snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
+      if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                   snapshot_seq, !next_is_larger)) {
+        break;
+      }
+    }
+    if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE &&
+                 snapshot_seq < evicted.prep_seq)) {
+      // Then access the less efficient list of snapshots_
+      ReadLock rl(&snapshots_mutex_);
+      // Items could have moved from the snapshots_ to snapshot_cache_ before
+      // accquiring the lock. To make sure that we do not miss a valid snapshot,
+      // read snapshot_cache_ again while holding the lock.
+      for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
+        snapshot_seq = snapshot_cache_[i].load(std::memory_order_acquire);
+        if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                     snapshot_seq, next_is_larger)) {
+          break;
+        }
+      }
+      for (auto snapshot_seq_2 : snapshots_) {
+        if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                     snapshot_seq_2, next_is_larger)) {
+          break;
+        }
+      }
+    }
+  }
+  bool succ =
+      ExchangeCommitEntry(indexed_seq, evicted, {prepare_seq, commit_seq});
+  if (!succ) {
+    // A very rare event, in which the commit entry is updated before we do.
+    // Here we apply a very simple solution of retrying.
+    // TODO(myabandeh): do precautions to detect bugs that cause infinite loops
+    AddCommitted(prepare_seq, commit_seq);
+    return;
+  }
+  {
+    WriteLock wl(&prepared_mutex_);
+    prepared_txns_.erase(prepare_seq);
+    bool was_empty = delayed_prepared_.empty();
+    if (!was_empty) {
+      delayed_prepared_.erase(prepare_seq);
+      bool is_empty = delayed_prepared_.empty();
+      if (was_empty != is_empty) {
+        delayed_prepared_empty_.store(is_empty, std::memory_order_release);
+      }
+    }
+  }
+}
+
+bool WritePreparedTxnDB::GetCommitEntry(uint64_t indexed_seq,
+                                        CommitEntry* entry) {
+  // TODO(myabandeh): implement lock-free commit_cache_
+  ReadLock rl(&commit_cache_mutex_);
+  *entry = commit_cache_[indexed_seq];
+  return (entry->commit_seq != 0);  // initialized
+}
+
+bool WritePreparedTxnDB::AddCommitEntry(uint64_t indexed_seq,
+                                        CommitEntry& new_entry,
+                                        CommitEntry* evicted_entry) {
+  // TODO(myabandeh): implement lock-free commit_cache_
+  WriteLock wl(&commit_cache_mutex_);
+  *evicted_entry = commit_cache_[indexed_seq];
+  commit_cache_[indexed_seq] = new_entry;
+  return (evicted_entry->commit_seq != 0);  // initialized
+}
+
+bool WritePreparedTxnDB::ExchangeCommitEntry(uint64_t indexed_seq,
+                                             CommitEntry& expected_entry,
+                                             CommitEntry new_entry) {
+  // TODO(myabandeh): implement lock-free commit_cache_
+  WriteLock wl(&commit_cache_mutex_);
+  auto& evicted_entry = commit_cache_[indexed_seq];
+  if (evicted_entry.prep_seq != expected_entry.prep_seq) {
+    return false;
+  }
+  commit_cache_[indexed_seq] = new_entry;
+  return true;
+}
+
+// 10m entry, 80MB size
+size_t WritePreparedTxnDB::DEF_COMMIT_CACHE_SIZE = static_cast<size_t>(1 << 21);
+size_t WritePreparedTxnDB::DEF_SNAPSHOT_CACHE_SIZE =
+    static_cast<size_t>(1 << 7);
+
+bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
+    const uint64_t& prep_seq, const uint64_t& commit_seq,
+    const uint64_t& snapshot_seq, const bool next_is_larger = true) {
+  // If we do not store an entry in old_commit_map we assume it is committed in
+  // all snapshots. if commit_seq <= snapshot_seq, it is considered already in
+  // the snapshot so we need not to keep the entry around for this snapshot.
+  if (commit_seq <= snapshot_seq) {
+    // continue the search if the next snapshot could be smaller than commit_seq
+    return !next_is_larger;
+  }
+  // then snapshot_seq < commit_seq
+  if (prep_seq <= snapshot_seq) {  // overlapping range
+    WriteLock wl(&old_commit_map_mutex_);
+    old_commit_map_empty_.store(false, std::memory_order_release);
+    old_commit_map_[prep_seq] = commit_seq;
+    // Storing once is enough. No need to check it for other snapshots.
+    return false;
+  }
+  // continue the search if the next snapshot could be larger than prep_seq
+  return next_is_larger;
+}
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h
new file mode 100644
index 000000000..e3eec6b60
--- /dev/null
+++ b/utilities/transactions/pessimistic_transaction_db.h
@@ -0,0 +1,316 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_lock_mgr.h"
+#include "utilities/transactions/write_prepared_txn.h"
+
+namespace rocksdb {
+
+class PessimisticTransactionDB : public TransactionDB {
+ public:
+  explicit PessimisticTransactionDB(DB* db,
+                                    const TransactionDBOptions& txn_db_options);
+
+  explicit PessimisticTransactionDB(StackableDB* db,
+                                    const TransactionDBOptions& txn_db_options);
+
+  virtual ~PessimisticTransactionDB();
+
+  Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices,
+                    const std::vector<ColumnFamilyHandle*>& handles);
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override = 0;
+
+  using StackableDB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override;
+
+  using StackableDB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
+
+  using StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+
+  using StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  using StackableDB::CreateColumnFamily;
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) override;
+
+  using StackableDB::DropColumnFamily;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+
+  Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id,
+                 const std::string& key, bool exclusive);
+
+  void UnLock(PessimisticTransaction* txn, const TransactionKeyMap* keys);
+  void UnLock(PessimisticTransaction* txn, uint32_t cfh_id,
+              const std::string& key);
+
+  void AddColumnFamily(const ColumnFamilyHandle* handle);
+
+  static TransactionDBOptions ValidateTxnDBOptions(
+      const TransactionDBOptions& txn_db_options);
+
+  const TransactionDBOptions& GetTxnDBOptions() const {
+    return txn_db_options_;
+  }
+
+  void InsertExpirableTransaction(TransactionID tx_id,
+                                  PessimisticTransaction* tx);
+  void RemoveExpirableTransaction(TransactionID tx_id);
+
+  // If transaction is no longer available, locks can be stolen
+  // If transaction is available, try stealing locks directly from transaction
+  // It is the caller's responsibility to ensure that the referred transaction
+  // is expirable (GetExpirationTime() > 0) and that it is expired.
+  bool TryStealingExpiredTransactionLocks(TransactionID tx_id);
+
+  Transaction* GetTransactionByName(const TransactionName& name) override;
+
+  void RegisterTransaction(Transaction* txn);
+  void UnregisterTransaction(Transaction* txn);
+
+  // not thread safe. current use case is during recovery (single thread)
+  void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override;
+
+  TransactionLockMgr::LockStatusData GetLockStatusData() override;
+
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+  void SetDeadlockInfoBufferSize(uint32_t target_size) override;
+
+  struct CommitEntry {
+    uint64_t prep_seq;
+    uint64_t commit_seq;
+    CommitEntry() : prep_seq(0), commit_seq(0) {}
+    CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
+  };
+
+ protected:
+  void ReinitializeTransaction(
+      Transaction* txn, const WriteOptions& write_options,
+      const TransactionOptions& txn_options = TransactionOptions());
+  DBImpl* db_impl_;
+  std::shared_ptr<Logger> info_log_;
+
+ private:
+  friend class WritePreparedTxnDB;
+  const TransactionDBOptions txn_db_options_;
+  TransactionLockMgr lock_mgr_;
+
+  // Must be held when adding/dropping column families.
+  InstrumentedMutex column_family_mutex_;
+  Transaction* BeginInternalTransaction(const WriteOptions& options);
+
+  // Used to ensure that no locks are stolen from an expirable transaction
+  // that has started a commit. Only transactions with an expiration time
+  // should be in this map.
+  std::mutex map_mutex_;
+  std::unordered_map<TransactionID, PessimisticTransaction*>
+      expirable_transactions_map_;
+
+  // map from name to two phase transaction instance
+  std::mutex name_map_mutex_;
+  std::unordered_map<TransactionName, Transaction*> transactions_;
+};
+
+// A PessimisticTransactionDB that writes the data to the DB after the commit.
+// In this way the DB only contains the committed data.
+class WriteCommittedTxnDB : public PessimisticTransactionDB {
+ public:
+  explicit WriteCommittedTxnDB(DB* db,
+                               const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options) {}
+
+  explicit WriteCommittedTxnDB(StackableDB* db,
+                               const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options) {}
+
+  virtual ~WriteCommittedTxnDB() {}
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+};
+
+// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
+// In this way some data in the DB might not be committed. The DB provides
+// mechanisms to tell such data apart from committed data.
+class WritePreparedTxnDB : public PessimisticTransactionDB {
+ public:
+  explicit WritePreparedTxnDB(DB* db,
+                              const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options),
+        SNAPSHOT_CACHE_SIZE(DEF_SNAPSHOT_CACHE_SIZE),
+        COMMIT_CACHE_SIZE(DEF_COMMIT_CACHE_SIZE) {
+    init(txn_db_options);
+  }
+
+  explicit WritePreparedTxnDB(StackableDB* db,
+                              const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options),
+        SNAPSHOT_CACHE_SIZE(DEF_SNAPSHOT_CACHE_SIZE),
+        COMMIT_CACHE_SIZE(DEF_COMMIT_CACHE_SIZE) {
+    init(txn_db_options);
+  }
+
+  virtual ~WritePreparedTxnDB() {}
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  // Check whether the transaction that wrote the value with seqeunce number seq
+  // is visible to the snapshot with sequence number snapshot_seq
+  bool IsInSnapshot(uint64_t seq, uint64_t snapshot_seq);
+  // Add the trasnaction with prepare sequence seq to the prepared list
+  void AddPrepared(uint64_t seq);
+  // Add the transaction with prepare sequence prepare_seq and commit sequence
+  // commit_seq to the commit map
+  void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq);
+
+ private:
+  friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
+
+  void init(const TransactionDBOptions& /* unused */) {
+    snapshot_cache_ = unique_ptr<std::atomic<SequenceNumber>[]>(
+        new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
+    commit_cache_ =
+        unique_ptr<CommitEntry[]>(new CommitEntry[COMMIT_CACHE_SIZE]{});
+  }
+
+  // A heap with the amortized O(1) complexity for erase. It uses one extra heap
+  // to keep track of erased entries that are not yet on top of the main heap.
+  class PreparedHeap {
+    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
+        heap_;
+    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
+        erased_heap_;
+
+   public:
+    bool empty() { return heap_.empty(); }
+    uint64_t top() { return heap_.top(); }
+    void push(uint64_t v) { heap_.push(v); }
+    void pop() {
+      heap_.pop();
+      while (!heap_.empty() && !erased_heap_.empty() &&
+             heap_.top() == erased_heap_.top()) {
+        heap_.pop();
+        erased_heap_.pop();
+      }
+    }
+    void erase(uint64_t seq) {
+      if (!heap_.empty()) {
+        if (seq < heap_.top()) {
+          // Already popped, ignore it.
+        } else if (heap_.top() == seq) {
+          heap_.pop();
+        } else {  // (heap_.top() > seq)
+          // Down the heap, remember to pop it later
+          erased_heap_.push(seq);
+        }
+      }
+    }
+  };
+
+  // Get the commit entry with index indexed_seq from the commit table. It
+  // returns true if such entry exists.
+  bool GetCommitEntry(uint64_t indexed_seq, CommitEntry* entry);
+  // Rewrite the entry with the index indexed_seq in the commit table with the
+  // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction,
+  // sets the evicted_entry and returns true.
+  bool AddCommitEntry(uint64_t indexed_seq, CommitEntry& new_entry,
+                      CommitEntry* evicted_entry);
+  // Rewrite the entry with the index indexed_seq in the commit table with the
+  // commit entry new_entry only if the existing entry matches the
+  // expected_entry. Returns false otherwise.
+  bool ExchangeCommitEntry(uint64_t indexed_seq, CommitEntry& expected_entry,
+                           CommitEntry new_entry);
+
+  // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq <
+  // commit_seq. Return false if checking the next snapshot(s) is not needed.
+  // This is the case if the entry already added to old_commit_map_ or none of
+  // the next snapshots could satisfy the condition. next_is_larger: the next
+  // snapshot will be a larger value
+  bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq,
+                               const uint64_t& commit_seq,
+                               const uint64_t& snapshot_seq,
+                               const bool next_is_larger);
+
+  // The list of live snapshots at the last time that max_evicted_seq_ advanced.
+  // The list stored into two data structures: in snapshot_cache_ that is
+  // efficient for concurrent reads, and in snapshots_ if the data does not fit
+  // into snapshot_cache_. The total number of snapshots in the two lists
+  std::atomic<size_t> snapshots_total_ = {};
+  // The list sorted in ascending order. Thread-safety for writes is provided
+  // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for
+  // each entry. In x86_64 architecture such reads are compiled to simple read
+  // instructions. 128 entries
+  // TODO(myabandeh): avoid non-const static variables
+  static size_t DEF_SNAPSHOT_CACHE_SIZE;
+  const size_t SNAPSHOT_CACHE_SIZE;
+  unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
+  // 2nd list for storing snapshots. The list sorted in ascending order.
+  // Thread-safety is provided with snapshots_mutex_.
+  std::vector<SequenceNumber> snapshots_;
+  // The version of the latest list of snapshots. This can be used to avoid
+  // rewrittiing a list that is concurrently updated with a more recent version.
+  SequenceNumber snapshots_version_ = 0;
+
+  // A heap of prepared transactions. Thread-safety is provided with
+  // prepared_mutex_.
+  PreparedHeap prepared_txns_;
+  // TODO(myabandeh): avoid non-const static variables
+  static size_t DEF_COMMIT_CACHE_SIZE;
+  const size_t COMMIT_CACHE_SIZE;
+  // commit_cache_ must be initialized to zero to tell apart an empty index from
+  // a filled one. Thread-safety is provided with commit_cache_mutex_.
+  unique_ptr<CommitEntry[]> commit_cache_;
+  // The largest evicted *commit* sequence number from the commit_cache_
+  std::atomic<uint64_t> max_evicted_seq_ = {};
+  // A map of the evicted entries from commit_cache_ that has to be kept around
+  // to service the old snapshots. This is expected to be empty normally.
+  // Thread-safety is provided with old_commit_map_mutex_.
+  std::map<uint64_t, uint64_t> old_commit_map_;
+  // A set of long-running prepared transactions that are not finished by the
+  // time max_evicted_seq_ advances their sequence number. This is expected to
+  // be empty normally. Thread-safety is provided with prepared_mutex_.
+  std::set<uint64_t> delayed_prepared_;
+  // Update when delayed_prepared_.empty() changes. Expected to be true
+  // normally.
+  std::atomic<bool> delayed_prepared_empty_ = {true};
+  // Update when old_commit_map_.empty() changes. Expected to be true normally.
+  std::atomic<bool> old_commit_map_empty_ = {true};
+  port::RWMutex prepared_mutex_;
+  port::RWMutex old_commit_map_mutex_;
+  port::RWMutex commit_cache_mutex_;
+  port::RWMutex snapshots_mutex_;
+};
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 0357c113f..4612dfa54 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -181,8 +181,21 @@ Status TransactionBaseImpl::RollbackToSavePoint() {
 Status TransactionBaseImpl::Get(const ReadOptions& read_options,
                                 ColumnFamilyHandle* column_family,
                                 const Slice& key, std::string* value) {
+  assert(value != nullptr);
+  PinnableSlice pinnable_val(value);
+  assert(!pinnable_val.IsPinned());
+  auto s = Get(read_options, column_family, key, &pinnable_val);
+  if (s.ok() && pinnable_val.IsPinned()) {
+    value->assign(pinnable_val.data(), pinnable_val.size());
+  }  // else value is already assigned
+  return s;
+}
+
+Status TransactionBaseImpl::Get(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, PinnableSlice* pinnable_val) {
   return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key,
-                                        value);
+                                        pinnable_val);
 }
 
 Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
@@ -192,7 +205,26 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
   Status s = TryLock(column_family, key, true /* read_only */, exclusive);
 
   if (s.ok() && value != nullptr) {
-    s = Get(read_options, column_family, key, value);
+    assert(value != nullptr);
+    PinnableSlice pinnable_val(value);
+    assert(!pinnable_val.IsPinned());
+    s = Get(read_options, column_family, key, &pinnable_val);
+    if (s.ok() && pinnable_val.IsPinned()) {
+      value->assign(pinnable_val.data(), pinnable_val.size());
+    }  // else value is already assigned
+  }
+  return s;
+}
+
+Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
+                                         ColumnFamilyHandle* column_family,
+                                         const Slice& key,
+                                         PinnableSlice* pinnable_val,
+                                         bool exclusive) {
+  Status s = TryLock(column_family, key, true /* read_only */, exclusive);
+
+  if (s.ok() && pinnable_val != nullptr) {
+    s = Get(read_options, column_family, key, pinnable_val);
   }
   return s;
 }
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 151483648..c73b329f4 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -46,18 +46,27 @@ class TransactionBaseImpl : public Transaction {
 
   Status RollbackToSavePoint() override;
 
+  using Transaction::Get;
   Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
              const Slice& key, std::string* value) override;
 
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
   Status Get(const ReadOptions& options, const Slice& key,
              std::string* value) override {
     return Get(options, db_->DefaultColumnFamily(), key, value);
   }
 
+  using Transaction::GetForUpdate;
   Status GetForUpdate(const ReadOptions& options,
                       ColumnFamilyHandle* column_family, const Slice& key,
                       std::string* value, bool exclusive) override;
 
+  Status GetForUpdate(const ReadOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      PinnableSlice* pinnable_val, bool exclusive) override;
+
   Status GetForUpdate(const ReadOptions& options, const Slice& key,
                       std::string* value, bool exclusive) override {
     return GetForUpdate(options, db_->DefaultColumnFamily(), key, value,
diff --git a/utilities/transactions/transaction_db_impl.cc b/utilities/transactions/transaction_db_impl.cc
deleted file mode 100644
index 2c425dd8d..000000000
--- a/utilities/transactions/transaction_db_impl.cc
+++ /dev/null
@@ -1,466 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "utilities/transactions/transaction_db_impl.h"
-
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "db/db_impl.h"
-#include "rocksdb/db.h"
-#include "rocksdb/options.h"
-#include "rocksdb/utilities/transaction_db.h"
-#include "utilities/transactions/transaction_db_mutex_impl.h"
-#include "utilities/transactions/transaction_impl.h"
-
-namespace rocksdb {
-
-TransactionDBImpl::TransactionDBImpl(DB* db,
-                                     const TransactionDBOptions& txn_db_options)
-    : TransactionDB(db),
-      db_impl_(dynamic_cast<DBImpl*>(db)),
-      txn_db_options_(txn_db_options),
-      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
-                txn_db_options_.custom_mutex_factory
-                    ? txn_db_options_.custom_mutex_factory
-                    : std::shared_ptr<TransactionDBMutexFactory>(
-                          new TransactionDBMutexFactoryImpl())) {
-  assert(db_impl_ != nullptr);
-}
-
-// Support initiliazing TransactionDBImpl from a stackable db
-//
-//    TransactionDBImpl
-//     ^        ^
-//     |        |
-//     |        +
-//     |   StackableDB
-//     |   ^
-//     |   |
-//     +   +
-//     DBImpl
-//       ^
-//       |(inherit)
-//       +
-//       DB
-//
-TransactionDBImpl::TransactionDBImpl(StackableDB* db,
-                                     const TransactionDBOptions& txn_db_options)
-    : TransactionDB(db),
-      db_impl_(dynamic_cast<DBImpl*>(db->GetRootDB())),
-      txn_db_options_(txn_db_options),
-      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
-                txn_db_options_.custom_mutex_factory
-                    ? txn_db_options_.custom_mutex_factory
-                    : std::shared_ptr<TransactionDBMutexFactory>(
-                          new TransactionDBMutexFactoryImpl())) {
-  assert(db_impl_ != nullptr);
-}
-
-TransactionDBImpl::~TransactionDBImpl() {
-  while (!transactions_.empty()) {
-    delete transactions_.begin()->second;
-  }
-}
-
-Status TransactionDBImpl::Initialize(
-    const std::vector<size_t>& compaction_enabled_cf_indices,
-    const std::vector<ColumnFamilyHandle*>& handles) {
-  for (auto cf_ptr : handles) {
-    AddColumnFamily(cf_ptr);
-  }
-  // Re-enable compaction for the column families that initially had
-  // compaction enabled.
-  std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
-  compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
-  for (auto index : compaction_enabled_cf_indices) {
-    compaction_enabled_cf_handles.push_back(handles[index]);
-  }
-
-  Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
-
-  // create 'real' transactions from recovered shell transactions
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
-  assert(dbimpl != nullptr);
-  auto rtrxs = dbimpl->recovered_transactions();
-
-  for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
-    auto recovered_trx = it->second;
-    assert(recovered_trx);
-    assert(recovered_trx->log_number_);
-    assert(recovered_trx->name_.length());
-
-    WriteOptions w_options;
-    w_options.sync = true;
-    TransactionOptions t_options;
-
-    Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
-    assert(real_trx);
-    real_trx->SetLogNumber(recovered_trx->log_number_);
-
-    s = real_trx->SetName(recovered_trx->name_);
-    if (!s.ok()) {
-      break;
-    }
-
-    s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
-    real_trx->SetState(Transaction::PREPARED);
-    if (!s.ok()) {
-      break;
-    }
-  }
-  if (s.ok()) {
-    dbimpl->DeleteAllRecoveredTransactions();
-  }
-  return s;
-}
-
-Transaction* TransactionDBImpl::BeginTransaction(
-    const WriteOptions& write_options, const TransactionOptions& txn_options,
-    Transaction* old_txn) {
-  if (old_txn != nullptr) {
-    ReinitializeTransaction(old_txn, write_options, txn_options);
-    return old_txn;
-  } else {
-    return new TransactionImpl(this, write_options, txn_options);
-  }
-}
-
-TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions(
-    const TransactionDBOptions& txn_db_options) {
-  TransactionDBOptions validated = txn_db_options;
-
-  if (txn_db_options.num_stripes == 0) {
-    validated.num_stripes = 1;
-  }
-
-  return validated;
-}
-
-Status TransactionDB::Open(const Options& options,
-                           const TransactionDBOptions& txn_db_options,
-                           const std::string& dbname, TransactionDB** dbptr) {
-  DBOptions db_options(options);
-  ColumnFamilyOptions cf_options(options);
-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.push_back(
-      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
-  std::vector<ColumnFamilyHandle*> handles;
-  Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
-                                 column_families, &handles, dbptr);
-  if (s.ok()) {
-    assert(handles.size() == 1);
-    // i can delete the handle since DBImpl is always holding a reference to
-    // default column family
-    delete handles[0];
-  }
-
-  return s;
-}
-
-Status TransactionDB::Open(
-    const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
-    const std::string& dbname,
-    const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
-  Status s;
-  DB* db;
-
-  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
-  std::vector<size_t> compaction_enabled_cf_indices;
-  DBOptions db_options_2pc = db_options;
-  PrepareWrap(&db_options_2pc, &column_families_copy,
-              &compaction_enabled_cf_indices);
-  s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
-  if (s.ok()) {
-    s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
-               dbptr);
-  }
-  return s;
-}
-
-void TransactionDB::PrepareWrap(
-    DBOptions* db_options, std::vector<ColumnFamilyDescriptor>* column_families,
-    std::vector<size_t>* compaction_enabled_cf_indices) {
-  compaction_enabled_cf_indices->clear();
-
-  // Enable MemTable History if not already enabled
-  for (size_t i = 0; i < column_families->size(); i++) {
-    ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
-
-    if (cf_options->max_write_buffer_number_to_maintain == 0) {
-      // Setting to -1 will set the History size to max_write_buffer_number.
-      cf_options->max_write_buffer_number_to_maintain = -1;
-    }
-    if (!cf_options->disable_auto_compactions) {
-      // Disable compactions momentarily to prevent race with DB::Open
-      cf_options->disable_auto_compactions = true;
-      compaction_enabled_cf_indices->push_back(i);
-    }
-  }
-  db_options->allow_2pc = true;
-}
-
-Status TransactionDB::WrapDB(
-    // make sure this db is already opened with memtable history enabled,
-    // auto compaction distabled and 2 phase commit enabled
-    DB* db, const TransactionDBOptions& txn_db_options,
-    const std::vector<size_t>& compaction_enabled_cf_indices,
-    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
-  TransactionDBImpl* txn_db = new TransactionDBImpl(
-      db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
-  *dbptr = txn_db;
-  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
-  return s;
-}
-
-Status TransactionDB::WrapStackableDB(
-    // make sure this stackable_db is already opened with memtable history
-    // enabled,
-    // auto compaction distabled and 2 phase commit enabled
-    StackableDB* db, const TransactionDBOptions& txn_db_options,
-    const std::vector<size_t>& compaction_enabled_cf_indices,
-    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
-  TransactionDBImpl* txn_db = new TransactionDBImpl(
-      db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
-  *dbptr = txn_db;
-  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
-  return s;
-}
-
-// Let TransactionLockMgr know that this column family exists so it can
-// allocate a LockMap for it.
-void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) {
-  lock_mgr_.AddColumnFamily(handle->GetID());
-}
-
-Status TransactionDBImpl::CreateColumnFamily(
-    const ColumnFamilyOptions& options, const std::string& column_family_name,
-    ColumnFamilyHandle** handle) {
-  InstrumentedMutexLock l(&column_family_mutex_);
-
-  Status s = db_->CreateColumnFamily(options, column_family_name, handle);
-  if (s.ok()) {
-    lock_mgr_.AddColumnFamily((*handle)->GetID());
-  }
-
-  return s;
-}
-
-// Let TransactionLockMgr know that it can deallocate the LockMap for this
-// column family.
-Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
-  InstrumentedMutexLock l(&column_family_mutex_);
-
-  Status s = db_->DropColumnFamily(column_family);
-  if (s.ok()) {
-    lock_mgr_.RemoveColumnFamily(column_family->GetID());
-  }
-
-  return s;
-}
-
-Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id,
-                                  const std::string& key, bool exclusive) {
-  return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv(), exclusive);
-}
-
-void TransactionDBImpl::UnLock(TransactionImpl* txn,
-                               const TransactionKeyMap* keys) {
-  lock_mgr_.UnLock(txn, keys, GetEnv());
-}
-
-void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id,
-                               const std::string& key) {
-  lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
-}
-
-// Used when wrapping DB write operations in a transaction
-Transaction* TransactionDBImpl::BeginInternalTransaction(
-    const WriteOptions& options) {
-  TransactionOptions txn_options;
-  Transaction* txn = BeginTransaction(options, txn_options, nullptr);
-
-  // Use default timeout for non-transactional writes
-  txn->SetLockTimeout(txn_db_options_.default_lock_timeout);
-  return txn;
-}
-
-// All user Put, Merge, Delete, and Write requests must be intercepted to make
-// sure that they lock all keys that they are writing to avoid causing conflicts
-// with any concurent transactions. The easiest way to do this is to wrap all
-// write operations in a transaction.
-//
-// Put(), Merge(), and Delete() only lock a single key per call.  Write() will
-// sort its keys before locking them.  This guarantees that TransactionDB write
-// methods cannot deadlock with eachother (but still could deadlock with a
-// Transaction).
-Status TransactionDBImpl::Put(const WriteOptions& options,
-                              ColumnFamilyHandle* column_family,
-                              const Slice& key, const Slice& val) {
-  Status s;
-
-  Transaction* txn = BeginInternalTransaction(options);
-  txn->DisableIndexing();
-
-  // Since the client didn't create a transaction, they don't care about
-  // conflict checking for this write.  So we just need to do PutUntracked().
-  s = txn->PutUntracked(column_family, key, val);
-
-  if (s.ok()) {
-    s = txn->Commit();
-  }
-
-  delete txn;
-
-  return s;
-}
-
-Status TransactionDBImpl::Delete(const WriteOptions& wopts,
-                                 ColumnFamilyHandle* column_family,
-                                 const Slice& key) {
-  Status s;
-
-  Transaction* txn = BeginInternalTransaction(wopts);
-  txn->DisableIndexing();
-
-  // Since the client didn't create a transaction, they don't care about
-  // conflict checking for this write.  So we just need to do
-  // DeleteUntracked().
-  s = txn->DeleteUntracked(column_family, key);
-
-  if (s.ok()) {
-    s = txn->Commit();
-  }
-
-  delete txn;
-
-  return s;
-}
-
-Status TransactionDBImpl::Merge(const WriteOptions& options,
-                                ColumnFamilyHandle* column_family,
-                                const Slice& key, const Slice& value) {
-  Status s;
-
-  Transaction* txn = BeginInternalTransaction(options);
-  txn->DisableIndexing();
-
-  // Since the client didn't create a transaction, they don't care about
-  // conflict checking for this write.  So we just need to do
-  // MergeUntracked().
-  s = txn->MergeUntracked(column_family, key, value);
-
-  if (s.ok()) {
-    s = txn->Commit();
-  }
-
-  delete txn;
-
-  return s;
-}
-
-Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
-  // Need to lock all keys in this batch to prevent write conflicts with
-  // concurrent transactions.
-  Transaction* txn = BeginInternalTransaction(opts);
-  txn->DisableIndexing();
-
-  assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
-  auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
-
-  // Since commitBatch sorts the keys before locking, concurrent Write()
-  // operations will not cause a deadlock.
-  // In order to avoid a deadlock with a concurrent Transaction, Transactions
-  // should use a lock timeout.
-  Status s = txn_impl->CommitBatch(updates);
-
-  delete txn;
-
-  return s;
-}
-
-void TransactionDBImpl::InsertExpirableTransaction(TransactionID tx_id,
-                                                   TransactionImpl* tx) {
-  assert(tx->GetExpirationTime() > 0);
-  std::lock_guard<std::mutex> lock(map_mutex_);
-  expirable_transactions_map_.insert({tx_id, tx});
-}
-
-void TransactionDBImpl::RemoveExpirableTransaction(TransactionID tx_id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
-  expirable_transactions_map_.erase(tx_id);
-}
-
-bool TransactionDBImpl::TryStealingExpiredTransactionLocks(
-    TransactionID tx_id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
-
-  auto tx_it = expirable_transactions_map_.find(tx_id);
-  if (tx_it == expirable_transactions_map_.end()) {
-    return true;
-  }
-  TransactionImpl& tx = *(tx_it->second);
-  return tx.TryStealingLocks();
-}
-
-void TransactionDBImpl::ReinitializeTransaction(
-    Transaction* txn, const WriteOptions& write_options,
-    const TransactionOptions& txn_options) {
-  assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
-  auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
-
-  txn_impl->Reinitialize(this, write_options, txn_options);
-}
-
-Transaction* TransactionDBImpl::GetTransactionByName(
-    const TransactionName& name) {
-  std::lock_guard<std::mutex> lock(name_map_mutex_);
-  auto it = transactions_.find(name);
-  if (it == transactions_.end()) {
-    return nullptr;
-  } else {
-    return it->second;
-  }
-}
-
-void TransactionDBImpl::GetAllPreparedTransactions(
-    std::vector<Transaction*>* transv) {
-  assert(transv);
-  transv->clear();
-  std::lock_guard<std::mutex> lock(name_map_mutex_);
-  for (auto it = transactions_.begin(); it != transactions_.end(); it++) {
-    if (it->second->GetState() == Transaction::PREPARED) {
-      transv->push_back(it->second);
-    }
-  }
-}
-
-TransactionLockMgr::LockStatusData TransactionDBImpl::GetLockStatusData() {
-  return lock_mgr_.GetLockStatusData();
-}
-
-void TransactionDBImpl::RegisterTransaction(Transaction* txn) {
-  assert(txn);
-  assert(txn->GetName().length() > 0);
-  assert(GetTransactionByName(txn->GetName()) == nullptr);
-  assert(txn->GetState() == Transaction::STARTED);
-  std::lock_guard<std::mutex> lock(name_map_mutex_);
-  transactions_[txn->GetName()] = txn;
-}
-
-void TransactionDBImpl::UnregisterTransaction(Transaction* txn) {
-  assert(txn);
-  std::lock_guard<std::mutex> lock(name_map_mutex_);
-  auto it = transactions_.find(txn->GetName());
-  assert(it != transactions_.end());
-  transactions_.erase(it);
-}
-
-}  //  namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_db_impl.h b/utilities/transactions/transaction_db_impl.h
deleted file mode 100644
index 428512e82..000000000
--- a/utilities/transactions/transaction_db_impl.h
+++ /dev/null
@@ -1,127 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <mutex>
-#include <queue>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "rocksdb/db.h"
-#include "rocksdb/options.h"
-#include "rocksdb/utilities/transaction_db.h"
-#include "utilities/transactions/transaction_impl.h"
-#include "utilities/transactions/transaction_lock_mgr.h"
-
-namespace rocksdb {
-
-class TransactionDBImpl : public TransactionDB {
- public:
-  explicit TransactionDBImpl(DB* db,
-                             const TransactionDBOptions& txn_db_options);
-
-  explicit TransactionDBImpl(StackableDB* db,
-                             const TransactionDBOptions& txn_db_options);
-
-  ~TransactionDBImpl();
-
-  Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices,
-                    const std::vector<ColumnFamilyHandle*>& handles);
-
-  Transaction* BeginTransaction(const WriteOptions& write_options,
-                                const TransactionOptions& txn_options,
-                                Transaction* old_txn) override;
-
-  using StackableDB::Put;
-  virtual Status Put(const WriteOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& val) override;
-
-  using StackableDB::Delete;
-  virtual Status Delete(const WriteOptions& wopts,
-                        ColumnFamilyHandle* column_family,
-                        const Slice& key) override;
-
-  using StackableDB::Merge;
-  virtual Status Merge(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) override;
-
-  using StackableDB::Write;
-  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
-
-  using StackableDB::CreateColumnFamily;
-  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
-                                    const std::string& column_family_name,
-                                    ColumnFamilyHandle** handle) override;
-
-  using StackableDB::DropColumnFamily;
-  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
-
-  Status TryLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key,
-                 bool exclusive);
-
-  void UnLock(TransactionImpl* txn, const TransactionKeyMap* keys);
-  void UnLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key);
-
-  void AddColumnFamily(const ColumnFamilyHandle* handle);
-
-  static TransactionDBOptions ValidateTxnDBOptions(
-      const TransactionDBOptions& txn_db_options);
-
-  const TransactionDBOptions& GetTxnDBOptions() const {
-    return txn_db_options_;
-  }
-
-  void InsertExpirableTransaction(TransactionID tx_id, TransactionImpl* tx);
-  void RemoveExpirableTransaction(TransactionID tx_id);
-
-  // If transaction is no longer available, locks can be stolen
-  // If transaction is available, try stealing locks directly from transaction
-  // It is the caller's responsibility to ensure that the referred transaction
-  // is expirable (GetExpirationTime() > 0) and that it is expired.
-  bool TryStealingExpiredTransactionLocks(TransactionID tx_id);
-
-  Transaction* GetTransactionByName(const TransactionName& name) override;
-
-  void RegisterTransaction(Transaction* txn);
-  void UnregisterTransaction(Transaction* txn);
-
-  // not thread safe. current use case is during recovery (single thread)
-  void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override;
-
-  TransactionLockMgr::LockStatusData GetLockStatusData() override;
-
- private:
-  void ReinitializeTransaction(
-      Transaction* txn, const WriteOptions& write_options,
-      const TransactionOptions& txn_options = TransactionOptions());
-
-  DBImpl* db_impl_;
-  const TransactionDBOptions txn_db_options_;
-  TransactionLockMgr lock_mgr_;
-
-  // Must be held when adding/dropping column families.
-  InstrumentedMutex column_family_mutex_;
-  Transaction* BeginInternalTransaction(const WriteOptions& options);
-  Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl);
-
-  // Used to ensure that no locks are stolen from an expirable transaction
-  // that has started a commit. Only transactions with an expiration time
-  // should be in this map.
-  std::mutex map_mutex_;
-  std::unordered_map<TransactionID, TransactionImpl*>
-      expirable_transactions_map_;
-
-  // map from name to two phase transaction instance
-  std::mutex name_map_mutex_;
-  std::unordered_map<TransactionName, Transaction*> transactions_;
-};
-
-}  //  namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index a10aec17d..a72c2a12f 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -22,10 +22,11 @@
 
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
+#include "util/cast_util.h"
 #include "util/murmurhash.h"
 #include "util/sync_point.h"
 #include "util/thread_local.h"
-#include "utilities/transactions/transaction_db_impl.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
 
 namespace rocksdb {
 
@@ -95,6 +96,64 @@ struct LockMap {
   size_t GetStripe(const std::string& key) const;
 };
 
+void DeadlockInfoBuffer::AddNewPath(DeadlockPath path) {
+  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+  if (paths_buffer_.empty()) {
+    return;
+  }
+
+  paths_buffer_[buffer_idx_] = path;
+  buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
+}
+
+void DeadlockInfoBuffer::Resize(uint32_t target_size) {
+  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+  paths_buffer_ = Normalize();
+
+  // Drop the deadlocks that will no longer be needed ater the normalize
+  if (target_size < paths_buffer_.size()) {
+    paths_buffer_.erase(
+        paths_buffer_.begin(),
+        paths_buffer_.begin() + (paths_buffer_.size() - target_size));
+    buffer_idx_ = 0;
+  }
+  // Resize the buffer to the target size and restore the buffer's idx
+  else {
+    auto prev_size = paths_buffer_.size();
+    paths_buffer_.resize(target_size);
+    buffer_idx_ = (uint32_t)prev_size;
+  }
+}
+
+std::vector<DeadlockPath> DeadlockInfoBuffer::Normalize() {
+  auto working = paths_buffer_;
+
+  if (working.empty()) {
+    return working;
+  }
+
+  // Next write occurs at a nonexistent path's slot
+  if (paths_buffer_[buffer_idx_].empty()) {
+    working.resize(buffer_idx_);
+  } else {
+    std::rotate(working.begin(), working.begin() + buffer_idx_, working.end());
+  }
+
+  return working;
+}
+
+std::vector<DeadlockPath> DeadlockInfoBuffer::PrepareBuffer() {
+  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+  // Reversing the normalized vector returns the latest deadlocks first
+  auto working = Normalize();
+  std::reverse(working.begin(), working.end());
+
+  return working;
+}
+
 namespace {
 void UnrefLockMapsCache(void* ptr) {
   // Called when a thread exits or a ThreadLocalPtr gets destroyed.
@@ -106,14 +165,17 @@ void UnrefLockMapsCache(void* ptr) {
 
 TransactionLockMgr::TransactionLockMgr(
     TransactionDB* txn_db, size_t default_num_stripes, int64_t max_num_locks,
+    uint32_t max_num_deadlocks,
     std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
     : txn_db_impl_(nullptr),
       default_num_stripes_(default_num_stripes),
       max_num_locks_(max_num_locks),
       lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
+      dlock_buffer_(max_num_deadlocks),
       mutex_factory_(mutex_factory) {
-  txn_db_impl_ = dynamic_cast<TransactionDBImpl*>(txn_db);
-  assert(txn_db_impl_);
+  assert(txn_db);
+  txn_db_impl_ =
+      static_cast_with_check<PessimisticTransactionDB, TransactionDB>(txn_db);
 }
 
 TransactionLockMgr::~TransactionLockMgr() {}
@@ -225,7 +287,7 @@ bool TransactionLockMgr::IsLockExpired(TransactionID txn_id,
   return expired;
 }
 
-Status TransactionLockMgr::TryLock(TransactionImpl* txn,
+Status TransactionLockMgr::TryLock(PessimisticTransaction* txn,
                                    uint32_t column_family_id,
                                    const std::string& key, Env* env,
                                    bool exclusive) {
@@ -254,7 +316,7 @@ Status TransactionLockMgr::TryLock(TransactionImpl* txn,
 
 // Helper function for TryLock().
 Status TransactionLockMgr::AcquireWithTimeout(
-    TransactionImpl* txn, LockMap* lock_map, LockMapStripe* stripe,
+    PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
     uint32_t column_family_id, const std::string& key, Env* env,
     int64_t timeout, const LockInfo& lock_info) {
   Status result;
@@ -307,7 +369,8 @@ Status TransactionLockMgr::AcquireWithTimeout(
       // detection.
       if (wait_ids.size() != 0) {
         if (txn->IsDeadlockDetect()) {
-          if (IncrementWaiters(txn, wait_ids)) {
+          if (IncrementWaiters(txn, wait_ids, key, column_family_id,
+                               lock_info.exclusive)) {
             result = Status::Busy(Status::SubCode::kDeadlock);
             stripe->stripe_mutex->UnLock();
             return result;
@@ -355,13 +418,15 @@ Status TransactionLockMgr::AcquireWithTimeout(
 }
 
 void TransactionLockMgr::DecrementWaiters(
-    const TransactionImpl* txn, const autovector<TransactionID>& wait_ids) {
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids) {
   std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
   DecrementWaitersImpl(txn, wait_ids);
 }
 
 void TransactionLockMgr::DecrementWaitersImpl(
-    const TransactionImpl* txn, const autovector<TransactionID>& wait_ids) {
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids) {
   auto id = txn->GetID();
   assert(wait_txn_map_.Contains(id));
   wait_txn_map_.Delete(id);
@@ -375,12 +440,16 @@ void TransactionLockMgr::DecrementWaitersImpl(
 }
 
 bool TransactionLockMgr::IncrementWaiters(
-    const TransactionImpl* txn, const autovector<TransactionID>& wait_ids) {
+    const PessimisticTransaction* txn,
+    const autovector<TransactionID>& wait_ids, const std::string& key,
+    const uint32_t& cf_id, const bool& exclusive) {
   auto id = txn->GetID();
-  std::vector<TransactionID> queue(txn->GetDeadlockDetectDepth());
+  std::vector<int> queue_parents(txn->GetDeadlockDetectDepth());
+  std::vector<TransactionID> queue_values(txn->GetDeadlockDetectDepth());
   std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
   assert(!wait_txn_map_.Contains(id));
-  wait_txn_map_.Insert(id, wait_ids);
+
+  wait_txn_map_.Insert(id, {wait_ids, cf_id, key, exclusive});
 
   for (auto wait_id : wait_ids) {
     if (rev_wait_txn_map_.Contains(wait_id)) {
@@ -396,13 +465,15 @@ bool TransactionLockMgr::IncrementWaiters(
   }
 
   const auto* next_ids = &wait_ids;
+  int parent = -1;
   for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) {
     int i = 0;
     if (next_ids) {
       for (; i < static_cast<int>(next_ids->size()) &&
              tail + i < txn->GetDeadlockDetectDepth();
            i++) {
-        queue[tail + i] = (*next_ids)[i];
+        queue_values[tail + i] = (*next_ids)[i];
+        queue_parents[tail + i] = parent;
       }
       tail += i;
     }
@@ -412,19 +483,33 @@ bool TransactionLockMgr::IncrementWaiters(
       return false;
     }
 
-    auto next = queue[head];
+    auto next = queue_values[head];
     if (next == id) {
+      std::vector<DeadlockInfo> path;
+      while (head != -1) {
+        assert(wait_txn_map_.Contains(queue_values[head]));
+
+        auto extracted_info = wait_txn_map_.Get(queue_values[head]);
+        path.push_back({queue_values[head], extracted_info.m_cf_id,
+                        extracted_info.m_waiting_key,
+                        extracted_info.m_exclusive});
+        head = queue_parents[head];
+      }
+      std::reverse(path.begin(), path.end());
+      dlock_buffer_.AddNewPath(DeadlockPath(path));
       DecrementWaitersImpl(txn, wait_ids);
       return true;
     } else if (!wait_txn_map_.Contains(next)) {
       next_ids = nullptr;
       continue;
     } else {
-      next_ids = &wait_txn_map_.Get(next);
+      parent = head;
+      next_ids = &(wait_txn_map_.Get(next).m_neighbors);
     }
   }
 
   // Wait cycle too big, just assume deadlock.
+  dlock_buffer_.AddNewPath(DeadlockPath(true));
   DecrementWaitersImpl(txn, wait_ids);
   return true;
 }
@@ -499,7 +584,7 @@ Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
   return result;
 }
 
-void TransactionLockMgr::UnLockKey(const TransactionImpl* txn,
+void TransactionLockMgr::UnLockKey(const PessimisticTransaction* txn,
                                    const std::string& key,
                                    LockMapStripe* stripe, LockMap* lock_map,
                                    Env* env) {
@@ -535,7 +620,8 @@ void TransactionLockMgr::UnLockKey(const TransactionImpl* txn,
   }
 }
 
-void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id,
+void TransactionLockMgr::UnLock(PessimisticTransaction* txn,
+                                uint32_t column_family_id,
                                 const std::string& key, Env* env) {
   std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
   LockMap* lock_map = lock_map_ptr.get();
@@ -557,7 +643,7 @@ void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id,
   stripe->stripe_cv->NotifyAll();
 }
 
-void TransactionLockMgr::UnLock(const TransactionImpl* txn,
+void TransactionLockMgr::UnLock(const PessimisticTransaction* txn,
                                 const TransactionKeyMap* key_map, Env* env) {
   for (auto& key_map_iter : *key_map) {
     uint32_t column_family_id = key_map_iter.first;
@@ -644,6 +730,13 @@ TransactionLockMgr::LockStatusData TransactionLockMgr::GetLockStatusData() {
 
   return data;
 }
+std::vector<DeadlockPath> TransactionLockMgr::GetDeadlockInfoBuffer() {
+  return dlock_buffer_.PrepareBuffer();
+}
+
+void TransactionLockMgr::Resize(uint32_t target_size) {
+  dlock_buffer_.Resize(target_size);
+}
 
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h
index 6389f8d7d..abf7c5d3d 100644
--- a/utilities/transactions/transaction_lock_mgr.h
+++ b/utilities/transactions/transaction_lock_mgr.h
@@ -17,7 +17,7 @@
 #include "util/autovector.h"
 #include "util/hash_map.h"
 #include "util/thread_local.h"
-#include "utilities/transactions/transaction_impl.h"
+#include "utilities/transactions/pessimistic_transaction.h"
 
 namespace rocksdb {
 
@@ -26,13 +26,35 @@ struct LockInfo;
 struct LockMap;
 struct LockMapStripe;
 
+struct DeadlockInfoBuffer {
+ private:
+  std::vector<DeadlockPath> paths_buffer_;
+  uint32_t buffer_idx_;
+  std::mutex paths_buffer_mutex_;
+  std::vector<DeadlockPath> Normalize();
+
+ public:
+  explicit DeadlockInfoBuffer(uint32_t n_latest_dlocks)
+      : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {}
+  void AddNewPath(DeadlockPath path);
+  void Resize(uint32_t target_size);
+  std::vector<DeadlockPath> PrepareBuffer();
+};
+
+struct TrackedTrxInfo {
+  autovector<TransactionID> m_neighbors;
+  uint32_t m_cf_id;
+  std::string m_waiting_key;
+  bool m_exclusive;
+};
+
 class Slice;
-class TransactionDBImpl;
+class PessimisticTransactionDB;
 
 class TransactionLockMgr {
  public:
   TransactionLockMgr(TransactionDB* txn_db, size_t default_num_stripes,
-                     int64_t max_num_locks,
+                     int64_t max_num_locks, uint32_t max_num_deadlocks,
                      std::shared_ptr<TransactionDBMutexFactory> factory);
 
   ~TransactionLockMgr();
@@ -47,21 +69,23 @@ class TransactionLockMgr {
 
   // Attempt to lock key.  If OK status is returned, the caller is responsible
   // for calling UnLock() on this key.
-  Status TryLock(TransactionImpl* txn, uint32_t column_family_id,
+  Status TryLock(PessimisticTransaction* txn, uint32_t column_family_id,
                  const std::string& key, Env* env, bool exclusive);
 
   // Unlock a key locked by TryLock().  txn must be the same Transaction that
   // locked this key.
-  void UnLock(const TransactionImpl* txn, const TransactionKeyMap* keys,
+  void UnLock(const PessimisticTransaction* txn, const TransactionKeyMap* keys,
               Env* env);
-  void UnLock(TransactionImpl* txn, uint32_t column_family_id,
+  void UnLock(PessimisticTransaction* txn, uint32_t column_family_id,
               const std::string& key, Env* env);
 
   using LockStatusData = std::unordered_multimap<uint32_t, KeyLockInfo>;
   LockStatusData GetLockStatusData();
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer();
+  void Resize(uint32_t);
 
  private:
-  TransactionDBImpl* txn_db_impl_;
+  PessimisticTransactionDB* txn_db_impl_;
 
   // Default number of lock map stripes per column family
   const size_t default_num_stripes_;
@@ -92,7 +116,8 @@ class TransactionLockMgr {
   // Maps from waitee -> number of waiters.
   HashMap<TransactionID, int> rev_wait_txn_map_;
   // Maps from waiter -> waitee.
-  HashMap<TransactionID, autovector<TransactionID>> wait_txn_map_;
+  HashMap<TransactionID, TrackedTrxInfo> wait_txn_map_;
+  DeadlockInfoBuffer dlock_buffer_;
 
   // Used to allocate mutexes/condvars to use when locking keys
   std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
@@ -102,7 +127,7 @@ class TransactionLockMgr {
 
   std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
 
-  Status AcquireWithTimeout(TransactionImpl* txn, LockMap* lock_map,
+  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
                             LockMapStripe* stripe, uint32_t column_family_id,
                             const std::string& key, Env* env, int64_t timeout,
                             const LockInfo& lock_info);
@@ -112,14 +137,16 @@ class TransactionLockMgr {
                        const LockInfo& lock_info, uint64_t* wait_time,
                        autovector<TransactionID>* txn_ids);
 
-  void UnLockKey(const TransactionImpl* txn, const std::string& key,
+  void UnLockKey(const PessimisticTransaction* txn, const std::string& key,
                  LockMapStripe* stripe, LockMap* lock_map, Env* env);
 
-  bool IncrementWaiters(const TransactionImpl* txn,
-                        const autovector<TransactionID>& wait_ids);
-  void DecrementWaiters(const TransactionImpl* txn,
+  bool IncrementWaiters(const PessimisticTransaction* txn,
+                        const autovector<TransactionID>& wait_ids,
+                        const std::string& key, const uint32_t& cf_id,
+                        const bool& exclusive);
+  void DecrementWaiters(const PessimisticTransaction* txn,
                         const autovector<TransactionID>& wait_ids);
-  void DecrementWaitersImpl(const TransactionImpl* txn,
+  void DecrementWaitersImpl(const PessimisticTransaction* txn,
                             const autovector<TransactionID>& wait_ids);
 
   // No copying allowed
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 90cec396b..eac8e563d 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5,6 +5,11 @@
 
 #ifndef ROCKSDB_LITE
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <algorithm>
 #include <functional>
 #include <string>
@@ -17,7 +22,6 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
 #include "util/fault_injection_test_env.h"
-#include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
@@ -26,6 +30,7 @@
 #include "util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
 
 #include "port/port.h"
 
@@ -33,8 +38,8 @@ using std::string;
 
 namespace rocksdb {
 
-class TransactionTest
-    : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+class TransactionTest : public ::testing::TestWithParam<
+                            std::tuple<bool, bool, TxnDBWritePolicy>> {
  public:
   TransactionDB* db;
   FaultInjectionTestEnv* env;
@@ -57,6 +62,7 @@ class TransactionTest
     DestroyDB(dbname, options);
     txn_db_options.transaction_lock_timeout = 0;
     txn_db_options.default_lock_timeout = 0;
+    txn_db_options.write_policy = std::get<2>(GetParam());
     Status s;
     if (std::get<0>(GetParam()) == false) {
       s = TransactionDB::Open(options, txn_db_options, dbname, &db);
@@ -123,16 +129,23 @@ class TransactionTest
 };
 
 class MySQLStyleTransactionTest : public TransactionTest {};
+class WritePreparedTransactionTest : public TransactionTest {};
 
+static const TxnDBWritePolicy wc = WRITE_COMMITTED;
+static const TxnDBWritePolicy wp = WRITE_PREPARED;
+// TODO(myabandeh): Instantiate the tests with other write policies
 INSTANTIATE_TEST_CASE_P(DBAsBaseDB, TransactionTest,
-                        ::testing::Values(std::make_tuple(false, false)));
+                        ::testing::Values(std::make_tuple(false, false, wc)));
 INSTANTIATE_TEST_CASE_P(StackableDBAsBaseDB, TransactionTest,
-                        ::testing::Values(std::make_tuple(true, false)));
+                        ::testing::Values(std::make_tuple(true, false, wc)));
 INSTANTIATE_TEST_CASE_P(MySQLStyleTransactionTest, MySQLStyleTransactionTest,
-                        ::testing::Values(std::make_tuple(false, false),
-                                          std::make_tuple(false, true),
-                                          std::make_tuple(true, false),
-                                          std::make_tuple(true, true)));
+                        ::testing::Values(std::make_tuple(false, false, wc),
+                                          std::make_tuple(false, true, wc),
+                                          std::make_tuple(true, false, wc),
+                                          std::make_tuple(true, true, wc)));
+INSTANTIATE_TEST_CASE_P(WritePreparedTransactionTest,
+                        WritePreparedTransactionTest,
+                        ::testing::Values(std::make_tuple(false, true, wp)));
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {
   WriteOptions write_options;
@@ -242,7 +255,7 @@ TEST_P(TransactionTest, WaitingTxn) {
 
   // Column family is 1 or 0 (cfa).
   if (cf_iterator->first != 1 && cf_iterator->first != 0) {
-    ASSERT_FALSE(true);
+    FAIL();
   }
   // The locked key is "foo" and is locked by txn1
   ASSERT_EQ(cf_iterator->second.key, "foo");
@@ -253,7 +266,7 @@ TEST_P(TransactionTest, WaitingTxn) {
 
   // Column family is 0 (default) or 1.
   if (cf_iterator->first != 1 && cf_iterator->first != 0) {
-    ASSERT_FALSE(true);
+    FAIL();
   }
   // The locked key is "foo" and is locked by txn1
   ASSERT_EQ(cf_iterator->second.key, "foo");
@@ -462,6 +475,37 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
     auto s =
         txns[i]->GetForUpdate(read_options, "0", nullptr, true /* exclusive */);
     ASSERT_TRUE(s.IsDeadlock());
+
+    // Calculate next buffer len, plateau at 5 when 5 records are inserted.
+    const uint32_t curr_dlock_buffer_len_ =
+        (i - 14 > kInitialMaxDeadlocks) ? kInitialMaxDeadlocks : (i - 14);
+
+    auto dlock_buffer = db->GetDeadlockInfoBuffer();
+    ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_);
+    auto dlock_entry = dlock_buffer[0].path;
+    ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks);
+
+    int64_t curr_waiting_key = 0;
+
+    // Offset of each txn id from the root of the shared dlock tree's txn id.
+    int64_t offset_root = dlock_entry[0].m_txn_id - 1;
+    // Offset of the final entry in the dlock path from the root's txn id.
+    TransactionID leaf_id =
+        dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root;
+
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
+      auto dl_node = *it;
+      ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
+      ASSERT_EQ(dl_node.m_cf_id, 0);
+      ASSERT_EQ(dl_node.m_waiting_key, ToString(curr_waiting_key));
+      ASSERT_EQ(dl_node.m_exclusive, true);
+
+      if (curr_waiting_key == 0) {
+        curr_waiting_key = leaf_id;
+      }
+      curr_waiting_key /= 2;
+      leaf_id /= 2;
+    }
   }
 
   // Rollback the leaf transaction.
@@ -473,6 +517,102 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
   for (auto& t : threads) {
     t.join();
   }
+
+  // Downsize the buffer and verify the 3 latest deadlocks are preserved.
+  auto dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(3);
+  auto dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
+
+  for (uint32_t i = 0; i < dlock_buffer_after_resize.size(); i++) {
+    for (uint32_t j = 0; j < dlock_buffer_after_resize[i].path.size(); j++) {
+      ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
+                dlock_buffer_before_resize[i].path[j].m_txn_id);
+    }
+  }
+
+  // Upsize the buffer and verify the 3 latest dealocks are preserved.
+  dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(5);
+  dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 3);
+
+  for (uint32_t i = 0; i < dlock_buffer_before_resize.size(); i++) {
+    for (uint32_t j = 0; j < dlock_buffer_before_resize[i].path.size(); j++) {
+      ASSERT_EQ(dlock_buffer_after_resize[i].path[j].m_txn_id,
+                dlock_buffer_before_resize[i].path[j].m_txn_id);
+    }
+  }
+
+  // Downsize to 0 and verify the size is consistent.
+  dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(0);
+  dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
+
+  // Upsize from 0 to verify the size is persistent.
+  dlock_buffer_before_resize = db->GetDeadlockInfoBuffer();
+  db->SetDeadlockInfoBufferSize(3);
+  dlock_buffer_after_resize = db->GetDeadlockInfoBuffer();
+  ASSERT_EQ(dlock_buffer_after_resize.size(), 0);
+
+  // Contrived case of shared lock of cycle size 2 to verify that a shared
+  // lock causing a deadlock is correctly reported as "shared" in the buffer.
+  std::vector<Transaction*> txns_shared(2);
+
+  // Create a cycle of size 2.
+  for (uint32_t i = 0; i < 2; i++) {
+    txns_shared[i] = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txns_shared[i]);
+    auto s = txns_shared[i]->GetForUpdate(read_options, ToString(i), nullptr);
+    ASSERT_OK(s);
+  }
+
+  std::atomic<uint32_t> checkpoints_shared(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+      [&](void* arg) { checkpoints_shared.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<port::Thread> threads_shared;
+  for (uint32_t i = 0; i < 1; i++) {
+    std::function<void()> blocking_thread = [&, i] {
+      auto s =
+          txns_shared[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
+      ASSERT_OK(s);
+      txns_shared[i]->Rollback();
+      delete txns_shared[i];
+    };
+    threads_shared.emplace_back(blocking_thread);
+  }
+
+  // Wait until all threads are waiting on each other.
+  while (checkpoints_shared.load() != 1) {
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Complete the cycle T2 -> T1 with a shared lock.
+  auto s = txns_shared[1]->GetForUpdate(read_options, "0", nullptr, false);
+  ASSERT_TRUE(s.IsDeadlock());
+
+  auto dlock_buffer = db->GetDeadlockInfoBuffer();
+
+  // Verify the size of the buffer and the single path.
+  ASSERT_EQ(dlock_buffer.size(), 1);
+  ASSERT_EQ(dlock_buffer[0].path.size(), 2);
+
+  // Verify the exclusivity field of the transactions in the deadlock path.
+  ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive);
+  ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive);
+  txns_shared[1]->Rollback();
+  delete txns_shared[1];
+
+  for (auto& t : threads_shared) {
+    t.join();
+  }
 }
 
 TEST_P(TransactionTest, DeadlockCycle) {
@@ -480,7 +620,8 @@ TEST_P(TransactionTest, DeadlockCycle) {
   ReadOptions read_options;
   TransactionOptions txn_options;
 
-  const uint32_t kMaxCycleLength = 50;
+  // offset by 2 from the max depth to test edge case
+  const uint32_t kMaxCycleLength = 52;
 
   txn_options.lock_timeout = 1000000;
   txn_options.deadlock_detect = true;
@@ -489,6 +630,7 @@ TEST_P(TransactionTest, DeadlockCycle) {
     // Set up a long wait for chain like this:
     //
     // T1 -> T2 -> T3 -> ... -> Tlen
+
     std::vector<Transaction*> txns(len);
 
     for (uint32_t i = 0; i < len; i++) {
@@ -509,8 +651,7 @@ TEST_P(TransactionTest, DeadlockCycle) {
     std::vector<port::Thread> threads;
     for (uint32_t i = 0; i < len - 1; i++) {
       std::function<void()> blocking_thread = [&, i] {
-        auto s =
-            txns[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
+        auto s = txns[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
         ASSERT_OK(s);
         txns[i]->Rollback();
         delete txns[i];
@@ -530,6 +671,39 @@ TEST_P(TransactionTest, DeadlockCycle) {
     auto s = txns[len - 1]->GetForUpdate(read_options, "0", nullptr);
     ASSERT_TRUE(s.IsDeadlock());
 
+    const uint32_t dlock_buffer_size_ = (len - 1 > 5) ? 5 : (len - 1);
+    uint32_t curr_waiting_key = 0;
+    TransactionID curr_txn_id = txns[0]->GetID();
+
+    auto dlock_buffer = db->GetDeadlockInfoBuffer();
+    ASSERT_EQ(dlock_buffer.size(), dlock_buffer_size_);
+    uint32_t check_len = len;
+    bool check_limit_flag = false;
+
+    // Special case for a deadlock path that exceeds the maximum depth.
+    if (len > 50) {
+      check_len = 0;
+      check_limit_flag = true;
+    }
+    auto dlock_entry = dlock_buffer[0].path;
+    ASSERT_EQ(dlock_entry.size(), check_len);
+    ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag);
+
+    // Iterates backwards over path verifying decreasing txn_ids.
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
+      auto dl_node = *it;
+      ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
+      ASSERT_EQ(dl_node.m_cf_id, 0);
+      ASSERT_EQ(dl_node.m_waiting_key, ToString(curr_waiting_key));
+      ASSERT_EQ(dl_node.m_exclusive, true);
+
+      curr_txn_id--;
+      if (curr_waiting_key == 0) {
+        curr_waiting_key = len;
+      }
+      curr_waiting_key--;
+    }
+
     // Rollback the last transaction.
     txns[len - 1]->Rollback();
     delete txns[len - 1];
@@ -543,7 +717,7 @@ TEST_P(TransactionTest, DeadlockCycle) {
 TEST_P(TransactionTest, DeadlockStress) {
   const uint32_t NUM_TXN_THREADS = 10;
   const uint32_t NUM_KEYS = 100;
-  const uint32_t NUM_ITERS = 100000;
+  const uint32_t NUM_ITERS = 10000;
 
   WriteOptions write_options;
   ReadOptions read_options;
@@ -1080,7 +1254,7 @@ TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
             env->SleepForMicroseconds(10);
           }
         } else {
-          ASSERT_TRUE(false);
+          FAIL();
         }
       });
 
@@ -4559,6 +4733,138 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   delete txn;
 }
 
+// Test WritePreparedTxnDB's IsInSnapshot against different ordering of
+// snapshot, max_committed_seq_, prepared, and commit entries.
+TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) {
+  WriteOptions wo;
+  // Use small commit cache to trigger lots of eviction and fast advance of
+  // max_evicted_seq_
+  // will take effect after ReOpen
+  WritePreparedTxnDB::DEF_COMMIT_CACHE_SIZE = 8;
+  // Same for snapshot cache size
+  WritePreparedTxnDB::DEF_SNAPSHOT_CACHE_SIZE = 5;
+
+  // Take some preliminary snapshots first. This is to stress the data structure
+  // that holds the old snapshots as it will be designed to be efficient when
+  // only a few snapshots are below the max_evicted_seq_.
+  for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) {
+    // Leave some gap between the preliminary snapshots and the final snapshot
+    // that we check. This should test for also different overlapping scnearios
+    // between the last snapshot and the commits.
+    for (int max_gap = 1; max_gap < 10; max_gap++) {
+      // Since we do not actually write to db, we mock the seq as it would be
+      // increaased by the db. The only exception is that we need db seq to
+      // advance for our snapshots. for which we apply a dummy put each time we
+      // increase our mock of seq.
+      uint64_t seq = 0;
+      // At each step we prepare a txn and then we commit it in the next txn.
+      // This emulates the consecuitive transactions that write to the same key
+      uint64_t cur_txn = 0;
+      // Number of snapshots taken so far
+      int num_snapshots = 0;
+      std::vector<const Snapshot*> to_be_released;
+      // Number of gaps applied so far
+      int gap_cnt = 0;
+      // The final snapshot that we will inspect
+      uint64_t snapshot = 0;
+      bool found_committed = false;
+      // To stress the data structure that maintain prepared txns, at each cycle
+      // we add a new prepare txn. These do not mean to be committed for
+      // snapshot inspection.
+      std::set<uint64_t> prepared;
+      // We keep the list of txns comitted before we take the last snaphot.
+      // These should be the only seq numbers that will be found in the snapshot
+      std::set<uint64_t> committed_before;
+      ReOpen();  // to restart the db
+      WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+      assert(wp_db);
+      assert(wp_db->db_impl_);
+      // We continue until max advances a bit beyond the snapshot.
+      while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) {
+        // do prepare for a transaction
+        wp_db->db_impl_->Put(wo, "key", "value");  // dummy put to inc db seq
+        seq++;
+        ASSERT_EQ(wp_db->db_impl_->GetLatestSequenceNumber(), seq);
+        wp_db->AddPrepared(seq);
+        prepared.insert(seq);
+
+        // If cur_txn is not started, do prepare for it.
+        if (!cur_txn) {
+          wp_db->db_impl_->Put(wo, "key", "value");  // dummy put to inc db seq
+          seq++;
+          ASSERT_EQ(wp_db->db_impl_->GetLatestSequenceNumber(), seq);
+          cur_txn = seq;
+          wp_db->AddPrepared(cur_txn);
+        } else {                                     // else commit it
+          wp_db->db_impl_->Put(wo, "key", "value");  // dummy put to inc db seq
+          seq++;
+          ASSERT_EQ(wp_db->db_impl_->GetLatestSequenceNumber(), seq);
+          wp_db->AddCommitted(cur_txn, seq);
+          if (!snapshot) {
+            committed_before.insert(cur_txn);
+          }
+          cur_txn = 0;
+        }
+
+        if (num_snapshots < max_snapshots - 1) {
+          // Take preliminary snapshots
+          auto tmp_snapshot = db->GetSnapshot();
+          to_be_released.push_back(tmp_snapshot);
+          num_snapshots++;
+        } else if (gap_cnt < max_gap) {
+          // Wait for some gap before taking the final snapshot
+          gap_cnt++;
+        } else if (!snapshot) {
+          // Take the final snapshot if it is not already taken
+          auto tmp_snapshot = db->GetSnapshot();
+          to_be_released.push_back(tmp_snapshot);
+          snapshot = tmp_snapshot->GetSequenceNumber();
+          // We increase the db seq artificailly by a dummy Put. Check that this
+          // technique is effective and db seq is that same as ours.
+          ASSERT_EQ(snapshot, seq);
+          num_snapshots++;
+        }
+
+        // If the snapshot is taken, verify seq numbers visible to it. We redo
+        // it at each cycle to test that the system is still sound when
+        // max_evicted_seq_ advances.
+        if (snapshot) {
+          for (uint64_t s = 0; s <= seq; s++) {
+            bool was_committed =
+                (committed_before.find(s) != committed_before.end());
+            bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot);
+            if (was_committed != is_in_snapshot) {
+              printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64
+                     " snapshot %" PRIu64
+                     " gap_cnt %d num_snapshots %d s %" PRIu64 "\n",
+                     max_snapshots, max_gap, seq,
+                     wp_db->max_evicted_seq_.load(), snapshot, gap_cnt,
+                     num_snapshots, s);
+            }
+            ASSERT_EQ(was_committed, is_in_snapshot);
+            found_committed = found_committed || is_in_snapshot;
+          }
+        }
+      }
+      // Safety check to make sure the test actually ran
+      ASSERT_TRUE(found_committed);
+      // As an extra check, check if prepared set will be properly empty after
+      // they are committed.
+      if (cur_txn) {
+        wp_db->AddCommitted(cur_txn, seq);
+      }
+      for (auto p : prepared) {
+        wp_db->AddCommitted(p, seq);
+      }
+      ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+      ASSERT_TRUE(wp_db->prepared_txns_.empty());
+      for (auto s : to_be_released) {
+        db->ReleaseSnapshot(s);
+      }
+    }
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
new file mode 100644
index 000000000..211e21724
--- /dev/null
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -0,0 +1,88 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn.h"
+
+#include <map>
+
+#include "db/column_family.h"
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+namespace rocksdb {
+
+struct WriteOptions;
+
+WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
+                                   const WriteOptions& write_options,
+                                   const TransactionOptions& txn_options)
+    : PessimisticTransaction(txn_db, write_options, txn_options),
+      wpt_db_(txn_db) {
+  PessimisticTransaction::Initialize(txn_options);
+}
+
+Status WritePreparedTxn::CommitBatch(WriteBatch* /* unused */) {
+  // TODO(myabandeh) Implement this
+  throw std::runtime_error("CommitBatch not Implemented");
+  return Status::OK();
+}
+
+Status WritePreparedTxn::PrepareInternal() {
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
+  const bool disable_memtable = true;
+  uint64_t seq_used;
+  Status s =
+      db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+                          !disable_memtable, &seq_used);
+  prepare_seq_ = seq_used;
+  wpt_db_->AddPrepared(prepare_seq_);
+  return s;
+}
+
+Status WritePreparedTxn::CommitWithoutPrepareInternal() {
+  // TODO(myabandeh) Implement this
+  throw std::runtime_error("Commit not Implemented");
+  return Status::OK();
+}
+
+Status WritePreparedTxn::CommitInternal() {
+  // We take the commit-time batch and append the Commit marker.
+  // The Memtable will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+  // TODO(myabandeh): prevent the users from writing to txn after the prepare
+  // phase
+  assert(working_batch->Count() == 0);
+  WriteBatchInternal::MarkCommit(working_batch, name_);
+
+  // any operations appended to this working_batch will be ignored from WAL
+  working_batch->MarkWalTerminationPoint();
+
+  const bool disable_memtable = true;
+  uint64_t seq_used;
+  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                               log_number_, disable_memtable, &seq_used);
+  uint64_t& commit_seq = seq_used;
+  wpt_db_->AddCommitted(prepare_seq_, commit_seq);
+  return s;
+}
+
+Status WritePreparedTxn::Rollback() {
+  // TODO(myabandeh) Implement this
+  throw std::runtime_error("Rollback not Implemented");
+  return Status::OK();
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
new file mode 100644
index 000000000..b7cc6ba1b
--- /dev/null
+++ b/utilities/transactions/write_prepared_txn.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <atomic>
+#include <mutex>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/autovector.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace rocksdb {
+
+class WritePreparedTxnDB;
+
+// This impl could write to DB also uncomitted data and then later tell apart
+// committed data from uncomitted data. Uncommitted data could be after the
+// Prepare phase in 2PC (WritePreparedTxn) or before that
+// (WriteUnpreparedTxnImpl).
+class WritePreparedTxn : public PessimisticTransaction {
+ public:
+  WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
+                   const TransactionOptions& txn_options);
+
+  virtual ~WritePreparedTxn() {}
+
+  Status CommitBatch(WriteBatch* batch) override;
+
+  Status Rollback() override;
+
+ private:
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+
+  Status CommitInternal() override;
+
+  // TODO(myabandeh): verify that the current impl work with values being
+  // written with prepare sequence number too.
+  // Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice&
+  // key,
+  //                        SequenceNumber prev_seqno, SequenceNumber*
+  //                        new_seqno);
+
+  // No copying allowed
+  WritePreparedTxn(const WritePreparedTxn&);
+  void operator=(const WritePreparedTxn&);
+
+  WritePreparedTxnDB* wpt_db_;
+  uint64_t prepare_seq_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 203008688..586d0ce1f 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -131,7 +131,7 @@ class TtlTest : public testing::Test {
           batch.Delete(kv_it_->first);
           break;
         default:
-          ASSERT_TRUE(false);
+          FAIL();
       }
     }
     db_ttl_->Write(wopts, &batch);
@@ -184,12 +184,12 @@ class TtlTest : public testing::Test {
       if (ret == false || value_found == false) {
         fprintf(stderr, "KeyMayExist could not find key=%s in the database but"
                         " should have\n", kv.first.c_str());
-        ASSERT_TRUE(false);
+        FAIL();
       } else if (val.compare(kv.second) != 0) {
         fprintf(stderr, " value for key=%s present in database is %s but"
                         " should be %s\n", kv.first.c_str(), val.c_str(),
                         kv.second.c_str());
-        ASSERT_TRUE(false);
+        FAIL();
       }
     }
   }
@@ -239,18 +239,18 @@ class TtlTest : public testing::Test {
         } else {
           fprintf(stderr, "is present in db but was expected to be absent\n");
         }
-        ASSERT_TRUE(false);
+        FAIL();
       } else if (s.ok()) {
           if (test_compaction_change && v.compare(kNewValue_) != 0) {
             fprintf(stderr, " value for key=%s present in database is %s but "
                             " should be %s\n", kv_it_->first.c_str(), v.c_str(),
                             kNewValue_.c_str());
-            ASSERT_TRUE(false);
+            FAIL();
           } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) {
             fprintf(stderr, " value for key=%s present in database is %s but "
                             " should be %s\n", kv_it_->first.c_str(), v.c_str(),
                             kv_it_->second.c_str());
-            ASSERT_TRUE(false);
+            FAIL();
           }
       }
     }
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index dc5d0fcf6..b2820109c 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -385,8 +385,8 @@ class WBWIIteratorImpl : public WBWIIterator {
 };
 
 struct WriteBatchWithIndex::Rep {
-  Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
-      size_t max_bytes = 0, bool _overwrite_key = false)
+  explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
+               size_t max_bytes = 0, bool _overwrite_key = false)
       : write_batch(reserved_bytes, max_bytes),
         comparator(index_comparator, &write_batch),
         skip_list(comparator, &arena),
@@ -743,8 +743,23 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
                                               const ReadOptions& read_options,
                                               const Slice& key,
                                               std::string* value) {
+  assert(value != nullptr);
+  PinnableSlice pinnable_val(value);
+  assert(!pinnable_val.IsPinned());
+  auto s = GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
+                             &pinnable_val);
+  if (s.ok() && pinnable_val.IsPinned()) {
+    value->assign(pinnable_val.data(), pinnable_val.size());
+  }  // else value is already assigned
+  return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              const Slice& key,
+                                              PinnableSlice* pinnable_val) {
   return GetFromBatchAndDB(db, read_options, db->DefaultColumnFamily(), key,
-                           value);
+                           pinnable_val);
 }
 
 Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
@@ -752,19 +767,38 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
                                               ColumnFamilyHandle* column_family,
                                               const Slice& key,
                                               std::string* value) {
+  assert(value != nullptr);
+  PinnableSlice pinnable_val(value);
+  assert(!pinnable_val.IsPinned());
+  auto s =
+      GetFromBatchAndDB(db, read_options, column_family, key, &pinnable_val);
+  if (s.ok() && pinnable_val.IsPinned()) {
+    value->assign(pinnable_val.data(), pinnable_val.size());
+  }  // else value is already assigned
+  return s;
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
+                                              const ReadOptions& read_options,
+                                              ColumnFamilyHandle* column_family,
+                                              const Slice& key,
+                                              PinnableSlice* pinnable_val) {
   Status s;
   MergeContext merge_context;
   const ImmutableDBOptions& immuable_db_options =
       reinterpret_cast<DBImpl*>(db)->immutable_db_options();
 
-  std::string batch_value;
+  // Since the lifetime of the WriteBatch is the same as that of the transaction
+  // we cannot pin it as otherwise the returned value will not be available
+  // after the transaction finishes.
+  std::string& batch_value = *pinnable_val->GetSelf();
   WriteBatchWithIndexInternal::Result result =
       WriteBatchWithIndexInternal::GetFromBatch(
           immuable_db_options, this, column_family, key, &merge_context,
           &rep->comparator, &batch_value, rep->overwrite_key, &s);
 
   if (result == WriteBatchWithIndexInternal::Result::kFound) {
-    value->assign(batch_value.data(), batch_value.size());
+    pinnable_val->PinSelf();
     return s;
   }
   if (result == WriteBatchWithIndexInternal::Result::kDeleted) {
@@ -785,7 +819,7 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
          result == WriteBatchWithIndexInternal::Result::kNotFound);
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
-  s = db->Get(read_options, column_family, key, value);
+  s = db->Get(read_options, column_family, key, pinnable_val);
 
   if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded
     if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress) {
@@ -797,18 +831,18 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
       Env* env = immuable_db_options.env;
       Logger* logger = immuable_db_options.info_log.get();
 
-      Slice db_slice(*value);
       Slice* merge_data;
       if (s.ok()) {
-        merge_data = &db_slice;
+        merge_data = pinnable_val;
       } else {  // Key not present in db (s.IsNotFound())
         merge_data = nullptr;
       }
 
       if (merge_operator) {
-        s = MergeHelper::TimedFullMerge(merge_operator, key, merge_data,
-                                        merge_context.GetOperands(), value,
-                                        logger, statistics, env);
+        s = MergeHelper::TimedFullMerge(
+            merge_operator, key, merge_data, merge_context.GetOperands(),
+            pinnable_val->GetSelf(), logger, statistics, env);
+        pinnable_val->PinSelf();
       } else {
         s = Status::InvalidArgument("Options::merge_operator must be set");
       }