rocksdb.ini

# If true WAL is not flushed automatically after each write. Instead it
# relies on manual invocation of FlushWAL to write the WAL buffer to its
# file.
manual_wal_flush=false

# By default RocksDB replay WAL logs and flush them on DB open, which may
# create very small SST files. If this option is enabled, RocksDB will try
# to avoid (but not guarantee not to) flush during recovery. Also, existing
# WAL logs will be kept, so that if crash happened before flush, we still
# have logs to recover from.
avoid_flush_during_recovery=false

# If true, then print malloc stats together with rocksdb.stats
# when printing to LOG.
dump_malloc_stats=false

# Log level for the information logs.
# Must be one of INFO_LEVEL or DEBUG_LEVEL
info_log_level=INFO_LEVEL

# Specify the file access pattern once a compaction is started.
# It will be applied to all input files of a compaction.
# Must be one of NONE, NORMAL, SEQUENTIAL, WILLNEED
access_hint_on_compaction_start=NORMAL

# The latency in microseconds after which a std::this_thread::yield
# call (sched_yield on Linux) is considered to be a signal that
# other processes or threads would like to use the current core.
# Increasing this makes writer threads more likely to take CPU
# by spinning, which will show up as an increase in the number of
# involuntary context switches.
write_thread_slow_yield_usec=3

# If true, threads synchronizing with the write batch group leader will
# wait for up to write_thread_max_yield_usec before blocking on a mutex.
# This can substantially improve throughput for concurrent workloads,
# regardless of whether allow_concurrent_memtable_write is enabled.
enable_write_thread_adaptive_yield=true

# Setting unordered_write to true trades higher write throughput with
# relaxing the immutability guarantee of snapshots. This violates the
# repeatability one expects from ::Get from a snapshot, as well as
# ::MultiGet and Iterator's consistent-point-in-time view property.
# If the application cannot tolerate the relaxed guarantees, it can implement
# its own mechanisms to work around that and yet benefit from the higher
# throughput. Using TransactionDB with WRITE_PREPARED write policy and
# two_write_queues=true is one way to achieve immutable snapshots despite
# unordered_write.
#
# By default, i.e., when it is false, rocksdb does not advance the sequence
# number for new snapshots unless all the writes with lower sequence numbers
# are already finished. This provides the immutability that we except from
# snapshots. Moreover, since Iterator and MultiGet internally depend on
# snapshots, the snapshot immutability results into Iterator and MultiGet
# offering consistent-point-in-time view. If set to true, although
# Read-Your-Own-Write property is still provided, the snapshot immutability
# property is relaxed: the writes issued after the snapshot is obtained (with
# larger sequence numbers) will be still not visible to the reads from that
# snapshot, however, there still might be pending writes (with lower sequence
# number) that will change the state visible to the snapshot after they are
# landed to the memtable.
unordered_write=false

# The number of bytes to prefetch when reading the log. This is mostly useful
# for reading a remotely located log, as it can save the number of
# round-trips. If 0, then the prefetching is disabled.
log_readahead_size=0

# By default, a single write thread queue is maintained. The thread gets
# to the head of the queue becomes write batch group leader and responsible
# for writing to WAL and memtable for the batch group.
#
# If enable_pipelined_write is true, separate write thread queue is
# maintained for WAL write and memtable write. A write thread first enter WAL
# writer queue and then memtable writer queue. Pending thread on the WAL
# writer queue thus only have to wait for previous writers to finish their
# WAL writing but not the memtable writing. Enabling the feature may improve
# write throughput and reduce latency of the prepare phase of two-phase
# commit.
enable_pipelined_write=false

# If true, automatically persist stats to a hidden column family (column
# family name: ___rocksdb_stats_history___) every
# stats_persist_period_sec seconds; otherwise, write to an in-memory
# struct. User can query through `GetStatsHistory` API.
# If user attempts to create a column family with the same name on a DB
# which have previously set persist_stats_to_disk to true, the column family
# creation will fail, but the hidden column family will survive, as well as
# the previously persisted statistics.
# When peristing stats to disk, the stat name will be limited at 100 bytes.
persist_stats_to_disk=false

# Needed to support differential snapshots.
# If set to true then DB will only process deletes with sequence number
# less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
# Clients are responsible to periodically call this method to advance
# the cutoff time. If this method is never called and preserve_deletes
# is set to true NO deletes will ever be processed.
# At the moment this only keeps normal deletes, SingleDeletes will
# not be preserved.
preserve_deletes=false

# if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
stats_persist_period_sec=600

# Same as bytes_per_sync, but applies to WAL files
wal_bytes_per_sync=0

# Allows OS to incrementally sync files to disk while they are being
# written, asynchronously, in the background. This operation can be used
# to smooth out write I/Os over time. Users shouldn't rely on it for
# persistency guarantee.
# Issue one request for every bytes_per_sync written. 0 turns it off.
#
# You may consider using rate_limiter to regulate write rate to device.
# When rate limiter is enabled, it automatically enables bytes_per_sync
# to 1MB.
#
# This option applies to table files
bytes_per_sync=0

# The limited write rate to DB if soft_pending_compaction_bytes_limit or
# level0_slowdown_writes_trigger is triggered, or we are writing to the
# last mem table allowed and we allow more than 3 mem tables. It is
# calculated using size of user write requests before compression.
# RocksDB may decide to slow down more if the compaction still
# gets behind further.
# If the value is 0, we will infer a value from `rater_limiter` value
# if it is not empty, or 16MB if `rater_limiter` is empty. Note that
# if users change the rate in `rate_limiter` after DB is opened,
# `delayed_write_rate` won't be adjusted.
#
# Unit: byte per second.
delayed_write_rate=16777216

# The following two fields affect how archived logs will be deleted.
# 1. If both set to 0, logs will be deleted asap and will not get into
#    the archive.
# 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
#    WAL files will be checked every 10 min and if total size is greater
#    then WAL_size_limit_MB, they will be deleted starting with the
#    earliest until size_limit is met. All empty files will be deleted.
# 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
#    WAL files will be checked every WAL_ttl_seconds / 2 and those that
#    are older than WAL_ttl_seconds will be deleted.
# 4. If both are not 0, WAL files will be checked every 10 min and both
#    checks will be performed with ttl being first.
WAL_ttl_seconds=0
WAL_size_limit_MB=0

# This is the maximum buffer size that is used by WritableFileWriter.
# On Windows, we need to maintain an aligned buffer for writes.
# We allow the buffer to grow until it's size hits the limit in buffered
# IO and fix the buffer size when using direct IO to ensure alignment of
# write requests if the logical sector size is unusual
writable_file_max_buffer_size=1048576

# If true, allow multi-writers to update mem tables in parallel.
# Only some memtable_factory-s support concurrent writes; currently it
# is implemented only for SkipListFactory.  Concurrent memtable writes
# are not compatible with inplace_update_support or filter_deletes.
# It is strongly recommended to set enable_write_thread_adaptive_yield
# if you are going to use this feature.
allow_concurrent_memtable_write=true

# If true, RocksDB will aggressively check consistency of the data.
# Also, if any of the  writes to the database fails (Put, Delete, Merge,
# Write), the database will switch to read-only mode and fail all other
# Write operations.
# In most cases you want this to be set to true.
paranoid_checks=true

# If true, RocksDB supports flushing multiple column families and committing
# their results atomically to MANIFEST. Note that it is not
# necessary to set atomic_flush to true if WAL is always enabled since WAL
# allows the database to be restored to the last persistent state in WAL.
# This option is useful when there are column families with writes NOT
# protected by WAL.
# For manual flush, application has to specify which column families to
# flush atomically in DB::Flush.
# For auto-triggered flush, RocksDB atomically flushes ALL column families.
#
# Currently, any WAL-enabled writes after atomic flush may be replayed
# independently if the process crashes later and tries to recover.
atomic_flush=false

# This value represents the maximum number of threads that will
# concurrently perform a compaction job by breaking it into multiple,
# smaller ones that are run simultaneously.
max_subcompactions=1

# Once write-ahead logs exceed this size, we will start forcing the flush of
# column families whose memtables are backed by the oldest live WAL file
# (i.e. the ones that are causing all the space amplification). If set to 0
# (default), we will dynamically choose the WAL size limit to be
# [sum of all write_buffer_size * max_write_buffer_number] * 4
# This option takes effect only when there are more than one column family as
# otherwise the wal size is dictated by the write_buffer_size.
max_total_wal_size=0

# Amount of data to build up in memtables across all column
# families before writing to disk.
#
# This is distinct from write_buffer_size, which enforces a limit
# for a single memtable.
#
# This feature is disabled by default. Specify a non-zero value
# to enable it.
db_write_buffer_size=0

# By default RocksDB will flush all memtables on DB close if there are
# unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
# DB close. Unpersisted data WILL BE LOST.
avoid_flush_during_shutdown=false

# Maximal info log files to be kept.
keep_log_file_num=1000

# Number of shards used for table cache.
table_cache_numshardbits=6

# If max_open_files is -1, DB will open all files on DB::Open(). You can
# use this option to increase the number of threads used to open the files.
max_file_opening_threads=16

# NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
# value of max_background_jobs. For backwards compatibility we will set
# `max_background_jobs = max_background_compactions + max_background_flushes`
# in the case where user sets at least one of `max_background_compactions` or
# `max_background_flushes` (we replace -1 by 1 in case one option is unset).
#
# Maximum number of concurrent background compaction jobs, submitted to
# the default LOW priority thread pool.
#
# If you're increasing this, also consider increasing number of threads in
# LOW priority thread pool. For more information, see
# Env::SetBackgroundThreads
max_background_compactions=-1

# If enabled it uses two queues for writes, one for the ones with
# disable_memtable and one for the ones that also write to memtable. This
# allows the memtable writes not to lag behind other writes. It can be used
# to optimize MySQL 2PC in which only the commits, which are serial, write to
# memtable.
two_write_queues=false

# Maximum number of concurrent background jobs (compactions and flushes).
max_background_jobs=2

# By default, writes to stable storage use fdatasync (on platforms
# where this function is available). If this option is true,
# fsync is used instead.
#
# fsync and fdatasync are equally safe for our purposes and fdatasync is
# faster, so it is rarely necessary to set this option. It is provided
# as a workaround for kernel/filesystem bugs, such as one that affected
# fdatasync with ext4 in kernel versions prior to 3.7.
use_fsync=false

# This is a maximum buffer size that is used by WinMmapReadableFile in
# unbuffered disk I/O mode. We need to maintain an aligned buffer for
# reads. We allow the buffer to grow until the specified value and then
# for bigger requests allocate one shot buffers. In unbuffered mode we
# always bypass read-ahead buffer at ReadaheadRandomAccessFile
# When read-ahead is required we then make use of compaction_readahead_size
# value and always try to read ahead. With read-ahead we always
# pre-allocate buffer to the size instead of growing it up to a limit.
#
# This option is currently honored only on Windows
random_access_max_buffer_size=1048576

# Number of open files that can be used by the DB.  You may need to
# increase this if your database has a large working set. Value -1 means
# files opened are always kept open. You can estimate number of files based
# on target_file_size_base and target_file_size_multiplier for level-based
# compaction. For universal-style compaction, you can usually set it to -1.
max_open_files=-1

# If true, then DB::Open() will not update the statistics used to optimize
# compaction decision by loading table properties from many files.
# Turning off this feature will improve DBOpen time especially in
# disk environment.
skip_stats_update_on_db_open=false

# If true, an error is raised if the database already exists.
error_if_exists=false

# Number of bytes to preallocate (via fallocate) the manifest
# files.  Default is 4mb, which is reasonable to reduce random IO
# as well as prevent overallocation for mounts that preallocate
# large amounts of data (such as xfs's allocsize option).
manifest_preallocation_size=4194304

# NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
# value of max_background_jobs. For backwards compatibility we will set
# `max_background_jobs = max_background_compactions + max_background_flushes`
# in the case where user sets at least one of `max_background_compactions` or
# `max_background_flushes`.
#
# Maximum number of concurrent background memtable flush jobs, submitted by
# default to the HIGH priority thread pool. If the HIGH priority thread pool
# is configured to have zero threads, flush jobs will share the LOW priority
# thread pool with compaction jobs.
#
# It is important to use both thread pools when the same Env is shared by
# multiple db instances. Without a separate pool, long running compaction
# jobs could potentially block memtable flush jobs of other db instances,
# leading to unnecessary Put stalls.
#
# If you're increasing this, also consider increasing number of threads in
# HIGH priority thread pool. For more information, see
# Env::SetBackgroundThreads
max_background_flushes=-1

# Disable child process inherit open files. Default: true
is_fd_close_on_exec=true

# If true, the database will be created if it is missing.
create_if_missing=true

# Use adaptive mutex, which spins in the user space before resorting
# to kernel. This could reduce context switch when the mutex is not
# heavily contended. However, if the mutex is hot, we could end up
# wasting spin time.
use_adaptive_mutex=false

# If true, then the status of the threads involved in this DB will
# be tracked and available via GetThreadList() API.
enable_thread_tracking=false

# if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
stats_dump_period_sec=600

# Specify the maximal size of the info log file. If the log file
# is larger than `max_log_file_size`, a new info log file will
# be created.
# If max_log_file_size == 0, all logs will be written to one
# log file.
max_log_file_size=0

# If set true, will hint the underlying file system that the file
# access pattern is random, when a sst file is opened.
advise_random_on_open=true

# If true, missing column families will be automatically created.
create_missing_column_families=true

# Time for the info log file to roll (in seconds).
# If specified with non-zero value, log file will be rolled
# if it has been active longer than `log_file_time_to_roll`.
# Default: 0 (disabled)
log_file_time_to_roll=0

# Use O_DIRECT for writes in background flush and compactions.
use_direct_io_for_flush_and_compaction=false

# If non-zero, we perform bigger reads when doing compaction. If you're
# running RocksDB on spinning disks, you should set this to at least 2MB.
# That way RocksDB's compaction is doing sequential instead of random reads.
#
# When non-zero, we also force new_table_reader_for_compaction_inputs to
# true.
compaction_readahead_size=0

# The periodicity when obsolete files get deleted. The default
# value is 6 hours. The files that get out of scope by compaction
# process will still get automatically delete on every compaction,
# regardless of this setting
delete_obsolete_files_period_micros=21600000000

# Recycle log files.
# If non-zero, we will reuse previously written log files for new
# logs, overwriting the old data.  The value indicates how many
# such files we will keep around at any point in time for later
# use.  This is more efficient because the blocks are already
# allocated and fdatasync does not need to update the inode after
# each write.
recycle_log_file_num=0

# if not zero, periodically take stats snapshots and store in memory, the
# memory size for stats snapshots is capped at stats_history_buffer_size
stats_history_buffer_size=1048576

# When true, guarantees WAL files have at most `wal_bytes_per_sync`
# bytes submitted for writeback at any given time, and SST files have at most
# `bytes_per_sync` bytes pending writeback at any given time. This can be
# used to handle cases where processing speed exceeds I/O speed during file
# generation, which can lead to a huge sync when the file is finished, even
# with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
#
#  - If `sync_file_range` is supported it achieves this by waiting for any
#    prior `sync_file_range`s to finish before proceeding. In this way,
#    processing (compression, etc.) can proceed uninhibited in the gap
#    between `sync_file_range`s, and we block only when I/O falls behind.
#  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
#    always blocks, thus preventing the interleaving of I/O and processing.
#
# Note: Enabling this option does not provide any additional persistence
# guarantees, as it may use `sync_file_range`, which does not write out
# metadata.
strict_bytes_per_sync=false

# Allow the OS to mmap file for reading sst tables. Default: false
allow_mmap_reads=false

# Set this option to true during creation of database if you want
# to be able to ingest behind (call IngestExternalFile() skipping keys
# that already exist, rather than overwriting matching keys).
# Setting this option to true will affect 2 things:
# 1) Disable some internal optimizations around SST file compression
# 2) Reserve bottom-most level for ingested files only.
# 3) Note that num_levels should be >= 3 if this option is turned on.
allow_ingest_behind=false

# If true, then DB::Open / CreateColumnFamily / DropColumnFamily
# / SetOptions will fail if options file is not detected or properly
# persisted.
fail_if_options_file_error=false

# If true, working thread may avoid doing unnecessary and long-latency
# operation (such as deleting obsolete files directly or deleting memtable)
# and will instead schedule a background job to do it.
# Use it if you're latency-sensitive.
avoid_unnecessary_blocking_io=false

# If false, fallocate() calls are bypassed
allow_fallocate=true

# if set to false then recovery will fail when a prepared
# transaction is encountered in the WAL
allow_2pc=false

# manifest file is rolled over on reaching this limit.
# The older manifest file be deleted.
# The default value is 1GB so that the manifest file can grow, but not
# reach the limit of storage capacity.
max_manifest_file_size=1073741824

# The maximum number of microseconds that a write operation will use
# a yielding spin loop to coordinate with other write threads before
# blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
# set properly) increasing this value is likely to increase RocksDB
# throughput at the expense of increased CPU usage.
write_thread_max_yield_usec=100

# Enable direct I/O mode for read/write
# they may or may not improve performance depending on the use case
#
# Files will be opened in "direct I/O" mode
# which means that data r/w from the disk will not be cached or
# buffered. The hardware buffer of the devices may however still
# be used. Memory mapped files are not impacted by these parameters.

# Use O_DIRECT for user and compaction reads.
# When true, we also force new_table_reader_for_compaction_inputs to true.
use_direct_reads=false

# Recovery mode to control the consistency while replaying WAL
# Original levelDB recovery
#
# We tolerate the last record in any log to be incomplete due to a crash
# while writing it. Zeroed bytes from preallocation are also tolerated in the
# trailing data of any log.
#
# Use case: Applications for which updates, once applied, must not be rolled
# back even after a crash-recovery. In this recovery mode, RocksDB guarantees
# this as long as `WritableFile::Append()` writes are durable. In case the
# user needs the guarantee in more situations (e.g., when
# `WritableFile::Append()` writes to page cache, but the user desires this
# guarantee in face of power-loss crash-recovery), RocksDB offers various
# mechanisms to additionally invoke `WritableFile::Sync()` in order to
# strengthen the guarantee.
#
# This differs from `kPointInTimeRecovery` in that, in case a corruption is
# detected during recovery, this mode will refuse to open the DB. Whereas,
# `kPointInTimeRecovery` will stop recovery just before the corruption since
# that is a valid point-in-time to which to recover.
# 
# `kTolerateCorruptedTailRecords`
# Recover from clean shutdown
# We don't expect to find any corruption in the WAL
# Use case : This is ideal for unit tests and rare applications that
# can require high consistency guarantee
# `kAbsoluteConsistency`
# Recover to point-in-time consistency (default)
# We stop the WAL playback on discovering WAL inconsistency
# Use case : Ideal for systems that have disk controller cache like
# hard disk, SSD without super capacitor that store related data
# `kPointInTimeRecovery`
# Recovery after a disaster
# We ignore any corruption in the WAL and try to salvage as much data as
# possible
# Use case : Ideal for last ditch effort to recover data or systems that
# operate with low grade unrelated data
# `kSkipAnyCorruptedRecords`
wal_recovery_mode=kPointInTimeRecovery

# If true, always create a new file descriptor and new table reader
# for compaction inputs. Turn this parameter on may introduce extra
# memory usage in the table reader, if it allocates extra memory
# for indexes. This will allow file descriptor prefetch options
# to be set for compaction input files and not to impact file
# descriptors for the same file used by user queries.
# Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
# for this mode if using block-based table.
new_table_reader_for_compaction_inputs=false

# Allow the OS to mmap file for writing.
# DB::SyncWAL() only works if this is set to false.
allow_mmap_writes=false