-
Notifications
You must be signed in to change notification settings - Fork 26
/
rocksdb.ini
510 lines (445 loc) · 23 KB
/
rocksdb.ini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
# If true WAL is not flushed automatically after each write. Instead it
# relies on manual invocation of FlushWAL to write the WAL buffer to its
# file.
manual_wal_flush=false
# By default RocksDB replay WAL logs and flush them on DB open, which may
# create very small SST files. If this option is enabled, RocksDB will try
# to avoid (but not guarantee not to) flush during recovery. Also, existing
# WAL logs will be kept, so that if crash happened before flush, we still
# have logs to recover from.
avoid_flush_during_recovery=false
# If true, then print malloc stats together with rocksdb.stats
# when printing to LOG.
dump_malloc_stats=false
# Log level for the information logs.
# Must be one of INFO_LEVEL or DEBUG_LEVEL
info_log_level=INFO_LEVEL
# Specify the file access pattern once a compaction is started.
# It will be applied to all input files of a compaction.
# Must be one of NONE, NORMAL, SEQUENTIAL, WILLNEED
access_hint_on_compaction_start=NORMAL
# The latency in microseconds after which a std::this_thread::yield
# call (sched_yield on Linux) is considered to be a signal that
# other processes or threads would like to use the current core.
# Increasing this makes writer threads more likely to take CPU
# by spinning, which will show up as an increase in the number of
# involuntary context switches.
write_thread_slow_yield_usec=3
# If true, threads synchronizing with the write batch group leader will
# wait for up to write_thread_max_yield_usec before blocking on a mutex.
# This can substantially improve throughput for concurrent workloads,
# regardless of whether allow_concurrent_memtable_write is enabled.
enable_write_thread_adaptive_yield=true
# Setting unordered_write to true trades higher write throughput with
# relaxing the immutability guarantee of snapshots. This violates the
# repeatability one expects from ::Get from a snapshot, as well as
# ::MultiGet and Iterator's consistent-point-in-time view property.
# If the application cannot tolerate the relaxed guarantees, it can implement
# its own mechanisms to work around that and yet benefit from the higher
# throughput. Using TransactionDB with WRITE_PREPARED write policy and
# two_write_queues=true is one way to achieve immutable snapshots despite
# unordered_write.
#
# By default, i.e., when it is false, rocksdb does not advance the sequence
# number for new snapshots unless all the writes with lower sequence numbers
# are already finished. This provides the immutability that we except from
# snapshots. Moreover, since Iterator and MultiGet internally depend on
# snapshots, the snapshot immutability results into Iterator and MultiGet
# offering consistent-point-in-time view. If set to true, although
# Read-Your-Own-Write property is still provided, the snapshot immutability
# property is relaxed: the writes issued after the snapshot is obtained (with
# larger sequence numbers) will be still not visible to the reads from that
# snapshot, however, there still might be pending writes (with lower sequence
# number) that will change the state visible to the snapshot after they are
# landed to the memtable.
unordered_write=false
# The number of bytes to prefetch when reading the log. This is mostly useful
# for reading a remotely located log, as it can save the number of
# round-trips. If 0, then the prefetching is disabled.
log_readahead_size=0
# By default, a single write thread queue is maintained. The thread gets
# to the head of the queue becomes write batch group leader and responsible
# for writing to WAL and memtable for the batch group.
#
# If enable_pipelined_write is true, separate write thread queue is
# maintained for WAL write and memtable write. A write thread first enter WAL
# writer queue and then memtable writer queue. Pending thread on the WAL
# writer queue thus only have to wait for previous writers to finish their
# WAL writing but not the memtable writing. Enabling the feature may improve
# write throughput and reduce latency of the prepare phase of two-phase
# commit.
enable_pipelined_write=false
# If true, automatically persist stats to a hidden column family (column
# family name: ___rocksdb_stats_history___) every
# stats_persist_period_sec seconds; otherwise, write to an in-memory
# struct. User can query through `GetStatsHistory` API.
# If user attempts to create a column family with the same name on a DB
# which have previously set persist_stats_to_disk to true, the column family
# creation will fail, but the hidden column family will survive, as well as
# the previously persisted statistics.
# When peristing stats to disk, the stat name will be limited at 100 bytes.
persist_stats_to_disk=false
# Needed to support differential snapshots.
# If set to true then DB will only process deletes with sequence number
# less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
# Clients are responsible to periodically call this method to advance
# the cutoff time. If this method is never called and preserve_deletes
# is set to true NO deletes will ever be processed.
# At the moment this only keeps normal deletes, SingleDeletes will
# not be preserved.
preserve_deletes=false
# if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
stats_persist_period_sec=600
# Same as bytes_per_sync, but applies to WAL files
wal_bytes_per_sync=0
# Allows OS to incrementally sync files to disk while they are being
# written, asynchronously, in the background. This operation can be used
# to smooth out write I/Os over time. Users shouldn't rely on it for
# persistency guarantee.
# Issue one request for every bytes_per_sync written. 0 turns it off.
#
# You may consider using rate_limiter to regulate write rate to device.
# When rate limiter is enabled, it automatically enables bytes_per_sync
# to 1MB.
#
# This option applies to table files
bytes_per_sync=0
# The limited write rate to DB if soft_pending_compaction_bytes_limit or
# level0_slowdown_writes_trigger is triggered, or we are writing to the
# last mem table allowed and we allow more than 3 mem tables. It is
# calculated using size of user write requests before compression.
# RocksDB may decide to slow down more if the compaction still
# gets behind further.
# If the value is 0, we will infer a value from `rater_limiter` value
# if it is not empty, or 16MB if `rater_limiter` is empty. Note that
# if users change the rate in `rate_limiter` after DB is opened,
# `delayed_write_rate` won't be adjusted.
#
# Unit: byte per second.
delayed_write_rate=16777216
# The following two fields affect how archived logs will be deleted.
# 1. If both set to 0, logs will be deleted asap and will not get into
# the archive.
# 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
# WAL files will be checked every 10 min and if total size is greater
# then WAL_size_limit_MB, they will be deleted starting with the
# earliest until size_limit is met. All empty files will be deleted.
# 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
# WAL files will be checked every WAL_ttl_seconds / 2 and those that
# are older than WAL_ttl_seconds will be deleted.
# 4. If both are not 0, WAL files will be checked every 10 min and both
# checks will be performed with ttl being first.
WAL_ttl_seconds=0
WAL_size_limit_MB=0
# This is the maximum buffer size that is used by WritableFileWriter.
# On Windows, we need to maintain an aligned buffer for writes.
# We allow the buffer to grow until it's size hits the limit in buffered
# IO and fix the buffer size when using direct IO to ensure alignment of
# write requests if the logical sector size is unusual
writable_file_max_buffer_size=1048576
# If true, allow multi-writers to update mem tables in parallel.
# Only some memtable_factory-s support concurrent writes; currently it
# is implemented only for SkipListFactory. Concurrent memtable writes
# are not compatible with inplace_update_support or filter_deletes.
# It is strongly recommended to set enable_write_thread_adaptive_yield
# if you are going to use this feature.
allow_concurrent_memtable_write=true
# If true, RocksDB will aggressively check consistency of the data.
# Also, if any of the writes to the database fails (Put, Delete, Merge,
# Write), the database will switch to read-only mode and fail all other
# Write operations.
# In most cases you want this to be set to true.
paranoid_checks=true
# If true, RocksDB supports flushing multiple column families and committing
# their results atomically to MANIFEST. Note that it is not
# necessary to set atomic_flush to true if WAL is always enabled since WAL
# allows the database to be restored to the last persistent state in WAL.
# This option is useful when there are column families with writes NOT
# protected by WAL.
# For manual flush, application has to specify which column families to
# flush atomically in DB::Flush.
# For auto-triggered flush, RocksDB atomically flushes ALL column families.
#
# Currently, any WAL-enabled writes after atomic flush may be replayed
# independently if the process crashes later and tries to recover.
atomic_flush=false
# This value represents the maximum number of threads that will
# concurrently perform a compaction job by breaking it into multiple,
# smaller ones that are run simultaneously.
max_subcompactions=1
# Once write-ahead logs exceed this size, we will start forcing the flush of
# column families whose memtables are backed by the oldest live WAL file
# (i.e. the ones that are causing all the space amplification). If set to 0
# (default), we will dynamically choose the WAL size limit to be
# [sum of all write_buffer_size * max_write_buffer_number] * 4
# This option takes effect only when there are more than one column family as
# otherwise the wal size is dictated by the write_buffer_size.
max_total_wal_size=0
# Amount of data to build up in memtables across all column
# families before writing to disk.
#
# This is distinct from write_buffer_size, which enforces a limit
# for a single memtable.
#
# This feature is disabled by default. Specify a non-zero value
# to enable it.
db_write_buffer_size=0
# By default RocksDB will flush all memtables on DB close if there are
# unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
# DB close. Unpersisted data WILL BE LOST.
avoid_flush_during_shutdown=false
# Maximal info log files to be kept.
keep_log_file_num=1000
# Number of shards used for table cache.
table_cache_numshardbits=6
# If max_open_files is -1, DB will open all files on DB::Open(). You can
# use this option to increase the number of threads used to open the files.
max_file_opening_threads=16
# NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
# value of max_background_jobs. For backwards compatibility we will set
# `max_background_jobs = max_background_compactions + max_background_flushes`
# in the case where user sets at least one of `max_background_compactions` or
# `max_background_flushes` (we replace -1 by 1 in case one option is unset).
#
# Maximum number of concurrent background compaction jobs, submitted to
# the default LOW priority thread pool.
#
# If you're increasing this, also consider increasing number of threads in
# LOW priority thread pool. For more information, see
# Env::SetBackgroundThreads
max_background_compactions=-1
# If enabled it uses two queues for writes, one for the ones with
# disable_memtable and one for the ones that also write to memtable. This
# allows the memtable writes not to lag behind other writes. It can be used
# to optimize MySQL 2PC in which only the commits, which are serial, write to
# memtable.
two_write_queues=false
# Maximum number of concurrent background jobs (compactions and flushes).
max_background_jobs=2
# By default, writes to stable storage use fdatasync (on platforms
# where this function is available). If this option is true,
# fsync is used instead.
#
# fsync and fdatasync are equally safe for our purposes and fdatasync is
# faster, so it is rarely necessary to set this option. It is provided
# as a workaround for kernel/filesystem bugs, such as one that affected
# fdatasync with ext4 in kernel versions prior to 3.7.
use_fsync=false
# This is a maximum buffer size that is used by WinMmapReadableFile in
# unbuffered disk I/O mode. We need to maintain an aligned buffer for
# reads. We allow the buffer to grow until the specified value and then
# for bigger requests allocate one shot buffers. In unbuffered mode we
# always bypass read-ahead buffer at ReadaheadRandomAccessFile
# When read-ahead is required we then make use of compaction_readahead_size
# value and always try to read ahead. With read-ahead we always
# pre-allocate buffer to the size instead of growing it up to a limit.
#
# This option is currently honored only on Windows
random_access_max_buffer_size=1048576
# Number of open files that can be used by the DB. You may need to
# increase this if your database has a large working set. Value -1 means
# files opened are always kept open. You can estimate number of files based
# on target_file_size_base and target_file_size_multiplier for level-based
# compaction. For universal-style compaction, you can usually set it to -1.
max_open_files=-1
# If true, then DB::Open() will not update the statistics used to optimize
# compaction decision by loading table properties from many files.
# Turning off this feature will improve DBOpen time especially in
# disk environment.
skip_stats_update_on_db_open=false
# If true, an error is raised if the database already exists.
error_if_exists=false
# Number of bytes to preallocate (via fallocate) the manifest
# files. Default is 4mb, which is reasonable to reduce random IO
# as well as prevent overallocation for mounts that preallocate
# large amounts of data (such as xfs's allocsize option).
manifest_preallocation_size=4194304
# NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
# value of max_background_jobs. For backwards compatibility we will set
# `max_background_jobs = max_background_compactions + max_background_flushes`
# in the case where user sets at least one of `max_background_compactions` or
# `max_background_flushes`.
#
# Maximum number of concurrent background memtable flush jobs, submitted by
# default to the HIGH priority thread pool. If the HIGH priority thread pool
# is configured to have zero threads, flush jobs will share the LOW priority
# thread pool with compaction jobs.
#
# It is important to use both thread pools when the same Env is shared by
# multiple db instances. Without a separate pool, long running compaction
# jobs could potentially block memtable flush jobs of other db instances,
# leading to unnecessary Put stalls.
#
# If you're increasing this, also consider increasing number of threads in
# HIGH priority thread pool. For more information, see
# Env::SetBackgroundThreads
max_background_flushes=-1
# Disable child process inherit open files. Default: true
is_fd_close_on_exec=true
# If true, the database will be created if it is missing.
create_if_missing=true
# Use adaptive mutex, which spins in the user space before resorting
# to kernel. This could reduce context switch when the mutex is not
# heavily contended. However, if the mutex is hot, we could end up
# wasting spin time.
use_adaptive_mutex=false
# If true, then the status of the threads involved in this DB will
# be tracked and available via GetThreadList() API.
enable_thread_tracking=false
# if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
stats_dump_period_sec=600
# Specify the maximal size of the info log file. If the log file
# is larger than `max_log_file_size`, a new info log file will
# be created.
# If max_log_file_size == 0, all logs will be written to one
# log file.
max_log_file_size=0
# If set true, will hint the underlying file system that the file
# access pattern is random, when a sst file is opened.
advise_random_on_open=true
# If true, missing column families will be automatically created.
create_missing_column_families=true
# Time for the info log file to roll (in seconds).
# If specified with non-zero value, log file will be rolled
# if it has been active longer than `log_file_time_to_roll`.
# Default: 0 (disabled)
log_file_time_to_roll=0
# Use O_DIRECT for writes in background flush and compactions.
use_direct_io_for_flush_and_compaction=false
# If non-zero, we perform bigger reads when doing compaction. If you're
# running RocksDB on spinning disks, you should set this to at least 2MB.
# That way RocksDB's compaction is doing sequential instead of random reads.
#
# When non-zero, we also force new_table_reader_for_compaction_inputs to
# true.
compaction_readahead_size=0
# The periodicity when obsolete files get deleted. The default
# value is 6 hours. The files that get out of scope by compaction
# process will still get automatically delete on every compaction,
# regardless of this setting
delete_obsolete_files_period_micros=21600000000
# Recycle log files.
# If non-zero, we will reuse previously written log files for new
# logs, overwriting the old data. The value indicates how many
# such files we will keep around at any point in time for later
# use. This is more efficient because the blocks are already
# allocated and fdatasync does not need to update the inode after
# each write.
recycle_log_file_num=0
# if not zero, periodically take stats snapshots and store in memory, the
# memory size for stats snapshots is capped at stats_history_buffer_size
stats_history_buffer_size=1048576
# When true, guarantees WAL files have at most `wal_bytes_per_sync`
# bytes submitted for writeback at any given time, and SST files have at most
# `bytes_per_sync` bytes pending writeback at any given time. This can be
# used to handle cases where processing speed exceeds I/O speed during file
# generation, which can lead to a huge sync when the file is finished, even
# with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
#
# - If `sync_file_range` is supported it achieves this by waiting for any
# prior `sync_file_range`s to finish before proceeding. In this way,
# processing (compression, etc.) can proceed uninhibited in the gap
# between `sync_file_range`s, and we block only when I/O falls behind.
# - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
# always blocks, thus preventing the interleaving of I/O and processing.
#
# Note: Enabling this option does not provide any additional persistence
# guarantees, as it may use `sync_file_range`, which does not write out
# metadata.
strict_bytes_per_sync=false
# Allow the OS to mmap file for reading sst tables. Default: false
allow_mmap_reads=false
# Set this option to true during creation of database if you want
# to be able to ingest behind (call IngestExternalFile() skipping keys
# that already exist, rather than overwriting matching keys).
# Setting this option to true will affect 2 things:
# 1) Disable some internal optimizations around SST file compression
# 2) Reserve bottom-most level for ingested files only.
# 3) Note that num_levels should be >= 3 if this option is turned on.
allow_ingest_behind=false
# If true, then DB::Open / CreateColumnFamily / DropColumnFamily
# / SetOptions will fail if options file is not detected or properly
# persisted.
fail_if_options_file_error=false
# If true, working thread may avoid doing unnecessary and long-latency
# operation (such as deleting obsolete files directly or deleting memtable)
# and will instead schedule a background job to do it.
# Use it if you're latency-sensitive.
avoid_unnecessary_blocking_io=false
# If false, fallocate() calls are bypassed
allow_fallocate=true
# if set to false then recovery will fail when a prepared
# transaction is encountered in the WAL
allow_2pc=false
# manifest file is rolled over on reaching this limit.
# The older manifest file be deleted.
# The default value is 1GB so that the manifest file can grow, but not
# reach the limit of storage capacity.
max_manifest_file_size=1073741824
# The maximum number of microseconds that a write operation will use
# a yielding spin loop to coordinate with other write threads before
# blocking on a mutex. (Assuming write_thread_slow_yield_usec is
# set properly) increasing this value is likely to increase RocksDB
# throughput at the expense of increased CPU usage.
write_thread_max_yield_usec=100
# Enable direct I/O mode for read/write
# they may or may not improve performance depending on the use case
#
# Files will be opened in "direct I/O" mode
# which means that data r/w from the disk will not be cached or
# buffered. The hardware buffer of the devices may however still
# be used. Memory mapped files are not impacted by these parameters.
# Use O_DIRECT for user and compaction reads.
# When true, we also force new_table_reader_for_compaction_inputs to true.
use_direct_reads=false
# Recovery mode to control the consistency while replaying WAL
# Original levelDB recovery
#
# We tolerate the last record in any log to be incomplete due to a crash
# while writing it. Zeroed bytes from preallocation are also tolerated in the
# trailing data of any log.
#
# Use case: Applications for which updates, once applied, must not be rolled
# back even after a crash-recovery. In this recovery mode, RocksDB guarantees
# this as long as `WritableFile::Append()` writes are durable. In case the
# user needs the guarantee in more situations (e.g., when
# `WritableFile::Append()` writes to page cache, but the user desires this
# guarantee in face of power-loss crash-recovery), RocksDB offers various
# mechanisms to additionally invoke `WritableFile::Sync()` in order to
# strengthen the guarantee.
#
# This differs from `kPointInTimeRecovery` in that, in case a corruption is
# detected during recovery, this mode will refuse to open the DB. Whereas,
# `kPointInTimeRecovery` will stop recovery just before the corruption since
# that is a valid point-in-time to which to recover.
#
# `kTolerateCorruptedTailRecords`
# Recover from clean shutdown
# We don't expect to find any corruption in the WAL
# Use case : This is ideal for unit tests and rare applications that
# can require high consistency guarantee
# `kAbsoluteConsistency`
# Recover to point-in-time consistency (default)
# We stop the WAL playback on discovering WAL inconsistency
# Use case : Ideal for systems that have disk controller cache like
# hard disk, SSD without super capacitor that store related data
# `kPointInTimeRecovery`
# Recovery after a disaster
# We ignore any corruption in the WAL and try to salvage as much data as
# possible
# Use case : Ideal for last ditch effort to recover data or systems that
# operate with low grade unrelated data
# `kSkipAnyCorruptedRecords`
wal_recovery_mode=kPointInTimeRecovery
# If true, always create a new file descriptor and new table reader
# for compaction inputs. Turn this parameter on may introduce extra
# memory usage in the table reader, if it allocates extra memory
# for indexes. This will allow file descriptor prefetch options
# to be set for compaction input files and not to impact file
# descriptors for the same file used by user queries.
# Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
# for this mode if using block-based table.
new_table_reader_for_compaction_inputs=false
# Allow the OS to mmap file for writing.
# DB::SyncWAL() only works if this is set to false.
allow_mmap_writes=false