-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
engine.go
1809 lines (1706 loc) · 77.9 KB
/
engine.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package storage
import (
"bytes"
"context"
"time"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/storage/fs"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/iterutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble"
prometheusgo "github.com/prometheus/client_model/go"
)
// DefaultStorageEngine represents the default storage engine to use.
var DefaultStorageEngine enginepb.EngineType
func init() {
_ = DefaultStorageEngine.Set(envutil.EnvOrDefaultString("COCKROACH_STORAGE_ENGINE", "pebble"))
}
// SimpleMVCCIterator is an interface for iterating over key/value pairs in an
// engine. SimpleMVCCIterator implementations are thread safe unless otherwise
// noted. SimpleMVCCIterator is a subset of the functionality offered by
// MVCCIterator.
//
// API invariants are asserted via assertSimpleMVCCIteratorInvariants().
//
// The iterator exposes both point keys and range keys. Range keys are only
// emitted when enabled via IterOptions.KeyTypes. Currently, all range keys are
// MVCC range tombstones, and this is enforced during writes.
//
// Range keys and point keys exist separately in Pebble. A specific key position
// can have both a point key and multiple range keys overlapping it. Their
// properties are accessed via:
//
// HasPointAndRange(): Key types present at the current position.
// UnsafeKey(): Current position (point key if any).
// UnsafeValue(): Current point key value (if any).
// RangeBounds(): Start,end bounds of range keys at current position.
// RangeKeys(): All range keys/values overlapping current position.
//
// Consider the following point keys and range keys:
//
// 4: a4 b4
// 3: [-------)
// 2: [-------)
// 1: b1 c1
// a b c
//
// Range keys cover a span between two roachpb.Key bounds (start inclusive, end
// exclusive) and contain timestamp/value pairs. They overlap *all* point key
// versions within their key bounds regardless of timestamp. For example, when
// the iterator is positioned on b@4, it will also expose [a-c)@3 and [a-c)@2.
//
// During iteration with IterKeyTypePointsAndRanges, range keys are emitted at
// their start key and at every overlapping point key. For example, iterating
// across the above span would emit this sequence:
//
// UnsafeKey HasPointAndRange UnsafeValue RangeKeys
// a false,true - [a-c)@3 [a-c)@2
// a@4 true,true a4 [a-c)@3 [a-c)@2
// b@4 true,true b4 [a-c)@3 [a-c)@2
// b@1 true,true b1 [a-c)@3 [a-c)@2
// c@1 true,false c1 -
//
// MVCCIterator reverse iteration yields the above sequence in reverse.
// Notably, bare range keys are still emitted at their start key (not end key),
// so they will be emitted last in this example.
//
// When using SeekGE within range key bounds, the iterator may land on the bare
// range key first, unless seeking exactly to an existing point key. E.g.:
//
// SeekGE UnsafeKey HasPointAndRange UnsafeValue RangeKeys
// b b false,true - [a-c)@3 [a-c)@2
// b@5 b@5 false,true - [a-c)@3 [a-c)@2
// b@4 b@4 true,true b@4 [a-c)@3 [a-c)@2
// b@3 b@3 false,true - [a-c)@3 [a-c)@2
//
// Note that intents (with timestamp 0) encode to a bare roachpb.Key, so they
// will be colocated with a range key start bound. For example, if there was an
// intent on a in the above example, then both SeekGE(a) and forward iteration
// would land on a@0 and [a-c)@3,[a-c)@2 simultaneously, instead of the bare
// range keys first.
//
// Range keys do not have a stable, discrete identity, and should be
// considered a continuum: they may be merged or fragmented by other range key
// writes, split and merged along with CRDB ranges, partially removed by GC,
// and truncated by iterator bounds.
//
// Range keys are fragmented by Pebble such that all overlapping range keys
// between two keys form a stack of range key fragments at different timestamps.
// For example, writing [a-e)@1 and [c-g)@2 will yield this fragment structure:
//
// 2: |---|---|
// 1: |---|---|
// a c e g
//
// Fragmentation makes all range key properties local, which avoids incurring
// unnecessary access costs across SSTs and CRDB ranges. It is deterministic
// on the current range key state, and does not depend on write history.
// Stacking allows easy access to all range keys overlapping a point key.
//
// For more information on MVCC range keys, see this tech note:
// https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
type SimpleMVCCIterator interface {
// Close frees up resources held by the iterator.
Close()
// SeekGE advances the iterator to the first key in the engine which is >= the
// provided key. This may be in the middle of a bare range key straddling the
// seek key.
SeekGE(key MVCCKey)
// Valid must be called after any call to Seek(), Next(), Prev(), or
// similar methods. It returns (true, nil) if the iterator points to
// a valid key (it is undefined to call Key(), Value(), or similar
// methods unless Valid() has returned (true, nil)). It returns
// (false, nil) if the iterator has moved past the end of the valid
// range, or (false, err) if an error has occurred. Valid() will
// never return true with a non-nil error.
Valid() (bool, error)
// Next advances the iterator to the next key in the iteration. After this
// call, Valid() will be true if the iterator was not positioned at the last
// key.
Next()
// NextKey advances the iterator to the next MVCC key. This operation is
// distinct from Next which advances to the next version of the current key
// or the next key if the iterator is currently located at the last version
// for a key. NextKey must not be used to switch iteration direction from
// reverse iteration to forward iteration.
//
// If NextKey() lands on a bare range key, it is possible that there exists a
// versioned point key at the start key too. Calling NextKey() again would
// skip over this point key, since the start key was already emitted. If the
// caller wants to see it, it must call Next() to check for it. Note that
// this is not the case with intents: they don't have a timestamp, so the
// encoded key is identical to the range key's start bound, and they will
// be emitted together at that position.
NextKey()
// UnsafeKey returns the current key position. This may be a point key, or
// the current position inside a range key (typically the start key
// or the seek key when using SeekGE within its bounds).
//
// The memory is invalidated on the next call to {Next,NextKey,Prev,SeekGE,
// SeekLT,Close}. Use Key() if this is undesirable.
UnsafeKey() MVCCKey
// UnsafeValue returns the current point key value as a byte slice.
// This must only be called when it is known that the iterator is positioned
// at a point value, i.e. HasPointAndRange has returned (true, *). If
// possible, use MVCCValueLenAndIsTombstone() instead.
//
// The memory is invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
// Use Value() if that is undesirable.
UnsafeValue() ([]byte, error)
// MVCCValueLenAndIsTombstone should be called only for MVCC (i.e.,
// UnsafeKey().IsValue()) point values, when the actual point value is not
// needed, for example when updating stats and making GC decisions, and it
// is sufficient for the caller to know the length (len(UnsafeValue()), and
// whether the underlying MVCCValue is a tombstone
// (MVCCValue.IsTombstone()). This is an optimization that can allow the
// underlying storage layer to avoid retrieving the value.
// REQUIRES: HasPointAndRange() has returned (true, *).
MVCCValueLenAndIsTombstone() (int, bool, error)
// ValueLen can be called for MVCC or non-MVCC values, when only the value
// length is needed. This is an optimization that can allow the underlying
// storage layer to avoid retrieving the value.
// REQUIRES: HasPointAndRange() has returned (true, *).
ValueLen() int
// HasPointAndRange returns whether the current iterator position has a point
// key and/or a range key. Must check Valid() first. At least one of these
// will always be true for a valid iterator. For details on range keys, see
// comment on SimpleMVCCIterator.
HasPointAndRange() (bool, bool)
// RangeBounds returns the range bounds for the current range key, or an
// empty span if there are none. The returned keys are valid until the
// range key changes, see RangeKeyChanged().
RangeBounds() roachpb.Span
// RangeKeys returns a stack of all range keys (with different timestamps) at
// the current key position. When at a point key, it will return all range
// keys overlapping that point key. The stack is valid until the range key
// changes, see RangeKeyChanged().
//
// For details on range keys, see SimpleMVCCIterator comment, or tech note:
// https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
RangeKeys() MVCCRangeKeyStack
// RangeKeyChanged returns true if the previous seek or step moved to a
// different range key (or none at all). This includes an exhausted iterator.
RangeKeyChanged() bool
}
// IteratorStats is returned from {MVCCIterator,EngineIterator}.Stats.
type IteratorStats struct {
// Iteration stats. We directly expose pebble.IteratorStats. Callers
// may want to aggregate and interpret these in the following manner:
// - Aggregate {Forward,Reverse}SeekCount, {Forward,Reverse}StepCount.
// - Interpret the four aggregated stats as follows:
// - {SeekCount,StepCount}[InterfaceCall]: We can refer to these simply as
// {SeekCount,StepCount} in logs/metrics/traces. These represents
// explicit calls by the implementation of MVCCIterator, in response to
// the caller of MVCCIterator. A high count relative to the unique MVCC
// keys returned suggests there are many versions for the same key.
// - {SeekCount,StepCount}[InternalIterCall]: We can refer to these simply
// as {InternalSeekCount,InternalStepCount}. If these are significantly
// larger than the ones in the preceding bullet, it suggests that there
// are lots of uncompacted deletes or stale Pebble-versions (not to be
// confused with MVCC versions) that need to be compacted away. This
// should be very rare, but has been observed.
Stats pebble.IteratorStats
}
// MVCCIterator is an interface for iterating over key/value pairs in an
// engine. It is used for iterating over the key space that can have multiple
// versions, and if often also used (due to historical reasons) for iterating
// over the key space that never has multiple versions (i.e.,
// MVCCKey.Timestamp.IsEmpty()).
//
// MVCCIterator implementations are thread safe unless otherwise noted. API
// invariants are asserted via assertMVCCIteratorInvariants().
//
// For details on range keys and iteration, see comment on SimpleMVCCIterator.
type MVCCIterator interface {
SimpleMVCCIterator
// SeekLT advances the iterator to the first key in the engine which is < the
// provided key. Unlike SeekGE, when calling SeekLT within range key bounds
// this will not land on the seek key, but rather on the closest point key
// overlapping the range key or the range key's start bound.
SeekLT(key MVCCKey)
// Prev moves the iterator backward to the previous key in the iteration.
// After this call, Valid() will be true if the iterator was not positioned at
// the first key.
Prev()
// SeekIntentGE is a specialized version of SeekGE(MVCCKey{Key: key}), when
// the caller expects to find an intent, and additionally has the txnUUID
// for the intent it is looking for. When running with separated intents,
// this can optimize the behavior of the underlying Engine for write heavy
// keys by avoiding the need to iterate over many deleted intents.
SeekIntentGE(key roachpb.Key, txnUUID uuid.UUID)
// Key is like UnsafeKey, but returns memory now owned by the caller.
Key() MVCCKey
// UnsafeRawKey returns the current raw key which could be an encoded
// MVCCKey, or the more general EngineKey (for a lock table key).
// This is a low-level and dangerous method since it will expose the
// raw key of the lock table, i.e., the intentInterleavingIter will not
// hide the difference between interleaved and separated intents.
// Callers should be very careful when using this. This is currently
// only used by callers who are iterating and deleting all data in a
// range.
UnsafeRawKey() []byte
// UnsafeRawMVCCKey returns a serialized MVCCKey. The memory is invalidated
// on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}. If the
// iterator is currently positioned at a separated intent (when
// intentInterleavingIter is used), it makes that intent look like an
// interleaved intent key, i.e., an MVCCKey with an empty timestamp. This is
// currently used by callers who pass around key information as a []byte --
// this seems avoidable, and we should consider cleaning up the callers.
UnsafeRawMVCCKey() []byte
// Value is like UnsafeValue, but returns memory owned by the caller.
Value() ([]byte, error)
// ValueProto unmarshals the value the iterator is currently
// pointing to using a protobuf decoder.
ValueProto(msg protoutil.Message) error
// FindSplitKey finds a key from the given span such that the left side of the
// split is roughly targetSize bytes. It only considers MVCC point keys, not
// range keys. The returned key will never be chosen from the key ranges
// listed in keys.NoSplitSpans and will always sort equal to or after
// minSplitKey.
//
// DO NOT CALL directly (except in wrapper MVCCIterator implementations). Use the
// package-level MVCCFindSplitKey instead. For correct operation, the caller
// must set the upper bound on the iterator before calling this method.
FindSplitKey(start, end, minSplitKey roachpb.Key, targetSize int64) (MVCCKey, error)
// Stats returns statistics about the iterator.
Stats() IteratorStats
// IsPrefix returns true if the MVCCIterator is a prefix iterator, i.e.
// created with IterOptions.Prefix enabled.
IsPrefix() bool
}
// EngineIterator is an iterator over key-value pairs where the key is
// an EngineKey.
type EngineIterator interface {
// Close frees up resources held by the iterator.
Close()
// SeekEngineKeyGE advances the iterator to the first key in the engine
// which is >= the provided key.
SeekEngineKeyGE(key EngineKey) (valid bool, err error)
// SeekEngineKeyLT advances the iterator to the first key in the engine
// which is < the provided key.
SeekEngineKeyLT(key EngineKey) (valid bool, err error)
// NextEngineKey advances the iterator to the next key/value in the
// iteration. After this call, valid will be true if the iterator was not
// originally positioned at the last key. Note that unlike
// MVCCIterator.NextKey, this method does not skip other versions with the
// same EngineKey.Key.
// TODO(sumeer): change MVCCIterator.Next() to match the
// return values, change all its callers, and rename this
// to Next().
NextEngineKey() (valid bool, err error)
// PrevEngineKey moves the iterator backward to the previous key/value in
// the iteration. After this call, valid will be true if the iterator was
// not originally positioned at the first key.
PrevEngineKey() (valid bool, err error)
// HasPointAndRange returns whether the iterator is positioned on a point or
// range key (shared with MVCCIterator interface).
HasPointAndRange() (bool, bool)
// EngineRangeBounds returns the current range key bounds.
EngineRangeBounds() (roachpb.Span, error)
// EngineRangeKeys returns the engine range keys at the current position.
EngineRangeKeys() []EngineRangeKeyValue
// RangeKeyChanged returns true if the previous seek or step moved to a
// different range key (or none at all). This includes an exhausted iterator.
RangeKeyChanged() bool
// UnsafeEngineKey returns the same value as EngineKey, but the memory is
// invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
// REQUIRES: latest positioning function returned valid=true.
UnsafeEngineKey() (EngineKey, error)
// EngineKey returns the current key.
// REQUIRES: latest positioning function returned valid=true.
EngineKey() (EngineKey, error)
// UnsafeRawEngineKey returns the current raw (encoded) key corresponding to
// EngineKey. This is a low-level method and callers should avoid using
// it. This is currently only used by intentInterleavingIter to implement
// UnsafeRawKey.
UnsafeRawEngineKey() []byte
// UnsafeValue returns the same value as Value, but the memory is
// invalidated on the next call to {Next,NextKey,Prev,SeekGE,SeekLT,Close}.
// REQUIRES: latest positioning function returned valid=true.
UnsafeValue() ([]byte, error)
// Value returns the current value as a byte slice.
// REQUIRES: latest positioning function returned valid=true.
Value() ([]byte, error)
// GetRawIter is a low-level method only for use in the storage package,
// that returns the underlying pebble Iterator.
GetRawIter() *pebble.Iterator
// SeekEngineKeyGEWithLimit is similar to SeekEngineKeyGE, but takes an
// additional exclusive upper limit parameter. The limit is semantically
// best-effort, and is an optimization to avoid O(n^2) iteration behavior in
// some pathological situations (uncompacted deleted locks).
SeekEngineKeyGEWithLimit(key EngineKey, limit roachpb.Key) (state pebble.IterValidityState, err error)
// SeekEngineKeyLTWithLimit is similar to SeekEngineKeyLT, but takes an
// additional inclusive lower limit parameter. The limit is semantically
// best-effort, and is an optimization to avoid O(n^2) iteration behavior in
// some pathological situations (uncompacted deleted locks).
SeekEngineKeyLTWithLimit(key EngineKey, limit roachpb.Key) (state pebble.IterValidityState, err error)
// NextEngineKeyWithLimit is similar to NextEngineKey, but takes an
// additional exclusive upper limit parameter. The limit is semantically
// best-effort, and is an optimization to avoid O(n^2) iteration behavior in
// some pathological situations (uncompacted deleted locks).
NextEngineKeyWithLimit(limit roachpb.Key) (state pebble.IterValidityState, err error)
// PrevEngineKeyWithLimit is similar to PrevEngineKey, but takes an
// additional inclusive lower limit parameter. The limit is semantically
// best-effort, and is an optimization to avoid O(n^2) iteration behavior in
// some pathological situations (uncompacted deleted locks).
PrevEngineKeyWithLimit(limit roachpb.Key) (state pebble.IterValidityState, err error)
// Stats returns statistics about the iterator.
Stats() IteratorStats
}
// IterOptions contains options used to create an {MVCC,Engine}Iterator.
//
// For performance, every {MVCC,Engine}Iterator must specify either Prefix or
// UpperBound.
type IterOptions struct {
// If Prefix is true, Seek will use the user-key prefix of the supplied
// {MVCC,Engine}Key (the Key field) to restrict which sstables are searched,
// but iteration (using Next) over keys without the same user-key prefix
// will not work correctly (keys may be skipped).
Prefix bool
// LowerBound gives this iterator an inclusive lower bound. Attempts to
// SeekReverse or Prev to a key that is strictly less than the bound will
// invalidate the iterator.
LowerBound roachpb.Key
// UpperBound gives this iterator an exclusive upper bound. Attempts to Seek
// or Next to a key that is greater than or equal to the bound will invalidate
// the iterator. UpperBound must be provided unless Prefix is true, in which
// case the end of the prefix will be used as the upper bound.
UpperBound roachpb.Key
// If WithStats is true, the iterator accumulates performance
// counters over its lifetime which can be queried via `Stats()`.
WithStats bool
// MinTimestampHint and MaxTimestampHint, if set, indicate that keys outside
// of the time range formed by [MinTimestampHint, MaxTimestampHint] do not
// need to be presented by the iterator. The underlying iterator may be able
// to efficiently skip over keys outside of the hinted time range, e.g., when
// an SST indicates that it contains no keys within the time range. Intents
// will not be visible to such iterators at all. This is only relevant for
// MVCCIterators.
//
// Note that time bound hints are strictly a performance optimization, and
// iterators with time bounds hints will frequently return keys outside of the
// [start, end] time range. If you must guarantee that you never see a key
// outside of the time bounds, perform your own filtering.
//
// NB: The iterator may surface stale data. Pebble range tombstones do not have
// timestamps and thus may be ignored entirely depending on whether their SST
// happens to satisfy the filter. Furthermore, keys outside the timestamp
// range may be stale and must be ignored -- for example, consider a key foo@5
// written in an SST with timestamp range [3-7], and then a non-MVCC removal
// or update of this key in a different SST with timestamp range [3-5]. Using
// an iterator with range [6-9] would surface the old foo@5 key because it
// would return all keys in the old [3-7] SST but not take into account the
// separate [3-5] SST where foo@5 was removed or updated. See also:
// https://github.com/cockroachdb/pebble/issues/1786
//
// NB: Range keys are not currently subject to timestamp filtering due to
// complications with MVCCIncrementalIterator. See:
// https://github.com/cockroachdb/cockroach/issues/86260
//
// Currently, the only way to correctly use such an iterator is to use it in
// concert with an iterator without timestamp hints, as done by
// MVCCIncrementalIterator.
MinTimestampHint, MaxTimestampHint hlc.Timestamp
// KeyTypes specifies the types of keys to surface: point and/or range keys.
// Use HasPointAndRange() to determine which key type is present at a given
// iterator position, and RangeBounds() and RangeKeys() to access range keys.
// Defaults to IterKeyTypePointsOnly. For more details on range keys, see
// comment on SimpleMVCCIterator.
KeyTypes IterKeyType
// RangeKeyMaskingBelow enables masking (hiding) of point keys by range keys.
// Any range key with a timestamp at or below RangeKeyMaskingBelow
// will mask point keys below it, preventing them from being surfaced.
// Consider the following example:
//
// 4 o---------------o RangeKeyMaskingBelow=4 emits b3
// 3 b3 d3 RangeKeyMaskingBelow=3 emits b3,d3,f2
// 2 o---------------o f2 RangeKeyMaskingBelow=2 emits b3,d3,f2
// 1 a1 b1 o-------o RangeKeyMaskingBelow=1 emits a1,b3,b1,d3,f2
// a b c d e f g
//
// Range keys themselves are not affected by the masking, and will be
// emitted as normal.
RangeKeyMaskingBelow hlc.Timestamp
// useL6Filters allows the caller to opt into reading filter blocks for
// L6 sstables. Only for use with Prefix = true. Helpful if a lot of prefix
// Seeks are expected in quick succession, that are also likely to not
// yield a single key. Filter blocks in L6 can be relatively large, often
// larger than data blocks, so the benefit of loading them in the cache
// is minimized if the probability of the key existing is not low or if
// this is a one-time Seek (where loading the data block directly is better).
useL6Filters bool
}
// IterKeyType configures which types of keys an iterator should surface.
//
// TODO(erikgrinaker): Combine this with MVCCIterKind somehow.
type IterKeyType = pebble.IterKeyType
const (
// IterKeyTypePointsOnly iterates over point keys only.
IterKeyTypePointsOnly = pebble.IterKeyTypePointsOnly
// IterKeyTypePointsAndRanges iterates over both point and range keys.
IterKeyTypePointsAndRanges = pebble.IterKeyTypePointsAndRanges
// IterKeyTypeRangesOnly iterates over only range keys.
IterKeyTypeRangesOnly = pebble.IterKeyTypeRangesOnly
)
// MVCCIterKind is used to inform Reader about the kind of iteration desired
// by the caller.
type MVCCIterKind int
// "Intent" refers to non-inline meta, that can be interleaved or separated.
const (
// MVCCKeyAndIntentsIterKind specifies that intents must be seen, and appear
// interleaved with keys, even if they are in a separated lock table.
// Iterators of this kind are not allowed to span from local to global keys,
// since the physical layout has the separated lock table in-between the
// local and global keys. These iterators do strict error checking and panic
// if the caller seems that to be trying to violate this constraint.
// Specifically:
// - If both bounds are set they must not span from local to global.
// - Any bound (lower or upper), constrains the iterator for its lifetime to
// one of local or global keys. The iterator will not tolerate a seek that
// violates this constraint.
// We could, with significant code complexity, not constrain an iterator for
// its lifetime, and allow a seek that specifies a global (local) key to
// change the constraint to global (local). This would allow reuse of the
// same iterator with a large global upper-bound. But a Next call on the
// highest local key (Prev on the lowest global key) would still not be able
// to transparently skip over the intermediate lock table. We deem that
// behavior to be more surprising and bug-prone (for the caller), than being
// strict.
MVCCKeyAndIntentsIterKind MVCCIterKind = iota
// MVCCKeyIterKind specifies that the caller does not need to see intents.
// Any interleaved intents may be seen, but no correctness properties are
// derivable from such partial knowledge of intents. NB: this is a performance
// optimization when iterating over (a) MVCC keys where the caller does
// not need to see intents, (b) a key space that is known to not have multiple
// versions (and therefore will never have intents), like the raft log.
MVCCKeyIterKind
)
// Reader is the read interface to an engine's data. Certain implementations
// of Reader guarantee consistency of the underlying engine state across the
// different iterators created by NewMVCCIterator, NewEngineIterator:
// - pebbleSnapshot, because it uses an engine snapshot.
// - pebbleReadOnly, pebbleBatch: when the IterOptions do not specify a
// timestamp hint (see IterOptions). Note that currently the engine state
// visible here is not as of the time of the Reader creation. It is the time
// when the first iterator is created, or earlier if
// PinEngineStateForIterators is called.
//
// The ConsistentIterators method returns true when this consistency is
// guaranteed by the Reader.
// TODO(sumeer): this partial consistency can be a source of bugs if future
// code starts relying on it, but rarely uses a Reader that does not guarantee
// it. Can we enumerate the current cases where KV uses Engine as a Reader?
type Reader interface {
// Close closes the reader, freeing up any outstanding resources. Note that
// various implementations have slightly different behaviors. In particular,
// Distinct() batches release their parent batch for future use while
// Engines, Snapshots and Batches free the associated C++ resources.
Close()
// Closed returns true if the reader has been closed or is not usable.
// Objects backed by this reader (e.g. Iterators) can check this to ensure
// that they are not using a closed engine. Intended for use within package
// engine; exported to enable wrappers to exist in other packages.
Closed() bool
// MVCCIterate scans from the start key to the end key (exclusive), invoking
// the function f on each key value pair. The inputs are copies, and safe to
// retain beyond the function call. It supports interleaved iteration over
// point and/or range keys, providing any overlapping range keys for each
// point key if requested. If f returns an error or if the scan itself
// encounters an error, the iteration will stop and return the error.
//
// Note that this method is not expected take into account the timestamp of
// the end key; all MVCCKeys at end.Key are considered excluded in the
// iteration.
MVCCIterate(start, end roachpb.Key, iterKind MVCCIterKind, keyTypes IterKeyType,
f func(MVCCKeyValue, MVCCRangeKeyStack) error) error
// NewMVCCIterator returns a new instance of an MVCCIterator over this engine.
// The caller must invoke Close() on it when done to free resources.
//
// Write visibility semantics:
//
// 1. An iterator has a consistent view of the reader as of the time of its
// creation. Subsequent writes are never visible to it.
//
// 2. All iterators on readers with ConsistentIterators=true have a consistent
// view of the _engine_ (not reader) as of the time of the first iterator
// creation or PinEngineStateForIterators call: newer engine writes are
// never visible. The opposite holds for ConsistentIterators=false: new
// iterators see the most recent engine state at the time of their creation.
//
// 3. Iterators on unindexed batches never see batch writes, but satisfy
// ConsistentIterators for engine write visibility.
//
// 4. Iterators on indexed batches see all batch writes as of their creation
// time, but they satisfy ConsistentIterators for engine writes.
NewMVCCIterator(iterKind MVCCIterKind, opts IterOptions) MVCCIterator
// NewEngineIterator returns a new instance of an EngineIterator over this
// engine. The caller must invoke EngineIterator.Close() when finished
// with the iterator to free resources. The caller can change IterOptions
// after this function returns.
NewEngineIterator(opts IterOptions) EngineIterator
// ConsistentIterators returns true if the Reader implementation guarantees
// that the different iterators constructed by this Reader will see the same
// underlying Engine state. This is not true about Batch writes: new iterators
// will see new writes made to the batch, existing iterators won't.
ConsistentIterators() bool
// SupportsRangeKeys returns true if the Reader implementation supports
// range keys.
//
// TODO(erikgrinaker): Remove this after 22.2.
SupportsRangeKeys() bool
// PinEngineStateForIterators ensures that the state seen by iterators
// without timestamp hints (see IterOptions) is pinned and will not see
// future mutations. It can be called multiple times on a Reader in which
// case the state seen will be either:
// - As of the first call.
// - For a Reader returned by Engine.NewSnapshot, the pinned state is as of
// the time the snapshot was taken.
// So the semantics that are true for all Readers is that the pinned state
// is somewhere in the time interval between the creation of the Reader and
// the first call to PinEngineStateForIterators.
// REQUIRES: ConsistentIterators returns true.
PinEngineStateForIterators() error
}
// Writer is the write interface to an engine's data.
type Writer interface {
// ApplyBatchRepr atomically applies a set of batched updates. Created by
// calling Repr() on a batch. Using this method is equivalent to constructing
// and committing a batch whose Repr() equals repr. If sync is true, the
// batch is synchronously written to disk. It is an error to specify
// sync=true if the Writer is a Batch.
//
// It is safe to modify the contents of the arguments after ApplyBatchRepr
// returns.
ApplyBatchRepr(repr []byte, sync bool) error
// ClearMVCC removes the point key with the given MVCCKey from the db. It does
// not affect range keys. It requires that the timestamp is non-empty (see
// ClearUnversioned or ClearIntent if the timestamp is empty). Note that clear
// actually removes entries from the storage engine, rather than inserting
// MVCC tombstones.
//
// It is safe to modify the contents of the arguments after it returns.
ClearMVCC(key MVCCKey) error
// ClearUnversioned removes an unversioned item from the db. It is for use
// with inline metadata (not intents) and other unversioned keys (like
// Range-ID local keys). It does not affect range keys.
//
// It is safe to modify the contents of the arguments after it returns.
ClearUnversioned(key roachpb.Key) error
// ClearIntent removes an intent from the db. Unlike ClearMVCC and
// ClearUnversioned, this is a higher-level method that may make changes in
// parts of the key space that are not only a function of the input, and may
// choose to use a single-clear under the covers. txnDidNotUpdateMeta allows
// for performance optimization when set to true, and has semantics defined in
// MVCCMetadata.TxnDidNotUpdateMeta (it can be conservatively set to false).
//
// It is safe to modify the contents of the arguments after it returns.
//
// TODO(sumeer): after the full transition to separated locks, measure the
// cost of a PutIntent implementation, where there is an existing intent,
// that does a <single-clear, put> pair. If there isn't a performance
// decrease, we can stop tracking txnDidNotUpdateMeta and still optimize
// ClearIntent by always doing single-clear.
ClearIntent(key roachpb.Key, txnDidNotUpdateMeta bool, txnUUID uuid.UUID) error
// ClearEngineKey removes the given point key from the engine. It does not
// affect range keys. Note that clear actually removes entries from the
// storage engine. This is a general-purpose and low-level method that should
// be used sparingly, only when the other Clear* methods are not applicable.
//
// It is safe to modify the contents of the arguments after it returns.
ClearEngineKey(key EngineKey) error
// ClearRawRange removes point and/or range keys from start (inclusive) to end
// (exclusive) using Pebble range tombstones. It can be applied to a range
// consisting of MVCCKeys or the more general EngineKeys -- it simply uses the
// roachpb.Key parameters as the Key field of an EngineKey. This implies that
// it does not clear intents unless the intent lock table is targeted
// explicitly.
//
// Similar to the other Clear* methods, this method actually removes entries
// from the storage engine. It is safe to modify the contents of the arguments
// after it returns.
ClearRawRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error
// ClearMVCCRange removes MVCC point and/or range keys (including intents)
// from start (inclusive) to end (exclusive) using Pebble range tombstones.
//
// Similar to the other Clear* methods, this method actually removes entries
// from the storage engine. It is safe to modify the contents of the arguments
// after it returns.
ClearMVCCRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error
// ClearMVCCVersions removes MVCC point key versions from start (inclusive) to
// end (exclusive) using a Pebble range tombstone. It is meant for efficiently
// clearing a subset of versions of a key, since the parameters are MVCCKeys
// and not roachpb.Keys, but it can also be used across multiple keys. It will
// ignore intents and range keys, leaving them in place.
//
// Similar to the other Clear* methods, this method actually removes entries
// from the storage engine. It is safe to modify the contents of the arguments
// after it returns.
ClearMVCCVersions(start, end MVCCKey) error
// ClearMVCCIteratorRange removes all point and/or range keys in the given
// span using an MVCC iterator, by clearing individual keys (including
// intents).
//
// Similar to the other Clear* methods, this method actually removes entries
// from the storage engine. It is safe to modify the contents of the arguments
// after it returns.
//
// TODO(erikgrinaker): This should be a separate function rather than an
// interface method, but we keep it for now to make use of UnsafeRawKey() when
// clearing keys.
ClearMVCCIteratorRange(start, end roachpb.Key, pointKeys, rangeKeys bool) error
// ClearMVCCRangeKey deletes an MVCC range key from start (inclusive) to end
// (exclusive) at the given timestamp. For any range key that straddles the
// start and end boundaries, only the segments within the boundaries will be
// cleared. Range keys at other timestamps are unaffected. Clears are
// idempotent.
//
// This method is primarily intended for MVCC garbage collection and similar
// internal use.
ClearMVCCRangeKey(rangeKey MVCCRangeKey) error
// PutMVCCRangeKey writes an MVCC range key. It will replace any overlapping
// range keys at the given timestamp (even partial overlap). Only MVCC range
// tombstones, i.e. an empty value, are currently allowed (other kinds will
// need additional handling in MVCC APIs and elsewhere, e.g. stats and GC).
//
// Range keys must be accessed using special iterator options and methods,
// see SimpleMVCCIterator.RangeKeys() for details.
//
// For more information on MVCC range keys, see this tech note:
// https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/mvcc-range-tombstones.md
PutMVCCRangeKey(MVCCRangeKey, MVCCValue) error
// PutRawMVCCRangeKey is like PutMVCCRangeKey, but accepts an encoded
// MVCCValue. It can be used to avoid decoding and immediately re-encoding an
// MVCCValue, but should generally be avoided due to the lack of type safety.
//
// It is safe to modify the contents of the arguments after PutRawMVCCRangeKey
// returns.
PutRawMVCCRangeKey(MVCCRangeKey, []byte) error
// PutEngineRangeKey sets the given range key to the values provided. This is
// a general-purpose and low-level method that should be used sparingly, only
// when the other Put* methods are not applicable.
//
// It is safe to modify the contents of the arguments after it returns.
PutEngineRangeKey(start, end roachpb.Key, suffix, value []byte) error
// ClearEngineRangeKey clears the given range key. This is a general-purpose
// and low-level method that should be used sparingly, only when the other
// Clear* methods are not applicable.
//
// It is safe to modify the contents of the arguments after it returns.
ClearEngineRangeKey(start, end roachpb.Key, suffix []byte) error
// Merge is a high-performance write operation used for values which are
// accumulated over several writes. Multiple values can be merged
// sequentially into a single key; a subsequent read will return a "merged"
// value which is computed from the original merged values. We only
// support Merge for keys with no version.
//
// Merge currently provides specialized behavior for three data types:
// integers, byte slices, and time series observations. Merged integers are
// summed, acting as a high-performance accumulator. Byte slices are simply
// concatenated in the order they are merged. Time series observations
// (stored as byte slices with a special tag on the roachpb.Value) are
// combined with specialized logic beyond that of simple byte slices.
//
//
// It is safe to modify the contents of the arguments after Merge returns.
Merge(key MVCCKey, value []byte) error
// PutMVCC sets the given key to the value provided. It requires that the
// timestamp is non-empty (see {PutUnversioned,PutIntent} if the timestamp
// is empty).
//
// It is safe to modify the contents of the arguments after PutMVCC returns.
PutMVCC(key MVCCKey, value MVCCValue) error
// PutRawMVCC is like PutMVCC, but it accepts an encoded MVCCValue. It
// can be used to avoid decoding and immediately re-encoding an MVCCValue,
// but should generally be avoided due to the lack of type safety.
//
// It is safe to modify the contents of the arguments after PutRawMVCC
// returns.
PutRawMVCC(key MVCCKey, value []byte) error
// PutUnversioned sets the given key to the value provided. It is for use
// with inline metadata (not intents) and other unversioned keys (like
// Range-ID local keys).
//
// It is safe to modify the contents of the arguments after Put returns.
PutUnversioned(key roachpb.Key, value []byte) error
// PutIntent puts an intent at the given key to the value provided. This is
// a higher-level method that may make changes in parts of the key space
// that are not only a function of the input key, and may explicitly clear
// the preceding intent. txnDidNotUpdateMeta defines what happened prior to
// this put, and allows for performance optimization when set to true, and
// has semantics defined in MVCCMetadata.TxnDidNotUpdateMeta (it can be
// conservatively set to false).
//
// It is safe to modify the contents of the arguments after Put returns.
PutIntent(ctx context.Context, key roachpb.Key, value []byte, txnUUID uuid.UUID) error
// PutEngineKey sets the given key to the value provided. This is a
// general-purpose and low-level method that should be used sparingly,
// only when the other Put* methods are not applicable.
//
// It is safe to modify the contents of the arguments after Put returns.
PutEngineKey(key EngineKey, value []byte) error
// LogData adds the specified data to the RocksDB WAL. The data is
// uninterpreted by RocksDB (i.e. not added to the memtable or sstables).
//
// It is safe to modify the contents of the arguments after LogData returns.
LogData(data []byte) error
// LogLogicalOp logs the specified logical mvcc operation with the provided
// details to the writer, if it has logical op logging enabled. For most
// Writer implementations, this is a no-op.
LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails)
// SingleClearEngineKey removes the most recent write to the item from the db
// with the given key. Whether older writes of the item will come back
// to life if not also removed with SingleClear is undefined. See the
// following:
// https://github.com/facebook/rocksdb/wiki/Single-Delete
// for details on the SingleDelete operation that this method invokes. Note
// that clear actually removes entries from the storage engine, rather than
// inserting MVCC tombstones. This is a low-level interface that must not be
// called from outside the storage package. It is part of the interface
// because there are structs that wrap Writer and implement the Writer
// interface, that are not part of the storage package.
//
// It is safe to modify the contents of the arguments after it returns.
SingleClearEngineKey(key EngineKey) error
// ShouldWriteLocalTimestamps is only for internal use in the storage package.
// This method is temporary, to handle the transition from clusters where not
// all nodes understand local timestamps.
ShouldWriteLocalTimestamps(ctx context.Context) bool
}
// ReadWriter is the read/write interface to an engine's data.
type ReadWriter interface {
Reader
Writer
}
// DurabilityRequirement is an advanced option. If in doubt, use
// StandardDurability.
//
// GuranteedDurability maps to pebble.IterOptions.OnlyReadGuaranteedDurable.
// This acknowledges the fact that we do not (without sacrificing correctness)
// sync the WAL for many writes, and there are some advanced cases
// (raftLogTruncator) that need visibility into what is guaranteed durable.
type DurabilityRequirement int8
const (
// StandardDurability is what should normally be used.
StandardDurability DurabilityRequirement = iota
// GuaranteedDurability is an advanced option (only for raftLogTruncator).
GuaranteedDurability
)
// Engine is the interface that wraps the core operations of a key/value store.
type Engine interface {
ReadWriter
// Attrs returns the engine/store attributes.
Attrs() roachpb.Attributes
// Capacity returns capacity details for the engine's available storage.
Capacity() (roachpb.StoreCapacity, error)
// Properties returns the low-level properties for the engine's underlying storage.
Properties() roachpb.StoreProperties
// Compact forces compaction over the entire database.
Compact() error
// Flush causes the engine to write all in-memory data to disk
// immediately.
Flush() error
// GetMetrics retrieves metrics from the engine.
GetMetrics() Metrics
// GetEncryptionRegistries returns the file and key registries when encryption is enabled
// on the store.
GetEncryptionRegistries() (*EncryptionRegistries, error)
// GetEnvStats retrieves stats about the engine's environment
// For RocksDB, this includes details of at-rest encryption.
GetEnvStats() (*EnvStats, error)
// GetAuxiliaryDir returns a path under which files can be stored
// persistently, and from which data can be ingested by the engine.
//
// Not thread safe.
GetAuxiliaryDir() string
// NewBatch returns a new instance of a batched engine which wraps
// this engine. Batched engines accumulate all mutations and apply
// them atomically on a call to Commit().
NewBatch() Batch
// NewReadOnly returns a new instance of a ReadWriter that wraps this
// engine, and with the given durability requirement. This wrapper panics
// when unexpected operations (e.g., write operations) are executed on it
// and caches iterators to avoid the overhead of creating multiple iterators
// for batched reads.
//
// All iterators created from a read-only engine are guaranteed to provide a
// consistent snapshot of the underlying engine. See the comment on the
// Reader interface and the Reader.ConsistentIterators method.
NewReadOnly(durability DurabilityRequirement) ReadWriter
// NewUnindexedBatch returns a new instance of a batched engine which wraps
// this engine. It is unindexed, in that writes to the batch are not
// visible to reads until after it commits. The batch accumulates all
// mutations and applies them atomically on a call to Commit(). Read
// operations return an error, unless writeOnly is set to false.
//
// When writeOnly is false, reads will be satisfied by reading from the
// underlying engine, i.e., the caller does not see its own writes. This
// setting should be used only when the caller is certain that this
// optimization is correct, and beneficial. There are subtleties here -- see
// the discussion on https://github.com/cockroachdb/cockroach/pull/57661 for
// more details.
//
// TODO(sumeer): We should separate the writeOnly=true case into a
// separate method, that returns a WriteBatch interface. Even better would
// be not having an option to pass writeOnly=false, and have the caller
// explicitly work with a separate WriteBatch and Reader.
NewUnindexedBatch(writeOnly bool) Batch
// NewSnapshot returns a new instance of a read-only snapshot
// engine. Snapshots are instantaneous and, as long as they're
// released relatively quickly, inexpensive. Snapshots are released
// by invoking Close(). Note that snapshots must not be used after the
// original engine has been stopped.
NewSnapshot() Reader
// Type returns engine type.
Type() enginepb.EngineType
// IngestExternalFiles atomically links a slice of files into the RocksDB
// log-structured merge-tree.
IngestExternalFiles(ctx context.Context, paths []string) error
// IngestExternalFilesWithStats is a variant of IngestExternalFiles that
// additionally returns ingestion stats.
IngestExternalFilesWithStats(
ctx context.Context, paths []string) (pebble.IngestOperationStats, error)
// PreIngestDelay offers an engine the chance to backpressure ingestions.
// When called, it may choose to block if the engine determines that it is in
// or approaching a state where further ingestions may risk its health.
PreIngestDelay(ctx context.Context)
// ApproximateDiskBytes returns an approximation of the on-disk size for the given key span.
ApproximateDiskBytes(from, to roachpb.Key) (uint64, error)
// CompactRange ensures that the specified range of key value pairs is
// optimized for space efficiency.
CompactRange(start, end roachpb.Key) error
// RegisterFlushCompletedCallback registers a callback that will be run for
// every successful flush. Only one callback can be registered at a time, so
// registering again replaces the previous callback. The callback must
// return quickly and must not call any methods on the Engine in the context
// of the callback since it could cause a deadlock (since the callback may
// be invoked while holding mutexes).
RegisterFlushCompletedCallback(cb func())
// Filesystem functionality.
fs.FS
// CreateCheckpoint creates a checkpoint of the engine in the given directory,
// which must not exist. The directory should be on the same file system so
// that hard links can be used.
CreateCheckpoint(dir string) error
// SetMinVersion is used to signal to the engine the current minimum
// version that it must maintain compatibility with.
SetMinVersion(version roachpb.Version) error
// MinVersionIsAtLeastTargetVersion returns whether the engine's recorded
// storage min version is at least the target version.
MinVersionIsAtLeastTargetVersion(target roachpb.Version) (bool, error)
// SetCompactionConcurrency is used to set the engine's compaction
// concurrency. It returns the previous compaction concurrency.
SetCompactionConcurrency(n uint64) uint64
}
// Batch is the interface for batch specific operations.
type Batch interface {
// Iterators created on a batch can see some mutations performed after the
// iterator creation. To guarantee that they see all the mutations, the
// iterator has to be repositioned using a seek operation, after the
// mutations were done.
ReadWriter
// Commit atomically applies any batched updates to the underlying
// engine. This is a noop unless the batch was created via NewBatch(). If
// sync is true, the batch is synchronously committed to disk.
Commit(sync bool) error
// CommitNoSyncWait atomically applies any batched updates to the underlying
// engine and initiates a synchronous disk write, but does not wait for that
// write to complete. The caller must call SyncWait to wait for the fsync to
// complete. The caller must not Close the Batch without first calling
// SyncWait.
CommitNoSyncWait() error
// SyncWait waits for the disk write initiated by a call to CommitNoSyncWait
// to complete.
SyncWait() error
// Empty returns whether the batch has been written to or not.
Empty() bool
// Count returns the number of memtable-modifying operations in the batch.
Count() uint32
// Len returns the size of the underlying representation of the batch.
// Because of the batch header, the size of the batch is never 0 and should
// not be used interchangeably with Empty. The method avoids the memory copy
// that Repr imposes, but it still may require flushing the batch's mutations.
Len() int
// Repr returns the underlying representation of the batch and can be used to
// reconstitute the batch on a remote node using Writer.ApplyBatchRepr().
Repr() []byte
}
// Metrics is a set of Engine metrics. Most are contained in the embedded
// *pebble.Metrics struct, which has its own documentation.
type Metrics struct {
*pebble.Metrics
// WriteStallCount counts the number of times Pebble intentionally delayed
// incoming writes. Currently, the only two reasons for this to happen are:
// - "memtable count limit reached"
// - "L0 file count limit exceeded"
//
// We do not split this metric across these two reasons, but they can be
// distinguished in the pebble logs.
WriteStallCount int64
WriteStallDuration time.Duration
// DiskSlowCount counts the number of times Pebble records disk slowness.