-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
fetcher.go
1283 lines (1153 loc) · 41 KB
/
fetcher.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2017 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package row
import (
"bytes"
"context"
"fmt"
"strings"
"time"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside"
"github.com/cockroachdb/cockroach/pkg/sql/rowenc/valueside"
"github.com/cockroachdb/cockroach/pkg/sql/rowinfra"
"github.com/cockroachdb/cockroach/pkg/sql/scrub"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)
// DebugRowFetch can be used to turn on some low-level debugging logs. We use
// this to avoid using log.V in the hot path.
const DebugRowFetch = false
// noOutputColumn is a sentinel value to denote that a system column is not
// part of the output.
const noOutputColumn = -1
// KVBatchFetcher abstracts the logic of fetching KVs in batches.
type KVBatchFetcher interface {
// nextBatch returns the next batch of rows. Returns false in the first
// parameter if there are no more keys in the scan. May return either a slice
// of KeyValues or a batchResponse, numKvs pair, depending on the server
// version - both must be handled by calling code.
nextBatch(ctx context.Context) (ok bool, kvs []roachpb.KeyValue, batchResponse []byte, err error)
close(ctx context.Context)
}
type tableInfo struct {
// -- Fields initialized once --
// Used to determine whether a key retrieved belongs to the span we
// want to scan.
desc catalog.TableDescriptor
index catalog.Index
isSecondaryIndex bool
indexColumnDirs []descpb.IndexDescriptor_Direction
// cols are the columns for which we produce values. The fetcher produces rows
// with one value for each of these columns.
cols []catalog.Column
// The set of indexes into the cols array that are required for columns
// in the value part.
neededValueColsByIdx util.FastIntSet
// The number of needed columns from the value part of the row. Once we've
// seen this number of value columns for a particular row, we can stop
// decoding values in that row.
neededValueCols int
// Map used to get the index for columns in cols.
colIdxMap catalog.TableColMap
// One value per column that is part of the key; each value is a column
// index (into cols); -1 if we don't need the value for that column.
indexColIdx []int
// knownPrefixLength is the number of bytes in the index key prefix this
// Fetcher is configured for. The index key prefix is the table id, index
// id pair at the start of the key.
knownPrefixLength int
// -- Fields updated during a scan --
keyValTypes []*types.T
extraTypes []*types.T
keyVals []rowenc.EncDatum
extraVals []rowenc.EncDatum
row rowenc.EncDatumRow
decodedRow tree.Datums
// The following fields contain MVCC metadata for each row and may be
// returned to users of Fetcher immediately after NextRow returns.
//
// rowLastModified is the timestamp of the last time any family in the row
// was modified in any way.
rowLastModified hlc.Timestamp
// timestampOutputIdx controls at what row ordinal to write the timestamp.
timestampOutputIdx int
// Fields for outputting the tableoid system column.
tableOid tree.Datum
oidOutputIdx int
// rowIsDeleted is true when the row has been deleted. This is only
// meaningful when kv deletion tombstones are returned by the KVBatchFetcher,
// which the one used by `StartScan` (the common case) doesnt. Notably,
// changefeeds use this by providing raw kvs with tombstones unfiltered via
// `StartScanFrom`.
rowIsDeleted bool
}
// FetcherTableArgs are the arguments passed to Fetcher.Init
// for a given table that includes descriptors and row information.
type FetcherTableArgs struct {
Desc catalog.TableDescriptor
Index catalog.Index
IsSecondaryIndex bool
// Columns that are being fetched. The resulting datums for each row map
// 1-to-1 to these columns.
Columns []catalog.Column
}
// Fetcher handles fetching kvs and forming table rows for a single table.
// Usage:
// var rf Fetcher
// err := rf.Init(..)
// // Handle err
// err := rf.StartScan(..)
// // Handle err
// for {
// res, err := rf.NextRow()
// // Handle err
// if res.row == nil {
// // Done
// break
// }
// // Process res.row
// }
type Fetcher struct {
// codec is used to encode and decode sql keys.
codec keys.SQLCodec
table tableInfo
// reverse denotes whether or not the spans should be read in reverse
// or not when StartScan is invoked.
reverse bool
// numKeysPerRow is the number of keys per row of the table used to
// calculate the KVBatchFetcher's firstBatchLimit.
numKeysPerRow int
// True if the index key must be decoded. This is only false if there are no
// needed columns.
mustDecodeIndexKey bool
// lockStrength represents the row-level locking mode to use when fetching
// rows.
lockStrength descpb.ScanLockingStrength
// lockWaitPolicy represents the policy to be used for handling conflicting
// locks held by other active transactions.
lockWaitPolicy descpb.ScanLockingWaitPolicy
// lockTimeout specifies the maximum amount of time that the fetcher will
// wait while attempting to acquire a lock on a key or while blocking on an
// existing lock in order to perform a non-locking read on a key.
lockTimeout time.Duration
// traceKV indicates whether or not session tracing is enabled. It is set
// when beginning a new scan.
traceKV bool
// mvccDecodeStrategy controls whether or not MVCC timestamps should
// be decoded from KV's fetched.
mvccDecodeStrategy MVCCDecodingStrategy
// -- Fields updated during a scan --
kvFetcher *KVFetcher
indexKey []byte // the index key of the current row
prettyValueBuf *bytes.Buffer
valueColsFound int // how many needed cols we've found so far in the value
// The current key/value, unless kvEnd is true.
kv roachpb.KeyValue
keyRemainingBytes []byte
kvEnd bool
// IgnoreUnexpectedNulls allows Fetcher to return null values for non-nullable
// columns and is only used for decoding for error messages or debugging.
IgnoreUnexpectedNulls bool
// Buffered allocation of decoded datums.
alloc *tree.DatumAlloc
// Memory monitor and memory account for the bytes fetched by this fetcher.
mon *mon.BytesMonitor
kvFetcherMemAcc *mon.BoundAccount
}
// Reset resets this Fetcher, preserving the memory capacity that was used
// for the tables slice, and the slices within each of the tableInfo objects
// within tables. This permits reuse of this objects without forcing total
// reallocation of all of those slice fields.
func (rf *Fetcher) Reset() {
*rf = Fetcher{
table: rf.table,
}
}
// Close releases resources held by this fetcher.
func (rf *Fetcher) Close(ctx context.Context) {
if rf.kvFetcher != nil {
rf.kvFetcher.Close(ctx)
}
if rf.mon != nil {
rf.kvFetcherMemAcc.Close(ctx)
rf.mon.Stop(ctx)
}
}
// Init sets up a Fetcher for a given table and index. If we are using a
// non-primary index, tables.ValNeededForCol can only refer to columns in the
// index.
func (rf *Fetcher) Init(
ctx context.Context,
codec keys.SQLCodec,
reverse bool,
lockStrength descpb.ScanLockingStrength,
lockWaitPolicy descpb.ScanLockingWaitPolicy,
lockTimeout time.Duration,
alloc *tree.DatumAlloc,
memMonitor *mon.BytesMonitor,
tableArgs FetcherTableArgs,
) error {
rf.codec = codec
rf.reverse = reverse
rf.lockStrength = lockStrength
rf.lockWaitPolicy = lockWaitPolicy
rf.lockTimeout = lockTimeout
rf.alloc = alloc
if memMonitor != nil {
rf.mon = mon.NewMonitorInheritWithLimit("fetcher-mem", 0 /* limit */, memMonitor)
rf.mon.Start(ctx, memMonitor, mon.BoundAccount{})
memAcc := rf.mon.MakeBoundAccount()
rf.kvFetcherMemAcc = &memAcc
}
var colIdxMap catalog.TableColMap
for i, c := range tableArgs.Columns {
colIdxMap.Set(c.GetID(), i)
}
table := &rf.table
*table = tableInfo{
desc: tableArgs.Desc,
colIdxMap: colIdxMap,
index: tableArgs.Index,
isSecondaryIndex: tableArgs.IsSecondaryIndex,
cols: tableArgs.Columns,
row: make(rowenc.EncDatumRow, len(tableArgs.Columns)),
decodedRow: make(tree.Datums, len(tableArgs.Columns)),
// These slice fields might get re-allocated below, so reslice them from
// the old table here in case they've got enough capacity already.
indexColIdx: rf.table.indexColIdx[:0],
keyVals: rf.table.keyVals[:0],
extraVals: rf.table.extraVals[:0],
timestampOutputIdx: noOutputColumn,
oidOutputIdx: noOutputColumn,
}
for idx, col := range table.cols {
if col.IsSystemColumn() {
// Set up any system column metadata.
switch colinfo.GetSystemColumnKindFromColumnID(col.GetID()) {
case catpb.SystemColumnKind_MVCCTIMESTAMP:
table.timestampOutputIdx = idx
rf.mvccDecodeStrategy = MVCCDecodingRequired
case catpb.SystemColumnKind_TABLEOID:
table.oidOutputIdx = idx
table.tableOid = tree.NewDOid(tree.DInt(tableArgs.Desc.GetID()))
}
}
}
table.knownPrefixLength = len(
rowenc.MakeIndexKeyPrefix(codec, table.desc.GetID(), table.index.GetID()),
)
table.indexColumnDirs = table.desc.IndexFullColumnDirections(table.index)
fullColumns := table.desc.IndexFullColumns(table.index)
if len(table.cols) > 0 {
table.neededValueColsByIdx.AddRange(0, len(table.cols)-1)
}
neededIndexCols := 0
nIndexCols := len(fullColumns)
if cap(table.indexColIdx) >= nIndexCols {
table.indexColIdx = table.indexColIdx[:nIndexCols]
} else {
table.indexColIdx = make([]int, nIndexCols)
}
for i, col := range fullColumns {
if col == nil {
table.indexColIdx[i] = -1
continue
}
id := col.GetID()
colIdx, ok := table.colIdxMap.Get(id)
if ok {
table.indexColIdx[i] = colIdx
neededIndexCols++
table.neededValueColsByIdx.Remove(colIdx)
} else {
table.indexColIdx[i] = -1
}
}
// If there are needed columns from the index key, we need to read it;
// otherwise, we can completely avoid decoding the index key.
rf.mustDecodeIndexKey = neededIndexCols > 0
// The number of columns we need to read from the value part of the key.
// It's the total number of needed columns minus the ones we read from the
// index key, except for composite columns.
table.neededValueCols = len(table.cols) - neededIndexCols + table.index.NumCompositeColumns()
if table.isSecondaryIndex {
colIDs := table.index.CollectKeyColumnIDs()
colIDs.UnionWith(table.index.CollectSecondaryStoredColumnIDs())
colIDs.UnionWith(table.index.CollectKeySuffixColumnIDs())
for i := range table.cols {
if !colIDs.Contains(table.cols[i].GetID()) {
return errors.Errorf("requested column %s not in index", table.cols[i].GetName())
}
}
}
// Prepare our index key vals slice.
var err error
table.keyValTypes, err = getColumnTypes(fullColumns, table.keyValTypes)
if err != nil {
return err
}
if cap(table.keyVals) >= nIndexCols {
table.keyVals = table.keyVals[:nIndexCols]
} else {
table.keyVals = make([]rowenc.EncDatum, nIndexCols)
}
if hasExtraCols(table) {
// Unique secondary indexes have a value that is the
// primary index key.
// Primary indexes only contain ascendingly-encoded
// values. If this ever changes, we'll probably have to
// figure out the directions here too.
keySuffixCols := table.desc.IndexKeySuffixColumns(table.index)
table.extraTypes, err = getColumnTypes(keySuffixCols, table.extraTypes)
nExtraColumns := len(keySuffixCols)
if cap(table.extraVals) >= nExtraColumns {
table.extraVals = table.extraVals[:nExtraColumns]
} else {
table.extraVals = make([]rowenc.EncDatum, nExtraColumns)
}
if err != nil {
return err
}
}
rf.numKeysPerRow, err = table.desc.KeysPerRow(table.index.GetID())
return err
}
func getColumnTypes(columns []catalog.Column, outTypes []*types.T) ([]*types.T, error) {
if cap(outTypes) < len(columns) {
outTypes = make([]*types.T, len(columns))
} else {
outTypes = outTypes[:len(columns)]
}
for i, col := range columns {
if col == nil {
return nil, fmt.Errorf("column does not exist")
}
if !col.Public() {
return nil, fmt.Errorf("column %q (%d) is not public", col.GetName(), col.GetID())
}
outTypes[i] = col.GetType()
}
return outTypes, nil
}
// GetTable returns the table that this Fetcher was initialized with.
func (rf *Fetcher) GetTable() catalog.Descriptor {
return rf.table.desc
}
// StartScan initializes and starts the key-value scan. Can be used multiple
// times.
//
// batchBytesLimit controls whether bytes limits are placed on the batches. If
// set, bytes limits will be used to protect against running out of memory (on
// both this client node, and on the server).
//
// If batchBytesLimit is set, rowLimitHint can also be set to control the number of
// rows that will be scanned by the first batch. If set, subsequent batches (if
// any) will have progressively higher limits (up to a fixed max). The idea with
// row limits is to make the execution of LIMIT queries efficient: if the caller
// has some idea about how many rows need to be read to ultimately satisfy the
// query, the Fetcher uses it. Even if this hint proves insufficient, the
// Fetcher continues to set row limits (in addition to bytes limits) on the
// argument that some number of rows will eventually satisfy the query and we
// likely don't need to scan `spans` fully. The bytes limit, on the other hand,
// is simply intended to protect against OOMs.
func (rf *Fetcher) StartScan(
ctx context.Context,
txn *kv.Txn,
spans roachpb.Spans,
batchBytesLimit rowinfra.BytesLimit,
rowLimitHint rowinfra.RowLimit,
traceKV bool,
forceProductionKVBatchSize bool,
) error {
if len(spans) == 0 {
return errors.AssertionFailedf("no spans")
}
f, err := makeKVBatchFetcher(
ctx,
makeKVBatchFetcherDefaultSendFunc(txn),
spans,
rf.reverse,
batchBytesLimit,
rf.rowLimitToKeyLimit(rowLimitHint),
rf.lockStrength,
rf.lockWaitPolicy,
rf.lockTimeout,
rf.kvFetcherMemAcc,
forceProductionKVBatchSize,
txn.AdmissionHeader(),
txn.DB().SQLKVResponseAdmissionQ,
)
if err != nil {
return err
}
return rf.StartScanFrom(ctx, &f, traceKV)
}
// TestingInconsistentScanSleep introduces a sleep inside the fetcher after
// every KV batch (for inconsistent scans, currently used only for table
// statistics collection).
// TODO(radu): consolidate with forceProductionKVBatchSize into a
// FetcherTestingKnobs struct.
var TestingInconsistentScanSleep time.Duration
// StartInconsistentScan initializes and starts an inconsistent scan, where each
// KV batch can be read at a different historical timestamp.
//
// The scan uses the initial timestamp, until it becomes older than
// maxTimestampAge; at this time the timestamp is bumped by the amount of time
// that has passed. See the documentation for TableReaderSpec for more
// details.
//
// Can be used multiple times.
func (rf *Fetcher) StartInconsistentScan(
ctx context.Context,
db *kv.DB,
initialTimestamp hlc.Timestamp,
maxTimestampAge time.Duration,
spans roachpb.Spans,
batchBytesLimit rowinfra.BytesLimit,
rowLimitHint rowinfra.RowLimit,
traceKV bool,
forceProductionKVBatchSize bool,
) error {
if len(spans) == 0 {
return errors.AssertionFailedf("no spans")
}
txnTimestamp := initialTimestamp
txnStartTime := timeutil.Now()
if txnStartTime.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
return errors.Errorf(
"AS OF SYSTEM TIME: cannot specify timestamp older than %s for this operation",
maxTimestampAge,
)
}
txn := kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */)
if err := txn.SetFixedTimestamp(ctx, txnTimestamp); err != nil {
return err
}
if log.V(1) {
log.Infof(ctx, "starting inconsistent scan at timestamp %v", txnTimestamp)
}
sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) {
if now := timeutil.Now(); now.Sub(txnTimestamp.GoTime()) >= maxTimestampAge {
// Time to bump the transaction. First commit the old one (should be a no-op).
if err := txn.Commit(ctx); err != nil {
return nil, err
}
// Advance the timestamp by the time that passed.
txnTimestamp = txnTimestamp.Add(now.Sub(txnStartTime).Nanoseconds(), 0 /* logical */)
txnStartTime = now
txn = kv.NewTxnWithSteppingEnabled(ctx, db, 0 /* gatewayNodeID */)
if err := txn.SetFixedTimestamp(ctx, txnTimestamp); err != nil {
return nil, err
}
if log.V(1) {
log.Infof(ctx, "bumped inconsistent scan timestamp to %v", txnTimestamp)
}
}
res, err := txn.Send(ctx, ba)
if err != nil {
return nil, err.GoError()
}
if TestingInconsistentScanSleep != 0 {
time.Sleep(TestingInconsistentScanSleep)
}
return res, nil
}
// TODO(radu): we should commit the last txn. Right now the commit is a no-op
// on read transactions, but perhaps one day it will release some resources.
f, err := makeKVBatchFetcher(
ctx,
sendFunc(sendFn),
spans,
rf.reverse,
batchBytesLimit,
rf.rowLimitToKeyLimit(rowLimitHint),
rf.lockStrength,
rf.lockWaitPolicy,
rf.lockTimeout,
rf.kvFetcherMemAcc,
forceProductionKVBatchSize,
txn.AdmissionHeader(),
txn.DB().SQLKVResponseAdmissionQ,
)
if err != nil {
return err
}
return rf.StartScanFrom(ctx, &f, traceKV)
}
func (rf *Fetcher) rowLimitToKeyLimit(rowLimitHint rowinfra.RowLimit) rowinfra.KeyLimit {
if rowLimitHint == 0 {
return 0
}
// If we have a limit hint, we limit the first batch size. Subsequent
// batches get larger to avoid making things too slow (e.g. in case we have
// a very restrictive filter and actually have to retrieve a lot of rows).
// The rowLimitHint is a row limit, but each row could be made up of more than
// one key. We take the maximum possible keys per row out of all the table
// rows we could potentially scan over.
//
// We add an extra key to make sure we form the last row.
return rowinfra.KeyLimit(int64(rowLimitHint)*int64(rf.numKeysPerRow) + 1)
}
// StartScanFrom initializes and starts a scan from the given KVBatchFetcher. Can be
// used multiple times.
func (rf *Fetcher) StartScanFrom(ctx context.Context, f KVBatchFetcher, traceKV bool) error {
rf.traceKV = traceKV
rf.indexKey = nil
if rf.kvFetcher != nil {
rf.kvFetcher.Close(ctx)
}
rf.kvFetcher = newKVFetcher(f)
// Retrieve the first key.
_, err := rf.NextKey(ctx)
return err
}
// setNextKV sets the next KV to process to the input KV. needsCopy, if true,
// causes the input kv to be deep copied. needsCopy should be set to true if
// the input KV is pointing to the last KV of a batch, so that the batch can
// be garbage collected before fetching the next one.
// gcassert:inline
func (rf *Fetcher) setNextKV(kv roachpb.KeyValue, needsCopy bool) {
if !needsCopy {
rf.kv = kv
return
}
// If we've made it to the very last key in the batch, copy out the key
// so that the GC can reclaim the large backing slice before we call
// NextKV() again.
kvCopy := roachpb.KeyValue{}
kvCopy.Key = make(roachpb.Key, len(kv.Key))
copy(kvCopy.Key, kv.Key)
kvCopy.Value.RawBytes = make([]byte, len(kv.Value.RawBytes))
copy(kvCopy.Value.RawBytes, kv.Value.RawBytes)
kvCopy.Value.Timestamp = kv.Value.Timestamp
rf.kv = kvCopy
}
// NextKey retrieves the next key/value and sets kv/kvEnd. Returns whether a row
// has been completed.
func (rf *Fetcher) NextKey(ctx context.Context) (rowDone bool, _ error) {
moreKVs, kv, finalReferenceToBatch, err := rf.kvFetcher.NextKV(ctx, rf.mvccDecodeStrategy)
if err != nil {
return false, ConvertFetchError(ctx, rf, err)
}
rf.setNextKV(kv, finalReferenceToBatch)
rf.kvEnd = !moreKVs
if rf.kvEnd {
// No more keys in the scan.
//
// NB: this assumes that the KV layer will never split a range
// between column families, which is a brittle assumption.
// See:
// https://github.com/cockroachdb/cockroach/pull/42056
return true, nil
}
// unchangedPrefix will be set to true if we can skip decoding the index key
// completely, because the last key we saw has identical prefix to the
// current key.
//
// See Init() for a detailed description of when we can get away with not
// reading the index key.
unchangedPrefix := rf.indexKey != nil && bytes.HasPrefix(rf.kv.Key, rf.indexKey)
if unchangedPrefix {
// Skip decoding!
rf.keyRemainingBytes = rf.kv.Key[len(rf.indexKey):]
} else if rf.mustDecodeIndexKey {
var foundNull bool
rf.keyRemainingBytes, foundNull, err = rf.DecodeIndexKey(rf.kv.Key)
if err != nil {
return false, err
}
// For unique secondary indexes, the index-key does not distinguish one row
// from the next if both rows contain identical values along with a NULL.
// Consider the keys:
//
// /test/unique_idx/NULL/0
// /test/unique_idx/NULL/1
//
// The index-key extracted from the above keys is /test/unique_idx/NULL. The
// trailing /0 and /1 are the primary key used to unique-ify the keys when a
// NULL is present. When a null is present in the index key, we cut off more
// of the index key so that the prefix includes the primary key columns.
//
// Note that we do not need to do this for non-unique secondary indexes because
// the extra columns in the primary key will _always_ be there, so we can decode
// them when processing the index. The difference with unique secondary indexes
// is that the extra columns are not always there, and are used to unique-ify
// the index key, rather than provide the primary key column values.
if foundNull && rf.table.isSecondaryIndex && rf.table.index.IsUnique() && len(rf.table.desc.GetFamilies()) != 1 {
for i := 0; i < rf.table.index.NumKeySuffixColumns(); i++ {
var err error
// Slice off an extra encoded column from rf.keyRemainingBytes.
rf.keyRemainingBytes, err = keyside.Skip(rf.keyRemainingBytes)
if err != nil {
return false, err
}
}
}
} else {
// We still need to consume the key until the family
// id, so processKV can know whether we've finished a
// row or not.
prefixLen, err := keys.GetRowPrefixLength(rf.kv.Key)
if err != nil {
return false, err
}
rf.keyRemainingBytes = rf.kv.Key[prefixLen:]
}
switch {
case len(rf.table.desc.GetFamilies()) == 1:
// If we only have one family, we know that there is only 1 k/v pair per row.
rowDone = true
case !unchangedPrefix:
// If the prefix of the key has changed, current key is from a different
// row than the previous one.
rowDone = true
default:
rowDone = false
}
if rf.indexKey != nil && rowDone {
// The current key belongs to a new row. Output the
// current row.
rf.indexKey = nil
return true, nil
}
return false, nil
}
func (rf *Fetcher) prettyEncDatums(types []*types.T, vals []rowenc.EncDatum) string {
var buf strings.Builder
for i, v := range vals {
buf.WriteByte('/')
if err := v.EnsureDecoded(types[i], rf.alloc); err != nil {
buf.WriteByte('?')
} else {
buf.WriteString(v.Datum.String())
}
}
return buf.String()
}
// DecodeIndexKey decodes an index key and returns the remaining key and wheter
// it encountered a null while decoding.
func (rf *Fetcher) DecodeIndexKey(key roachpb.Key) (remaining []byte, foundNull bool, err error) {
return rowenc.DecodeKeyVals(
rf.table.keyValTypes,
rf.table.keyVals,
rf.table.indexColumnDirs,
key[rf.table.knownPrefixLength:],
)
}
// KeyToDesc implements the KeyToDescTranslator interface. The implementation is
// used by ConvertFetchError.
func (rf *Fetcher) KeyToDesc(key roachpb.Key) (catalog.TableDescriptor, bool) {
if len(key) < rf.table.knownPrefixLength {
return nil, false
}
if _, _, err := rf.DecodeIndexKey(key); err != nil {
return nil, false
}
return rf.table.desc, true
}
// processKV processes the given key/value, setting values in the row
// accordingly. If debugStrings is true, returns pretty printed key and value
// information in prettyKey/prettyValue (otherwise they are empty strings).
func (rf *Fetcher) processKV(
ctx context.Context, kv roachpb.KeyValue,
) (prettyKey string, prettyValue string, err error) {
table := &rf.table
if rf.traceKV {
prettyKey = fmt.Sprintf(
"/%s/%s%s",
table.desc.GetName(),
table.index.GetName(),
rf.prettyEncDatums(table.keyValTypes, table.keyVals),
)
}
// Either this is the first key of the fetch or the first key of a new
// row.
if rf.indexKey == nil {
// This is the first key for the row.
rf.indexKey = []byte(kv.Key[:len(kv.Key)-len(rf.keyRemainingBytes)])
// Reset the row to nil; it will get filled in with the column
// values as we decode the key-value pairs for the row.
// We only need to reset the needed columns in the value component, because
// non-needed columns are never set and key columns are unconditionally set
// below.
for idx, ok := table.neededValueColsByIdx.Next(0); ok; idx, ok = table.neededValueColsByIdx.Next(idx + 1) {
table.row[idx].UnsetDatum()
}
// Fill in the column values that are part of the index key.
for i := range table.keyVals {
if idx := table.indexColIdx[i]; idx != -1 {
table.row[idx] = table.keyVals[i]
}
}
rf.valueColsFound = 0
// Reset the MVCC metadata for the next row.
// set rowLastModified to a sentinel that's before any real timestamp.
// As kvs are iterated for this row, it keeps track of the greatest
// timestamp seen.
table.rowLastModified = hlc.Timestamp{}
// All row encodings (both before and after column families) have a
// sentinel kv (column family 0) that is always present when a row is
// present, even if that row is all NULLs. Thus, a row is deleted if and
// only if the first kv in it a tombstone (RawBytes is empty).
table.rowIsDeleted = len(kv.Value.RawBytes) == 0
}
if table.rowLastModified.Less(kv.Value.Timestamp) {
table.rowLastModified = kv.Value.Timestamp
}
if len(table.cols) == 0 {
// We don't need to decode any values.
if rf.traceKV {
prettyValue = "<undecoded>"
}
return prettyKey, prettyValue, nil
}
// For covering secondary indexes, allow for decoding as a primary key.
if table.index.GetEncodingType() == descpb.PrimaryIndexEncoding &&
len(rf.keyRemainingBytes) > 0 {
// If familyID is 0, kv.Value contains values for composite key columns.
// These columns already have a table.row value assigned above, but that value
// (obtained from the key encoding) might not be correct (e.g. for decimals,
// it might not contain the right number of trailing 0s; for collated
// strings, it is one of potentially many strings with the same collation
// key).
//
// In these cases, the correct value will be present in family 0 and the
// table.row value gets overwritten.
switch kv.Value.GetTag() {
case roachpb.ValueType_TUPLE:
// In this case, we don't need to decode the column family ID, because
// the ValueType_TUPLE encoding includes the column id with every encoded
// column value.
var tupleBytes []byte
tupleBytes, err = kv.Value.GetTuple()
if err != nil {
break
}
prettyKey, prettyValue, err = rf.processValueBytes(ctx, table, kv, tupleBytes, prettyKey)
default:
var familyID uint64
_, familyID, err = encoding.DecodeUvarintAscending(rf.keyRemainingBytes)
if err != nil {
return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err)
}
var family *descpb.ColumnFamilyDescriptor
family, err = table.desc.FindFamilyByID(descpb.FamilyID(familyID))
if err != nil {
return "", "", scrub.WrapError(scrub.IndexKeyDecodingError, err)
}
prettyKey, prettyValue, err = rf.processValueSingle(ctx, table, family, kv, prettyKey)
}
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
} else {
tag := kv.Value.GetTag()
var valueBytes []byte
switch tag {
case roachpb.ValueType_BYTES:
// If we have the ValueType_BYTES on a secondary index, then we know we
// are looking at column family 0. Column family 0 stores the extra primary
// key columns if they are present, so we decode them here.
valueBytes, err = kv.Value.GetBytes()
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
if hasExtraCols(table) {
// This is a unique secondary index; decode the extra
// column values from the value.
var err error
valueBytes, _, err = rowenc.DecodeKeyVals(
table.extraTypes,
table.extraVals,
nil,
valueBytes,
)
if err != nil {
return "", "", scrub.WrapError(scrub.SecondaryIndexKeyExtraValueDecodingError, err)
}
for i := 0; i < table.index.NumKeySuffixColumns(); i++ {
id := table.index.GetKeySuffixColumnID(i)
if idx, ok := table.colIdxMap.Get(id); ok {
table.row[idx] = table.extraVals[i]
}
}
if rf.traceKV {
prettyValue = rf.prettyEncDatums(table.extraTypes, table.extraVals)
}
}
case roachpb.ValueType_TUPLE:
valueBytes, err = kv.Value.GetTuple()
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
}
if DebugRowFetch {
if hasExtraCols(table) && tag == roachpb.ValueType_BYTES {
log.Infof(ctx, "Scan %s -> %s", kv.Key, rf.prettyEncDatums(table.extraTypes, table.extraVals))
} else {
log.Infof(ctx, "Scan %s", kv.Key)
}
}
if len(valueBytes) > 0 {
prettyKey, prettyValue, err = rf.processValueBytes(
ctx, table, kv, valueBytes, prettyKey,
)
if err != nil {
return "", "", scrub.WrapError(scrub.IndexValueDecodingError, err)
}
}
}
if rf.traceKV && prettyValue == "" {
prettyValue = "<undecoded>"
}
return prettyKey, prettyValue, nil
}
// processValueSingle processes the given value (of column
// family.DefaultColumnID), setting values in table.row accordingly. The key is
// only used for logging.
func (rf *Fetcher) processValueSingle(
ctx context.Context,
table *tableInfo,
family *descpb.ColumnFamilyDescriptor,
kv roachpb.KeyValue,
prettyKeyPrefix string,
) (prettyKey string, prettyValue string, err error) {
prettyKey = prettyKeyPrefix
// If this is the row sentinel (in the legacy pre-family format),
// a value is not expected, so we're done.
if family.ID == 0 {
return "", "", nil
}
colID := family.DefaultColumnID
if colID == 0 {
return "", "", errors.Errorf("single entry value with no default column id")
}
if idx, ok := table.colIdxMap.Get(colID); ok {
if rf.traceKV {
prettyKey = fmt.Sprintf("%s/%s", prettyKey, table.desc.DeletableColumns()[idx].GetName())
}
if len(kv.Value.RawBytes) == 0 {
return prettyKey, "", nil
}
typ := table.cols[idx].GetType()
// TODO(arjun): The value is a directly marshaled single value, so we
// unmarshal it eagerly here. This can potentially be optimized out,
// although that would require changing UnmarshalColumnValue to operate
// on bytes, and for Encode/DecodeTableValue to operate on marshaled
// single values.
value, err := valueside.UnmarshalLegacy(rf.alloc, typ, kv.Value)
if err != nil {
return "", "", err
}
if rf.traceKV {
prettyValue = value.String()
}
table.row[idx] = rowenc.DatumToEncDatum(typ, value)
if DebugRowFetch {
log.Infof(ctx, "Scan %s -> %v", kv.Key, value)
}
return prettyKey, prettyValue, nil
}
// No need to unmarshal the column value. Either the column was part of
// the index key or it isn't needed.
if DebugRowFetch {
log.Infof(ctx, "Scan %s -> [%d] (skipped)", kv.Key, colID)
}
return prettyKey, prettyValue, nil
}
func (rf *Fetcher) processValueBytes(
ctx context.Context,
table *tableInfo,
kv roachpb.KeyValue,
valueBytes []byte,
prettyKeyPrefix string,
) (prettyKey string, prettyValue string, err error) {
prettyKey = prettyKeyPrefix
if rf.traceKV {
if rf.prettyValueBuf == nil {
rf.prettyValueBuf = &bytes.Buffer{}
}
rf.prettyValueBuf.Reset()
}