pingcap · ti-chi-bot · Sep 21, 2024 · Jul 23, 2024 · Aug 16, 2024 · Aug 17, 2024
diff --git a/lightning/pkg/importer/chunk_process.go b/lightning/pkg/importer/chunk_process.go
@@ -177,9 +177,10 @@ func (cr *chunkProcessor) process(
 	// Create the encoder.
 	kvEncoder, err := rc.encBuilder.NewEncoder(ctx, &encode.EncodingConfig{
 		SessionOptions: encode.SessionOptions{
-			SQLMode:   rc.cfg.TiDB.SQLMode,
-			Timestamp: cr.chunk.Timestamp,
-			SysVars:   rc.sysVars,
+			SQLMode:               rc.cfg.TiDB.SQLMode,
+			Timestamp:             cr.chunk.Timestamp,
+			SysVars:               rc.sysVars,
+			LogicalImportPrepStmt: rc.cfg.TikvImporter.LogicalImportPrepStmt,
 			// use chunk.PrevRowIDMax as the auto random seed, so it can stay the same value after recover from checkpoint.
 			AutoRandomSeed: cr.chunk.Chunk.PrevRowIDMax,
 		},
@@ -262,9 +263,10 @@ func (cr *chunkProcessor) encodeLoop(
 
 		originalTableEncoder, err = rc.encBuilder.NewEncoder(ctx, &encode.EncodingConfig{
 			SessionOptions: encode.SessionOptions{
-				SQLMode:   rc.cfg.TiDB.SQLMode,
-				Timestamp: cr.chunk.Timestamp,
-				SysVars:   rc.sysVars,
+				SQLMode:               rc.cfg.TiDB.SQLMode,
+				Timestamp:             cr.chunk.Timestamp,
+				SysVars:               rc.sysVars,
+				LogicalImportPrepStmt: rc.cfg.TikvImporter.LogicalImportPrepStmt,
 				// use chunk.PrevRowIDMax as the auto random seed, so it can stay the same value after recover from checkpoint.
 				AutoRandomSeed: cr.chunk.Chunk.PrevRowIDMax,
 			},

diff --git a/lightning/tests/lightning_tidb_duplicate_data/error.toml b/lightning/tests/lightning_tidb_duplicate_data/error.toml
@@ -2,3 +2,4 @@
 backend = "tidb"
 on-duplicate = "error"
 logical-import-batch-rows = 1
+logical-import-prep-stmt = true
diff --git a/lightning/tests/lightning_tidb_duplicate_data/ignore.toml b/lightning/tests/lightning_tidb_duplicate_data/ignore.toml
@@ -2,3 +2,4 @@
 backend = "tidb"
 on-duplicate = "ignore"
 logical-import-batch-rows = 1
+logical-import-prep-stmt = true
diff --git a/lightning/tests/lightning_tidb_duplicate_data/replace.toml b/lightning/tests/lightning_tidb_duplicate_data/replace.toml
@@ -2,3 +2,4 @@
 backend = "tidb"
 on-duplicate = "replace"
 logical-import-batch-rows = 1
+logical-import-prep-stmt = true
diff --git a/lightning/tidb-lightning.toml b/lightning/tidb-lightning.toml
@@ -166,6 +166,8 @@ addr = "127.0.0.1:8287"
 # the rows will be split in a way to respect both settings.
 # This value may be decreased to reduce the stress on the cluster due to large transaction.
 #logical-import-batch-rows = 65536
+# logical-import-prep-stmt controls whether to use prepared statements in logical mode (TiDB backend).
+#logical-import-prep-stmt = false
 
 [mydumper]
 # block size of file reading

diff --git a/pkg/lightning/backend/encode/encode.go b/pkg/lightning/backend/encode/encode.go
@@ -52,9 +52,10 @@ type Encoder interface {
 
 // SessionOptions is the initial configuration of the session.
 type SessionOptions struct {
-	SQLMode   mysql.SQLMode
-	Timestamp int64
-	SysVars   map[string]string
+	SQLMode               mysql.SQLMode
+	Timestamp             int64
+	SysVars               map[string]string
+	LogicalImportPrepStmt bool
 	// a seed used for tableKvEncoder's auto random bits value
 	AutoRandomSeed int64
 	// IndexID is used by the dupeDetector. Only the key range with the specified index ID is scanned.

diff --git a/pkg/lightning/backend/tidb/BUILD.bazel b/pkg/lightning/backend/tidb/BUILD.bazel
@@ -22,6 +22,8 @@ go_library(
         "//pkg/table",
         "//pkg/types",
         "//pkg/util/dbutil",
+        "//pkg/util/hack",
+        "//pkg/util/kvcache",
         "//pkg/util/redact",
         "@com_github_go_sql_driver_mysql//:mysql",
         "@com_github_google_uuid//:uuid",
@@ -37,7 +39,7 @@ go_test(
     timeout = "short",
     srcs = ["tidb_test.go"],
     flaky = True,
-    shard_count = 15,
+    shard_count = 17,
     deps = [
         ":tidb",
         "//pkg/errno",
@@ -60,5 +62,6 @@ go_test(
         "@com_github_go_sql_driver_mysql//:mysql",
         "@com_github_stretchr_testify//require",
         "@org_uber_go_atomic//:atomic",
+        "@org_uber_go_zap//:zap",
     ],
 )
diff --git a/pkg/lightning/backend/tidb/tidb.go b/pkg/lightning/backend/tidb/tidb.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	gmysql "github.com/go-sql-driver/mysql"
@@ -43,6 +44,8 @@ import (
 	"github.com/pingcap/tidb/pkg/table"
 	"github.com/pingcap/tidb/pkg/types"
 	"github.com/pingcap/tidb/pkg/util/dbutil"
+	"github.com/pingcap/tidb/pkg/util/hack"
+	"github.com/pingcap/tidb/pkg/util/kvcache"
 	"github.com/pingcap/tidb/pkg/util/redact"
 	"go.uber.org/zap"
 	"go.uber.org/zap/zapcore"
@@ -56,12 +59,16 @@ var extraHandleTableColumn = &table.Column{
 
 const (
 	writeRowsMaxRetryTimes = 3
+	// To limit memory usage for prepared statements.
+	prepStmtCacheSize uint = 100
 )
 
 type tidbRow struct {
-	insertStmt string
-	path       string
-	offset     int64
+	insertStmt         string
+	preparedInsertStmt string
+	values             []any
+	path               string
+	offset             int64
 }
 
 var emptyTiDBRow = tidbRow{
@@ -91,8 +98,9 @@ type tidbEncoder struct {
 	// the there are enough columns.
 	columnCnt int
 	// data file path
-	path   string
-	logger log.Logger
+	path     string
+	logger   log.Logger
+	prepStmt bool
 }
 
 type encodingBuilder struct{}
@@ -106,10 +114,11 @@ func NewEncodingBuilder() encode.EncodingBuilder {
 // It implements the `backend.EncodingBuilder` interface.
 func (*encodingBuilder) NewEncoder(_ context.Context, config *encode.EncodingConfig) (encode.Encoder, error) {
 	return &tidbEncoder{
-		mode:   config.SQLMode,
-		tbl:    config.Table,
-		path:   config.Path,
-		logger: config.Logger,
+		mode:     config.SQLMode,
+		tbl:      config.Table,
+		path:     config.Path,
+		logger:   config.Logger,
+		prepStmt: config.LogicalImportPrepStmt,
 	}, nil
 }
 
@@ -288,6 +297,16 @@ func (*targetInfoGetter) CheckRequirements(ctx context.Context, _ *backend.Check
 	return nil
 }
 
+// stmtKey defines key for stmtCache.
+type stmtKey struct {
+	query string
+}
+
+// Hash implements SimpleLRUCache.Key.
+func (k *stmtKey) Hash() []byte {
+	return hack.Slice(k.query)
+}
+
 type tidbBackend struct {
 	db          *sql.DB
 	conflictCfg config.Conflict
@@ -301,6 +320,9 @@ type tidbBackend struct {
 	// affecting the cluster too much.
 	maxChunkSize uint64
 	maxChunkRows int
+	// implement stmtCache to improve performance
+	stmtCache      *kvcache.SimpleLRUCache
+	stmtCacheMutex sync.RWMutex
 }
 
 var _ backend.Backend = (*tidbBackend)(nil)
@@ -334,13 +356,23 @@ func NewTiDBBackend(
 		log.FromContext(ctx).Warn("unsupported conflict strategy for TiDB backend, overwrite with `error`")
 		onDuplicate = config.ErrorOnDup
 	}
+	var stmtCache *kvcache.SimpleLRUCache
+	if cfg.TikvImporter.LogicalImportPrepStmt {
+		stmtCache = kvcache.NewSimpleLRUCache(prepStmtCacheSize, 0, 0)
+		stmtCache.SetOnEvict(func(_ kvcache.Key, value kvcache.Value) {
+			stmt := value.(*sql.Stmt)
+			stmt.Close()
+		})
+	}
 	return &tidbBackend{
-		db:           db,
-		conflictCfg:  conflict,
-		onDuplicate:  onDuplicate,
-		errorMgr:     errorMgr,
-		maxChunkSize: uint64(cfg.TikvImporter.LogicalImportBatchSize),
-		maxChunkRows: cfg.TikvImporter.LogicalImportBatchRows,
+		db:             db,
+		conflictCfg:    conflict,
+		onDuplicate:    onDuplicate,
+		errorMgr:       errorMgr,
+		maxChunkSize:   uint64(cfg.TikvImporter.LogicalImportBatchSize),
+		maxChunkRows:   cfg.TikvImporter.LogicalImportBatchRows,
+		stmtCache:      stmtCache,
+		stmtCacheMutex: sync.RWMutex{},
-		stmtCacheMutex: sync.RWMutex{},
-		stmtCacheMutex: sync.RWMutex{},
 	}
 }
 
@@ -556,16 +588,25 @@ func (enc *tidbEncoder) Encode(row []types.Datum, _ int64, columnPermutation []i
 		return emptyTiDBRow, errors.Errorf("column count mismatch, at most %d but got %d", len(enc.columnIdx), len(row))
 	}
 
-	var encoded strings.Builder
+	var encoded, preparedInsertStmt strings.Builder
+	var values []any
 	encoded.Grow(8 * len(row))
 	encoded.WriteByte('(')
+	if enc.prepStmt {
+		preparedInsertStmt.Grow(2 * len(row))
+		preparedInsertStmt.WriteByte('(')
+		values = make([]any, 0, len(row))
+	}
 	cnt := 0
 	for i, field := range row {
 		if enc.columnIdx[i] < 0 {
 			continue
 		}
 		if cnt > 0 {
 			encoded.WriteByte(',')
+			if enc.prepStmt {
+				preparedInsertStmt.WriteByte(',')
+			}
 		}
 		datum := field
 		if err := enc.appendSQL(&encoded, &datum, getColumnByIndex(cols, enc.columnIdx[i])); err != nil {
@@ -576,13 +617,23 @@ func (enc *tidbEncoder) Encode(row []types.Datum, _ int64, columnPermutation []i
 			)
 			return nil, err
 		}
+		if enc.prepStmt {
+			preparedInsertStmt.WriteByte('?')
+			values = append(values, datum.GetValue())
+		}
 		cnt++
 	}
 	encoded.WriteByte(')')
+	if enc.prepStmt {
+		preparedInsertStmt.WriteByte(')')
+	}
+
 	return tidbRow{
-		insertStmt: encoded.String(),
-		path:       enc.path,
-		offset:     offset,
+		insertStmt:         encoded.String(),
+		preparedInsertStmt: preparedInsertStmt.String(),
+		values:             values,
+		path:               enc.path,
+		offset:             offset,
 	}, nil
 }
 
@@ -665,8 +716,9 @@ rowLoop:
 }
 
 type stmtTask struct {
-	rows tidbRows
-	stmt string
+	rows   tidbRows
+	stmt   string
+	values []any
 }
 
 // WriteBatchRowsToDB write rows in batch mode, which will insert multiple rows like this:
@@ -679,14 +731,23 @@ func (be *tidbBackend) WriteBatchRowsToDB(ctx context.Context, tableName string,
 	}
 	// Note: we are not going to do interpolation (prepared statements) to avoid
 	// complication arise from data length overflow of BIT and BINARY columns
+	var values []any
+	if be.stmtCache != nil && len(rows) > 0 {
+		values = make([]any, 0, len(rows[0].values)*len(rows))
+	}
 	stmtTasks := make([]stmtTask, 1)
 	for i, row := range rows {
 		if i != 0 {
 			insertStmt.WriteByte(',')
 		}
-		insertStmt.WriteString(row.insertStmt)
+		if be.stmtCache != nil {
+			insertStmt.WriteString(row.preparedInsertStmt)
+			values = append(values, row.values...)
+		} else {
+			insertStmt.WriteString(row.insertStmt)
+		}
 	}
-	stmtTasks[0] = stmtTask{rows, insertStmt.String()}
+	stmtTasks[0] = stmtTask{rows, insertStmt.String(), values}
 	return be.execStmts(ctx, stmtTasks, tableName, true)
 }
 
@@ -715,8 +776,12 @@ func (be *tidbBackend) WriteRowsToDB(ctx context.Context, tableName string, colu
 	for _, row := range rows {
 		var finalInsertStmt strings.Builder
 		finalInsertStmt.WriteString(is)
-		finalInsertStmt.WriteString(row.insertStmt)
-		stmtTasks = append(stmtTasks, stmtTask{[]tidbRow{row}, finalInsertStmt.String()})
+		if be.stmtCache != nil {
+			finalInsertStmt.WriteString(row.preparedInsertStmt)
+		} else {
+			finalInsertStmt.WriteString(row.insertStmt)
+		}
+		stmtTasks = append(stmtTasks, stmtTask{[]tidbRow{row}, finalInsertStmt.String(), row.values})
 	}
 	return be.execStmts(ctx, stmtTasks, tableName, false)
 }
@@ -754,8 +819,34 @@ stmtLoop:
 			err    error
 		)
 		for i := 0; i < writeRowsMaxRetryTimes; i++ {
-			stmt := stmtTask.stmt
-			result, err = be.db.ExecContext(ctx, stmt)
+			query := stmtTask.stmt
+			if be.stmtCache != nil {
+				var prepStmt *sql.Stmt
+				key := &stmtKey{query: query}
+				be.stmtCacheMutex.RLock()
+				stmt, ok := be.stmtCache.Get(key)
+				be.stmtCacheMutex.RUnlock()
+				if ok {
+					prepStmt = stmt.(*sql.Stmt)
+				} else if stmt, err := be.db.PrepareContext(ctx, query); err == nil {
+					be.stmtCacheMutex.Lock()
+					// check again if the key is already in the cache
+					// to avoid override existing stmt without closing it
+					if cachedStmt, ok := be.stmtCache.Get(key); !ok {
+						prepStmt = stmt
+						be.stmtCache.Put(key, stmt)
+					} else {
+						prepStmt = cachedStmt.(*sql.Stmt)
+						stmt.Close()
+					}
+					be.stmtCacheMutex.Unlock()
+				} else {
+					return errors.Trace(err)
+				}
+				result, err = prepStmt.ExecContext(ctx, stmtTask.values...)
+			} else {
+				result, err = be.db.ExecContext(ctx, query)
+			}
 			if err == nil {
 				affected, err2 := result.RowsAffected()
 				if err2 != nil {
@@ -776,7 +867,7 @@ stmtLoop:
 
 			if !common.IsContextCanceledError(err) {
 				log.FromContext(ctx).Error("execute statement failed",
-					zap.Array("rows", stmtTask.rows), zap.String("stmt", redact.Value(stmt)), zap.Error(err))
+					zap.Array("rows", stmtTask.rows), zap.String("stmt", redact.Value(query)), zap.Error(err))
 			}
 			// It's batch mode, just return the error. Caller will fall back to row-by-row mode.
 			if batch {