Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lightning: support data files with bom header (#40813) #40834

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions DEPS.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -3420,6 +3420,14 @@ def go_deps():
sum = "h1:CZ7eSOd3kZoaYDLbXnmzgQI5RlciuXBMA+18HwHRfZQ=",
version = "v1.12.0",
)
go_repository(
name = "com_github_spkg_bom",
build_file_proto_mode = "disable",
importpath = "github.com/spkg/bom",
sum = "h1:S939THe0ukL5WcTGiGqkgtaW5JW+O6ITaIlpJXTYY64=",
version = "v1.0.0",
)

go_repository(
name = "com_github_ssgreg_nlreturn_v2",
build_file_proto_mode = "disable",
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/mydump/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ go_library(
"//util/slice",
"//util/table-filter",
"@com_github_pingcap_errors//:errors",
"@com_github_spkg_bom//:bom",
"@com_github_xitongsys_parquet_go//parquet",
"@com_github_xitongsys_parquet_go//reader",
"@com_github_xitongsys_parquet_go//source",
Expand Down
9 changes: 8 additions & 1 deletion br/pkg/lightning/mydump/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/pingcap/tidb/br/pkg/lightning/worker"
"github.com/pingcap/tidb/parser/mysql"
"github.com/pingcap/tidb/types"
"github.com/spkg/bom"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
Expand Down Expand Up @@ -285,7 +286,13 @@ func (parser *blockParser) readBlock() error {
parser.remainBuf.Write(parser.buf)
parser.appendBuf.Reset()
parser.appendBuf.Write(parser.remainBuf.Bytes())
parser.appendBuf.Write(parser.blockBuf[:n])
blockData := parser.blockBuf[:n]
if parser.pos == 0 {
bomCleanedData := bom.Clean(blockData)
parser.pos += int64(n - len(bomCleanedData))
blockData = bomCleanedData
}
parser.appendBuf.Write(blockData)
parser.buf = parser.appendBuf.Bytes()
if parser.metrics != nil {
parser.metrics.ChunkParserReadBlockSecondsHistogram.Observe(time.Since(startTime).Seconds())
Expand Down
3 changes: 2 additions & 1 deletion br/pkg/lightning/mydump/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/pingcap/tidb/br/pkg/lightning/log"
"github.com/pingcap/tidb/br/pkg/lightning/worker"
"github.com/pingcap/tidb/br/pkg/storage"
"github.com/spkg/bom"
"go.uber.org/zap"
"golang.org/x/text/encoding/simplifiedchinese"
)
Expand Down Expand Up @@ -83,7 +84,7 @@ func ExportStatement(ctx context.Context, store storage.ExternalStorage, sqlFile
}
defer fd.Close()

br := bufio.NewReader(fd)
br := bufio.NewReader(bom.NewReader(fd))

data := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
buffer := make([]byte, 0, sqlFile.FileMeta.FileSize+1)
Expand Down
2 changes: 2 additions & 0 deletions br/tests/lightning_bom_file/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[mydumper.csv]
header = true
5 changes: 5 additions & 0 deletions br/tests/lightning_bom_file/data/mytest.testtbl-schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE testtbl (
id INTEGER,
val1 VARCHAR(40) NOT NULL,
INDEX `idx_val1` (`val1`)
);
6 changes: 6 additions & 0 deletions br/tests/lightning_bom_file/data/mytest.testtbl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,val1
1,"aaa01"
2,"aaa01"
3,"aaa02"
4,"aaa02"
5,"aaa05"
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CREATE TABLE testtbl (
id INTEGER,
val1 VARCHAR(40) NOT NULL,
INDEX `idx_val1` (`val1`)
);
6 changes: 6 additions & 0 deletions br/tests/lightning_bom_file/original_data/mytest.testtbl.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,val1
1,"aaa01"
2,"aaa01"
3,"aaa02"
4,"aaa02"
5,"aaa05"
56 changes: 56 additions & 0 deletions br/tests/lightning_bom_file/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/sh
#
# Copyright 2023 PingCAP, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -eux

mydir=$(dirname "${BASH_SOURCE[0]}")

original_schema_file="${mydir}/original_data/mytest.testtbl-schema.sql"
original_data_file="${mydir}/original_data/mytest.testtbl.csv"
schema_file="${original_schema_file/original_data/data}"
data_file="${original_data_file/original_data/data}"

# add the BOM header
printf '\xEF\xBB\xBF' | cat - <( sed '1s/^\xEF\xBB\xBF//' "${original_schema_file}" ) > "${schema_file}"
printf '\xEF\xBB\xBF' | cat - <( sed '1s/^\xEF\xBB\xBF//' "${original_data_file}" ) > "${data_file}"

# verify the BOM header
if ! grep -q $'^\xEF\xBB\xBF' "${schema_file}"; then
echo "schema file doesn't contain the BOM header" >&2
exit 1
fi

if ! grep -q $'^\xEF\xBB\xBF' "${data_file}"; then
echo "data file doesn't contain the BOM header" >&2
exit 1
fi

row_count=$( sed '1d' "${data_file}" | wc -l | xargs echo )

run_lightning --backend tidb

# Check that everything is correctly imported
run_sql 'SELECT count(*) FROM mytest.testtbl'
check_contains "count(*): ${row_count}"

check_cluster_version 4 0 0 'local backend' || exit 0
run_sql "DROP TABLE mytest.testtbl"

run_lightning --backend local

# Check that everything is correctly imported
run_sql 'SELECT count(*) FROM mytest.testtbl'
check_contains "count(*): ${row_count}"
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ require (
github.com/soheilhy/cmux v0.1.5
github.com/spf13/cobra v1.6.1
github.com/spf13/pflag v1.0.5
github.com/spkg/bom v1.0.0
github.com/stretchr/testify v1.8.0
github.com/tdakkota/asciicheck v0.1.1
github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
github.com/spf13/viper v1.7.0/go.mod h1:8WkrPz2fc9jxqZNCJI/76HCieCp4Q8HaLFoCha5qpdg=
github.com/spkg/bom v1.0.0 h1:S939THe0ukL5WcTGiGqkgtaW5JW+O6ITaIlpJXTYY64=
github.com/spkg/bom v1.0.0/go.mod h1:lAz2VbTuYNcvs7iaFF8WW0ufXrHShJ7ck1fYFFbVXJs=
github.com/stathat/consistent v1.0.0 h1:ZFJ1QTRn8npNBKW065raSZ8xfOqhpb8vLOkfp4CcL/U=
github.com/stathat/consistent v1.0.0/go.mod h1:uajTPbgSygZBJ+V+0mY7meZ8i0XAcZs7AQ6V121XSxw=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down