From 4b7e893b739e696612c4667fd011dd14db380374 Mon Sep 17 00:00:00 2001 From: tangenta Date: Wed, 29 Jun 2022 18:37:00 +0800 Subject: [PATCH 1/6] parser, ddl: support decoding binary literal in set/enum --- .../r/new_character_set_invalid.result | 15 +++++++++++++ .../t/new_character_set_invalid.test | 7 ++++++ ddl/ddl_api.go | 20 +++++++++++++++++ parser/ast/misc.go | 22 ------------------- parser/parser.go | 22 ++++++++++--------- parser/parser.y | 22 ++++++++++--------- parser/types/field_type.go | 21 +++++++++++++++++- 7 files changed, 86 insertions(+), 43 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index 92e43d6c747fa..864ee5cea89e1 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -21,3 +21,18 @@ a b c ? ? ? 中文?中文 asdf?fdsa 字符集?字符集 @@ @@ @@ +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)); +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `f` set('һ','??') DEFAULT NULL, + `e` enum('??','jY') DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin +drop table t; +create table t( e enum(0xBAEC,0x6A59)); +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `e` enum('??','jY') DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index eaed9ba78c518..aa1a350c1f310 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -15,3 +15,10 @@ insert into t values ('À', 'ø', '😂'); insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); select * from t; + +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)); +show create table t; +drop table t; +create table t( e enum(0xBAEC,0x6A59)); +show create table t; diff --git a/ddl/ddl_api.go b/ddl/ddl_api.go index d0b799731a779..16ed5397eff34 100644 --- a/ddl/ddl_api.go +++ b/ddl/ddl_api.go @@ -56,6 +56,7 @@ import ( "github.com/pingcap/tidb/util/collate" "github.com/pingcap/tidb/util/dbterror" "github.com/pingcap/tidb/util/domainutil" + "github.com/pingcap/tidb/util/hack" "github.com/pingcap/tidb/util/logutil" "github.com/pingcap/tidb/util/mathutil" "github.com/pingcap/tidb/util/mock" @@ -820,6 +821,23 @@ func setCharsetCollationFlenDecimal(tp *types.FieldType, colName, colCharset, co return checkTooBigFieldLengthAndTryAutoConvert(tp, colName, sessVars) } +func decodeEnumSetBinaryLiteralToUTF8(tp *types.FieldType, chs string) { + if tp.GetType() != mysql.TypeEnum && tp.GetType() != mysql.TypeSet { + return + } + enc := charset.FindEncoding(chs) + for i, elem := range tp.GetElems() { + if !tp.GetElemIsBinaryLit(i) { + continue + } + s, err := enc.Transform(nil, hack.Slice(elem), charset.OpDecodeReplace) + if err != nil { + logutil.BgLogger().Warn("decode enum binary literal to utf-8 failed", zap.Error(err)) + } + tp.SetElem(i, string(hack.String(s))) + } +} + // buildColumnAndConstraint builds table.Column and ast.Constraint from the parameters. // outPriKeyConstraint is the primary key constraint out of column definition. For example: // `create table t1 (id int , age int, primary key(id));` @@ -852,6 +870,7 @@ func buildColumnAndConstraint( if err := setCharsetCollationFlenDecimal(colDef.Tp, colDef.Name.Name.O, chs, coll, ctx.GetSessionVars()); err != nil { return nil, nil, errors.Trace(err) } + decodeEnumSetBinaryLiteralToUTF8(colDef.Tp, chs) col, cts, err := columnDefToCol(ctx, offset, colDef, outPriKeyConstraint) if err != nil { return nil, nil, errors.Trace(err) @@ -4521,6 +4540,7 @@ func (d *ddl) getModifiableColumnJob(ctx context.Context, sctx sessionctx.Contex if err = setCharsetCollationFlenDecimal(&newCol.FieldType, newCol.Name.O, chs, coll, sctx.GetSessionVars()); err != nil { return nil, errors.Trace(err) } + decodeEnumSetBinaryLiteralToUTF8(&newCol.FieldType, chs) // Check the column with foreign key, waiting for the default flen and decimal. if fkInfo := getColumnForeignKeyInfo(originalColName.L, t.Meta().ForeignKeys); fkInfo != nil { diff --git a/parser/ast/misc.go b/parser/ast/misc.go index 38c465568634a..08e14575c53eb 100644 --- a/parser/ast/misc.go +++ b/parser/ast/misc.go @@ -22,7 +22,6 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/tidb/parser/auth" - "github.com/pingcap/tidb/parser/charset" "github.com/pingcap/tidb/parser/format" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/parser/mysql" @@ -3619,27 +3618,6 @@ type TextString struct { IsBinaryLiteral bool } -// TransformTextStrings converts a slice of TextString to strings. -// This is only used by enum/set strings. -func TransformTextStrings(ts []*TextString, _ string) []string { - // The UTF-8 encoding rather than other encoding is used - // because parser is not possible to determine the "real" - // charset that a binary literal string should be converted to. - enc := charset.EncodingUTF8Impl - ret := make([]string, 0, len(ts)) - for _, t := range ts { - if !t.IsBinaryLiteral { - ret = append(ret, t.Value) - } else { - // Validate the binary literal string. - // See https://github.com/pingcap/tidb/issues/30740. - r, _ := enc.Transform(nil, charset.HackSlice(t.Value), charset.OpDecodeNoErr) - ret = append(ret, charset.HackString(r)) - } - } - return ret -} - type BinaryLiteral interface { ToString() string } diff --git a/parser/parser.go b/parser/parser.go index 051b5550596c1..9f9112dc76731 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -19881,12 +19881,13 @@ yynewstate: tp := types.NewFieldType(mysql.TypeEnum) elems := yyS[yypt-2].item.([]*ast.TextString) opt := yyS[yypt-0].item.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) + tp.SetElems(make([]string, len(elems))) fieldLen := -1 // enum_flen = max(ele_flen) - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - if len(tp.GetElem(i)) > fieldLen { - fieldLen = len(tp.GetElem(i)) + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + if len(trimmed) > fieldLen { + fieldLen = len(trimmed) } } tp.SetFlen(fieldLen) @@ -19901,11 +19902,12 @@ yynewstate: tp := types.NewFieldType(mysql.TypeSet) elems := yyS[yypt-2].item.([]*ast.TextString) opt := yyS[yypt-0].item.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) - fieldLen := len(tp.GetElems()) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - fieldLen += len(tp.GetElem(i)) + tp.SetElems(make([]string, len(elems))) + fieldLen := len(elems) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + fieldLen += len(trimmed) } tp.SetFlen(fieldLen) tp.SetCharset(opt.Charset) diff --git a/parser/parser.y b/parser/parser.y index 131fa55aaaebd..0cda00670a6eb 100644 --- a/parser/parser.y +++ b/parser/parser.y @@ -11896,12 +11896,13 @@ StringType: tp := types.NewFieldType(mysql.TypeEnum) elems := $3.([]*ast.TextString) opt := $5.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) + tp.SetElems(make([]string, len(elems))) fieldLen := -1 // enum_flen = max(ele_flen) - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - if len(tp.GetElem(i)) > fieldLen { - fieldLen = len(tp.GetElem(i)) + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + if len(trimmed) > fieldLen { + fieldLen = len(trimmed) } } tp.SetFlen(fieldLen) @@ -11916,11 +11917,12 @@ StringType: tp := types.NewFieldType(mysql.TypeSet) elems := $3.([]*ast.TextString) opt := $5.(*ast.OptBinary) - tp.SetElems(ast.TransformTextStrings(elems, opt.Charset)) - fieldLen := len(tp.GetElems()) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 - for i := range tp.GetElems() { - tp.SetElem(i, strings.TrimRight(tp.GetElem(i), " ")) - fieldLen += len(tp.GetElem(i)) + tp.SetElems(make([]string, len(elems))) + fieldLen := len(elems) - 1 // set_flen = sum(ele_flen) + number_of_ele - 1 + for i, e := range elems { + trimmed := strings.TrimRight(e.Value, " ") + tp.SetElemWithIsBinaryLit(i, trimmed, e.IsBinaryLiteral) + fieldLen += len(trimmed) } tp.SetFlen(fieldLen) tp.SetCharset(opt.Charset) diff --git a/parser/types/field_type.go b/parser/types/field_type.go index aa984e2a945b6..1e9986339339e 100644 --- a/parser/types/field_type.go +++ b/parser/types/field_type.go @@ -53,7 +53,8 @@ type FieldType struct { // collate represent collate rules of the charset collate string // elems is the element list for enum and set type. - elems []string + elems []string + elemsIsBinaryLit []bool } // NewFieldType returns a FieldType, @@ -180,10 +181,28 @@ func (ft *FieldType) SetElem(idx int, element string) { ft.elems[idx] = element } +func (ft *FieldType) SetElemWithIsBinaryLit(idx int, element string, isBinaryLit bool) { + ft.elems[idx] = element + if isBinaryLit { + // Create the binary literal flags lazily. + if ft.elemsIsBinaryLit == nil { + ft.elemsIsBinaryLit = make([]bool, len(ft.elems)) + } + ft.elemsIsBinaryLit[idx] = true + } +} + func (ft *FieldType) GetElem(idx int) string { return ft.elems[idx] } +func (ft *FieldType) GetElemIsBinaryLit(idx int) bool { + if len(ft.elemsIsBinaryLit) == 0 { + return false + } + return ft.elemsIsBinaryLit[idx] +} + // Clone returns a copy of itself. func (ft *FieldType) Clone() *FieldType { ret := *ft From 2ef864a493d0144c1c4b6241813d792ba87940ea Mon Sep 17 00:00:00 2001 From: tangenta Date: Wed, 29 Jun 2022 18:53:06 +0800 Subject: [PATCH 2/6] test: specifying gbk_bin as table's collation --- .../r/new_character_set_invalid.result | 15 +++++++++++++++ cmd/explaintest/t/new_character_set_invalid.test | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index 864ee5cea89e1..4134b1d6f2dce 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -36,3 +36,18 @@ Table Create Table t CREATE TABLE `t` ( `e` enum('??','jY') DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `f` set('一','三') COLLATE gbk_bin DEFAULT NULL, + `e` enum('红','jY') COLLATE gbk_bin DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=gbk COLLATE=gbk_bin +drop table t; +create table t( e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +Table Create Table +t CREATE TABLE `t` ( + `e` enum('红','jY') COLLATE gbk_bin DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=gbk COLLATE=gbk_bin diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index aa1a350c1f310..e6bb3ce88c75d 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -22,3 +22,9 @@ show create table t; drop table t; create table t( e enum(0xBAEC,0x6A59)); show create table t; +drop table t; +create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; +drop table t; +create table t( e enum(0xBAEC,0x6A59)) collate gbk_bin; +show create table t; From 87fcbbf81a73fa850919f84dc8d95e7269397049 Mon Sep 17 00:00:00 2001 From: tangenta Date: Wed, 29 Jun 2022 20:38:31 +0800 Subject: [PATCH 3/6] fix integration tests --- ddl/ddl_api.go | 1 + ddl/integration_test.go | 14 +++++++------- parser/ast/util_test.go | 2 ++ parser/parser_test.go | 2 ++ parser/types/field_type.go | 6 ++++++ 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ddl/ddl_api.go b/ddl/ddl_api.go index 16ed5397eff34..c56fce26b203c 100644 --- a/ddl/ddl_api.go +++ b/ddl/ddl_api.go @@ -836,6 +836,7 @@ func decodeEnumSetBinaryLiteralToUTF8(tp *types.FieldType, chs string) { } tp.SetElem(i, string(hack.String(s))) } + tp.CleanElemIsBinaryLit() } // buildColumnAndConstraint builds table.Column and ast.Constraint from the parameters. diff --git a/ddl/integration_test.go b/ddl/integration_test.go index 34edd8dfe34d6..99ba587322f88 100644 --- a/ddl/integration_test.go +++ b/ddl/integration_test.go @@ -68,19 +68,19 @@ func TestDefaultValueInEnum(t *testing.T) { tk.MustExec("use test;") // The value 0x91 should not cause panic. tk.MustExec("create table t(a enum('a', 0x91) charset gbk);") - tk.MustExec("insert into t values (1), (2);") // Use 1-base index to locate the value. - tk.MustQuery("select a from t;").Check(testkit.Rows("a", "")) // 0x91 is truncate. + tk.MustExec("insert into t values (1), (2);") // Use 1-base index to locate the value. + tk.MustQuery("select a from t;").Check(testkit.Rows("a", "?")) // 0x91 is replaced to '?'. tk.MustExec("drop table t;") tk.MustExec("create table t (a enum('a', 0x91)) charset gbk;") // Test for table charset. tk.MustExec("insert into t values (1), (2);") - tk.MustQuery("select a from t;").Check(testkit.Rows("a", "")) + tk.MustQuery("select a from t;").Check(testkit.Rows("a", "?")) tk.MustExec("drop table t;") - tk.MustGetErrMsg("create table t(a set('a', 0x91, '') charset gbk);", - "[types:1291]Column 'a' has duplicated value '' in SET") - // Test valid utf-8 string value in enum. Note that the binary literal only can be decoded to utf-8. + tk.MustGetErrMsg("create table t(a set('a', 0x91, '?') charset gbk);", + "[types:1291]Column 'a' has duplicated value '?' in SET") + // Test valid utf-8 string value in enum. tk.MustExec("create table t (a enum('a', 0xE4BDA0E5A5BD) charset gbk);") tk.MustExec("insert into t values (1), (2);") - tk.MustQuery("select a from t;").Check(testkit.Rows("a", "你好")) + tk.MustQuery("select a from t;").Check(testkit.Rows("a", "浣犲ソ")) } func TestDDLStatementsBackFill(t *testing.T) { diff --git a/parser/ast/util_test.go b/parser/ast/util_test.go index 015f5dc5cc4eb..b43fac39ae481 100644 --- a/parser/ast/util_test.go +++ b/parser/ast/util_test.go @@ -147,6 +147,8 @@ func (checker *nodeTextCleaner) Enter(in Node) (out Node, skipChildren bool) { } case *Join: node.ExplicitParens = false + case *ColumnDef: + node.Tp.CleanElemIsBinaryLit() } return in, false } diff --git a/parser/parser_test.go b/parser/parser_test.go index 3ab44e1a232c0..f97ca11feada5 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -6192,6 +6192,8 @@ func (checker *nodeTextCleaner) Enter(in ast.Node) (out ast.Node, skipChildren b node.Specs = specs case *ast.Join: node.ExplicitParens = false + case *ast.ColumnDef: + node.Tp.CleanElemIsBinaryLit() } return in, false } diff --git a/parser/types/field_type.go b/parser/types/field_type.go index 1e9986339339e..9e644f6575dba 100644 --- a/parser/types/field_type.go +++ b/parser/types/field_type.go @@ -203,6 +203,12 @@ func (ft *FieldType) GetElemIsBinaryLit(idx int) bool { return ft.elemsIsBinaryLit[idx] } +func (ft *FieldType) CleanElemIsBinaryLit() { + if ft != nil && ft.elemsIsBinaryLit != nil { + ft.elemsIsBinaryLit = nil + } +} + // Clone returns a copy of itself. func (ft *FieldType) Clone() *FieldType { ret := *ft From d47996a8c0e53d12dca4729c9633c198fb79dc07 Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 30 Jun 2022 11:21:24 +0800 Subject: [PATCH 4/6] explain_test: set sql mode to default before test --- cmd/explaintest/r/new_character_set_invalid.result | 1 + cmd/explaintest/t/new_character_set_invalid.test | 1 + 2 files changed, 2 insertions(+) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index 4134b1d6f2dce..c6e29d24eabf1 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -21,6 +21,7 @@ a b c ? ? ? 中文?中文 asdf?fdsa 字符集?字符集 @@ @@ @@ +set @@sql_mode = default; drop table t; create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)); show create table t; diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index e6bb3ce88c75d..ee7546f253a36 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -16,6 +16,7 @@ insert into t values ('中文À中文', 'asdføfdsa', '字符集😂字符集'); insert into t values (0x4040ffff, 0x4040ffff, 0x4040ffff); select * from t; +set @@sql_mode = default; drop table t; create table t(f set(0xD2BB, 0xC8FD), e enum(0xBAEC,0x6A59)); show create table t; From 01ab3b8cb0c95538eb1afea219b967718a0db193 Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 30 Jun 2022 11:29:02 +0800 Subject: [PATCH 5/6] update jsonFieldType --- parser/types/field_type.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/parser/types/field_type.go b/parser/types/field_type.go index 9e644f6575dba..429896d6d790e 100644 --- a/parser/types/field_type.go +++ b/parser/types/field_type.go @@ -55,6 +55,7 @@ type FieldType struct { // elems is the element list for enum and set type. elems []string elemsIsBinaryLit []bool + // Please keep in mind that jsonFieldType should be updated if you add a new field here. } // NewFieldType returns a FieldType, @@ -531,13 +532,14 @@ func HasCharset(ft *FieldType) bool { // for json type jsonFieldType struct { - Tp byte - Flag uint - Flen int - Decimal int - Charset string - Collate string - Elems []string + Tp byte + Flag uint + Flen int + Decimal int + Charset string + Collate string + Elems []string + ElemsIsBinaryLit []bool } func (ft *FieldType) UnmarshalJSON(data []byte) error { @@ -551,6 +553,7 @@ func (ft *FieldType) UnmarshalJSON(data []byte) error { ft.charset = r.Charset ft.collate = r.Collate ft.elems = r.Elems + ft.elemsIsBinaryLit = r.ElemsIsBinaryLit } return err } @@ -564,5 +567,6 @@ func (ft *FieldType) MarshalJSON() ([]byte, error) { r.Charset = ft.charset r.Collate = ft.collate r.Elems = ft.elems + r.ElemsIsBinaryLit = ft.elemsIsBinaryLit return json.Marshal(r) } From 934ea38a805889efdde93af546fa7e098d396fa3 Mon Sep 17 00:00:00 2001 From: tangenta Date: Thu, 30 Jun 2022 12:44:46 +0800 Subject: [PATCH 6/6] fix explain test --- cmd/explaintest/r/new_character_set_invalid.result | 1 + cmd/explaintest/t/new_character_set_invalid.test | 1 + 2 files changed, 2 insertions(+) diff --git a/cmd/explaintest/r/new_character_set_invalid.result b/cmd/explaintest/r/new_character_set_invalid.result index c6e29d24eabf1..bb9ae4aa6db2b 100644 --- a/cmd/explaintest/r/new_character_set_invalid.result +++ b/cmd/explaintest/r/new_character_set_invalid.result @@ -52,3 +52,4 @@ Table Create Table t CREATE TABLE `t` ( `e` enum('红','jY') COLLATE gbk_bin DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=gbk COLLATE=gbk_bin +set @@sql_mode = ''; diff --git a/cmd/explaintest/t/new_character_set_invalid.test b/cmd/explaintest/t/new_character_set_invalid.test index ee7546f253a36..369f72362cfc7 100644 --- a/cmd/explaintest/t/new_character_set_invalid.test +++ b/cmd/explaintest/t/new_character_set_invalid.test @@ -29,3 +29,4 @@ show create table t; drop table t; create table t( e enum(0xBAEC,0x6A59)) collate gbk_bin; show create table t; +set @@sql_mode = '';