diff --git a/ddl/serial_test.go b/ddl/serial_test.go index c54db5cf44dfa..698dbdff179d6 100644 --- a/ddl/serial_test.go +++ b/ddl/serial_test.go @@ -1285,6 +1285,7 @@ func (s *testSerialSuite) TestModifyingColumn4NewCollations(c *C) { tk.MustExec("alter table t collate utf8mb4_general_ci") tk.MustExec("alter table t charset utf8mb4 collate utf8mb4_bin") tk.MustExec("alter table t charset utf8mb4 collate utf8mb4_unicode_ci") + tk.MustExec("alter table t charset utf8mb4 collate utf8mb4_zh_pinyin_tidb_as_cs") // Change the default collation of database is allowed. tk.MustExec("alter database dct charset utf8mb4 collate utf8mb4_general_ci") } diff --git a/executor/seqtest/seq_executor_test.go b/executor/seqtest/seq_executor_test.go index 80a106173d4fa..e5dee5a93620f 100644 --- a/executor/seqtest/seq_executor_test.go +++ b/executor/seqtest/seq_executor_test.go @@ -1240,6 +1240,7 @@ func (s *seqTestSuite) TestShowForNewCollations(c *C) { "utf8mb4_bin utf8mb4 46 Yes Yes 1", "utf8mb4_general_ci utf8mb4 45 Yes 1", "utf8mb4_unicode_ci utf8mb4 224 Yes 1", + "utf8mb4_zh_pinyin_tidb_as_cs utf8mb4 2048 Yes 1", ) tk.MustQuery("show collation").Check(expectRows) tk.MustQuery("select * from information_schema.COLLATIONS").Check(expectRows) diff --git a/expression/distsql_builtin_test.go b/expression/distsql_builtin_test.go index a438ea80a65d7..20ff67e5792f5 100644 --- a/expression/distsql_builtin_test.go +++ b/expression/distsql_builtin_test.go @@ -56,6 +56,7 @@ func (s *testEvalSerialSuite) TestPBToExprWithNewCollation(c *C) { {"some_error_collation", "utf8mb4_bin", 46, 46}, {"utf8_unicode_ci", "utf8_unicode_ci", 192, 192}, {"utf8mb4_unicode_ci", "utf8mb4_unicode_ci", 224, 224}, + {"utf8mb4_zh_pinyin_tidb_as_cs", "utf8mb4_zh_pinyin_tidb_as_cs", 2048, 2048}, } for _, cs := range cases { diff --git a/expression/expr_to_pb.go b/expression/expr_to_pb.go index b932bb04f24d2..223f71df311e8 100644 --- a/expression/expr_to_pb.go +++ b/expression/expr_to_pb.go @@ -17,6 +17,7 @@ import ( "github.com/gogo/protobuf/proto" "github.com/pingcap/errors" "github.com/pingcap/failpoint" + "github.com/pingcap/parser/charset" "github.com/pingcap/parser/mysql" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/sessionctx/stmtctx" @@ -174,8 +175,8 @@ func FieldTypeFromPB(ft *tipb.FieldType) *types.FieldType { } func collationToProto(c string) int32 { - if v, ok := mysql.CollationNames[c]; ok { - return collate.RewriteNewCollationIDIfNeeded(int32(v)) + if coll, err := charset.GetCollationByName(c); err == nil { + return collate.RewriteNewCollationIDIfNeeded(int32(coll.ID)) } v := collate.RewriteNewCollationIDIfNeeded(int32(mysql.DefaultCollationID)) logutil.BgLogger().Warn( @@ -188,9 +189,9 @@ func collationToProto(c string) int32 { } func protoToCollation(c int32) string { - v, ok := mysql.Collations[uint8(collate.RestoreCollationIDIfNeeded(c))] - if ok { - return v + coll, err := charset.GetCollationByID(int(collate.RestoreCollationIDIfNeeded(c))) + if err == nil { + return coll.Name } logutil.BgLogger().Warn( "Unable to get collation name from ID, use name of the default collation instead", diff --git a/expression/expr_to_pb_test.go b/expression/expr_to_pb_test.go index 4daeb88e0939a..bbfe7ee6853d6 100644 --- a/expression/expr_to_pb_test.go +++ b/expression/expr_to_pb_test.go @@ -789,6 +789,7 @@ func (s *testEvaluatorSerialSuites) TestNewCollationsEnabled(c *C) { colExprs = append(colExprs, columnCollation(dg.genColumn(mysql.TypeString, 4), "utf8mb4_0900_ai_ci")) colExprs = append(colExprs, columnCollation(dg.genColumn(mysql.TypeVarchar, 5), "utf8_bin")) colExprs = append(colExprs, columnCollation(dg.genColumn(mysql.TypeVarchar, 6), "utf8_unicode_ci")) + colExprs = append(colExprs, columnCollation(dg.genColumn(mysql.TypeVarchar, 7), "utf8mb4_zh_pinyin_tidb_as_cs")) pushed, _ := PushDownExprs(sc, colExprs, client, kv.UnSpecified) c.Assert(len(pushed), Equals, len(colExprs)) pbExprs, err := ExpressionsToPBList(sc, colExprs, client) @@ -800,6 +801,7 @@ func (s *testEvaluatorSerialSuites) TestNewCollationsEnabled(c *C) { "{\"tp\":201,\"val\":\"gAAAAAAAAAQ=\",\"sig\":0,\"field_type\":{\"tp\":254,\"flag\":0,\"flen\":-1,\"decimal\":-1,\"collate\":-255,\"charset\":\"\"}}", "{\"tp\":201,\"val\":\"gAAAAAAAAAU=\",\"sig\":0,\"field_type\":{\"tp\":15,\"flag\":0,\"flen\":-1,\"decimal\":-1,\"collate\":-83,\"charset\":\"\"}}", "{\"tp\":201,\"val\":\"gAAAAAAAAAY=\",\"sig\":0,\"field_type\":{\"tp\":15,\"flag\":0,\"flen\":-1,\"decimal\":-1,\"collate\":-192,\"charset\":\"\"}}", + "{\"tp\":201,\"val\":\"gAAAAAAAAAc=\",\"sig\":0,\"field_type\":{\"tp\":15,\"flag\":0,\"flen\":-1,\"decimal\":-1,\"collate\":-2048,\"charset\":\"\"}}", } for i, pbExpr := range pbExprs { c.Assert(pbExprs, NotNil) diff --git a/go.mod b/go.mod index cccb22660fdce..44d669a1aa456 100644 --- a/go.mod +++ b/go.mod @@ -1,7 +1,6 @@ module github.com/pingcap/tidb require ( - cloud.google.com/go v0.51.0 // indirect github.com/BurntSushi/toml v0.3.1 github.com/HdrHistogram/hdrhistogram-go v0.9.0 // indirect github.com/Jeffail/gabs/v2 v2.5.1 diff --git a/go.sum b/go.sum index d359eea653113..cf2cb1bc3386f 100644 --- a/go.sum +++ b/go.sum @@ -6,8 +6,6 @@ cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxK cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= -cloud.google.com/go v0.51.0 h1:PvKAVQWCtlGUSlZkGW3QLelKaWq7KYv/MW1EboG8bfM= -cloud.google.com/go v0.51.0/go.mod h1:hWtGJ6gnXH+KgDv+V0zFGDvpi07n3z8ZNj3T1RW0Gcw= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0 h1:sAbMqjY1PEQKZBWfbu6Y6bsupJ9c4QdHnzg/VvYTLcE= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= @@ -217,7 +215,6 @@ github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPg github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= -github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200407044318-7d83b28da2e9 h1:K+lX49/3eURCE1IjlaZN//u6c+9nfDAMnyQ9E2dsJbY= github.com/google/pprof v0.0.0-20200407044318-7d83b28da2e9/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= diff --git a/store/mockstore/mocktikv/cop_handler_dag.go b/store/mockstore/mocktikv/cop_handler_dag.go index 8c8874fab2094..d020d058467ee 100644 --- a/store/mockstore/mocktikv/cop_handler_dag.go +++ b/store/mockstore/mocktikv/cop_handler_dag.go @@ -925,6 +925,6 @@ func fieldTypeFromPBColumn(col *tipb.ColumnInfo) *types.FieldType { Flen: int(col.GetColumnLen()), Decimal: int(col.GetDecimal()), Elems: col.Elems, - Collate: mysql.Collations[uint8(collate.RestoreCollationIDIfNeeded(col.GetCollation()))], + Collate: collate.CollationID2Name(collate.RestoreCollationIDIfNeeded(col.GetCollation())), } } diff --git a/store/mockstore/unistore/cophandler/cop_handler.go b/store/mockstore/unistore/cophandler/cop_handler.go index 89f169d6008ce..f9fc3045d3c00 100644 --- a/store/mockstore/unistore/cophandler/cop_handler.go +++ b/store/mockstore/unistore/cophandler/cop_handler.go @@ -419,7 +419,7 @@ func fieldTypeFromPBColumn(col *tipb.ColumnInfo) *types.FieldType { Flen: int(col.GetColumnLen()), Decimal: int(col.GetDecimal()), Elems: col.Elems, - Collate: mysql.Collations[uint8(collate.RestoreCollationIDIfNeeded(col.GetCollation()))], + Collate: collate.CollationID2Name(collate.RestoreCollationIDIfNeeded(col.GetCollation())), } } diff --git a/util/collate/collate.go b/util/collate/collate.go index 54d8f5f44e953..1cdf88529e147 100644 --- a/util/collate/collate.go +++ b/util/collate/collate.go @@ -44,9 +44,17 @@ var ( ErrIllegalMix3Collation = dbterror.ClassExpression.NewStd(mysql.ErrCantAggregate3collations) ) -// DefaultLen is set for datum if the string datum don't know its length. const ( + // DefaultLen is set for datum if the string datum don't know its length. DefaultLen = 0 + // first byte of a 2-byte encoding starts 110 and carries 5 bits of data + b2Mask = 0x1F // 0001 1111 + // first byte of a 3-byte encoding starts 1110 and carries 4 bits of data + b3Mask = 0x0F // 0000 1111 + // first byte of a 4-byte encoding starts 11110 and carries 3 bits of data + b4Mask = 0x07 // 0000 0111 + // non-first bytes start 10 and carry 6 bits of data + mbMask = 0x3F // 0011 1111 ) // Collator provides functionality for comparing strings for a given @@ -164,8 +172,8 @@ func GetCollatorByID(id int) Collator { // CollationID2Name return the collation name by the given id. // If the id is not found in the map, the default collation is returned. func CollationID2Name(id int32) string { - name, ok := mysql.Collations[uint8(id)] - if !ok { + collation, err := charset.GetCollationByID(int(id)) + if err != nil { // TODO(bb7133): fix repeating logs when the following code is uncommented. //logutil.BgLogger().Warn( // "Unable to get collation name from ID, use default collation instead.", @@ -173,7 +181,16 @@ func CollationID2Name(id int32) string { // zap.Stack("stack")) return mysql.DefaultCollationName } - return name + return collation.Name +} + +// CollationName2ID return the collation id by the given name. +// If the name is not found in the map, the default collation id is returned +func CollationName2ID(name string) int { + if coll, err := charset.GetCollationByName(name); err == nil { + return coll.ID + } + return mysql.DefaultCollationID } // GetCollationByName wraps charset.GetCollationByName, it checks the collation. @@ -221,6 +238,40 @@ func truncateTailingSpace(str string) string { return str } +func sign(i int) int { + if i < 0 { + return -1 + } else if i > 0 { + return 1 + } + return 0 +} + +// decode rune by hand +func decodeRune(s string, si int) (r rune, newIndex int) { + switch b := s[si]; { + case b < 0x80: + r = rune(b) + newIndex = si + 1 + case b < 0xE0: + r = rune(b&b2Mask)<<6 | + rune(s[1+si]&mbMask) + newIndex = si + 2 + case b < 0xF0: + r = rune(b&b3Mask)<<12 | + rune(s[si+1]&mbMask)<<6 | + rune(s[si+2]&mbMask) + newIndex = si + 3 + default: + r = rune(b&b4Mask)<<18 | + rune(s[si+1]&mbMask)<<12 | + rune(s[si+2]&mbMask)<<6 | + rune(s[si+3]&mbMask) + newIndex = si + 4 + } + return +} + // IsCICollation returns if the collation is case-sensitive func IsCICollation(collate string) bool { return collate == "utf8_general_ci" || collate == "utf8mb4_general_ci" || @@ -232,21 +283,23 @@ func init() { newCollatorIDMap = make(map[int]Collator) newCollatorMap["binary"] = &binCollator{} - newCollatorIDMap[int(mysql.CollationNames["binary"])] = &binCollator{} + newCollatorIDMap[CollationName2ID("binary")] = &binCollator{} newCollatorMap["ascii_bin"] = &binPaddingCollator{} - newCollatorIDMap[int(mysql.CollationNames["ascii_bin"])] = &binPaddingCollator{} + newCollatorIDMap[CollationName2ID("ascii_bin")] = &binPaddingCollator{} newCollatorMap["latin1_bin"] = &binPaddingCollator{} - newCollatorIDMap[int(mysql.CollationNames["latin1_bin"])] = &binPaddingCollator{} + newCollatorIDMap[CollationName2ID("latin1_bin")] = &binPaddingCollator{} newCollatorMap["utf8mb4_bin"] = &binPaddingCollator{} - newCollatorIDMap[int(mysql.CollationNames["utf8mb4_bin"])] = &binPaddingCollator{} + newCollatorIDMap[CollationName2ID("utf8mb4_bin")] = &binPaddingCollator{} newCollatorMap["utf8_bin"] = &binPaddingCollator{} - newCollatorIDMap[int(mysql.CollationNames["utf8_bin"])] = &binPaddingCollator{} + newCollatorIDMap[CollationName2ID("utf8_bin")] = &binPaddingCollator{} newCollatorMap["utf8mb4_general_ci"] = &generalCICollator{} - newCollatorIDMap[int(mysql.CollationNames["utf8mb4_general_ci"])] = &generalCICollator{} + newCollatorIDMap[CollationName2ID("utf8mb4_general_ci")] = &generalCICollator{} newCollatorMap["utf8_general_ci"] = &generalCICollator{} - newCollatorIDMap[int(mysql.CollationNames["utf8_general_ci"])] = &generalCICollator{} + newCollatorIDMap[CollationName2ID("utf8_general_ci")] = &generalCICollator{} newCollatorMap["utf8mb4_unicode_ci"] = &unicodeCICollator{} - newCollatorIDMap[int(mysql.CollationNames["utf8mb4_unicode_ci"])] = &unicodeCICollator{} + newCollatorIDMap[CollationName2ID("utf8mb4_unicode_ci")] = &unicodeCICollator{} newCollatorMap["utf8_unicode_ci"] = &unicodeCICollator{} - newCollatorIDMap[int(mysql.CollationNames["utf8_unicode_ci"])] = &unicodeCICollator{} + newCollatorIDMap[CollationName2ID("utf8_unicode_ci")] = &unicodeCICollator{} + newCollatorMap["utf8mb4_zh_pinyin_tidb_as_cs"] = &zhPinyinTiDBASCSCollator{} + newCollatorIDMap[CollationName2ID("utf8mb4_zh_pinyin_tidb_as_cs")] = &zhPinyinTiDBASCSCollator{} } diff --git a/util/collate/collate_test.go b/util/collate/collate_test.go index 07981c4bfa3d6..1da50af005ce8 100644 --- a/util/collate/collate_test.go +++ b/util/collate/collate_test.go @@ -198,6 +198,7 @@ func (s *testCollateSuite) TestGetCollator(c *C) { c.Assert(GetCollator("utf8_general_ci"), FitsTypeOf, &generalCICollator{}) c.Assert(GetCollator("utf8mb4_unicode_ci"), FitsTypeOf, &unicodeCICollator{}) c.Assert(GetCollator("utf8_unicode_ci"), FitsTypeOf, &unicodeCICollator{}) + c.Assert(GetCollator("utf8mb4_zh_pinyin_tidb_as_cs"), FitsTypeOf, &zhPinyinTiDBASCSCollator{}) c.Assert(GetCollator("default_test"), FitsTypeOf, &binPaddingCollator{}) c.Assert(GetCollatorByID(63), FitsTypeOf, &binCollator{}) c.Assert(GetCollatorByID(46), FitsTypeOf, &binPaddingCollator{}) @@ -206,6 +207,7 @@ func (s *testCollateSuite) TestGetCollator(c *C) { c.Assert(GetCollatorByID(33), FitsTypeOf, &generalCICollator{}) c.Assert(GetCollatorByID(224), FitsTypeOf, &unicodeCICollator{}) c.Assert(GetCollatorByID(192), FitsTypeOf, &unicodeCICollator{}) + c.Assert(GetCollatorByID(2048), FitsTypeOf, &zhPinyinTiDBASCSCollator{}) c.Assert(GetCollatorByID(9999), FitsTypeOf, &binPaddingCollator{}) SetNewCollationEnabledForTest(false) @@ -216,6 +218,7 @@ func (s *testCollateSuite) TestGetCollator(c *C) { c.Assert(GetCollator("utf8_general_ci"), FitsTypeOf, &binCollator{}) c.Assert(GetCollator("utf8mb4_unicode_ci"), FitsTypeOf, &binCollator{}) c.Assert(GetCollator("utf8_unicode_ci"), FitsTypeOf, &binCollator{}) + c.Assert(GetCollator("utf8mb4_zh_pinyin_tidb_as_cs"), FitsTypeOf, &binCollator{}) c.Assert(GetCollator("default_test"), FitsTypeOf, &binCollator{}) c.Assert(GetCollatorByID(63), FitsTypeOf, &binCollator{}) c.Assert(GetCollatorByID(46), FitsTypeOf, &binCollator{}) @@ -224,5 +227,6 @@ func (s *testCollateSuite) TestGetCollator(c *C) { c.Assert(GetCollatorByID(33), FitsTypeOf, &binCollator{}) c.Assert(GetCollatorByID(224), FitsTypeOf, &binCollator{}) c.Assert(GetCollatorByID(192), FitsTypeOf, &binCollator{}) + c.Assert(GetCollatorByID(2048), FitsTypeOf, &binCollator{}) c.Assert(GetCollatorByID(9999), FitsTypeOf, &binCollator{}) } diff --git a/util/collate/general_ci.go b/util/collate/general_ci.go index da60ef5f8b2fc..77f7204c5bba3 100644 --- a/util/collate/general_ci.go +++ b/util/collate/general_ci.go @@ -20,15 +20,6 @@ import ( type generalCICollator struct { } -func sign(i int) int { - if i < 0 { - return -1 - } else if i > 0 { - return 1 - } - return 0 -} - // compilePatternGeneralCI handles escapes and wild cards, generate pattern weights and types. // This function is modified from stringutil.CompilePattern. func compilePatternGeneralCI(pattern string, escape byte) (patWeights []uint16, patTypes []byte) { diff --git a/util/collate/pinyin_tidb_as_cs.go b/util/collate/pinyin_tidb_as_cs.go new file mode 100644 index 0000000000000..565680e2cff56 --- /dev/null +++ b/util/collate/pinyin_tidb_as_cs.go @@ -0,0 +1,33 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package collate + +// Collation of utf8mb4_zh_pinyin_tidb_as_cs +type zhPinyinTiDBASCSCollator struct { +} + +// Collator interface, no implements now. +func (py *zhPinyinTiDBASCSCollator) Compare(a, b string) int { + panic("implement me") +} + +// Collator interface, no implements now. +func (py *zhPinyinTiDBASCSCollator) Key(str string) []byte { + panic("implement me") +} + +// Collator interface, no implements now. +func (py *zhPinyinTiDBASCSCollator) Pattern() WildcardPattern { + panic("implement me") +} diff --git a/util/collate/unicode_ci.go b/util/collate/unicode_ci.go index fd3d57799d9ee..d03b169fee9e9 100644 --- a/util/collate/unicode_ci.go +++ b/util/collate/unicode_ci.go @@ -20,44 +20,8 @@ import ( const ( // magic number indicate weight has 2 uint64, should get from `longRuneMap` longRune uint64 = 0xFFFD - // first byte of a 2-byte encoding starts 110 and carries 5 bits of data - b2Mask = 0x1F // 0001 1111 - - // first byte of a 3-byte encoding starts 1110 and carries 4 bits of data - b3Mask = 0x0F // 0000 1111 - - // first byte of a 4-byte encoding starts 11110 and carries 3 bits of data - b4Mask = 0x07 // 0000 0111 - - // non-first bytes start 10 and carry 6 bits of data - mbMask = 0x3F // 0011 1111 ) -// decode rune by hand -func decodeRune(s string, si int) (r rune, newIndex int) { - switch b := s[si]; { - case b < 0x80: - r = rune(b) - newIndex = si + 1 - case b < 0xE0: - r = rune(b&b2Mask)<<6 | - rune(s[1+si]&mbMask) - newIndex = si + 2 - case b < 0xF0: - r = rune(b&b3Mask)<<12 | - rune(s[si+1]&mbMask)<<6 | - rune(s[si+2]&mbMask) - newIndex = si + 3 - default: - r = rune(b&b4Mask)<<18 | - rune(s[si+1]&mbMask)<<12 | - rune(s[si+2]&mbMask)<<6 | - rune(s[si+3]&mbMask) - newIndex = si + 4 - } - return -} - // unicodeCICollator implements UCA. see http://unicode.org/reports/tr10/ type unicodeCICollator struct { }