Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

icuregex: Update to ICU 73 #13912

Merged
merged 2 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions go/mysql/icuregex/compiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (c *compiler) nextChar(ch *reChar) {
//
// We are in free-spacing and comments mode.
// Scan through any white space and comments, until we
// reach a significant character or the end of inut.
// reach a significant character or the end of input.
for {
if ch.char == -1 {
break // End of Input
Expand Down Expand Up @@ -2049,7 +2049,7 @@ func (c *compiler) matchStartType() {
currentLen = safeIncrement(currentLen, 1)
atStart = false

case urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded.
case urxBackslashX, // Grapheme Cluster. Minimum is 1, max unbounded.
urxDotanyAll, // . matches one or two.
urxDotany,
urxDotanyUnix:
Expand Down Expand Up @@ -2893,7 +2893,7 @@ func (c *compiler) minMatchLength(start, end int) int32 {
urxBackslashR,
urxBackslashV,
urcOnecharI,
urxBackslashX, // Grahpeme Cluster. Minimum is 1, max unbounded.
urxBackslashX, // Grapheme Cluster. Minimum is 1, max unbounded.
urxDotanyAll, // . matches one or two.
urxDotany,
urxDotanyUnix:
Expand Down Expand Up @@ -2983,7 +2983,7 @@ func (c *compiler) minMatchLength(start, end int) int32 {
loc++
op = c.out.compiledPat[loc]
if op.typ() == urxLaStart {
// The boilerplate for look-ahead includes two LA_END insturctions,
// The boilerplate for look-ahead includes two LA_END instructions,
// Depth will be decremented by each one when it is seen.
depth += 2
}
Expand Down Expand Up @@ -3086,7 +3086,7 @@ func (c *compiler) maxMatchLength(start, end int) int32 {
// Call the max length unbounded, and stop further checking.
case urxBackref, // BackRef. Must assume that it might be a zero length match
urxBackrefI,
urxBackslashX: // Grahpeme Cluster. Minimum is 1, max unbounded.
urxBackslashX: // Grapheme Cluster. Minimum is 1, max unbounded.
currentLen = math.MaxInt32

// Ops that match a max of one character (possibly two 16 bit code units.)
Expand Down
7 changes: 5 additions & 2 deletions go/mysql/icuregex/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ func (e *MatchError) Error() string {
out.WriteString("Stack overflow")
case TimeOut:
out.WriteString("Timeout")
case InternalMatchError:
out.WriteString("Internal error")
}

input := e.Input
Expand Down Expand Up @@ -144,6 +146,7 @@ const (
type MatchErrorCode int32

const (
StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */
TimeOut /**< Maximum allowed match time exceeded */
StackOverflow MatchErrorCode = iota /**< Regular expression backtrack stack overflow. */
TimeOut /**< Maximum allowed match time exceeded */
InternalMatchError /**< Internal error (bug) was detected. */
)
17 changes: 5 additions & 12 deletions go/mysql/icuregex/icu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,7 @@ func (tp *TestPattern) parseMatch(orig string) error {

func ParseTestFile(t testing.TB, filename string) []TestPattern {
f, err := os.Open(filename)
if err != nil {
t.Fatalf("failed to open test data: %v", err)
}
require.NoError(t, err)

defer f.Close()
scanner := bufio.NewScanner(f)
Expand Down Expand Up @@ -229,9 +227,8 @@ func ParseTestFile(t testing.TB, filename string) []TestPattern {
patterns = append(patterns, tp)
}

if err := scanner.Err(); err != nil {
t.Fatal(err)
}
err = scanner.Err()
require.NoError(t, err)
return patterns
}

Expand Down Expand Up @@ -394,9 +391,7 @@ func TestCornerCases(t *testing.T) {
for _, tc := range cases {
t.Run(tc.Pattern, func(t *testing.T) {
_, err := icuregex.CompileString(tc.Pattern, tc.Flags)
if err != nil {
t.Fatal(err)
}
require.NoError(t, err)
})
}
}
Expand All @@ -407,9 +402,7 @@ func TestOne(t *testing.T) {
const Flags = 0

re, err := icuregex.CompileString(Pattern, Flags)
if err != nil {
t.Fatalf("compilation failed: %v", err)
}
require.NoError(t, err)

re.Dump(os.Stderr)

Expand Down
5 changes: 5 additions & 0 deletions go/mysql/icuregex/internal/icudata/embed.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ var UBidi []byte
//go:embed ucase.icu
var UCase []byte

// UEmoji is the list of Emoji properties.
//
//go:embed uemoji.icu
var UEmoji []byte

// ULayout is used for property checks agains the InPC, InSC
// and VO properties.
//
Expand Down
Binary file modified go/mysql/icuregex/internal/icudata/nfc.nrm
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/nfkc.nrm
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/nfkc_cf.nrm
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/pnames.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/ubidi.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/ucase.icu
Binary file not shown.
Binary file added go/mysql/icuregex/internal/icudata/uemoji.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/ulayout.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/unames.icu
Binary file not shown.
Binary file modified go/mysql/icuregex/internal/icudata/uprops.icu
Binary file not shown.
9 changes: 0 additions & 9 deletions go/mysql/icuregex/internal/ucase/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ var ucaseOnce sync.Once
var ucase struct {
trie *utrie.UTrie2
exceptions []uint16
unfold []uint16
}

func trie() *utrie.UTrie2 {
Expand All @@ -47,11 +46,6 @@ func exceptions() []uint16 {
return ucase.exceptions
}

func unfold() []uint16 {
loadUCase()
return ucase.unfold
}

func loadUCase() {
ucaseOnce.Do(func() {
b := udata.NewBytes(icudata.UCase)
Expand Down Expand Up @@ -102,9 +96,6 @@ func readData(bytes *udata.Bytes) error {
if n := indexes[ixExcLength]; n > 0 {
ucase.exceptions = bytes.Uint16Slice(n)
}
if n := indexes[ixUnfoldLength]; n > 0 {
ucase.unfold = bytes.Uint16Slice(n)
}

return nil
}
69 changes: 69 additions & 0 deletions go/mysql/icuregex/internal/uemoji/loader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
© 2016 and later: Unicode, Inc. and others.
Copyright (C) 2004-2015, International Business Machines Corporation and others.
Copyright 2023 The Vitess Authors.

This file contains code derived from the Unicode Project's ICU library.
License & terms of use for the original code: http://www.unicode.org/copyright.html

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package uemoji
dbussink marked this conversation as resolved.
Show resolved Hide resolved

import (
"sync"

"vitess.io/vitess/go/mysql/icuregex/internal/icudata"
"vitess.io/vitess/go/mysql/icuregex/internal/udata"
"vitess.io/vitess/go/mysql/icuregex/internal/utrie"
)

var uemojiOnce sync.Once
var uemoji struct {
trie *utrie.UcpTrie
}

func loadUEmoji() {
uemojiOnce.Do(func() {
b := udata.NewBytes(icudata.UEmoji)
if err := readData(b); err != nil {
panic(err)
}
})
}

func trie() *utrie.UcpTrie {
loadUEmoji()
return uemoji.trie
}

func readData(bytes *udata.Bytes) error {
err := bytes.ReadHeader(func(info *udata.DataInfo) bool {
return info.DataFormat[0] == 0x45 &&
info.DataFormat[1] == 0x6d &&
info.DataFormat[2] == 0x6f &&
info.DataFormat[3] == 0x6a &&
info.FormatVersion[0] == 1
})
if err != nil {
return err
}

bytes.Skip(bytes.Int32() - 4)
uemoji.trie, err = utrie.UcpTrieFromBytes(bytes)
if err != nil {
return err
}
return nil
}
82 changes: 82 additions & 0 deletions go/mysql/icuregex/internal/uemoji/uemoji.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
© 2016 and later: Unicode, Inc. and others.
Copyright (C) 2004-2015, International Business Machines Corporation and others.
Copyright 2023 The Vitess Authors.

This file contains code derived from the Unicode Project's ICU library.
License & terms of use for the original code: http://www.unicode.org/copyright.html

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package uemoji

import (
"vitess.io/vitess/go/mysql/icuregex/internal/utrie"
)

type propertySet interface {
AddRune(ch rune)
AddRuneRange(from rune, to rune)
}

func AddPropertyStarts(sa propertySet) {
// Add the start code point of each same-value range of the trie.
var start, end rune
for {
end, _ = trie().GetRange(start, utrie.UcpMapRangeNormal, 0, nil)
if end < 0 {
break
}
sa.AddRune(start)
start = end + 1
}
}

const (
bitEmoji = 0
bitEmojiPresentation = 1
bitEmojiModifier = 2
bitEmojiModifierBase = 3
bitEmojiComponent = 4
bitExtendedPictographic = 5
bitBasicEmoji = 6
)

// Note: REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere.
var bitFlags = []int8{
bitEmoji,
bitEmojiPresentation,
bitEmojiModifier,
bitEmojiModifierBase,
bitEmojiComponent,
-1,
-1,
bitExtendedPictographic,
bitBasicEmoji,
-1,
-1,
-1,
-1,
-1,
bitBasicEmoji,
}

func HasBinaryProperty(c rune, which int) bool {
bit := bitFlags[which]
if bit < 0 {
return false // not a property that we support in this function
}
bits := trie().Get(c)
return ((bits >> bit) & 1) != 0
}
53 changes: 52 additions & 1 deletion go/mysql/icuregex/internal/uprops/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,56 @@ const (
*/
UCharExtendedPictographic Property = 64

/**
* Binary property of strings Basic_Emoji.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharBasicEmoji Property = 65
/**
* Binary property of strings Emoji_Keycap_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharEmojiKeycapSequence Property = 66
/**
* Binary property of strings RGI_Emoji_Modifier_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiModifierSequence Property = 67
/**
* Binary property of strings RGI_Emoji_Flag_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiFlagSequence Property = 68
/**
* Binary property of strings RGI_Emoji_Tag_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiTagSequence Property = 69
/**
* Binary property of strings RGI_Emoji_ZWJ_Sequence.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmojiZwjSequence Property = 70
/**
* Binary property of strings RGI_Emoji.
* See https://www.unicode.org/reports/tr51/#Emoji_Sets
*
* @stable ICU 70
*/
UCharRgiEmoji Property = 71

/** Enumerated property Bidi_Class.
Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */
UCharBidiClass Property = 0x1000
Expand Down Expand Up @@ -492,7 +542,7 @@ const (
)

const (
uCharBinaryLimit = 65
uCharBinaryLimit = 72
uCharIntLimit = 0x1019
uCharMaskLimit = 0x2001
uCharStringLimit = 0x400E
Expand Down Expand Up @@ -595,6 +645,7 @@ const (
srcInpc
srcInsc
srcVo
srcEmoji
)

const (
Expand Down
3 changes: 3 additions & 0 deletions go/mysql/icuregex/internal/uprops/properties.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"vitess.io/vitess/go/mysql/icuregex/internal/ubidi"
"vitess.io/vitess/go/mysql/icuregex/internal/ucase"
"vitess.io/vitess/go/mysql/icuregex/internal/uchar"
"vitess.io/vitess/go/mysql/icuregex/internal/uemoji"
"vitess.io/vitess/go/mysql/icuregex/internal/ulayout"
"vitess.io/vitess/go/mysql/icuregex/internal/unames"
"vitess.io/vitess/go/mysql/icuregex/internal/uset"
Expand Down Expand Up @@ -74,6 +75,8 @@ func getInclusionsForSource(src propertySource) (*uset.UnicodeSet, error) {
ubidi.AddPropertyStarts(u)
case srcInpc, srcInsc, srcVo:
AddULayoutPropertyStarts(src, u)
case srcEmoji:
uemoji.AddPropertyStarts(u)
default:
return nil, errors.ErrUnsupported
}
Expand Down
Loading