From 9cfa004f7fc193c183bf80a33cbf02a23e5e0a12 Mon Sep 17 00:00:00 2001 From: Gusted Date: Fri, 28 Jan 2022 02:14:39 +0100 Subject: [PATCH 1/2] Fix non-ASCII search on database - Use `ToASCIIUpper` for SQLite database on issues search, this because `UPPER(x)` on SQLite only transforms ASCII letters. - Resolves #18429 --- models/issue.go | 9 ++++- modules/strings/strings.go | 62 +++++++++++++++++++++++++++++++++ modules/strings/strings_test.go | 47 +++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 modules/strings/strings.go create mode 100644 modules/strings/strings_test.go diff --git a/models/issue.go b/models/issue.go index 3a61b085dc3b2..274edb2d50e11 100644 --- a/models/issue.go +++ b/models/issue.go @@ -23,6 +23,8 @@ import ( "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/references" + "code.gitea.io/gitea/modules/setting" + mod_strings "code.gitea.io/gitea/modules/strings" api "code.gitea.io/gitea/modules/structs" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/util" @@ -1862,7 +1864,12 @@ func GetRepoIssueStats(repoID, uid int64, filterMode int, isPull bool) (numOpen, func SearchIssueIDsByKeyword(ctx context.Context, kw string, repoIDs []int64, limit, start int) (int64, []int64, error) { repoCond := builder.In("repo_id", repoIDs) subQuery := builder.Select("id").From("issue").Where(repoCond) - kw = strings.ToUpper(kw) + // SQLite's UPPER function only transforms ASCII letters. + if setting.Database.UseSQLite3 { + kw = mod_strings.ToASCIIUpper(kw) + } else { + kw = strings.ToUpper(kw) + } cond := builder.And( repoCond, builder.Or( diff --git a/modules/strings/strings.go b/modules/strings/strings.go new file mode 100644 index 0000000000000..3d87630c96ad3 --- /dev/null +++ b/modules/strings/strings.go @@ -0,0 +1,62 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package strings + +import ( + "strings" + "unicode/utf8" +) + +// ToASCIIUpper returns s with all ASCII letters mapped to their upper case. +func ToASCIIUpper(s string) string { + isASCII, hasLower := true, false + for i := 0; i < len(s); i++ { + c := s[i] + if c >= utf8.RuneSelf { + isASCII = false + break + } + hasLower = hasLower || ('a' <= c && c <= 'z') + } + + // Optimize for ASCII-only strings. + if isASCII { + if !hasLower { + return s + } + var b strings.Builder + b.Grow(len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + if 'a' <= c && c <= 'z' { + c -= 'a' - 'A' + } + b.WriteByte(c) + } + return b.String() + } + + sBytes := []byte(s) + var b strings.Builder + b.Grow(len(s)) + + for i := 0; i < len(sBytes); { + // Use ut8 because it includes non-ASCII letters. + r, width := utf8.DecodeRune(sBytes[i:]) + i += width + + if r == utf8.RuneError { + // Might change to RUNE_ERROR, which can be tracked down + // via the SQL Logs what's possibly going on. + return s + } + // Only uppercase ASCII. + if 'a' <= r && r <= 'z' { + r -= 'a' - 'A' + } + b.WriteRune(r) + } + return b.String() +} diff --git a/modules/strings/strings_test.go b/modules/strings/strings_test.go new file mode 100644 index 0000000000000..22829b406501e --- /dev/null +++ b/modules/strings/strings_test.go @@ -0,0 +1,47 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package strings + +import "testing" + +// Test case for any function which accepts and returns a single string. +type StringTest struct { + in, out string +} + +var upperTests = []StringTest{ + {"", ""}, + {"ONLYUPPER", "ONLYUPPER"}, + {"abc", "ABC"}, + {"AbC123", "ABC123"}, + {"azAZ09_", "AZAZ09_"}, + {"longStrinGwitHmixofsmaLLandcAps", "LONGSTRINGWITHMIXOFSMALLANDCAPS"}, + {"long\u0250string\u0250with\u0250nonascii\u2C6Fchars", "LONG\u0250STRING\u0250WITH\u0250NONASCII\u2C6FCHARS"}, + {"\u0250\u0250\u0250\u0250\u0250", "\u0250\u0250\u0250\u0250\u0250"}, + {"a\u0080\U0010FFFF", "A\u0080\U0010FFFF"}, + {"lél", "LéL"}, +} + +func TestToASCIIUpper(t *testing.T) { + for _, tc := range upperTests { + actual := ToASCIIUpper(tc.in) + if actual != tc.out { + t.Errorf("ToASCIIUpper(%q) = %q; want %q", tc.in, actual, tc.out) + } + } +} + +func BenchmarkToUpper(b *testing.B) { + for _, tc := range upperTests { + b.Run(tc.in, func(b *testing.B) { + for i := 0; i < b.N; i++ { + actual := ToASCIIUpper(tc.in) + if actual != tc.out { + b.Errorf("ToUpper(%q) = %q; want %q", tc.in, actual, tc.out) + } + } + }) + } +} From 478e1d3261ce219f6f0ee012055b4e1d3a08f4ef Mon Sep 17 00:00:00 2001 From: Gusted Date: Sat, 29 Jan 2022 18:48:16 +0100 Subject: [PATCH 2/2] Per wxiaoguang --- models/issue.go | 3 +- modules/strings/strings.go | 62 --------------------------------- modules/strings/strings_test.go | 47 ------------------------- modules/util/util.go | 11 ++++++ modules/util/util_test.go | 34 ++++++++++++++++++ 5 files changed, 46 insertions(+), 111 deletions(-) delete mode 100644 modules/strings/strings.go delete mode 100644 modules/strings/strings_test.go diff --git a/models/issue.go b/models/issue.go index 274edb2d50e11..8eb61f20507ec 100644 --- a/models/issue.go +++ b/models/issue.go @@ -24,7 +24,6 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/references" "code.gitea.io/gitea/modules/setting" - mod_strings "code.gitea.io/gitea/modules/strings" api "code.gitea.io/gitea/modules/structs" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/util" @@ -1866,7 +1865,7 @@ func SearchIssueIDsByKeyword(ctx context.Context, kw string, repoIDs []int64, li subQuery := builder.Select("id").From("issue").Where(repoCond) // SQLite's UPPER function only transforms ASCII letters. if setting.Database.UseSQLite3 { - kw = mod_strings.ToASCIIUpper(kw) + kw = util.ToUpperASCII(kw) } else { kw = strings.ToUpper(kw) } diff --git a/modules/strings/strings.go b/modules/strings/strings.go deleted file mode 100644 index 3d87630c96ad3..0000000000000 --- a/modules/strings/strings.go +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2022 The Gitea Authors. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package strings - -import ( - "strings" - "unicode/utf8" -) - -// ToASCIIUpper returns s with all ASCII letters mapped to their upper case. -func ToASCIIUpper(s string) string { - isASCII, hasLower := true, false - for i := 0; i < len(s); i++ { - c := s[i] - if c >= utf8.RuneSelf { - isASCII = false - break - } - hasLower = hasLower || ('a' <= c && c <= 'z') - } - - // Optimize for ASCII-only strings. - if isASCII { - if !hasLower { - return s - } - var b strings.Builder - b.Grow(len(s)) - for i := 0; i < len(s); i++ { - c := s[i] - if 'a' <= c && c <= 'z' { - c -= 'a' - 'A' - } - b.WriteByte(c) - } - return b.String() - } - - sBytes := []byte(s) - var b strings.Builder - b.Grow(len(s)) - - for i := 0; i < len(sBytes); { - // Use ut8 because it includes non-ASCII letters. - r, width := utf8.DecodeRune(sBytes[i:]) - i += width - - if r == utf8.RuneError { - // Might change to RUNE_ERROR, which can be tracked down - // via the SQL Logs what's possibly going on. - return s - } - // Only uppercase ASCII. - if 'a' <= r && r <= 'z' { - r -= 'a' - 'A' - } - b.WriteRune(r) - } - return b.String() -} diff --git a/modules/strings/strings_test.go b/modules/strings/strings_test.go deleted file mode 100644 index 22829b406501e..0000000000000 --- a/modules/strings/strings_test.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2022 The Gitea Authors. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package strings - -import "testing" - -// Test case for any function which accepts and returns a single string. -type StringTest struct { - in, out string -} - -var upperTests = []StringTest{ - {"", ""}, - {"ONLYUPPER", "ONLYUPPER"}, - {"abc", "ABC"}, - {"AbC123", "ABC123"}, - {"azAZ09_", "AZAZ09_"}, - {"longStrinGwitHmixofsmaLLandcAps", "LONGSTRINGWITHMIXOFSMALLANDCAPS"}, - {"long\u0250string\u0250with\u0250nonascii\u2C6Fchars", "LONG\u0250STRING\u0250WITH\u0250NONASCII\u2C6FCHARS"}, - {"\u0250\u0250\u0250\u0250\u0250", "\u0250\u0250\u0250\u0250\u0250"}, - {"a\u0080\U0010FFFF", "A\u0080\U0010FFFF"}, - {"lél", "LéL"}, -} - -func TestToASCIIUpper(t *testing.T) { - for _, tc := range upperTests { - actual := ToASCIIUpper(tc.in) - if actual != tc.out { - t.Errorf("ToASCIIUpper(%q) = %q; want %q", tc.in, actual, tc.out) - } - } -} - -func BenchmarkToUpper(b *testing.B) { - for _, tc := range upperTests { - b.Run(tc.in, func(b *testing.B) { - for i := 0; i < b.N; i++ { - actual := ToASCIIUpper(tc.in) - if actual != tc.out { - b.Errorf("ToUpper(%q) = %q; want %q", tc.in, actual, tc.out) - } - } - }) - } -} diff --git a/modules/util/util.go b/modules/util/util.go index 90d0eca15c1e8..af6581f7cdbe3 100644 --- a/modules/util/util.go +++ b/modules/util/util.go @@ -170,3 +170,14 @@ func CryptoRandomBytes(length int64) ([]byte, error) { _, err := rand.Read(buf) return buf, err } + +// ToUpperASCII returns s with all ASCII letters mapped to their upper case. +func ToUpperASCII(s string) string { + b := []byte(s) + for i, c := range b { + if 'a' <= c && c <= 'z' { + b[i] -= 'a' - 'A' + } + } + return string(b) +} diff --git a/modules/util/util_test.go b/modules/util/util_test.go index b32cec23d9bcc..0c2792a9cbf3d 100644 --- a/modules/util/util_test.go +++ b/modules/util/util_test.go @@ -186,3 +186,37 @@ func Test_OptionalBool(t *testing.T) { assert.Equal(t, OptionalBoolTrue, OptionalBoolParse("t")) assert.Equal(t, OptionalBoolTrue, OptionalBoolParse("True")) } + +// Test case for any function which accepts and returns a single string. +type StringTest struct { + in, out string +} + +var upperTests = []StringTest{ + {"", ""}, + {"ONLYUPPER", "ONLYUPPER"}, + {"abc", "ABC"}, + {"AbC123", "ABC123"}, + {"azAZ09_", "AZAZ09_"}, + {"longStrinGwitHmixofsmaLLandcAps", "LONGSTRINGWITHMIXOFSMALLANDCAPS"}, + {"long\u0250string\u0250with\u0250nonascii\u2C6Fchars", "LONG\u0250STRING\u0250WITH\u0250NONASCII\u2C6FCHARS"}, + {"\u0250\u0250\u0250\u0250\u0250", "\u0250\u0250\u0250\u0250\u0250"}, + {"a\u0080\U0010FFFF", "A\u0080\U0010FFFF"}, + {"lél", "LéL"}, +} + +func TestToUpperASCII(t *testing.T) { + for _, tc := range upperTests { + assert.Equal(t, ToUpperASCII(tc.in), tc.out) + } +} + +func BenchmarkToUpper(b *testing.B) { + for _, tc := range upperTests { + b.Run(tc.in, func(b *testing.B) { + for i := 0; i < b.N; i++ { + ToUpperASCII(tc.in) + } + }) + } +}