-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Switch Unicode Escaping to a VSCode-like system (#19990)
This PR rewrites the invisible unicode detection algorithm to more closely match that of the Monaco editor on the system. It provides a technique for detecting ambiguous characters and relaxes the detection of combining marks. Control characters are in addition detected as invisible in this implementation whereas they are not on monaco but this is related to font issues. Close #19913 Signed-off-by: Andrew Thornton <art27@cantab.net>
- Loading branch information
Showing
29 changed files
with
2,106 additions
and
370 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package charset | ||
|
||
import ( | ||
"sort" | ||
"strings" | ||
"unicode" | ||
|
||
"code.gitea.io/gitea/modules/translation" | ||
) | ||
|
||
// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale. | ||
func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable { | ||
key := locale.Language() | ||
var table *AmbiguousTable | ||
var ok bool | ||
for len(key) > 0 { | ||
if table, ok = AmbiguousCharacters[key]; ok { | ||
break | ||
} | ||
idx := strings.LastIndexAny(key, "-_") | ||
if idx < 0 { | ||
key = "" | ||
} else { | ||
key = key[:idx] | ||
} | ||
} | ||
if table == nil { | ||
table = AmbiguousCharacters["_default"] | ||
} | ||
|
||
return []*AmbiguousTable{ | ||
table, | ||
AmbiguousCharacters["_common"], | ||
} | ||
} | ||
|
||
func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool { | ||
for _, table := range tables { | ||
if !unicode.Is(table.RangeTable, r) { | ||
continue | ||
} | ||
i := sort.Search(len(table.Confusable), func(i int) bool { | ||
return table.Confusable[i] >= r | ||
}) | ||
(*confusableTo) = table.With[i] | ||
return true | ||
} | ||
return false | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package main | ||
|
||
import ( | ||
"bytes" | ||
"flag" | ||
"fmt" | ||
"go/format" | ||
"os" | ||
"sort" | ||
"text/template" | ||
"unicode" | ||
|
||
"code.gitea.io/gitea/modules/json" | ||
|
||
"golang.org/x/text/unicode/rangetable" | ||
) | ||
|
||
// ambiguous.json provides a one to one mapping of ambiguous characters to other characters | ||
// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json | ||
|
||
type AmbiguousTable struct { | ||
Confusable []rune | ||
With []rune | ||
Locale string | ||
RangeTable *unicode.RangeTable | ||
} | ||
|
||
type RunePair struct { | ||
Confusable rune | ||
With rune | ||
} | ||
|
||
var verbose bool | ||
|
||
func main() { | ||
flag.Usage = func() { | ||
fmt.Fprintf(os.Stderr, `%s: Generate AmbiguousCharacter | ||
Usage: %[1]s [-v] [-o output.go] ambiguous.json | ||
`, os.Args[0]) | ||
flag.PrintDefaults() | ||
} | ||
|
||
output := "" | ||
flag.BoolVar(&verbose, "v", false, "verbose output") | ||
flag.StringVar(&output, "o", "ambiguous_gen.go", "file to output to") | ||
flag.Parse() | ||
input := flag.Arg(0) | ||
if input == "" { | ||
input = "ambiguous.json" | ||
} | ||
|
||
bs, err := os.ReadFile(input) | ||
if err != nil { | ||
fatalf("Unable to read: %s Err: %v", input, err) | ||
} | ||
|
||
var unwrapped string | ||
if err := json.Unmarshal(bs, &unwrapped); err != nil { | ||
fatalf("Unable to unwrap content in: %s Err: %v", input, err) | ||
} | ||
|
||
fromJSON := map[string][]uint32{} | ||
if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil { | ||
fatalf("Unable to unmarshal content in: %s Err: %v", input, err) | ||
} | ||
|
||
tables := make([]*AmbiguousTable, 0, len(fromJSON)) | ||
for locale, chars := range fromJSON { | ||
table := &AmbiguousTable{Locale: locale} | ||
table.Confusable = make([]rune, 0, len(chars)/2) | ||
table.With = make([]rune, 0, len(chars)/2) | ||
pairs := make([]RunePair, len(chars)/2) | ||
for i := 0; i < len(chars); i += 2 { | ||
pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1]) | ||
} | ||
sort.Slice(pairs, func(i, j int) bool { | ||
return pairs[i].Confusable < pairs[j].Confusable | ||
}) | ||
for _, pair := range pairs { | ||
table.Confusable = append(table.Confusable, pair.Confusable) | ||
table.With = append(table.With, pair.With) | ||
} | ||
table.RangeTable = rangetable.New(table.Confusable...) | ||
tables = append(tables, table) | ||
} | ||
sort.Slice(tables, func(i, j int) bool { | ||
return tables[i].Locale < tables[j].Locale | ||
}) | ||
data := map[string]interface{}{ | ||
"Tables": tables, | ||
} | ||
|
||
if err := runTemplate(generatorTemplate, output, &data); err != nil { | ||
fatalf("Unable to run template: %v", err) | ||
} | ||
} | ||
|
||
func runTemplate(t *template.Template, filename string, data interface{}) error { | ||
buf := bytes.NewBuffer(nil) | ||
if err := t.Execute(buf, data); err != nil { | ||
return fmt.Errorf("unable to execute template: %w", err) | ||
} | ||
bs, err := format.Source(buf.Bytes()) | ||
if err != nil { | ||
verbosef("Bad source:\n%s", buf.String()) | ||
return fmt.Errorf("unable to format source: %w", err) | ||
} | ||
file, err := os.Create(filename) | ||
if err != nil { | ||
return fmt.Errorf("failed to create file %s because %w", filename, err) | ||
} | ||
defer file.Close() | ||
_, err = file.Write(bs) | ||
if err != nil { | ||
return fmt.Errorf("unable to write generated source: %w", err) | ||
} | ||
return nil | ||
} | ||
|
||
var generatorTemplate = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
package charset | ||
import "unicode" | ||
// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json | ||
// AmbiguousTable matches a confusable rune with its partner for the Locale | ||
type AmbiguousTable struct { | ||
Confusable []rune | ||
With []rune | ||
Locale string | ||
RangeTable *unicode.RangeTable | ||
} | ||
// AmbiguousCharacters provides a map by locale name to the confusable characters in that locale | ||
var AmbiguousCharacters = map[string]*AmbiguousTable{ | ||
{{range .Tables}}{{printf "%q:" .Locale}} { | ||
Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} }, | ||
With: []rune{ {{range .With}}{{.}},{{end}} }, | ||
Locale: {{printf "%q" .Locale}}, | ||
RangeTable: &unicode.RangeTable{ | ||
R16: []unicode.Range16{ | ||
{{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, | ||
{{end}} }, | ||
R32: []unicode.Range32{ | ||
{{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}}, | ||
{{end}} }, | ||
LatinOffset: {{.RangeTable.LatinOffset}}, | ||
}, | ||
}, | ||
{{end}} | ||
} | ||
`)) | ||
|
||
func logf(format string, args ...interface{}) { | ||
fmt.Fprintf(os.Stderr, format+"\n", args...) | ||
} | ||
|
||
func verbosef(format string, args ...interface{}) { | ||
if verbose { | ||
logf(format, args...) | ||
} | ||
} | ||
|
||
func fatalf(format string, args ...interface{}) { | ||
logf("fatal: "+format+"\n", args...) | ||
os.Exit(1) | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package charset | ||
|
||
import ( | ||
"sort" | ||
"testing" | ||
"unicode" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestAmbiguousCharacters(t *testing.T) { | ||
for locale, ambiguous := range AmbiguousCharacters { | ||
assert.Equal(t, locale, ambiguous.Locale) | ||
assert.Equal(t, len(ambiguous.Confusable), len(ambiguous.With)) | ||
assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool { | ||
return ambiguous.Confusable[i] < ambiguous.Confusable[j] | ||
})) | ||
|
||
for _, confusable := range ambiguous.Confusable { | ||
assert.True(t, unicode.Is(ambiguous.RangeTable, confusable)) | ||
i := sort.Search(len(ambiguous.Confusable), func(j int) bool { | ||
return ambiguous.Confusable[j] >= confusable | ||
}) | ||
found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable | ||
assert.True(t, found, "%c is not in %d", confusable, i) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package charset | ||
|
||
import ( | ||
"bytes" | ||
"io" | ||
) | ||
|
||
// BreakWriter wraps an io.Writer to always write '\n' as '<br>' | ||
type BreakWriter struct { | ||
io.Writer | ||
} | ||
|
||
// Write writes the provided byte slice transparently replacing '\n' with '<br>' | ||
func (b *BreakWriter) Write(bs []byte) (n int, err error) { | ||
pos := 0 | ||
for pos < len(bs) { | ||
idx := bytes.IndexByte(bs[pos:], '\n') | ||
if idx < 0 { | ||
wn, err := b.Writer.Write(bs[pos:]) | ||
return n + wn, err | ||
} | ||
|
||
if idx > 0 { | ||
wn, err := b.Writer.Write(bs[pos : pos+idx]) | ||
n += wn | ||
if err != nil { | ||
return n, err | ||
} | ||
} | ||
|
||
if _, err = b.Writer.Write([]byte("<br>")); err != nil { | ||
return n, err | ||
} | ||
pos += idx + 1 | ||
|
||
n++ | ||
} | ||
|
||
return n, err | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package charset | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
) | ||
|
||
func TestBreakWriter_Write(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
kase string | ||
expect string | ||
wantErr bool | ||
}{ | ||
{ | ||
name: "noline", | ||
kase: "abcdefghijklmnopqrstuvwxyz", | ||
expect: "abcdefghijklmnopqrstuvwxyz", | ||
}, | ||
{ | ||
name: "endline", | ||
kase: "abcdefghijklmnopqrstuvwxyz\n", | ||
expect: "abcdefghijklmnopqrstuvwxyz<br>", | ||
}, | ||
{ | ||
name: "startline", | ||
kase: "\nabcdefghijklmnopqrstuvwxyz", | ||
expect: "<br>abcdefghijklmnopqrstuvwxyz", | ||
}, | ||
{ | ||
name: "onlyline", | ||
kase: "\n\n\n", | ||
expect: "<br><br><br>", | ||
}, | ||
{ | ||
name: "empty", | ||
kase: "", | ||
expect: "", | ||
}, | ||
{ | ||
name: "midline", | ||
kase: "\nabc\ndefghijkl\nmnopqrstuvwxy\nz", | ||
expect: "<br>abc<br>defghijkl<br>mnopqrstuvwxy<br>z", | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
buf := &strings.Builder{} | ||
b := &BreakWriter{ | ||
Writer: buf, | ||
} | ||
n, err := b.Write([]byte(tt.kase)) | ||
if (err != nil) != tt.wantErr { | ||
t.Errorf("BreakWriter.Write() error = %v, wantErr %v", err, tt.wantErr) | ||
return | ||
} | ||
if n != len(tt.kase) { | ||
t.Errorf("BreakWriter.Write() = %v, want %v", n, len(tt.kase)) | ||
} | ||
if buf.String() != tt.expect { | ||
t.Errorf("BreakWriter.Write() wrote %q, want %v", buf.String(), tt.expect) | ||
} | ||
}) | ||
} | ||
} |
Oops, something went wrong.