Skip to content

Commit

Permalink
Switch Unicode Escaping to a VSCode-like system (go-gitea#19990)
Browse files Browse the repository at this point in the history
This PR rewrites the invisible unicode detection algorithm to more
closely match that of the Monaco editor on the system. It provides a
technique for detecting ambiguous characters and relaxes the detection
of combining marks.

Control characters are in addition detected as invisible in this
implementation whereas they are not on monaco but this is related to
font issues.

Close go-gitea#19913

Signed-off-by: Andrew Thornton <art27@cantab.net>
  • Loading branch information
zeripath authored and Sysoev, Vladimir committed Aug 28, 2022
1 parent a087e3e commit fb549db
Show file tree
Hide file tree
Showing 29 changed files with 2,106 additions and 370 deletions.
54 changes: 54 additions & 0 deletions modules/charset/ambiguous.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
"sort"
"strings"
"unicode"

"code.gitea.io/gitea/modules/translation"
)

// AmbiguousTablesForLocale provides the table of ambiguous characters for this locale.
func AmbiguousTablesForLocale(locale translation.Locale) []*AmbiguousTable {
key := locale.Language()
var table *AmbiguousTable
var ok bool
for len(key) > 0 {
if table, ok = AmbiguousCharacters[key]; ok {
break
}
idx := strings.LastIndexAny(key, "-_")
if idx < 0 {
key = ""
} else {
key = key[:idx]
}
}
if table == nil {
table = AmbiguousCharacters["_default"]
}

return []*AmbiguousTable{
table,
AmbiguousCharacters["_common"],
}
}

func isAmbiguous(r rune, confusableTo *rune, tables ...*AmbiguousTable) bool {
for _, table := range tables {
if !unicode.Is(table.RangeTable, r) {
continue
}
i := sort.Search(len(table.Confusable), func(i int) bool {
return table.Confusable[i] >= r
})
(*confusableTo) = table.With[i]
return true
}
return false
}
1 change: 1 addition & 0 deletions modules/charset/ambiguous/ambiguous.json

Large diffs are not rendered by default.

178 changes: 178 additions & 0 deletions modules/charset/ambiguous/generate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package main

import (
"bytes"
"flag"
"fmt"
"go/format"
"os"
"sort"
"text/template"
"unicode"

"code.gitea.io/gitea/modules/json"

"golang.org/x/text/unicode/rangetable"
)

// ambiguous.json provides a one to one mapping of ambiguous characters to other characters
// See https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json

type AmbiguousTable struct {
Confusable []rune
With []rune
Locale string
RangeTable *unicode.RangeTable
}

type RunePair struct {
Confusable rune
With rune
}

var verbose bool

func main() {
flag.Usage = func() {
fmt.Fprintf(os.Stderr, `%s: Generate AmbiguousCharacter
Usage: %[1]s [-v] [-o output.go] ambiguous.json
`, os.Args[0])
flag.PrintDefaults()
}

output := ""
flag.BoolVar(&verbose, "v", false, "verbose output")
flag.StringVar(&output, "o", "ambiguous_gen.go", "file to output to")
flag.Parse()
input := flag.Arg(0)
if input == "" {
input = "ambiguous.json"
}

bs, err := os.ReadFile(input)
if err != nil {
fatalf("Unable to read: %s Err: %v", input, err)
}

var unwrapped string
if err := json.Unmarshal(bs, &unwrapped); err != nil {
fatalf("Unable to unwrap content in: %s Err: %v", input, err)
}

fromJSON := map[string][]uint32{}
if err := json.Unmarshal([]byte(unwrapped), &fromJSON); err != nil {
fatalf("Unable to unmarshal content in: %s Err: %v", input, err)
}

tables := make([]*AmbiguousTable, 0, len(fromJSON))
for locale, chars := range fromJSON {
table := &AmbiguousTable{Locale: locale}
table.Confusable = make([]rune, 0, len(chars)/2)
table.With = make([]rune, 0, len(chars)/2)
pairs := make([]RunePair, len(chars)/2)
for i := 0; i < len(chars); i += 2 {
pairs[i/2].Confusable, pairs[i/2].With = rune(chars[i]), rune(chars[i+1])
}
sort.Slice(pairs, func(i, j int) bool {
return pairs[i].Confusable < pairs[j].Confusable
})
for _, pair := range pairs {
table.Confusable = append(table.Confusable, pair.Confusable)
table.With = append(table.With, pair.With)
}
table.RangeTable = rangetable.New(table.Confusable...)
tables = append(tables, table)
}
sort.Slice(tables, func(i, j int) bool {
return tables[i].Locale < tables[j].Locale
})
data := map[string]interface{}{
"Tables": tables,
}

if err := runTemplate(generatorTemplate, output, &data); err != nil {
fatalf("Unable to run template: %v", err)
}
}

func runTemplate(t *template.Template, filename string, data interface{}) error {
buf := bytes.NewBuffer(nil)
if err := t.Execute(buf, data); err != nil {
return fmt.Errorf("unable to execute template: %w", err)
}
bs, err := format.Source(buf.Bytes())
if err != nil {
verbosef("Bad source:\n%s", buf.String())
return fmt.Errorf("unable to format source: %w", err)
}
file, err := os.Create(filename)
if err != nil {
return fmt.Errorf("failed to create file %s because %w", filename, err)
}
defer file.Close()
_, err = file.Write(bs)
if err != nil {
return fmt.Errorf("unable to write generated source: %w", err)
}
return nil
}

var generatorTemplate = template.Must(template.New("ambiguousTemplate").Parse(`// This file is generated by modules/charset/ambiguous/generate.go DO NOT EDIT
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package charset
import "unicode"
// This file is generated from https://github.com/hediet/vscode-unicode-data/blob/main/out/ambiguous.json
// AmbiguousTable matches a confusable rune with its partner for the Locale
type AmbiguousTable struct {
Confusable []rune
With []rune
Locale string
RangeTable *unicode.RangeTable
}
// AmbiguousCharacters provides a map by locale name to the confusable characters in that locale
var AmbiguousCharacters = map[string]*AmbiguousTable{
{{range .Tables}}{{printf "%q:" .Locale}} {
Confusable: []rune{ {{range .Confusable}}{{.}},{{end}} },
With: []rune{ {{range .With}}{{.}},{{end}} },
Locale: {{printf "%q" .Locale}},
RangeTable: &unicode.RangeTable{
R16: []unicode.Range16{
{{range .RangeTable.R16 }} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
{{end}} },
R32: []unicode.Range32{
{{range .RangeTable.R32}} {Lo:{{.Lo}}, Hi:{{.Hi}}, Stride: {{.Stride}}},
{{end}} },
LatinOffset: {{.RangeTable.LatinOffset}},
},
},
{{end}}
}
`))

func logf(format string, args ...interface{}) {
fmt.Fprintf(os.Stderr, format+"\n", args...)
}

func verbosef(format string, args ...interface{}) {
if verbose {
logf(format, args...)
}
}

func fatalf(format string, args ...interface{}) {
logf("fatal: "+format+"\n", args...)
os.Exit(1)
}
837 changes: 837 additions & 0 deletions modules/charset/ambiguous_gen.go

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions modules/charset/ambiguous_gen_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
"sort"
"testing"
"unicode"

"github.com/stretchr/testify/assert"
)

func TestAmbiguousCharacters(t *testing.T) {
for locale, ambiguous := range AmbiguousCharacters {
assert.Equal(t, locale, ambiguous.Locale)
assert.Equal(t, len(ambiguous.Confusable), len(ambiguous.With))
assert.True(t, sort.SliceIsSorted(ambiguous.Confusable, func(i, j int) bool {
return ambiguous.Confusable[i] < ambiguous.Confusable[j]
}))

for _, confusable := range ambiguous.Confusable {
assert.True(t, unicode.Is(ambiguous.RangeTable, confusable))
i := sort.Search(len(ambiguous.Confusable), func(j int) bool {
return ambiguous.Confusable[j] >= confusable
})
found := i < len(ambiguous.Confusable) && ambiguous.Confusable[i] == confusable
assert.True(t, found, "%c is not in %d", confusable, i)
}
}
}
44 changes: 44 additions & 0 deletions modules/charset/breakwriter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
"bytes"
"io"
)

// BreakWriter wraps an io.Writer to always write '\n' as '<br>'
type BreakWriter struct {
io.Writer
}

// Write writes the provided byte slice transparently replacing '\n' with '<br>'
func (b *BreakWriter) Write(bs []byte) (n int, err error) {
pos := 0
for pos < len(bs) {
idx := bytes.IndexByte(bs[pos:], '\n')
if idx < 0 {
wn, err := b.Writer.Write(bs[pos:])
return n + wn, err
}

if idx > 0 {
wn, err := b.Writer.Write(bs[pos : pos+idx])
n += wn
if err != nil {
return n, err
}
}

if _, err = b.Writer.Write([]byte("<br>")); err != nil {
return n, err
}
pos += idx + 1

n++
}

return n, err
}
69 changes: 69 additions & 0 deletions modules/charset/breakwriter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
"strings"
"testing"
)

func TestBreakWriter_Write(t *testing.T) {
tests := []struct {
name string
kase string
expect string
wantErr bool
}{
{
name: "noline",
kase: "abcdefghijklmnopqrstuvwxyz",
expect: "abcdefghijklmnopqrstuvwxyz",
},
{
name: "endline",
kase: "abcdefghijklmnopqrstuvwxyz\n",
expect: "abcdefghijklmnopqrstuvwxyz<br>",
},
{
name: "startline",
kase: "\nabcdefghijklmnopqrstuvwxyz",
expect: "<br>abcdefghijklmnopqrstuvwxyz",
},
{
name: "onlyline",
kase: "\n\n\n",
expect: "<br><br><br>",
},
{
name: "empty",
kase: "",
expect: "",
},
{
name: "midline",
kase: "\nabc\ndefghijkl\nmnopqrstuvwxy\nz",
expect: "<br>abc<br>defghijkl<br>mnopqrstuvwxy<br>z",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
buf := &strings.Builder{}
b := &BreakWriter{
Writer: buf,
}
n, err := b.Write([]byte(tt.kase))
if (err != nil) != tt.wantErr {
t.Errorf("BreakWriter.Write() error = %v, wantErr %v", err, tt.wantErr)
return
}
if n != len(tt.kase) {
t.Errorf("BreakWriter.Write() = %v, want %v", n, len(tt.kase))
}
if buf.String() != tt.expect {
t.Errorf("BreakWriter.Write() wrote %q, want %v", buf.String(), tt.expect)
}
})
}
}
Loading

0 comments on commit fb549db

Please sign in to comment.