Skip to content

Commit

Permalink
d2parser: Autodetect UTF-16 based on BOM
Browse files Browse the repository at this point in the history
Turns out I was wrong this is safe.
  • Loading branch information
nhooyr committed Aug 2, 2023
1 parent b81da1e commit 6be3db3
Show file tree
Hide file tree
Showing 10 changed files with 49 additions and 86 deletions.
10 changes: 5 additions & 5 deletions d2compiler/compile.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,27 @@ import (
)

type CompileOptions struct {
UTF16 bool
UTF16Pos bool
// FS is the file system used for resolving imports in the d2 text.
// It should correspond to the root path.
FS fs.FS
}

func Compile(p string, r io.RuneReader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
func Compile(p string, r io.Reader, opts *CompileOptions) (*d2graph.Graph, *d2target.Config, error) {
if opts == nil {
opts = &CompileOptions{}
}

ast, err := d2parser.Parse(p, r, &d2parser.ParseOptions{
UTF16: opts.UTF16,
UTF16Pos: opts.UTF16Pos,
})
if err != nil {
return nil, nil, err
}

ir, err := d2ir.Compile(ast, &d2ir.CompileOptions{
UTF16: opts.UTF16,
FS: opts.FS,
UTF16Pos: opts.UTF16Pos,
FS: opts.FS,
})
if err != nil {
return nil, nil, err
Expand Down
2 changes: 1 addition & 1 deletion d2exporter/export_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ func run(t *testing.T, tc testCase) {
ctx = log.Leveled(ctx, slog.LevelDebug)

g, config, err := d2compiler.Compile("", strings.NewReader(tc.dsl), &d2compiler.CompileOptions{
UTF16: true,
UTF16Pos: true,
})
if err != nil {
t.Fatal(err)
Expand Down
6 changes: 3 additions & 3 deletions d2ir/compile.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ type compiler struct {
importStack []string
// importCache enables reuse of files imported multiple times.
importCache map[string]*Map
utf16 bool
utf16Pos bool

globStack []bool
}

type CompileOptions struct {
UTF16 bool
UTF16Pos bool
// Pass nil to disable imports.
FS fs.FS
}
Expand All @@ -45,7 +45,7 @@ func Compile(ast *d2ast.Map, opts *CompileOptions) (*Map, error) {
fs: opts.FS,

importCache: make(map[string]*Map),
utf16: opts.UTF16,
utf16Pos: opts.UTF16Pos,
}
m := &Map{}
m.initRoot()
Expand Down
2 changes: 1 addition & 1 deletion d2ir/import.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func (c *compiler) __import(imp *d2ast.Import) (*Map, bool) {
defer f.Close()

ast, err := d2parser.Parse(impPath, f, &d2parser.ParseOptions{
UTF16: c.utf16,
UTF16Pos: c.utf16Pos,
ParseError: c.err,
})
if err != nil {
Expand Down
6 changes: 3 additions & 3 deletions d2lib/d2.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
)

type CompileOptions struct {
UTF16 bool
UTF16Pos bool
FS fs.FS
MeasuredTexts []*d2target.MText
Ruler *textmeasure.Ruler
Expand All @@ -50,8 +50,8 @@ func Compile(ctx context.Context, input string, compileOpts *CompileOptions, ren
}

g, config, err := d2compiler.Compile(compileOpts.InputPath, strings.NewReader(input), &d2compiler.CompileOptions{
UTF16: compileOpts.UTF16,
FS: compileOpts.FS,
UTF16Pos: compileOpts.UTF16Pos,
FS: compileOpts.FS,
})
if err != nil {
return nil, nil, err
Expand Down
53 changes: 31 additions & 22 deletions d2parser/parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package d2parser

import (
"bufio"
"bytes"
"fmt"
"io"
"math/big"
Expand All @@ -23,9 +24,6 @@ type ParseOptions struct {
// So you want to read UTF-8 still but adjust the indexes to pretend the input is utf16.
UTF16Pos bool

// UTF16Input makes the parser read the input as UTF16 and also sets UTF16Pos.
UTF16Input bool

ParseError *ParseError
}

Expand All @@ -44,25 +42,29 @@ type ParseOptions struct {
func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
if opts == nil {
opts = &ParseOptions{
UTF16Pos: false,
UTF16Input: false,
UTF16Pos: false,
}
}

p := &parser{
path: path,

utf16Input: opts.UTF16Input,
utf16Pos: opts.UTF16Pos,
err: opts.ParseError,
utf16Pos: opts.UTF16Pos,
err: opts.ParseError,
}
if p.utf16Input {
p.utf16Pos = true
tr := transform.NewReader(r, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
p.reader = bufio.NewReader(tr)
} else {
p.reader = bufio.NewReader(r)
br := bufio.NewReader(r)
p.reader = br

bom, err := br.Peek(2)
if err == nil {
// 0xFFFE is invalid UTF-8 so this is safe.
// Also a different BOM is used for UTF-8.
// See https://unicode.org/faq/utf_bom.html#bom4
if bom[0] == 0xFF && bom[1] == 0xFE {
p.utf16Input(br, r)
}
}

if p.err == nil {
p.err = &ParseError{}
}
Expand All @@ -74,6 +76,17 @@ func Parse(path string, r io.Reader, opts *ParseOptions) (*d2ast.Map, error) {
return m, nil
}

func (p *parser) utf16Input(br *bufio.Reader, r io.Reader) {
p.utf16Pos = true

buf := make([]byte, br.Buffered())
io.ReadFull(br, buf)

mr := io.MultiReader(bytes.NewBuffer(buf), r)
tr := transform.NewReader(mr, tunicode.UTF16(tunicode.LittleEndian, tunicode.UseBOM).NewDecoder())
p.reader = bufio.NewReader(tr)
}

func ParseKey(key string) (*d2ast.KeyPath, error) {
p := &parser{
reader: strings.NewReader(key),
Expand Down Expand Up @@ -131,10 +144,9 @@ func ParseValue(value string) (d2ast.Value, error) {
//
// TODO: ast struct that combines map & errors and pass that around
type parser struct {
path string
pos d2ast.Position
utf16Pos bool
utf16Input bool
path string
pos d2ast.Position
utf16Pos bool

reader io.RuneReader
readerPos d2ast.Position
Expand Down Expand Up @@ -212,10 +224,7 @@ func (p *parser) _readRune() (r rune, eof bool) {

p.readerPos = p.lookaheadPos

r, n, err := p.reader.ReadRune()
if p.utf16Input && n > 0 {
// TODO:
}
r, _, err := p.reader.ReadRune()
if err != nil {
p.ioerr = true
if err != io.EOF {
Expand Down
13 changes: 1 addition & 12 deletions d2parser/parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
type testCase struct {
name string
text string
utf16 bool
assert func(t testing.TB, ast *d2ast.Map, err error)
}

Expand Down Expand Up @@ -395,20 +394,13 @@ c-
},
{
name: "utf16-input",
utf16: true,
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
assert: func(t testing.TB, ast *d2ast.Map, err error) {
assert.Success(t, err)
t.Logf("%q", d2format.Format(ast))
assert.Equal(t, "x -> y\n", d2format.Format(ast))
},
},
{
name: "errors/utf16-input",
text: "\xff\xfex\x00 \x00-\x00>\x00 \x00y\x00\r\x00\n\x00",
assert: func(t testing.TB, ast *d2ast.Map, err error) {
assert.ErrorString(t, err, `d2/testdata/d2parser/TestParse/errors/utf16-input.d2:1:13: invalid text beginning unquoted key`)
},
},
}

t.Run("import", testImport)
Expand Down Expand Up @@ -510,9 +502,6 @@ func runa(t *testing.T, tca []testCase) {

d2Path := fmt.Sprintf("d2/testdata/d2parser/%v.d2", t.Name())
opts := &d2parser.ParseOptions{}
if tc.utf16 {
opts.UTF16Input = true
}
ast, err := d2parser.Parse(d2Path, strings.NewReader(tc.text), opts)

if tc.assert != nil {
Expand Down
3 changes: 3 additions & 0 deletions d2parser/utf16_gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"io"
"log"
"os"
"unicode/utf8"

"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
Expand All @@ -27,6 +28,8 @@ func main() {
}

fmt.Printf("%q\n", b.String())
fmt.Println("\xFF\xFE")
fmt.Println(utf8.ValidString("\xFF\xFE"))

err = os.WriteFile("./utf16.d2", b.Bytes(), 0644)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion e2etests/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func serde(t *testing.T, tc testCase, ruler *textmeasure.Ruler) {
ctx := context.Background()
ctx = log.WithTB(ctx, t, nil)
g, _, err := d2compiler.Compile("", strings.NewReader(tc.script), &d2compiler.CompileOptions{
UTF16: false,
UTF16Pos: false,
})
trequire.Nil(t, err)
if len(g.Objects) > 0 {
Expand Down
38 changes: 0 additions & 38 deletions testdata/d2parser/TestParse/errors/utf16-input.exp.json

This file was deleted.

0 comments on commit 6be3db3

Please sign in to comment.