diff --git a/_testdata/bom.proto b/_testdata/bom.proto new file mode 100644 index 0000000..c5c9d1d --- /dev/null +++ b/_testdata/bom.proto @@ -0,0 +1 @@ +syntax = "proto3"; diff --git a/lexer/scanner/token.go b/lexer/scanner/token.go index ced2abc..63e4758 100644 --- a/lexer/scanner/token.go +++ b/lexer/scanner/token.go @@ -37,6 +37,7 @@ const ( TCOMMA // , TDOT // . TMINUS // - + TBOM // Byte Order Mark // Keywords TSYNTAX @@ -64,22 +65,23 @@ const ( func asMiscToken(ch rune) Token { m := map[rune]Token{ - ';': TSEMICOLON, - ':': TCOLON, - '=': TEQUALS, - '"': TQUOTE, - '\'': TQUOTE, - '(': TLEFTPAREN, - ')': TRIGHTPAREN, - '{': TLEFTCURLY, - '}': TRIGHTCURLY, - '[': TLEFTSQUARE, - ']': TRIGHTSQUARE, - '<': TLESS, - '>': TGREATER, - ',': TCOMMA, - '.': TDOT, - '-': TMINUS, + ';': TSEMICOLON, + ':': TCOLON, + '=': TEQUALS, + '"': TQUOTE, + '\'': TQUOTE, + '(': TLEFTPAREN, + ')': TRIGHTPAREN, + '{': TLEFTCURLY, + '}': TRIGHTCURLY, + '[': TLEFTSQUARE, + ']': TRIGHTSQUARE, + '<': TLESS, + '>': TGREATER, + ',': TCOMMA, + '.': TDOT, + '-': TMINUS, + '\uFEFF': TBOM, } if t, ok := m[ch]; ok { return t diff --git a/parser/proto.go b/parser/proto.go index 9f83ad0..4dcd3db 100644 --- a/parser/proto.go +++ b/parser/proto.go @@ -33,6 +33,8 @@ func (p *Proto) Accept(v Visitor) { // // See https://developers.google.com/protocol-buffers/docs/reference/proto3-spec#proto_file func (p *Parser) ParseProto() (*Proto, error) { + p.parseBOM() + syntaxComments := p.ParseComments() syntax, err := p.ParseSyntax() if err != nil { @@ -55,6 +57,15 @@ func (p *Parser) ParseProto() (*Proto, error) { }, nil } +// See https://protobuf.com/docs/language-spec#source-code-representation +func (p *Parser) parseBOM() { + p.lex.Next() + if p.lex.Token == scanner.TBOM { + return + } + defer p.lex.UnNext() +} + // protoBody = { import | package | option | topLevelDef | emptyStatement } // topLevelDef = message | enum | service | extend // See https://developers.google.com/protocol-buffers/docs/reference/proto3-spec#proto_file diff --git a/parser/proto_test.go b/parser/proto_test.go index bce4a75..26c6405 100644 --- a/parser/proto_test.go +++ b/parser/proto_test.go @@ -2242,6 +2242,33 @@ message foo { }, }, }, + { + name: "parsing a UTF-8-BOM file", + input: string([]byte{ + 0xEF, 0xBB, 0xBF, + }) + ` +syntax = "proto3"; +`, + wantProto: &parser.Proto{ + Syntax: &parser.Syntax{ + ProtobufVersion: "proto3", + ProtobufVersionQuote: `"proto3"`, + Meta: meta.Meta{ + Pos: meta.Position{ + Offset: 4, + Line: 2, + Column: 1, + }, + LastPos: meta.Position{ + Offset: 21, + Line: 2, + Column: 18, + }, + }, + }, + Meta: &parser.ProtoMeta{}, + }, + }, } for _, test := range tests {