Skip to content

Commit

Permalink
Initial refactor of lexing.
Browse files Browse the repository at this point in the history
Extended lexer.Definition to support directly lexing strings and []byte
slices. Remove ebnf and regex lexers.

An adapter has been added for v0 lexers.
  • Loading branch information
alecthomas committed Nov 26, 2020
1 parent 2403858 commit 362b266
Show file tree
Hide file tree
Showing 35 changed files with 202 additions and 1,875 deletions.
25 changes: 10 additions & 15 deletions _examples/basic/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,25 @@ import (

"github.com/alecthomas/participle"
"github.com/alecthomas/participle/lexer"
"github.com/alecthomas/participle/lexer/ebnf"
"github.com/alecthomas/participle/lexer/stateful"
)

var (
basicLexer = lexer.Must(ebnf.New(`
Comment = ("REM" | "rem" ) { "\u0000"…"\uffff"-"\n"-"\r" } .
Ident = (alpha | "_") { "_" | alpha | digit } .
String = "\"" { "\u0000"…"\uffff"-"\""-"\\" | "\\" any } "\"" .
Number = [ "-" | "+" ] ("." | digit) { "." | digit } .
Punct = "!"…"/" | ":"…"@" | "["…` + "\"`\"" + ` | "{"…"~" .
EOL = ( "\n" | "\r" ) { "\n" | "\r" }.
Whitespace = ( " " | "\t" ) { " " | "\t" } .
alpha = "a"…"z" | "A"…"Z" .
digit = "0"…"9" .
any = "\u0000"…"\uffff" .
`))
basicLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
{"Comment", `(?i)rem[^\n]*`, nil},
{"String", `"(\\"|[^"])*"`, nil},
{"Number", `[-+]?(\d*\.)?\d+`, nil},
{"Ident", `[a-zA-Z_]\w*`, nil},
{"Punct", `[-[!@#$%^&*()+_={}\|:;"'<,>.?/]|]`, nil},
{"EOL", `[\n\r]+`, nil},
{"whitespace", `[ \t]+`, nil},
}))

basicParser = participle.MustBuild(&Program{},
participle.Lexer(basicLexer),
participle.CaseInsensitive("Ident"),
participle.Unquote("String"),
participle.UseLookahead(2),
participle.Elide("Whitespace"),
)

cli struct {
Expand Down
2 changes: 1 addition & 1 deletion _examples/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.14

require (
github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2
github.com/alecthomas/kong v0.2.8
github.com/alecthomas/kong v0.2.11
github.com/alecthomas/participle v0.4.1
github.com/alecthomas/repr v0.0.0-20200325044227-4184120f674c
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
Expand Down
2 changes: 2 additions & 0 deletions _examples/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ github.com/alecthomas/go-thrift v0.0.0-20170109061633-7914173639b2/go.mod h1:CxC
github.com/alecthomas/kong v0.2.1/go.mod h1:+inYUSluD+p4L8KdviBSgzcqEjUQOfC5fQDRFuc36lI=
github.com/alecthomas/kong v0.2.8 h1:VSWWkD1TZij2967FcfVwgRwlp3khCA0liZIkUI9hTdU=
github.com/alecthomas/kong v0.2.8/go.mod h1:kQOmtJgV+Lb4aj+I2LEn40cbtawdWJ9Y8QLq+lElKxE=
github.com/alecthomas/kong v0.2.11 h1:RKeJXXWfg9N47RYfMm0+igkxBCTF4bzbneAxaqid0c4=
github.com/alecthomas/kong v0.2.11/go.mod h1:kQOmtJgV+Lb4aj+I2LEn40cbtawdWJ9Y8QLq+lElKxE=
github.com/alecthomas/participle v0.4.1 h1:P2PJWzwrSpuCWXKnzqvw0b0phSfH1kJo4p2HvLynVsI=
github.com/alecthomas/participle v0.4.1/go.mod h1:T8u4bQOSMwrkTWOSyt8/jSFPEnRtd0FKFMjVfYBlqPs=
github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ=
Expand Down
20 changes: 8 additions & 12 deletions _examples/graphql/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

"github.com/alecthomas/participle"
"github.com/alecthomas/participle/lexer"
"github.com/alecthomas/participle/lexer/ebnf"
"github.com/alecthomas/participle/lexer/stateful"
)

type File struct {
Expand Down Expand Up @@ -62,17 +62,13 @@ type Value struct {
}

var (
graphQLLexer = lexer.Must(ebnf.New(`
Comment = ("#" | "//") { "\u0000"…"\uffff"-"\n" } .
Ident = (alpha | "_") { "_" | alpha | digit } .
Number = ("." | digit) {"." | digit} .
Whitespace = " " | "\t" | "\n" | "\r" .
Punct = "!"…"/" | ":"…"@" | "["…` + "\"`\"" + ` | "{"…"~" .
alpha = "a"…"z" | "A"…"Z" .
digit = "0"…"9" .
`))

graphQLLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
{"Comment", `(?:#|//)[^\n]*\n?`, nil},
{"Ident", `[a-zA-Z]\w*`, nil},
{"Number", `(?:\d*\.)?\d+`, nil},
{"Punct", `[-[!@#$%^&*()+_={}\|:;"'<,>.?/]|]`, nil},
{"Whitespace", `[ \t\n\r]+`, nil},
}))
parser = participle.MustBuild(&File{},
participle.Lexer(graphQLLexer),
participle.Elide("Comment", "Whitespace"),
Expand Down
19 changes: 10 additions & 9 deletions _examples/ini/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,21 @@ import (

"github.com/alecthomas/participle"
"github.com/alecthomas/participle/lexer"
"github.com/alecthomas/participle/lexer/stateful"

"github.com/alecthomas/repr"
)

// A custom lexer for INI files. This illustrates a relatively complex Regexp lexer, as well
// as use of the Unquote filter, which unquotes string tokens.
var iniLexer = lexer.Must(lexer.Regexp(
`(?m)` +
`(\s+)` +
`|(^[#;].*$)` +
`|(?P<Ident>[a-zA-Z][a-zA-Z_\d]*)` +
`|(?P<String>"(?:\\.|[^"])*")` +
`|(?P<Float>\d+(?:\.\d+)?)` +
`|(?P<Punct>[][=])`,
))
var iniLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
{`Ident`, `[a-zA-Z][a-zA-Z_\d]*`, nil},
{`String`, `"(?:\\.|[^"])*"`, nil},
{`Float`, `\d+(?:\.\d+)?`, nil},
{`Punct`, `[][=]`, nil},
{"comment", `[#;][^\n]*`, nil},
{"whitespace", `\s+`, nil},
}))

type INI struct {
Properties []*Property `@@*`
Expand Down
18 changes: 11 additions & 7 deletions _examples/sql/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ package main

import (
"github.com/alecthomas/kong"

"github.com/alecthomas/participle"
"github.com/alecthomas/participle/lexer"
"github.com/alecthomas/participle/lexer/stateful"

"github.com/alecthomas/repr"
)

Expand Down Expand Up @@ -156,13 +159,14 @@ var (
SQL string `arg:"" required:"" help:"SQL to parse."`
}

sqlLexer = lexer.Must(lexer.Regexp(`(\s+)` +
`|(?P<Keyword>(?i)SELECT|FROM|TOP|DISTINCT|ALL|WHERE|GROUP|BY|HAVING|UNION|MINUS|EXCEPT|INTERSECT|ORDER|LIMIT|OFFSET|TRUE|FALSE|NULL|IS|NOT|ANY|SOME|BETWEEN|AND|OR|LIKE|AS|IN)` +
`|(?P<Ident>[a-zA-Z_][a-zA-Z0-9_]*)` +
`|(?P<Number>[-+]?\d*\.?\d+([eE][-+]?\d+)?)` +
`|(?P<String>'[^']*'|"[^"]*")` +
`|(?P<Operators><>|!=|<=|>=|[-+*/%,.()=<>])`,
))
sqlLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
{`Keyword`, `(?i)SELECT|FROM|TOP|DISTINCT|ALL|WHERE|GROUP|BY|HAVING|UNION|MINUS|EXCEPT|INTERSECT|ORDER|LIMIT|OFFSET|TRUE|FALSE|NULL|IS|NOT|ANY|SOME|BETWEEN|AND|OR|LIKE|AS|IN`, nil},
{`Ident`, `[a-zA-Z_][a-zA-Z0-9_]*`, nil},
{`Number`, `[-+]?\d*\.?\d+([eE][-+]?\d+)?`, nil},
{`String`, `'[^']*'|"[^"]*"`, nil},
{`Operators`, `<>|!=|<=|>=|[-+*/%,.()=<>]`, nil},
{"whitespace", `\s+`, nil},
}))
sqlParser = participle.MustBuild(
&Select{},
participle.Lexer(sqlLexer),
Expand Down
41 changes: 18 additions & 23 deletions _examples/toml/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ import (
"os"

"github.com/alecthomas/kong"

"github.com/alecthomas/participle"
"github.com/alecthomas/participle/lexer"
"github.com/alecthomas/participle/lexer/ebnf"
"github.com/alecthomas/participle/lexer/stateful"

"github.com/alecthomas/repr"
)

Expand All @@ -32,8 +34,7 @@ type Value struct {
Date *string `| @Date`
Time *string `| @Time`
Bool *bool `| (@"true" | "false")`
Integer *int64 `| @Int`
Float *float64 `| @Float`
Number *float64 `| @Number`
List []*Value `| "[" [ @@ { "," @@ } ] "]"`
}

Expand All @@ -43,28 +44,22 @@ type Section struct {
}

var (
tomlLexer = lexer.Must(ebnf.New(`
Comment = "#" { "\u0000"…"\uffff"-"\n" } .
DateTime = date "T" time [ "-" digit digit ":" digit digit ].
Date = date .
Time = time .
Ident = (alpha | "_") { "_" | alpha | digit } .
String = "\"" { "\u0000"…"\uffff"-"\""-"\\" | "\\" any } "\"" .
Int = [ "-" | "+" ] digit { digit } .
Float = ("." | digit) {"." | digit} .
Punct = "!"…"/" | ":"…"@" | "["…` + "\"`\"" + ` | "{"…"~" .
Whitespace = " " | "\t" | "\n" | "\r" .
alpha = "a"…"z" | "A"…"Z" .
digit = "0"…"9" .
any = "\u0000"…"\uffff" .
date = digit digit digit digit "-" digit digit "-" digit digit .
time = digit digit ":" digit digit ":" digit digit [ "." { digit } ] .
`))
tomlLexer = lexer.Must(stateful.NewSimple([]stateful.Rule{
{"DateTime", `\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(-\d\d:\d\d)?`, nil},
{"Date", `\d\d\d\d-\d\d-\d\d`, nil},
{"Time", `\d\d:\d\d:\d\d(\.\d+)?`, nil},
{"Ident", `[a-zA-Z_][a-zA-Z_0-9]*`, nil},
{"String", `"[^"]*"`, nil},
{"Number", `[-+]?[.0-9]+\b`, nil},
{"Punct", `\[|]|[-!()+/*=,]`, nil},
{"comment", `#[^\n]+`, nil},
{"whitespace", `\s+`, nil},
}))
tomlParser = participle.MustBuild(&TOML{},
participle.Lexer(tomlLexer),
participle.Lexer(
tomlLexer,
),
participle.Unquote("String"),
participle.Elide("Whitespace", "Comment"),
)

cli struct {
Expand Down
2 changes: 1 addition & 1 deletion ebnf_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package participle
package participle_test

import (
"strings"
Expand Down
6 changes: 4 additions & 2 deletions error_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package participle
package participle_test

import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/alecthomas/participle"
)

func TestErrorReporting(t *testing.T) {
Expand All @@ -22,7 +24,7 @@ func TestErrorReporting(t *testing.T) {
type grammar struct {
Decls []*decl `( @@ ";" )*`
}
p := mustTestParser(t, &grammar{}, UseLookahead(5))
p := mustTestParser(t, &grammar{}, participle.UseLookahead(5))

var err error
ast := &grammar{}
Expand Down
46 changes: 46 additions & 0 deletions lexer/adapters.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package lexer

import (
"bytes"
"io"
"strings"
)

type legacy struct {
legacy interface {
Lex(io.Reader) (Lexer, error)
Symbols() map[string]rune
}
}

func (l legacy) LexReader(r io.Reader) (Lexer, error) { return l.legacy.Lex(r) }
func (l legacy) LexString(s string) (Lexer, error) { return l.legacy.Lex(strings.NewReader(s)) }
func (l legacy) LexBytes(b []byte) (Lexer, error) { return l.legacy.Lex(bytes.NewReader(b)) }
func (l legacy) Symbols() map[string]rune { return l.legacy.Symbols() }

// Legacy is a shim for Participle v0 lexer definitions.
func Legacy(def interface {
Lex(io.Reader) (Lexer, error)
Symbols() map[string]rune
}) Definition {
return legacy{def}
}

// Simple upgrades a lexer that only implements LexReader() by using
// strings/bytes.NewReader().
func Simple(def interface {
Symbols() map[string]rune
LexReader(io.Reader) (Lexer, error)
}) Definition {
return simple{def}
}

type simplei interface {
Symbols() map[string]rune
LexReader(io.Reader) (Lexer, error)
}

type simple struct{ simplei }

func (s simple) LexString(str string) (Lexer, error) { return s.LexReader(strings.NewReader(str)) }
func (s simple) LexBytes(b []byte) (Lexer, error) { return s.LexReader(bytes.NewReader(b)) }
18 changes: 2 additions & 16 deletions lexer/doc.go
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
// Package lexer defines interfaces and implementations used by Participle to perform lexing.
//
// The primary interfaces are Definition and Lexer. There are three implementations of these
// interfaces:
//
// TextScannerLexer is based on text/scanner. This is the fastest, but least flexible, in that
// tokens are restricted to those supported by that package. It can scan about 5M tokens/second on a
// late 2013 15" MacBook Pro.
//
// The second lexer is constructed via the Regexp() function, mapping regexp capture groups
// to tokens. The complete input source is read into memory, so it is unsuitable for large inputs.
//
// The final lexer provided accepts a lexical grammar in EBNF. Each capitalised production is a
// lexical token supported by the resulting Lexer. This is very flexible, but a bit slower, scanning
// around 730K tokens/second on the same machine, though it is currently completely unoptimised.
// This could/should be converted to a table-based lexer.
//
// Lexer implementations must use Panic/Panicf to report errors.
// The primary interfaces are Definition and Lexer. There is one concrete implementation included,
// the stateful lexer.
package lexer
Loading

0 comments on commit 362b266

Please sign in to comment.