Skip to content

Commit

Permalink
HTML: support parsing simple templates
Browse files Browse the repository at this point in the history
  • Loading branch information
tdewolff committed Oct 30, 2023
1 parent dead295 commit f501855
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 21 deletions.
97 changes: 86 additions & 11 deletions html/lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const (
TextToken
SvgToken
MathToken
TemplateToken
)

// String returns the string representation of a TokenType.
Expand Down Expand Up @@ -50,22 +51,34 @@ func (tt TokenType) String() string {
return "Svg"
case MathToken:
return "Math"
case TemplateToken:
return "Template"
}
return "Invalid(" + strconv.Itoa(int(tt)) + ")"
}

////////////////////////////////////////////////////////////////

var GoTemplate = [2]string{"{{", "}}"}
var HandlebarsTemplate = [2]string{"{{", "}}"}
var MustacheTemplate = [2]string{"{{", "}}"}
var EJSTemplate = [2]string{"<%", "%>"}
var ASPTemplate = [2]string{"<%", "%>"}
var PHPTemplate = [2]string{"<?", "?>"}

// Lexer is the state for the lexer.
type Lexer struct {
r *parse.Input
err error
r *parse.Input
tmplBegin []byte
tmplEnd []byte
err error

rawTag Hash
inTag bool

text []byte
attrVal []byte
text []byte
attrVal []byte
attrTmpl bool
}

// NewLexer returns a new Lexer for a given io.Reader.
Expand All @@ -75,6 +88,14 @@ func NewLexer(r *parse.Input) *Lexer {
}
}

func NewTemplateLexer(r *parse.Input, tmpl [2]string) *Lexer {
return &Lexer{
r: r,
tmplBegin: []byte(tmpl[0]),
tmplEnd: []byte(tmpl[1]),
}
}

// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
func (l *Lexer) Err() error {
if l.err != nil {
Expand All @@ -88,11 +109,21 @@ func (l *Lexer) Text() []byte {
return l.text
}

// AttrKey returns the attribute key when an AttributeToken was returned from Next.
func (l *Lexer) AttrKey() []byte {
return l.text
}

// AttrVal returns the attribute value when an AttributeToken was returned from Next.
func (l *Lexer) AttrVal() []byte {
return l.attrVal
}

// AttrHasTemplate returns the true if the attribute value contains a template.
func (l *Lexer) AttrHasTemplate() bool {
return l.attrTmpl
}

// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
func (l *Lexer) Next() (TokenType, []byte) {
l.text = nil
Expand Down Expand Up @@ -135,12 +166,12 @@ func (l *Lexer) Next() (TokenType, []byte) {
if c == '<' {
c = l.r.Peek(1)
isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
if l.r.Pos() > 0 {
if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
// return currently buffered texttoken so that we can return tag next iteration
l.text = l.r.Shift()
return TextToken, l.text
}
if !isEndTag && (c < 'a' || 'z' < c) && (c < 'A' || 'Z' < c) && c != '!' && c != '?' {
// not a tag
} else if 0 < l.r.Pos() {
// return currently buffered texttoken so that we can return tag next iteration
l.text = l.r.Shift()
return TextToken, l.text
} else if isEndTag {
l.r.Move(2)
// only endtags that are not followed by > or EOF arrive here
Expand All @@ -159,6 +190,16 @@ func (l *Lexer) Next() (TokenType, []byte) {
l.r.Move(1)
return CommentToken, l.shiftBogusComment()
}
} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
if 0 < l.r.Pos() {
// return currently buffered texttoken so that we can return tag next iteration
l.text = l.r.Shift()
return TextToken, l.text
} else {
l.r.Move(len(l.tmplBegin))
l.moveTemplate()
return TemplateToken, l.r.Shift()
}
} else if c == 0 && l.r.Err() != nil {
if l.r.Pos() > 0 {
l.text = l.r.Shift()
Expand Down Expand Up @@ -360,6 +401,7 @@ func (l *Lexer) shiftAttribute() []byte {
}
break
}
l.attrTmpl = false
if c == '=' {
l.r.Move(1)
for { // before attribute value state
Expand All @@ -378,11 +420,20 @@ func (l *Lexer) shiftAttribute() []byte {
if c == delim {
l.r.Move(1)
break
} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
l.r.Move(len(l.tmplBegin))
l.moveTemplate()
l.attrTmpl = true
} else if c == 0 && l.r.Err() != nil {
break
} else {
l.r.Move(1)
}
l.r.Move(1)
}
} else if 0 < len(l.tmplBegin) && l.at(l.tmplBegin...) {
l.r.Move(len(l.tmplBegin))
l.moveTemplate()
l.attrTmpl = true
} else { // attribute value unquoted state
for {
if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
Expand Down Expand Up @@ -473,6 +524,30 @@ func (l *Lexer) shiftXML(rawTag Hash) []byte {
return l.r.Shift()
}

func (l *Lexer) moveTemplate() {
for {
if c := l.r.Peek(0); l.at(l.tmplEnd...) || c == 0 && l.r.Err() != nil {
if c != 0 {
l.r.Move(len(l.tmplEnd))
}
break
} else if c == '"' || c == '\'' {
escape := false
for {
if c2 := l.r.Peek(1); !escape && c2 == c || c2 == 0 && l.r.Err() != nil {
break
} else if c2 == '\\' {
escape = !escape
} else {
escape = false
}
l.r.Move(1)
}
}
l.r.Move(1)
}
}

////////////////////////////////////////////////////////////////

func (l *Lexer) at(b ...byte) bool {
Expand Down
74 changes: 64 additions & 10 deletions html/lex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
type TTs []TokenType

func TestTokens(t *testing.T) {
var tokenTests = []struct {
var tests = []struct {
html string
expected []TokenType
}{
Expand Down Expand Up @@ -79,7 +79,7 @@ func TestTokens(t *testing.T) {
// go-fuzz
{"</>", TTs{TextToken}},
}
for _, tt := range tokenTests {
for _, tt := range tests {
t.Run(tt.html, func(t *testing.T) {
l := NewLexer(parse.NewInputString(tt.html))
i := 0
Expand All @@ -106,7 +106,7 @@ func TestTokens(t *testing.T) {
}

func TestTags(t *testing.T) {
var tagTests = []struct {
var tests = []struct {
html string
expected string
}{
Expand All @@ -118,7 +118,7 @@ func TestTags(t *testing.T) {
// early endings
{"<foo ", "foo"},
}
for _, tt := range tagTests {
for _, tt := range tests {
t.Run(tt.html, func(t *testing.T) {
l := NewLexer(parse.NewInputString(tt.html))
for {
Expand All @@ -137,7 +137,7 @@ func TestTags(t *testing.T) {
}

func TestAttributes(t *testing.T) {
var attributeTests = []struct {
var tests = []struct {
attr string
expected []string
}{
Expand All @@ -157,7 +157,7 @@ func TestAttributes(t *testing.T) {
{"<foo \x00=\x00>", []string{"\x00", "\x00"}},
{"<foo \x00='\x00'>", []string{"\x00", "'\x00'"}},
}
for _, tt := range attributeTests {
for _, tt := range tests {
t.Run(tt.attr, func(t *testing.T) {
l := NewLexer(parse.NewInputString(tt.attr))
i := 0
Expand All @@ -170,7 +170,7 @@ func TestAttributes(t *testing.T) {
} else if token == AttributeToken {
test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected))
if i+1 < len(tt.expected) {
test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match")
test.String(t, string(l.AttrKey()), tt.expected[i], "attribute keys must match")
test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match")
i += 2
}
Expand All @@ -180,15 +180,69 @@ func TestAttributes(t *testing.T) {
}
}

func TestTemplates(t *testing.T) {
var tests = []struct {
html string
expected []TokenType
}{
{"<p>{{.}}</p>", TTs{StartTagToken, StartTagCloseToken, TemplateToken, EndTagToken}},
{"<p> {{.}} </p>", TTs{StartTagToken, StartTagCloseToken, TextToken, TemplateToken, TextToken, EndTagToken}},
{"<input type='{{.}}'/>", TTs{StartTagToken, AttributeToken, StartTagVoidToken}},
{"<input type={{.}} />", TTs{StartTagToken, AttributeToken, StartTagVoidToken}},
}
for _, tt := range tests {
t.Run(tt.html, func(t *testing.T) {
l := NewTemplateLexer(parse.NewInputString(tt.html), GoTemplate)
i := 0
tokens := []TokenType{}
for {
token, _ := l.Next()
if token == ErrorToken {
test.T(t, l.Err(), io.EOF)
break
}
tokens = append(tokens, token)
i++
}
test.T(t, tokens, tt.expected, "token types must match")
})
}
}

func TestTemplateAttributess(t *testing.T) {
var tests = []struct {
html string
hasTmpl bool
}{
{"<input type='{value}'/>", false},
{"<input type='{{.}}'/>", true},
{"<input type={{.}} />", true},
}
for _, tt := range tests {
t.Run(tt.html, func(t *testing.T) {
l := NewTemplateLexer(parse.NewInputString(tt.html), GoTemplate)
for {
token, _ := l.Next()
if token == ErrorToken {
test.T(t, l.Err(), io.EOF)
break
} else if token == AttributeToken {
test.T(t, l.AttrHasTemplate(), tt.hasTmpl)
}
}
})
}
}

func TestErrors(t *testing.T) {
var errorTests = []struct {
var tests = []struct {
html string
col int
}{
{"<svg>\x00</svg>", 6},
{"<svg></svg\x00>", 11},
}
for _, tt := range errorTests {
for _, tt := range tests {
t.Run(tt.html, func(t *testing.T) {
l := NewLexer(parse.NewInputString(tt.html))
for {
Expand Down Expand Up @@ -218,7 +272,7 @@ func TestTextAndAttrVal(t *testing.T) {

_, data = l.Next()
test.Bytes(t, data, []byte(` attr="val"`))
test.Bytes(t, l.Text(), []byte("attr"))
test.Bytes(t, l.AttrKey(), []byte("attr"))
test.Bytes(t, l.AttrVal(), []byte(`"val"`))

_, data = l.Next()
Expand Down

0 comments on commit f501855

Please sign in to comment.