Reduce scope of keywords during lexing (#404)

This means keywords in one section (i.e. "name") may be used as ids in other sections.
asmaloney · Jan 20, 2024 · 51b4659 · 51b4659
1 parent 411d148
commit 51b4659
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 47 deletions.
diff --git a/amod/amod_config_test.go b/amod/amod_config_test.go
@@ -457,3 +457,16 @@ func Example_proceduralFieldUnrecognized() {
 	// Output:
 	// ERROR: unrecognized option "foo" in procedural config (line 6, col 15)
 }
+
+// Tests that we can use a keyword from one section as an id in another
+func Example_keywordInDifferentSection() {
+	generateToStdout(`
+	~~ model ~~
+	name: Test
+	~~ config ~~
+	chunks { [name: first last] }
+	~~ init ~~
+	~~ productions ~~`)
+
+	// Output:
+}
diff --git a/amod/lex.go b/amod/lex.go
@@ -8,6 +8,7 @@ package amod
 import (
 	"fmt"
 	"io"
+	"slices"
 	"strings"
 	"unicode"
 	"unicode/utf8"
@@ -99,18 +100,31 @@ type lexeme struct {
 	pos   int // position within the line
 }
 
+// sectionType is used to keep track of what section we are lexing
+// We use this to limit the scope of keywords.
+type sectionType int
+
+const (
+	sectionModel sectionType = iota
+	sectionConfig
+	sectionInit
+	sectionProduction
+)
+
 // lexer_amod tracks our lexing and provides a channel to emit lexemes
 type lexer_amod struct {
 	name           string // used only for error reports
 	input          string // the string being scanned.
 	line           int    // the line number
 	lastNewlinePos int
-	start          int             // start position of this lexeme (offset from beginning of file)
-	pos            int             // current position in the input  (offset from beginning of file)
-	width          int             // width of last rune read from input
-	lexemes        chan lexeme     // channel of scanned lexemes
-	keywords       map[string]bool // used to lookup identifier to see if they are keywords
-	inPattern      bool            // state: a pattern - delimited by [] is lexed specially
+	start          int         // start position of this lexeme (offset from beginning of file)
+	pos            int         // current position in the input  (offset from beginning of file)
+	width          int         // width of last rune read from input
+	lexemes        chan lexeme // channel of scanned lexemes
+
+	inSectionHeader bool        // state: switch currentSection based on ~~ section headers
+	currentSection  sectionType // which section are we lexing? used to switch out keywords
+	inPattern       bool        // state: a pattern - delimited by [] is lexed specially
 }
 
 // stateFn is used to move through the lexing states
@@ -122,26 +136,40 @@ const (
 	commentDelim = "//"
 )
 
-var keywords []string = []string{
+// keywordsModel are only keywords for the model section
+var keywordsModel []string = []string{
+	"authors",
+	"description",
+	"examples",
+	"name",
+}
+
+// keywordsModel are only keywords for the config section
+var keywordsConfig []string = []string{
+	"chunks",
+	"gactar",
+	"modules",
+}
+
+// keywordsModel are only keywords for the init section
+var keywordsInit []string = []string{
+	"similar",
+}
+
+// keywordsModel are only keywords for the productions section
+var keywordsProductions []string = []string{
 	"and",
 	"any",
-	"authors",
 	"buffer_state",
-	"chunks",
 	"clear",
 	"description",
 	"do",
-	"examples",
-	"gactar",
 	"match",
 	"module_state",
-	"modules",
-	"name",
 	"nil",
 	"print",
 	"recall",
 	"set",
-	"similar",
 	"stop",
 	"to",
 	"when",
@@ -185,17 +213,14 @@ func lex(filename string, data string) *lexer_amod {
 	cleanData(&data)
 
 	l := &lexer_amod{
-		name:           filename,
-		input:          data,
-		line:           1,
-		lastNewlinePos: 1, // start @ 1 so first line gets 0 (see emit())
-		lexemes:        make(chan lexeme),
-		keywords:       make(map[string]bool),
-		inPattern:      false,
-	}
-
-	for _, v := range keywords {
-		l.keywords[v] = true
+		name:            filename,
+		input:           data,
+		line:            1,
+		lastNewlinePos:  1, // start @ 1 so first line gets 0 (see emit())
+		lexemes:         make(chan lexeme),
+		currentSection:  sectionModel,
+		inSectionHeader: false,
+		inPattern:       false,
 	}
 
 	go l.run()
@@ -252,9 +277,20 @@ func (l *lexer_amod) next() rune {
 	return r
 }
 
+// lookupKeyword looks up "id" to see if it is a keyword based on which section we are lexing
 func (l *lexer_amod) lookupKeyword(id string) bool {
-	v, ok := l.keywords[id]
-	return v && ok
+	switch l.currentSection {
+	case sectionModel:
+		return slices.Contains(keywordsModel, id)
+	case sectionConfig:
+		return slices.Contains(keywordsConfig, id)
+	case sectionInit:
+		return slices.Contains(keywordsInit, id)
+	case sectionProduction:
+		return slices.Contains(keywordsProductions, id)
+	}
+
+	return false
 }
 
 // skip over the pending input before this point
@@ -428,6 +464,7 @@ func lexStart(l *lexer_amod) stateFn {
 		if l.nextIs('~') {
 			l.next()
 			l.emit(lexemeSectionDelim)
+			l.inSectionHeader = !l.inSectionHeader
 		} else {
 			l.emit(lexemeChar)
 		}
@@ -495,9 +532,32 @@ func lexIdentifier(l *lexer_amod) stateFn {
 		l.next()
 	}
 
+	id := l.input[l.start:l.pos]
+	isKeyword := false
+
+	// If we are in a section header, then change our current section
+	if l.inSectionHeader {
+		switch id {
+		case "model":
+			l.currentSection = sectionModel
+		case "config":
+			l.currentSection = sectionConfig
+		case "init":
+			l.currentSection = sectionInit
+		case "productions":
+			l.currentSection = sectionProduction
+		default:
+			return l.errorf("unrecognized section")
+		}
+
+		// these are keywords in this context
+		isKeyword = true
+	} else {
+		isKeyword = l.lookupKeyword(id)
+	}
+
 	// Perhaps not the best way to do this.
 	// I'm sure there's a char-by-char way we could implement which would be faster.
-	isKeyword := l.lookupKeyword(l.input[l.start:l.pos])
 	switch {
 	case isKeyword:
 		l.emit(lexemeKeyword)

diff --git a/amod/lex_test.go b/amod/lex_test.go
@@ -45,6 +45,15 @@ func TestInvalidSection(t *testing.T) {
 	if token.Type != lexer.TokenType(lexemeSectionDelim) {
 		t.Errorf("expected to lex '%s' as section delimiter (%d) - got type %d", token.Value, lexemeSectionDelim, token.Type)
 	}
+
+	expected := "ERROR on line 1 at position 9: unrecognized section"
+
+	token, err = l.Next()
+	if err == nil {
+		t.Errorf("expected error: %q", expected)
+	} else if err.Error() != expected {
+		t.Errorf("expected error: %q but got %q", expected, err.Error())
+	}
 }
 
 func TestUnterminatedQuote(t *testing.T) {

diff --git a/amod/parse.go b/amod/parse.go
@@ -25,26 +25,26 @@ import (
 //		paste in the generated EBNF above, click "Convert" and then click "View Diagram"
 
 type amodFile struct {
-	ModelHeader string        `parser:"'~~':SectionDelim 'model' '~~':SectionDelim"`
+	ModelHeader string        `parser:"'~~':SectionDelim 'model':Keyword '~~':SectionDelim"`
 	Model       *modelSection `parser:"@@"`
 
-	ConfigHeader string         `parser:"'~~':SectionDelim 'config' '~~':SectionDelim"`
+	ConfigHeader string         `parser:"'~~':SectionDelim 'config':Keyword '~~':SectionDelim"`
 	Config       *configSection `parser:"(@@)?"`
 
-	InitHeader string       `parser:"'~~':SectionDelim 'init' '~~':SectionDelim"`
+	InitHeader string       `parser:"'~~':SectionDelim 'init':Keyword '~~':SectionDelim"`
 	Init       *initSection `parser:"(@@)?"`
 
-	ProductionsHeader string             `parser:"'~~':SectionDelim 'productions' '~~':SectionDelim"`
+	ProductionsHeader string             `parser:"'~~':SectionDelim 'productions':Keyword '~~':SectionDelim"`
 	Productions       *productionSection `parser:"(@@)?"`
 
 	Tokens []lexer.Token
 }
 
 type modelSection struct {
-	Name        string     `parser:"'name' ':' (@String|@Ident)"`
-	Description string     `parser:"('description' ':' @String)?"`
-	Authors     []string   `parser:"('authors' '{' @String* '}')?"`
-	Examples    []*pattern `parser:"('examples' '{' @@* '}')?"`
+	Name        string     `parser:"'name':Keyword ':' (@String|@Ident)"`
+	Description string     `parser:"('description':Keyword ':' @String)?"`
+	Authors     []string   `parser:"('authors':Keyword '{' @String* '}')?"`
+	Examples    []*pattern `parser:"('examples':Keyword '{' @@* '}')?"`
 
 	Tokens []lexer.Token
 }
@@ -165,7 +165,7 @@ type field struct {
 }
 
 type gactarConfig struct {
-	GactarFields []*field `parser:"'gactar' '{' @@* '}'"`
+	GactarFields []*field `parser:"'gactar':Keyword '{' @@* '}'"`
 
 	Tokens []lexer.Token
 }
@@ -178,7 +178,7 @@ type module struct {
 }
 
 type moduleConfig struct {
-	Modules []*module `parser:"'modules' '{' @@* '}'"`
+	Modules []*module `parser:"'modules':Keyword '{' @@* '}'"`
 
 	Tokens []lexer.Token
 }
@@ -193,7 +193,7 @@ type chunkDecl struct {
 }
 
 type chunkConfig struct {
-	ChunkDecls []*chunkDecl `parser:"'chunks' '{' @@* '}'"`
+	ChunkDecls []*chunkDecl `parser:"'chunks':Keyword '{' @@* '}'"`
 
 	Tokens []lexer.Token
 }
@@ -347,19 +347,19 @@ type matchItem struct {
 }
 
 type match struct {
-	Items []*matchItem `parser:"'match' '{' @@+ '}'"`
+	Items []*matchItem `parser:"'match':Keyword '{' @@+ '}'"`
 
 	Tokens []lexer.Token
 }
 
 type clearStatement struct {
-	BufferNames []string `parser:"'clear' ( @Ident ','? )+"`
+	BufferNames []string `parser:"'clear':Keyword ( @Ident ','? )+"`
 
 	Tokens []lexer.Token
 }
 
 type printStatement struct {
-	Args []*printArg `parser:"'print' ( @@ ','? )*"`
+	Args []*printArg `parser:"'print':Keyword ( @@ ','? )*"`
 
 	Tokens []lexer.Token
 }
@@ -381,17 +381,17 @@ type withClause struct {
 }
 
 type recallStatement struct {
-	Pattern *pattern    `parser:"'recall' @@"`
+	Pattern *pattern    `parser:"'recall':Keyword @@"`
 	With    *withClause `parser:"@@?"`
 
 	Tokens []lexer.Token
 }
 
 type setStatement struct {
-	Set       string    `parser:"'set'"` // not used, but must be visible for parse to work
+	Set       string    `parser:"'set':Keyword"` // not used, but must be visible for parse to work
 	BufferRef bufferRef `parser:"@@"`
 
-	To      string   `parser:"'to'"` // not used, but must be visible for parse to work
+	To      string   `parser:"'to':Keyword"` // not used, but must be visible for parse to work
 	Value   *setArg  `parser:"( @@"`
 	Pattern *pattern `parser:"| @@)"`
 
@@ -415,15 +415,15 @@ type statement struct {
 }
 
 type do struct {
-	Do         string        `parser:"'do'"` // not used, but must be visible for parse to work
+	Do         string        `parser:"'do':Keyword"` // not used, but must be visible for parse to work
 	Statements *[]*statement `parser:"'{' @@+ '}'"`
 
 	Tokens []lexer.Token
 }
 
 type production struct {
 	Name        string  `parser:"@Ident '{'"`
-	Description *string `parser:"('description' ':' @String)?"`
+	Description *string `parser:"('description':Keyword ':' @String)?"`
 	Match       *match  `parser:"@@"`
 	Do          *do     `parser:"@@"`
 	End         string  `parser:"'}'"` // not used, but must be visible for parse to work