scan: add enumerated list

matthewdargan · Feb 4, 2024 · 0efedca · 0efedca
1 parent f999fcd
commit 0efedca
Show file tree

Hide file tree

Showing 4 changed files with 709 additions and 28 deletions.
diff --git a/scan/enum.go b/scan/enum.go
@@ -0,0 +1,171 @@
+// Copyright 2023 Matthew P. Dargan. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scan
+
+import (
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode"
+)
+
+// lexEnum scans an enumeration.
+func lexEnum(l *Scanner) stateFn {
+	for {
+		switch r := l.peek(); {
+		case r == '\n':
+			return lexEndOfLine(l, Enum)
+		case r == eof, unicode.IsSpace(r):
+			return l.emit(Enum)
+		}
+		l.next()
+	}
+}
+
+// isEnum reports whether the scanner is on an enumeration.
+func (l *Scanner) isEnum(r rune) bool {
+	if l.types[0] == BlankLine && l.types[1] == Paragraph {
+		return false
+	}
+	pos, lastWidth := l.pos, l.lastWidth
+	defer func() {
+		l.pos, l.lastWidth = pos, lastWidth
+	}()
+	if r == '(' {
+		r = l.next()
+	}
+	i, ok := l.enumSuffix()
+	if !ok {
+		return false
+	}
+	e, ok := l.enum(r, i)
+	if !ok {
+		return false
+	}
+	l.lastEnum = e
+	for r != eof && r != '\n' {
+		r = l.next()
+	}
+	if r == eof {
+		return true
+	}
+	r = l.next()
+	if r == '(' {
+		r = l.next()
+	}
+	if !unicode.IsDigit(r) && !unicode.IsLetter(r) {
+		return true
+	}
+	i, ok = l.enumSuffix()
+	if !ok {
+		return false
+	}
+	_, ok = l.enum(r, i)
+	return ok
+}
+
+const (
+	enumSuffixes   = ".)"
+	roman          = "Ii"
+	ambiguousRoman = "VXLCDMvxlcdm"
+)
+
+// enumSuffix returns the index of the first enumeration suffix character.
+func (l *Scanner) enumSuffix() (int, bool) {
+	i := strings.IndexAny(l.input[l.pos:], enumSuffixes)
+	if i < 0 {
+		return i, false
+	}
+	if l.pos+i+1 < len(l.input) && !unicode.IsSpace(rune(l.input[l.pos+i+1])) {
+		return i, false
+	}
+	return i, true
+}
+
+type enumType int
+
+const (
+	none enumType = iota
+	arabic
+	upperAlpha
+	lowerAlpha
+	upperRoman
+	lowerRoman
+)
+
+type enum struct {
+	typ  enumType
+	val  int
+	auto bool
+}
+
+// enum interprets an enumeration up to index i.
+func (l *Scanner) enum(r rune, i int) (enum, bool) {
+	var e enum
+	if l.lastEnum.auto && r != '#' {
+		return e, false
+	}
+	switch {
+	case unicode.IsDigit(r):
+		n, _ := strconv.Atoi(l.input[l.pos-1 : l.pos+i])
+		e = enum{typ: arabic, val: n}
+	case unicode.IsLetter(r):
+		switch {
+		case l.isRoman(r):
+			n, ok := parseRoman(l.input[l.pos-1 : l.pos+i])
+			if !ok {
+				return e, false
+			}
+			e = enum{typ: upperRoman, val: n}
+			if unicode.IsLower(r) {
+				e.typ = lowerRoman
+			}
+		case i > 0:
+			return e, false
+		default:
+			e = enum{typ: upperAlpha, val: int(r - '0')}
+			if unicode.IsLower(r) {
+				e.typ = lowerAlpha
+			}
+		}
+	case r == '#' && i == 0:
+		e = enum{typ: l.lastEnum.typ, val: l.lastEnum.val + 1, auto: true}
+	default:
+		return e, false
+	}
+	if e.typ == l.lastEnum.typ && e.val-l.lastEnum.val != 1 {
+		return e, false
+	}
+	return e, true
+}
+
+// isRoman reports whether r is a roman numeral.
+func (l *Scanner) isRoman(r rune) bool {
+	switch {
+	case strings.ContainsRune(roman, r):
+		return true
+	case strings.ContainsRune(ambiguousRoman, r):
+		return l.lastEnum.typ == upperRoman || l.lastEnum.typ == lowerRoman
+	}
+	return false
+}
+
+var (
+	nums       = map[rune]int{'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+	numPattern = regexp.MustCompile("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")
+)
+
+// parseRoman converts a roman numeral to an integer.
+func parseRoman(s string) (int, bool) {
+	s = strings.ToUpper(s)
+	if !numPattern.MatchString(s) {
+		return 0, false
+	}
+	var sum int
+	for _, r := range s {
+		sum += nums[r]
+	}
+	return sum, true
+}
diff --git a/scan/scan.go b/scan/scan.go
@@ -35,6 +35,7 @@ const (
 	Transition                       // Transition separate other body elements
 	Paragraph                        // Paragraph is left-aligned text with no markup
 	Bullet                           // Bullet starts a bullet list
+	Enum                             // Enum starts an enumerated list
 	Comment                          // Comment starts a comment
 	HyperlinkStart                   // HyperlinkStart starts a hyperlink target
 	HyperlinkPrefix                  // HyperlinkPrefix prefixes a hyperlink target name
@@ -78,6 +79,7 @@ type Scanner struct {
 	start     int           // start position of this item
 	token     Token         // token to return to parser
 	types     [2]Type       // most recent scanned types
+	lastEnum  enum          // most recent enumeration
 }
 
 // loadLine reads the next line of input and stores it in (appends it to) the input.
@@ -136,6 +138,7 @@ func (l *Scanner) peek() rune {
 func (l *Scanner) emit(t Type) stateFn {
 	if t == BlankLine {
 		l.line++
+		l.lastEnum = enum{typ: none, val: 0}
 	}
 	text := l.input[l.start:l.pos]
 	l.token = Token{t, l.line, text}
@@ -182,11 +185,13 @@ func (l *Scanner) Next() Token {
 }
 
 const (
-	hyperlinkStart      = ".. _"
-	anonHyperlinkStart  = "__ "
-	anonHyperlinkPrefix = "__:"
-	bullets             = "*+-•‣⁃"
-	adornments          = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+	comment                   = ".."
+	hyperlinkStart            = ".. _"
+	anonHyperlinkStart        = "__ "
+	anonHyperlinkPrefix       = "__:"
+	bullets                   = "*+-•‣⁃"
+	adornments                = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+	minSection, minTransition = 2, 4
 )
 
 // lexAny scans non-space items.
@@ -200,7 +205,7 @@ func lexAny(l *Scanner) stateFn {
 		return lexSpace
 	case l.isBullet(r):
 		return lexEndOfLine(l, Bullet)
-	case l.isComment(r):
+	case l.isComment():
 		return lexComment
 	case l.isTransition(r):
 		return lexUntilTerminator(l, Transition)
@@ -224,6 +229,8 @@ func lexAny(l *Scanner) stateFn {
 		return lexInlineReferenceClose
 	case l.isTitle():
 		return lexUntilTerminator(l, Title)
+	case l.isEnum(r):
+		return lexEnum
 	default:
 		return lexUntilTerminator(l, Paragraph)
 	}
@@ -345,15 +352,15 @@ func (l *Scanner) isBullet(r rune) bool {
 }
 
 // isComment reports whether the scanner is on a comment.
-func (l *Scanner) isComment(r rune) bool {
-	if r != '.' || l.types[1] == Title {
+func (l *Scanner) isComment() bool {
+	if l.types[1] == Title {
 		return false
 	}
 	s := l.input[l.start:]
 	if strings.HasPrefix(s, hyperlinkStart) && len(s) > len(hyperlinkStart) {
 		return false
 	}
-	return !strings.HasPrefix(s, "...")
+	return strings.HasPrefix(s, comment+" ") || strings.HasPrefix(s, comment+"\n")
 }
 
 // isHyperlinkStart reports whether the scanner is on a hyperlink start.
@@ -436,7 +443,7 @@ func (l *Scanner) isInlineReferenceClose() bool {
 // isTitle reports whether the scanner is on a title.
 func (l *Scanner) isTitle() bool {
 	pos, lastWidth := l.pos, l.lastWidth
-	r := l.next()
+	var r rune
 	for r != eof && r != '\n' {
 		r = l.next()
 	}
@@ -454,8 +461,6 @@ func notSpace(c rune) bool {
 	return !unicode.IsSpace(c)
 }
 
-const minSection = 2
-
 // isSection reports whether the scanner is on a section.
 func (l *Scanner) isSection(r rune) bool {
 	if !strings.ContainsRune(adornments, r) {
@@ -485,8 +490,6 @@ func (l *Scanner) isSectionAdornment(r rune) bool {
 	return r != '\n'
 }
 
-const minTransition = 4
-
 // isTransition reports whether the scanner is on a transition.
 func (l *Scanner) isTransition(r rune) bool {
 	switch l.types[1] {
@@ -502,7 +505,7 @@ func (l *Scanner) isTransition(r rune) bool {
 	if !strings.ContainsRune(adornments, r) {
 		return false
 	}
-	s := strings.TrimSuffix(l.input[l.pos-1:], "\n")
+	s := strings.TrimSuffix(l.input[l.start:], "\n")
 	if len(s) < minTransition || s != strings.Repeat(string(r), len(s)) {
 		return false
 	}