Skip to content

Commit

Permalink
scan: add enumerated list
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewdargan committed Feb 4, 2024
1 parent f999fcd commit 0efedca
Show file tree
Hide file tree
Showing 4 changed files with 709 additions and 28 deletions.
171 changes: 171 additions & 0 deletions scan/enum.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
// Copyright 2023 Matthew P. Dargan. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package scan

import (
"regexp"
"strconv"
"strings"
"unicode"
)

// lexEnum scans an enumeration.
func lexEnum(l *Scanner) stateFn {
for {
switch r := l.peek(); {
case r == '\n':
return lexEndOfLine(l, Enum)
case r == eof, unicode.IsSpace(r):
return l.emit(Enum)
}
l.next()
}
}

// isEnum reports whether the scanner is on an enumeration.
func (l *Scanner) isEnum(r rune) bool {
if l.types[0] == BlankLine && l.types[1] == Paragraph {
return false
}
pos, lastWidth := l.pos, l.lastWidth
defer func() {
l.pos, l.lastWidth = pos, lastWidth
}()
if r == '(' {
r = l.next()
}
i, ok := l.enumSuffix()
if !ok {
return false
}
e, ok := l.enum(r, i)
if !ok {
return false
}
l.lastEnum = e
for r != eof && r != '\n' {
r = l.next()
}
if r == eof {
return true
}
r = l.next()
if r == '(' {
r = l.next()
}
if !unicode.IsDigit(r) && !unicode.IsLetter(r) {
return true
}
i, ok = l.enumSuffix()
if !ok {
return false
}
_, ok = l.enum(r, i)
return ok
}

const (
enumSuffixes = ".)"
roman = "Ii"
ambiguousRoman = "VXLCDMvxlcdm"
)

// enumSuffix returns the index of the first enumeration suffix character.
func (l *Scanner) enumSuffix() (int, bool) {
i := strings.IndexAny(l.input[l.pos:], enumSuffixes)
if i < 0 {
return i, false
}
if l.pos+i+1 < len(l.input) && !unicode.IsSpace(rune(l.input[l.pos+i+1])) {
return i, false
}
return i, true
}

type enumType int

const (
none enumType = iota
arabic
upperAlpha
lowerAlpha
upperRoman
lowerRoman
)

type enum struct {
typ enumType
val int
auto bool
}

// enum interprets an enumeration up to index i.
func (l *Scanner) enum(r rune, i int) (enum, bool) {
var e enum
if l.lastEnum.auto && r != '#' {
return e, false
}
switch {
case unicode.IsDigit(r):
n, _ := strconv.Atoi(l.input[l.pos-1 : l.pos+i])
e = enum{typ: arabic, val: n}
case unicode.IsLetter(r):
switch {
case l.isRoman(r):
n, ok := parseRoman(l.input[l.pos-1 : l.pos+i])
if !ok {
return e, false
}
e = enum{typ: upperRoman, val: n}
if unicode.IsLower(r) {
e.typ = lowerRoman
}
case i > 0:
return e, false
default:
e = enum{typ: upperAlpha, val: int(r - '0')}
if unicode.IsLower(r) {
e.typ = lowerAlpha
}
}
case r == '#' && i == 0:
e = enum{typ: l.lastEnum.typ, val: l.lastEnum.val + 1, auto: true}
default:
return e, false
}
if e.typ == l.lastEnum.typ && e.val-l.lastEnum.val != 1 {
return e, false
}
return e, true
}

// isRoman reports whether r is a roman numeral.
func (l *Scanner) isRoman(r rune) bool {
switch {
case strings.ContainsRune(roman, r):
return true
case strings.ContainsRune(ambiguousRoman, r):
return l.lastEnum.typ == upperRoman || l.lastEnum.typ == lowerRoman
}
return false
}

var (
nums = map[rune]int{'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
numPattern = regexp.MustCompile("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$")
)

// parseRoman converts a roman numeral to an integer.
func parseRoman(s string) (int, bool) {
s = strings.ToUpper(s)
if !numPattern.MatchString(s) {
return 0, false
}
var sum int
for _, r := range s {
sum += nums[r]
}
return sum, true
}
33 changes: 18 additions & 15 deletions scan/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ const (
Transition // Transition separate other body elements
Paragraph // Paragraph is left-aligned text with no markup
Bullet // Bullet starts a bullet list
Enum // Enum starts an enumerated list
Comment // Comment starts a comment
HyperlinkStart // HyperlinkStart starts a hyperlink target
HyperlinkPrefix // HyperlinkPrefix prefixes a hyperlink target name
Expand Down Expand Up @@ -78,6 +79,7 @@ type Scanner struct {
start int // start position of this item
token Token // token to return to parser
types [2]Type // most recent scanned types
lastEnum enum // most recent enumeration
}

// loadLine reads the next line of input and stores it in (appends it to) the input.
Expand Down Expand Up @@ -136,6 +138,7 @@ func (l *Scanner) peek() rune {
func (l *Scanner) emit(t Type) stateFn {
if t == BlankLine {
l.line++
l.lastEnum = enum{typ: none, val: 0}
}
text := l.input[l.start:l.pos]
l.token = Token{t, l.line, text}
Expand Down Expand Up @@ -182,11 +185,13 @@ func (l *Scanner) Next() Token {
}

const (
hyperlinkStart = ".. _"
anonHyperlinkStart = "__ "
anonHyperlinkPrefix = "__:"
bullets = "*+-•‣⁃"
adornments = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
comment = ".."
hyperlinkStart = ".. _"
anonHyperlinkStart = "__ "
anonHyperlinkPrefix = "__:"
bullets = "*+-•‣⁃"
adornments = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
minSection, minTransition = 2, 4
)

// lexAny scans non-space items.
Expand All @@ -200,7 +205,7 @@ func lexAny(l *Scanner) stateFn {
return lexSpace
case l.isBullet(r):
return lexEndOfLine(l, Bullet)
case l.isComment(r):
case l.isComment():
return lexComment
case l.isTransition(r):
return lexUntilTerminator(l, Transition)
Expand All @@ -224,6 +229,8 @@ func lexAny(l *Scanner) stateFn {
return lexInlineReferenceClose
case l.isTitle():
return lexUntilTerminator(l, Title)
case l.isEnum(r):
return lexEnum
default:
return lexUntilTerminator(l, Paragraph)
}
Expand Down Expand Up @@ -345,15 +352,15 @@ func (l *Scanner) isBullet(r rune) bool {
}

// isComment reports whether the scanner is on a comment.
func (l *Scanner) isComment(r rune) bool {
if r != '.' || l.types[1] == Title {
func (l *Scanner) isComment() bool {
if l.types[1] == Title {
return false
}
s := l.input[l.start:]
if strings.HasPrefix(s, hyperlinkStart) && len(s) > len(hyperlinkStart) {
return false
}
return !strings.HasPrefix(s, "...")
return strings.HasPrefix(s, comment+" ") || strings.HasPrefix(s, comment+"\n")
}

// isHyperlinkStart reports whether the scanner is on a hyperlink start.
Expand Down Expand Up @@ -436,7 +443,7 @@ func (l *Scanner) isInlineReferenceClose() bool {
// isTitle reports whether the scanner is on a title.
func (l *Scanner) isTitle() bool {
pos, lastWidth := l.pos, l.lastWidth
r := l.next()
var r rune
for r != eof && r != '\n' {
r = l.next()
}
Expand All @@ -454,8 +461,6 @@ func notSpace(c rune) bool {
return !unicode.IsSpace(c)
}

const minSection = 2

// isSection reports whether the scanner is on a section.
func (l *Scanner) isSection(r rune) bool {
if !strings.ContainsRune(adornments, r) {
Expand Down Expand Up @@ -485,8 +490,6 @@ func (l *Scanner) isSectionAdornment(r rune) bool {
return r != '\n'
}

const minTransition = 4

// isTransition reports whether the scanner is on a transition.
func (l *Scanner) isTransition(r rune) bool {
switch l.types[1] {
Expand All @@ -502,7 +505,7 @@ func (l *Scanner) isTransition(r rune) bool {
if !strings.ContainsRune(adornments, r) {
return false
}
s := strings.TrimSuffix(l.input[l.pos-1:], "\n")
s := strings.TrimSuffix(l.input[l.start:], "\n")
if len(s) < minTransition || s != strings.Repeat(string(r), len(s)) {
return false
}
Expand Down
Loading

0 comments on commit 0efedca

Please sign in to comment.