Skip to content

Commit

Permalink
minor changes and bugfix
Browse files Browse the repository at this point in the history
  • Loading branch information
rhaeguard committed Sep 20, 2023
1 parent 988be5a commit aed1d8b
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 90 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ a very simple regex engine written in go.
- [x] `[^ ]` bracket negation notation
- [x] better handling of the bracket expressions: e.g., `[ab-exy12]`
- [x] special characters in the bracket
- [ ] support escape character
- [x] support escape character
- [x] quantifiers
- [x] `*` none or more times
- [x] `+` one or more times
Expand All @@ -24,10 +24,15 @@ a very simple regex engine written in go.
- [x] `( )` capturing group or subexpression
- [x] `\n` backreference, e.g, `(dog)\1` where `n` is in `[0, 9]`
- [x] `\k<name>` named backreference, e.g, `(?<animal>dog)\k<animal>`
- [ ] `\` escape character
- [ ] support special characters - context dependant
- [x] `\` escape character
- [x] support special characters - context dependant
- [ ] better error handling in the API
- [ ] ability to work on multi-line strings
- [ ] `.` should not match the newline - `\n`
- [ ] `$` should match the newline - `\n`
- [ ] multiple full matches

## notes

- `\` escape turns any next character into a literal, no special combinations such as `\d` for digits, `\b` for backspace, etc. are allowed
- numeric groups `\n` only support single digit references, so `\10` will be interpreted as the first capture group followed by a literal `0`
4 changes: 2 additions & 2 deletions lib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ func TestCheck(t *testing.T) {
// escape chars
{`\\\^\$\.\|\?\*\+\(\)\{\}-hello`, `\^$.|?*+(){}-hello`, true},
{`[[\]-]+`, `]-[]-[]-[[]]--[]`, true},
{`[[\]-]+$`, `]-[]-[]-[[]]--[]\`, false},
}

for _, test := range data {
Expand All @@ -142,8 +143,7 @@ func TestCheckForDev(t *testing.T) {
regexString, input string
expected bool
}{
//{`[[\]-]+$`, `]-[]-[]-[[]]--[]\`, true},
{`[[\]-]+`, `]-[]-[]-[[]]--[]`, false},
{`[[\]-]+$`, `]-[]-[]-[[]]--[]`, true},
}

for _, test := range data {
Expand Down
172 changes: 87 additions & 85 deletions parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,11 @@ func (p *parsingContext) push(token regexToken) {
p.tokens = append(p.tokens, token)
}

func (p *parsingContext) getLast(count int) []regexToken {
return p.tokens[len(p.tokens)-count:]
}

func (p *parsingContext) removeLast(count int) {
// removeLast pops the last count number of elements and returns the popped elements
func (p *parsingContext) removeLast(count int) []regexToken {
elementsToBeRemoved := p.tokens[len(p.tokens)-count:]
p.tokens = append([]regexToken{}, p.tokens[:len(p.tokens)-count]...)
return elementsToBeRemoved
}

func isAlphabetUppercase(ch uint8) bool {
Expand All @@ -83,23 +82,39 @@ func isNumeric(ch uint8) bool {
}

var specialChars = map[uint8]bool{
'&': true,
'*': true,
' ': true,
//'{': true,
//'}': true,
'[': true,
']': true,
'(': true,
')': true,
',': true,
'=': true,
'-': true,
//'.': true,
'+': true,
';': true,
//'\'': true,
'/': true,
'&': true,
'*': true,
' ': true,
'{': true,
'}': true,
'[': true,
']': true,
'(': true,
')': true,
',': true,
'=': true,
'-': true,
'.': true,
'+': true,
';': true,
'\\': true,
'/': true,
}

var mustBeEscapedCharacters = map[uint8]bool{
'[': true,
'\\': true,
'^': true,
'$': true,
'.': true,
'|': true,
'?': true,
'*': true,
'+': true,
'(': true,
')': true,
'{': true,
'}': true,
}

func isSpecialChar(ch uint8) bool {
Expand All @@ -114,20 +129,20 @@ func isLiteral(ch uint8) bool {
isSpecialChar(ch)
}

func isDot(ch uint8) bool {
func isWildcard(ch uint8) bool {
return ch == '.'
}

const QuantifierInfinity = -1

var quantifiers = map[uint8][]int{
var quantifiersWithBounds = map[uint8][]int{
'*': {0, QuantifierInfinity},
'+': {1, QuantifierInfinity},
'?': {0, 1},
}

func isQuantifier(ch uint8) bool {
_, ok := quantifiers[ch]
_, ok := quantifiersWithBounds[ch]
return ok
}

Expand All @@ -146,11 +161,12 @@ func parseBracket(regexString string, memory *parsingContext) {
ch := regexString[memory.loc()]

if ch == '-' {
nextChar := regexString[memory.adv()] // TODO: this might fail if we are at the end of the string
nextChar := regexString[memory.loc()+1] // TODO: this might fail if we are at the end of the string
// if - is the first character OR is the last character, it's a literal
if len(pieces) == 0 || nextChar == ']' {
pieces = append(pieces, fmt.Sprintf("%c", ch))
} else {
memory.adv() // to process the nextChar's position
piece := pieces[len(pieces)-1]
if len(piece) == 1 {
prevChar := piece[0]
Expand Down Expand Up @@ -280,16 +296,15 @@ func parseGroupUncaptured(regexString string, memory *parsingContext) {
}

func parseQuantifier(ch uint8, memory *parsingContext) {
bounds := quantifiers[ch]
bounds := quantifiersWithBounds[ch]
token := regexToken{
tokenType: Quantifier,
value: quantifier{
min: bounds[0],
max: bounds[1],
value: memory.getLast(1),
value: memory.removeLast(1),
},
}
memory.removeLast(1)
memory.push(token)
}

Expand All @@ -311,72 +326,39 @@ func processChar(regexString string, memory *parsingContext, ch uint8) {
} else if isQuantifier(ch) {
parseQuantifier(ch, memory)
} else if ch == '{' {
startPos := memory.adv()
var endPos = memory.loc()
for regexString[endPos] != '}' {
endPos++
}
memory.advTo(endPos)
expr := regexString[startPos:endPos]
pieces := strings.Split(expr, ",")

var start int
var end int

if len(pieces) == 1 {
start, _ = strconv.Atoi(pieces[0])
end = start
} else if len(pieces) == 2 {
start, _ = strconv.Atoi(pieces[0])
if pieces[1] == "" {
end = QuantifierInfinity
} else {
end, _ = strconv.Atoi(pieces[1])
}
}

token := regexToken{
tokenType: Quantifier,
value: quantifier{
min: start,
max: end,
value: memory.getLast(1),
},
}
memory.removeLast(1)
memory.push(token)
parseBoundedQuantifier(regexString, memory)
} else if ch == '\\' { // escaped backslash
parseBackslash(regexString, memory)
} else if isLiteral(ch) {
parseLiteral(ch, memory)
} else if isDot(ch) {
} else if isWildcard(ch) {
token := regexToken{
tokenType: Wildcard,
value: ch,
}
memory.push(token)
} else if isLiteral(ch) {
parseLiteral(ch, memory)
} else if ch == '|' {
// everything to the left of the pipe in this specific "parsingContext"
// is considered as the left side of the OR
left := regexToken{
tokenType: GroupUncaptured,
value: memory.getLast(len(memory.tokens)),
value: memory.removeLast(len(memory.tokens)),
}

memory.adv() // to not get stuck in the pipe char
parseGroupUncaptured(regexString, memory)
right := memory.getLast(1)[0] // TODO: better error handling?
right := memory.removeLast(1)[0] // TODO: better error handling?

// clear the memory as we do not need
// any of these tokens anymore
memory.removeLast(len(memory.tokens))
//memory.removeLast(len(memory.tokens))

token := regexToken{
tokenType: Or,
value: []regexToken{left, right},
}
memory.push(token)
} else if ch == '^' || ch == '$' {
} else if ch == '^' || ch == '$' { // anchors
var tokenType = regexTokenType(TextBeginning)

if ch == '$' {
Expand All @@ -391,20 +373,40 @@ func processChar(regexString string, memory *parsingContext, ch uint8) {
}
}

var mustBeEscapedCharacters = map[uint8]bool{
'[': true,
'\\': true,
'^': true,
'$': true,
'.': true,
'|': true,
'?': true,
'*': true,
'+': true,
'(': true,
')': true,
'{': true,
'}': true,
func parseBoundedQuantifier(regexString string, memory *parsingContext) {
startPos := memory.adv()
var endPos = memory.loc()
for regexString[endPos] != '}' {
endPos++
}
memory.advTo(endPos)
expr := regexString[startPos:endPos]
pieces := strings.Split(expr, ",")

var start int
var end int

if len(pieces) == 1 {
start, _ = strconv.Atoi(pieces[0])
end = start
} else if len(pieces) == 2 {
start, _ = strconv.Atoi(pieces[0])
if pieces[1] == "" {
end = QuantifierInfinity
} else {
end, _ = strconv.Atoi(pieces[1])
}
}

token := regexToken{
tokenType: Quantifier,
value: quantifier{
min: start,
max: end,
value: memory.removeLast(1),
},
}
memory.push(token)
}

func parseBackslash(regexString string, memory *parsingContext) {
Expand Down Expand Up @@ -441,7 +443,7 @@ func parseBackslash(regexString string, memory *parsingContext) {
memory.push(token)
memory.adv()
} else {
panic("")
panic("unimplemented")
}
}

Expand Down

0 comments on commit aed1d8b

Please sign in to comment.