Skip to content

Commit

Permalink
fix minor bugs, add a full file text search test
Browse files Browse the repository at this point in the history
  • Loading branch information
rhaeguard committed Sep 22, 2023
1 parent 416b554 commit 668d7ac
Show file tree
Hide file tree
Showing 7 changed files with 3,410 additions and 19 deletions.
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,15 @@ a very simple regex engine written in go.
- [x] `\` escape character
- [x] support special characters - context dependant
- [x] better error handling in the API
- [ ] ability to work on multi-line strings
- [ ] `.` should not match the newline - `\n`
- [ ] `$` should match the newline - `\n`
- [ ] multiple full matches
- [x] ability to work on multi-line strings (tested on [Alice in Wonderland](./lib_testdata) text corpus)
- [x] `.` should not match the newline - `\n`
- [x] `$` should match the newline - `\n`
- [x] multiple full matches

## notes

- `\` escape turns any next character into a literal, no special combinations such as `\d` for digits, `\b` for backspace, etc. are allowed
- numeric groups `\n` only support single digit references, so `\10` will be interpreted as the first capture group followed by a literal `0`
- numeric groups `\n` only support single digit references, so `\10` will be interpreted as the first capture group followed by a literal `0`

## credits
- [Alice in Wonderland, Lewis Carroll, Project Guttenberg](https://www.gutenberg.org/ebooks/11)
20 changes: 15 additions & 5 deletions check.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,27 @@ func (s *State) check(inputString string, pos int, started bool, ctx *regexCheck
// set the end to the current position
// a group can have 2 different names: numeric (\1) and user-set (\k<animal>)
for _, groupName := range capturedGroup.names {
ctx.groups[groupName].end = pos
if ctx.groups[groupName].end < pos {
// only update if the new position is greater
ctx.groups[groupName].end = pos
}
}
}
}
}

currentChar := getChar(inputString, pos)

// if it needs to be the end of the text, and it isn't
// or if it needs to be the start of the text and it isn't
if (s.endOfText && currentChar != endOfText) || (s.startOfText && currentChar != startOfText) {
// the current character should be either EOF or
// the next one after that a newline to be valid, otherwise check fails
if s.endOfText && (currentChar != endOfText && currentChar != newline) {
return false
}

previousChar := getChar(inputString, pos-1)
// the current character should be either Start of File or
// the previous one before that a newline to be valid, otherwise check fails
if s.startOfText && (currentChar != startOfText && previousChar != newline) {
return false
}

Expand Down Expand Up @@ -96,7 +106,7 @@ func (s *State) check(inputString string, pos int, started bool, ctx *regexCheck
nextState := s.nextStateWith(currentChar)
// if there are no transitions for the current char as is
// then see if there's a transition for any char, i.e. dot (.) sign
if nextState == nil && currentChar != endOfText {
if nextState == nil && (currentChar != endOfText && currentChar != newline) {
nextState = s.nextStateWith(anyChar)
}

Expand Down
1 change: 0 additions & 1 deletion error.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ type ParseErrorCode string

const (
SyntaxError ParseErrorCode = "SyntaxError"
Unimplemented = "Unimplemented"
CompilationError = "CompilationError"
)

Expand Down
46 changes: 42 additions & 4 deletions lib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package rgx

import (
"fmt"
"os"
"testing"
)

Expand Down Expand Up @@ -150,36 +151,73 @@ func TestFindMatches(t *testing.T) {
{"0": "123-678-99-32"},
{"0": "239-987-63-21"},
}},
// multiline extracts
{`[0-9]{3}-[0-9]{3}-[0-9]{2}-[0-9]{2}$`, "hi 123-678-99-32\n is my number, so is 239-987-63-21", []map[string]string{
{"0": "123-678-99-32"},
{"0": "239-987-63-21"},
}},
}

for _, test := range data {
testName := fmt.Sprintf("%s-%s-%v", test.regexString, test.input, test.expected)
t.Run(testName, func(t *testing.T) {
pattern, err := Compile(test.regexString)
if err != nil {
t.Errorf(err.Error())
t.Fatalf(err.Error())
}
results := pattern.FindMatches(test.input)
if len(results) != len(test.expected) {
t.Fail()
t.Fatalf("must have expected number of results: expected %d got %d", len(test.expected), len(results))
}
for i, expected := range test.expected {
for k, v := range expected {
if results[i].groups[k] != v {
t.Fail()
t.Fatalf("expected '%s' got: '%s'", v, results[i].groups[k])
}
}
}
})
}
}

func TestFindMatchesInTextFile(t *testing.T) {
bytes, err := os.ReadFile("lib_testdata")
if err != nil {
t.Fatalf("could not open the file: %s", err.Error())
}
content := string(bytes)

var data = []struct {
regexString string
expectedOccurrenceCount int
}{
{`door`, 33},
{`door `, 14},
{`[a-z]+-[a-z]+`, 121},
}

for _, test := range data {
pattern, err := Compile(test.regexString)
if err != nil {
t.Fatalf(err.Error())
}
testName := fmt.Sprintf("alice in wonderland: regex='%s'", test.regexString)
t.Run(testName, func(t *testing.T) {
results := pattern.FindMatches(content)
if len(results) != test.expectedOccurrenceCount {
t.Fatalf("expected %d, got: %d", test.expectedOccurrenceCount, len(results))
}
})
}

}

func TestCheckForDev(t *testing.T) {
var data = []struct {
regexString, input string
expected bool
}{
{`[[\]-]+$`, `]-[]-[]-[[]]--[]`, true},
{`\h+`, ``, false},
}

for _, test := range data {
Expand Down
Loading

0 comments on commit 668d7ac

Please sign in to comment.