Skip to content

Commit

Permalink
feat: Add utf8 support to Pattern Lexer to support utf8 chars (#13085)
Browse files Browse the repository at this point in the history
  • Loading branch information
benclive authored May 31, 2024
1 parent 21dd4af commit f6f8bab
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 174 deletions.
5 changes: 4 additions & 1 deletion pkg/logql/log/pattern/lexer.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package pattern

import "unicode/utf8"

type lexer struct {
data []byte
p, pe, cs int
Expand Down Expand Up @@ -57,6 +59,7 @@ func (lex *lexer) identifier(out *exprSymType) (int, error) {

// nolint
func (lex *lexer) literal(out *exprSymType) (int, error) {
out.literal = rune(lex.data[lex.ts])
decoded, _ := utf8.DecodeRune(lex.data[lex.ts:lex.te])
out.literal = decoded
return LITERAL, nil
}
16 changes: 15 additions & 1 deletion pkg/logql/log/pattern/lexer.rl
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,25 @@ package pattern
}
}%%

%%{
utf8 = (
0x00..0x7F |
0xC2..0xDF 0x80..0xBF |
0xE0 0xA0..0xBF 0x80..0xBF |
0xE1..0xEC 0x80..0xBF 0x80..0xBF |
0xED 0x80..0x9F 0x80..0xBF |
0xEE..0xEF 0x80..0xBF 0x80..0xBF |
0xF0 0x90..0xBF 0x80..0xBF 0x80..0xBF |
0xF1..0xF3 0x80..0xBF 0x80..0xBF 0x80..0xBF |
0xF4 0x80..0x8F 0x80..0xBF 0x80..0xBF
);
}%%

const LEXER_ERROR = 0

%%{
identifier = '<' (alpha| '_') (alnum | '_' )* '>';
literal = any;
literal = utf8;
}%%

func (lex *lexer) Lex(out *exprSymType) int {
Expand Down
Loading

0 comments on commit f6f8bab

Please sign in to comment.