Skip to content

Commit

Permalink
LogQL: Pattern Parser (#3837)
Browse files Browse the repository at this point in the history
* The beginning of a fun story.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Working on adding ragel.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Adding AST parsing with Yacc and Ragel.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Got a pattern parser working.

Reworking ast to works with bytes and not runes.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Setup tests and the matches algorithm.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* moar tests case.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Add some validation for the pattern expression.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Hooking to LogQL + performance boost.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Adds documentation

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Improve bound check.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Removes generated files from being linted.

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Danny Kopping <dannykopping@gmail.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Danny Kopping <dannykopping@gmail.com>

* Review feedback

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Danny Kopping <dannykopping@gmail.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Update docs/sources/logql/_index.md

Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>

* Docs suggestions

Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>

Co-authored-by: Danny Kopping <dannykopping@gmail.com>
Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com>
  • Loading branch information
3 people authored Jun 15, 2021
1 parent 6d026d2 commit 59bb6d3
Show file tree
Hide file tree
Showing 25 changed files with 1,879 additions and 476 deletions.
8 changes: 4 additions & 4 deletions .drone/drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,28 @@ workspace:

steps:
- name: test
image: grafana/loki-build-image:0.14.0
image: grafana/loki-build-image:0.15.0
commands:
- make BUILD_IN_CONTAINER=false test
depends_on:
- clone

- name: lint
image: grafana/loki-build-image:0.14.0
image: grafana/loki-build-image:0.15.0
commands:
- make BUILD_IN_CONTAINER=false lint
depends_on:
- clone

- name: check-generated-files
image: grafana/loki-build-image:0.14.0
image: grafana/loki-build-image:0.15.0
commands:
- make BUILD_IN_CONTAINER=false check-generated-files
depends_on:
- clone

- name: check-mod
image: grafana/loki-build-image:0.14.0
image: grafana/loki-build-image:0.15.0
commands:
- make BUILD_IN_CONTAINER=false check-mod
depends_on:
Expand Down
3 changes: 3 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ run:
# no need to include all autogenerated files, we confidently recognize
# autogenerated files. If it's not please let us know.
skip-files:
- .*.pb.go
- .*.y.go
- .*.rl.go
# output configuration options
output:
# colored-line-number|line-number|json|tab|checkstyle, default is "colored-line-number"
Expand Down
43 changes: 33 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
.PHONY: push-images push-latest save-images load-images promtail-image loki-image build-image
.PHONY: bigtable-backup, push-bigtable-backup
.PHONY: benchmark-store, drone, check-mod
.PHONY: migrate migrate-image lint-markdown
.PHONY: migrate migrate-image lint-markdown ragel

SHELL = /usr/bin/env bash

Expand Down Expand Up @@ -38,7 +38,7 @@ DOCKER_IMAGE_DIRS := $(patsubst %/Dockerfile,%,$(DOCKERFILES))
# make BUILD_IN_CONTAINER=false target
# or you can override this with an environment variable
BUILD_IN_CONTAINER ?= true
BUILD_IMAGE_VERSION := 0.14.0
BUILD_IMAGE_VERSION := 0.15.0

# Docker image info
IMAGE_PREFIX ?= grafana
Expand Down Expand Up @@ -87,6 +87,10 @@ PROTO_GOS := $(patsubst %.proto,%.pb.go,$(PROTO_DEFS))
YACC_DEFS := $(shell find . $(DONT_FIND) -type f -name *.y -print)
YACC_GOS := $(patsubst %.y,%.y.go,$(YACC_DEFS))

# Ragel Files
RAGEL_DEFS := $(shell find . $(DONT_FIND) -type f -name *.rl -print)
RAGEL_GOS := $(patsubst %.rl,%.rl.go,$(RAGEL_DEFS))

# Promtail UI files
PROMTAIL_GENERATED_FILE := clients/pkg/promtail/server/ui/assets_vfsdata.go
PROMTAIL_UI_FILES := $(shell find ./clients/pkg/promtail/server/ui -type f -name assets_vfsdata.go -prune -o -print)
Expand Down Expand Up @@ -126,8 +130,8 @@ binfmt:
all: promtail logcli loki loki-canary check-generated-files

# This is really a check for the CI to make sure generated files are built and checked in manually
check-generated-files: touch-protobuf-sources yacc protos clients/pkg/promtail/server/ui/assets_vfsdata.go
@if ! (git diff --exit-code $(YACC_GOS) $(PROTO_GOS) $(PROMTAIL_GENERATED_FILE)); then \
check-generated-files: touch-protobuf-sources yacc ragel protos clients/pkg/promtail/server/ui/assets_vfsdata.go
@if ! (git diff --exit-code $(YACC_GOS) $(RAGEL_GOS) $(PROTO_GOS) $(PROMTAIL_GENERATED_FILE)); then \
echo "\nChanges found in generated files"; \
echo "Run 'make check-generated-files' and commit the changes to fix this error."; \
echo "If you are actively developing these files you can ignore this error"; \
Expand All @@ -147,7 +151,7 @@ touch-protobuf-sources:
# Logcli #
##########

logcli: yacc cmd/logcli/logcli
logcli: yacc ragel cmd/logcli/logcli

logcli-image:
$(SUDO) docker build -t $(IMAGE_PREFIX)/logcli:$(IMAGE_TAG) -f cmd/logcli/Dockerfile .
Expand All @@ -160,8 +164,8 @@ cmd/logcli/logcli: $(APP_GO_FILES) cmd/logcli/main.go
# Loki #
########

loki: protos yacc cmd/loki/loki
loki-debug: protos yacc cmd/loki/loki-debug
loki: protos yacc ragel cmd/loki/loki
loki-debug: protos yacc ragel cmd/loki/loki-debug

cmd/loki/loki: $(APP_GO_FILES) cmd/loki/main.go
CGO_ENABLED=0 go build $(GO_FLAGS) -o $@ ./$(@D)
Expand All @@ -175,7 +179,7 @@ cmd/loki/loki-debug: $(APP_GO_FILES) cmd/loki/main.go
# Loki-Canary #
###############

loki-canary: protos yacc cmd/loki-canary/loki-canary
loki-canary: protos yacc ragel cmd/loki-canary/loki-canary

cmd/loki-canary/loki-canary: $(APP_GO_FILES) cmd/loki-canary/main.go
CGO_ENABLED=0 go build $(GO_FLAGS) -o $@ ./$(@D)
Expand Down Expand Up @@ -206,8 +210,8 @@ PROMTAIL_DEBUG_GO_FLAGS = $(DYN_DEBUG_GO_FLAGS)
endif
endif

promtail: yacc clients/cmd/promtail/promtail
promtail-debug: yacc clients/cmd/promtail/promtail-debug
promtail: yacc ragel clients/cmd/promtail/promtail
promtail-debug: yacc ragel clients/cmd/promtail/promtail-debug

promtail-clean-assets:
rm -rf clients/pkg/promtail/server/ui/assets_vfsdata.go
Expand Down Expand Up @@ -308,6 +312,25 @@ else
rm ${@}.back
endif

#########
# Ragels #
#########

ragel: $(RAGEL_GOS)

%.rl.go: %.rl
ifeq ($(BUILD_IN_CONTAINER),true)
@mkdir -p $(shell pwd)/.pkg
@mkdir -p $(shell pwd)/.cache
$(SUDO) docker run $(RM) $(TTY) -i \
-v $(shell pwd)/.cache:/go/cache$(MOUNT_FLAGS) \
-v $(shell pwd)/.pkg:/go/pkg$(MOUNT_FLAGS) \
-v $(shell pwd):/src/loki$(MOUNT_FLAGS) \
$(IMAGE_PREFIX)/loki-build-image:$(BUILD_IMAGE_VERSION) $@;
else
ragel -Z $< -o $@
endif

#############
# Protobufs #
#############
Expand Down
59 changes: 57 additions & 2 deletions docs/sources/logql/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,10 @@ In case of errors, for instance if the line is not in the expected format, the l

If an extracted label key name already exists in the original log stream, the extracted label key will be suffixed with the `_extracted` keyword to make the distinction between the two labels. You can forcefully override the original label using a [label formatter expression](#labels-format-expression). However if an extracted key appears twice, only the latest label value will be kept.

We support currently support [json](#json), [logfmt](#logfmt), [regexp](#regexp) and [unpack](#unpack) parsers.
Loki supports [JSON](#json), [logfmt](#logfmt), [pattern](#pattern), [regexp](#regexp) and [unpack](#unpack) parsers.

It's easier to use the predefined parsers like `json` and `logfmt` when you can, falling back to `regexp` when the log lines have unusual structure. Multiple parsers can be used during the same log pipeline which is useful when you want to parse complex logs. ([see examples](#multiple-parsers))
It's easier to use the predefined parsers `json` and `logfmt` when you can. If you can't, the `pattern` and `regexp` parsers can be used for log lines with an unusual structure. The `pattern` parser is easier and faster to write; it also outperforms the `regexp` parser.
Multiple parsers can be used by a single log pipeline. This is useful for parsing complex logs. There are examples in [Multiple parsers](#multiple-parsers).

##### Json

Expand Down Expand Up @@ -277,6 +278,60 @@ will get those labels extracted:
"status" => "200"
```

##### Pattern

The pattern parser allows the explicit extraction of fields from log lines by defining a pattern expression. The expression matches the structure of a log line.

Consider this NGINX log line.

```log
0.191.12.2 - - [10/Jun/2021:09:14:29 +0000] "GET /api/plugins/versioncheck HTTP/1.1" 200 2 "-" "Go-http-client/2.0" "13.76.247.102, 34.120.177.193" "TLSv1.2" "US" ""
```

This log line can be parsed with the expression

`<ip> - - <_> "<method> <uri> <_>" <status> <size> <_> "<agent>" <_>`

to extract these fields:

```kv
"ip" => "0.191.12.2"
"method" => "GET"
"uri" => "/api/plugins/versioncheck"
"status" => "200"
"size" => "2"
"agent" => "Go-http-client/2.0"
```

A pattern expression is composed of captures and literals.

A capture is a field name delimited by the `<` and `>` characters. `<example>` defines the field name `example`.
An unnamed capture appears as `<_>`. The unnamed capture skips matched content.

Captures are matched from the line beginning or the previous set of literals, to the line end or the next set of literals.
If a capture is not matched, the pattern parser will stop.

Literals can be any sequence of UTF-8 characters, including whitespace characters.

By default, a pattern expression is anchored at the start of the log line. If the expression start with literals, then the log line must also start with the same set of literals. Use `<_>` at the beginning of the expression to anchor the expression at the start.

Consider the log line

```log
level=debug ts=2021-06-10T09:24:13.472094048Z caller=logging.go:66 traceID=0568b66ad2d9294c msg="POST /loki/api/v1/push (204) 16.652862ms"
```

To match `msg="`, use the expression:

```pattern
<_> msg="<method> <path> (<status>) <latency>"
```

A pattern expression is invalid if

- It does not contain any named capture.
- It contains two consecutive captures not separated by whitespace characters.

##### regexp

Unlike the logfmt and json, which extract implicitly all values and takes no parameters, the **regexp** parser takes a single parameter `| regexp "<re>"` which is the regular expression using the [Golang](https://golang.org/) [RE2 syntax](https://github.com/google/re2/wiki/Syntax).
Expand Down
2 changes: 1 addition & 1 deletion loki-build-image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ RUN GO111MODULE=on go get github.com/fatih/faillint@v1.5.0
FROM golang:1.16.2-buster
RUN apt-get update && \
apt-get install -qy \
musl gnupg \
musl gnupg ragel \
file zip unzip jq gettext\
protobuf-compiler libprotobuf-dev \
libsystemd-dev && \
Expand Down
11 changes: 7 additions & 4 deletions pkg/logql/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,8 @@ func (e *labelParserExpr) Stage() (log.Stage, error) {
return log.NewRegexpParser(e.param)
case OpParserTypeUnpack:
return log.NewUnpackParser(), nil
case OpParserTypePattern:
return log.NewPatternParser(e.param)
default:
return nil, fmt.Errorf("unknown parser operator: %s", e.op)
}
Expand Down Expand Up @@ -601,10 +603,11 @@ const (
OpTypeLTE = "<="

// parsers
OpParserTypeJSON = "json"
OpParserTypeLogfmt = "logfmt"
OpParserTypeRegexp = "regexp"
OpParserTypeUnpack = "unpack"
OpParserTypeJSON = "json"
OpParserTypeLogfmt = "logfmt"
OpParserTypeRegexp = "regexp"
OpParserTypeUnpack = "unpack"
OpParserTypePattern = "pattern"

OpFmtLine = "line_format"
OpFmtLabel = "label_format"
Expand Down
12 changes: 12 additions & 0 deletions pkg/logql/ast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ func Test_logSelectorExpr_String(t *testing.T) {
{`{foo="bar", bar!="baz"} != "bip" !~ ".+bop" | json`, true},
{`{foo="bar"} |= "baz" |~ "blip" != "flip" !~ "flap" | logfmt`, true},
{`{foo="bar"} |= "baz" |~ "blip" != "flip" !~ "flap" | unpack | foo>5`, true},
{`{foo="bar"} |= "baz" |~ "blip" != "flip" !~ "flap" | pattern "<foo> bar <buzz>" | foo>5`, true},
{`{foo="bar"} |= "baz" |~ "blip" != "flip" !~ "flap" | logfmt | b>=10GB`, true},
{`{foo="bar"} |= "baz" |~ "blip" != "flip" !~ "flap" | regexp "(?P<foo>foo|bar)"`, true},
{`{foo="bar"} |= "baz" |~ "blip" != "flip" !~ "flap" | regexp "(?P<foo>foo|bar)" | ( ( foo<5.01 , bar>20ms ) or foo="bar" ) | line_format "blip{{.boop}}bap" | label_format foo=bar,bar="blip{{.blop}}"`, true},
Expand Down Expand Up @@ -69,6 +70,7 @@ func Test_SampleExpr_String(t *testing.T) {
`sum(count_over_time({job="mysql"} | json [5m] offset 10m))`,
`sum(count_over_time({job="mysql"} | logfmt [5m]))`,
`sum(count_over_time({job="mysql"} | logfmt [5m] offset 10m))`,
`sum(count_over_time({job="mysql"} | pattern "<foo> bar <buzz>" | json [5m]))`,
`sum(count_over_time({job="mysql"} | unpack | json [5m]))`,
`sum(count_over_time({job="mysql"} | regexp "(?P<foo>foo|bar)" [5m]))`,
`sum(count_over_time({job="mysql"} | regexp "(?P<foo>foo|bar)" [5m] offset 10m))`,
Expand Down Expand Up @@ -358,6 +360,8 @@ func Test_parserExpr_Parser(t *testing.T) {
{"json", OpParserTypeJSON, "", log.NewJSONParser(), false},
{"unpack", OpParserTypeUnpack, "", log.NewUnpackParser(), false},
{"logfmt", OpParserTypeLogfmt, "", log.NewLogfmtParser(), false},
{"pattern", OpParserTypePattern, "<foo> bar <buzz>", mustNewPatternParser("<foo> bar <buzz>"), false},
{"pattern err", OpParserTypePattern, "bar", nil, true},
{"regexp", OpParserTypeRegexp, "(?P<foo>foo)", mustNewRegexParser("(?P<foo>foo)"), false},
{"regexp err ", OpParserTypeRegexp, "foo", nil, true},
}
Expand Down Expand Up @@ -389,6 +393,14 @@ func mustNewRegexParser(re string) log.Stage {
return r
}

func mustNewPatternParser(p string) log.Stage {
r, err := log.NewPatternParser(p)
if err != nil {
panic(err)
}
return r
}

func Test_canInjectVectorGrouping(t *testing.T) {
tests := []struct {
vecOp string
Expand Down
3 changes: 2 additions & 1 deletion pkg/logql/expr.y
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ import (
OPEN_PARENTHESIS CLOSE_PARENTHESIS BY WITHOUT COUNT_OVER_TIME RATE SUM AVG MAX MIN COUNT STDDEV STDVAR BOTTOMK TOPK
BYTES_OVER_TIME BYTES_RATE BOOL JSON REGEXP LOGFMT PIPE LINE_FMT LABEL_FMT UNWRAP AVG_OVER_TIME SUM_OVER_TIME MIN_OVER_TIME
MAX_OVER_TIME STDVAR_OVER_TIME STDDEV_OVER_TIME QUANTILE_OVER_TIME BYTES_CONV DURATION_CONV DURATION_SECONDS_CONV
FIRST_OVER_TIME LAST_OVER_TIME ABSENT_OVER_TIME LABEL_REPLACE UNPACK OFFSET
FIRST_OVER_TIME LAST_OVER_TIME ABSENT_OVER_TIME LABEL_REPLACE UNPACK OFFSET PATTERN

// Operators are listed with increasing precedence.
%left <binOp> OR
Expand Down Expand Up @@ -246,6 +246,7 @@ labelParser:
| LOGFMT { $$ = newLabelParserExpr(OpParserTypeLogfmt, "") }
| REGEXP STRING { $$ = newLabelParserExpr(OpParserTypeRegexp, $2) }
| UNPACK { $$ = newLabelParserExpr(OpParserTypeUnpack, "") }
| PATTERN STRING { $$ = newLabelParserExpr(OpParserTypePattern, $2) }
;

jsonExpressionParser:
Expand Down
Loading

0 comments on commit 59bb6d3

Please sign in to comment.