diff --git a/docs/language/functions/README.md b/docs/language/functions/README.md index 5c0dcc61b2..aa6752a10b 100644 --- a/docs/language/functions/README.md +++ b/docs/language/functions/README.md @@ -28,6 +28,7 @@ Zed's [primitive types](../../formats/zed.md#1-primitive-types), e.g., * [flatten](flatten.md) - transform a record into a flattened map * [floor](floor.md) - floor of a number * [grep](grep.md) - search strings inside of values +* [grok](grok.md) - parse a string into a structured record * [has](has.md) - test existence of values * [hex](hex.md) - encode/decode hexadecimal strings * [has_error](has_error.md) - test if a value has an error diff --git a/docs/language/functions/grok.md b/docs/language/functions/grok.md new file mode 100644 index 0000000000..5a68885621 --- /dev/null +++ b/docs/language/functions/grok.md @@ -0,0 +1,44 @@ +### Function + +  **grok** — parse a string using a grok pattern + +### Synopsis + +``` +grok(p: string, s: string) -> any +grok(p: string, s: string, definitions: string) -> any +``` + +### Description + +The _grok_ function parses a string `s` using grok pattern `p` and returns +a record containing the parsed fields. The syntax for pattern `p` +is `{%pattern:field_name}` where _pattern_ is the name of the pattern +to match in `s` and _field_name_ is the resultant field name of the capture +value. + +When provided with three arguments, `definitions` is a string +of named patterns in the format `PATTERN_NAME PATTERN` each separated by newlines. +The named patterns can then be referenced in argument `p`. + +#### Included Patterns + +The _grok_ function by default includes a set of builtin named patterns +that can be referenced in any pattern. The included named patterns can be seen +[here](https://raw.githubusercontent.com/brimdata/zed/main/pkg/grok/base.go). + +### Examples + +Parsing a simple log line using the builtin named patterns: +```mdtest-command +echo '"2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message"' | + zq -Z 'yield grok("%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}", this)' - +``` +=> +```mdtest-output +{ + timestamp: "2020-09-16T04:20:42.45+01:00", + level: "DEBUG", + message: "This is a sample debug log message" +} +``` diff --git a/pkg/grok/base.go b/pkg/grok/base.go new file mode 100644 index 0000000000..1b0cfe9056 --- /dev/null +++ b/pkg/grok/base.go @@ -0,0 +1,129 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/base.go + +package grok + +func must(err error) { + if err != nil { + panic(err) + } +} + +// Must is like Add but panics if the expression can't be parsed or +// the name is empty. +func (h Host) Must(name, expr string) { + must(h.Add(name, expr)) +} + +// NewBase creates new Host that filled up with base patterns. +// To see all base patterns open 'base.go' file. +func NewBase() Host { + h := make(Host) + // + h.Must("USERNAME", `[a-zA-Z0-9._-]+`) + h.Must("USER", `%{USERNAME}`) + h.Must("EMAILLOCALPART", `[a-zA-Z][a-zA-Z0-9_.+-=:]+`) + h.Must("HOSTNAME", `\b[0-9A-Za-z][0-9A-Za-z-]{0,62}(?:\.[0-9A-Za-z][0-9A-Za-z-]{0,62})*(\.?|\b)`) + h.Must("EMAILADDRESS", `%{EMAILLOCALPART}@%{HOSTNAME}`) + h.Must("HTTPDUSER", `%{EMAILADDRESS}|%{USER}`) + h.Must("INT", `[+-]?(?:[0-9]+)`) + h.Must("BASE10NUM", `[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+))`) + h.Must("NUMBER", `%{BASE10NUM}`) + h.Must("BASE16NUM", `[+-]?(?:0x)?(?:[0-9A-Fa-f]+)`) + h.Must("BASE16FLOAT", `\b[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+))\b`) + // + h.Must("POSINT", `\b[1-9][0-9]*\b`) + h.Must("NONNEGINT", `\b[0-9]+\b`) + h.Must("WORD", `\b\w+\b`) + h.Must("NOTSPACE", `\S+`) + h.Must("SPACE", `\s*`) + h.Must("DATA", `.*?`) + h.Must("GREEDYDATA", `.*`) + h.Must("QUOTEDSTRING", `("(\\.|[^\\"]+)+")|""|('(\\.|[^\\']+)+')|''|`+ + "(`(\\\\.|[^\\\\`]+)+`)|``") + h.Must("UUID", `[A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}`) + // Networking + h.Must("CISCOMAC", `(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}`) + h.Must("WINDOWSMAC", `(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2}`) + h.Must("COMMONMAC", `(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}`) + h.Must("MAC", `%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC}`) + h.Must("IPV6", `((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?`) + h.Must("IPV4", `(?:(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])[.](?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5]))`) + h.Must("IP", `%{IPV6}|%{IPV4}`) + h.Must("IPORHOST", `%{IP}|%{HOSTNAME}`) + h.Must("HOSTPORT", `%{IPORHOST}:%{POSINT}`) + + // paths + h.Must("UNIXPATH", `(/([\w_%!$@:.,~-]+|\\.)*)+`) + h.Must("TTY", `/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+)`) + h.Must("WINPATH", `(?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+`) + h.Must("PATH", `%{UNIXPATH}|%{WINPATH}`) + h.Must("URIPROTO", `[A-Za-z]+(\+[A-Za-z+]+)?`) + h.Must("URIHOST", `%{IPORHOST}(?::%{POSINT:port})?`) + // uripath comes loosely from RFC1738, but mostly from what Firefox + // doesn't turn into %XX + h.Must("URIPATH", `(?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+`) + h.Must("URIPARAM", `\?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]<>]*`) + h.Must("URIPATHPARAM", `%{URIPATH}(?:%{URIPARAM})?`) + h.Must("URI", `%{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?`) + // Months: January, Feb, 3, 03, 12, December + h.Must("MONTH", `\bJan(?:uary|uar)?|Feb(?:ruary|ruar)?|M(?:a|รค)?r(?:ch|z)?|Apr(?:il)?|Ma(?:y|i)?|Jun(?:e|i)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|O(?:c|k)?t(?:ober)?|Nov(?:ember)?|De(?:c|z)(?:ember)?\b`) + h.Must("MONTHNUM", `0?[1-9]|1[0-2]`) + h.Must("MONTHNUM2", `0[1-9]|1[0-2]`) + h.Must("MONTHDAY", `(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9]`) + // Days: Monday, Tue, Thu, etc... + h.Must("DAY", `Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?`) + // Years? + h.Must("YEAR", `(?:\d\d){1,2}`) + h.Must("HOUR", `2[0123]|[01]?[0-9]`) + h.Must("MINUTE", `[0-5][0-9]`) + // '60' is a leap second in most time standards and thus is valid. + h.Must("SECOND", `(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?`) + h.Must("TIME", `%{HOUR}:%{MINUTE}:%{SECOND}`) + // datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it) + h.Must("DATE_US", `%{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}`) + h.Must("DATE_EU", `%{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}`) + // I really don't know how it's called + h.Must("DATE_X", `%{YEAR}/%{MONTHNUM2}/%{MONTHDAY}`) + h.Must("ISO8601_TIMEZONE", `Z|[+-]%{HOUR}(?::?%{MINUTE})`) + h.Must("ISO8601_SECOND", `%{SECOND}|60`) + h.Must("TIMESTAMP_ISO8601", `%{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?`) + h.Must("DATE", `%{DATE_US}|%{DATE_EU}|%{DATE_X}`) + h.Must("DATESTAMP", `%{DATE}[- ]%{TIME}`) + h.Must("TZ", `[A-Z]{3}`) + h.Must("NUMTZ", `[+-]\d{4}`) + h.Must("DATESTAMP_RFC822", `%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}`) + h.Must("DATESTAMP_RFC2822", `%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}`) + h.Must("DATESTAMP_OTHER", `%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}`) + h.Must("DATESTAMP_EVENTLOG", `%{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR}%{MINUTE}%{SECOND}`) + h.Must("HTTPDERROR_DATE", `%{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{YEAR}`) + // golang time patterns + h.Must("ANSIC", `%{DAY} %{MONTH} [_123]\d %{TIME} %{YEAR}"`) + h.Must("UNIXDATE", `%{DAY} %{MONTH} [_123]\d %{TIME} %{TZ} %{YEAR}`) + h.Must("RUBYDATE", `%{DAY} %{MONTH} [0-3]\d %{TIME} %{NUMTZ} %{YEAR}`) + h.Must("RFC822Z", `[0-3]\d %{MONTH} %{YEAR} %{TIME} %{NUMTZ}`) + h.Must("RFC850", `%{DAY}, [0-3]\d-%{MONTH}-%{YEAR} %{TIME} %{TZ}`) + h.Must("RFC1123", `%{DAY}, [0-3]\d %{MONTH} %{YEAR} %{TIME} %{TZ}`) + h.Must("RFC1123Z", `%{DAY}, [0-3]\d %{MONTH} %{YEAR} %{TIME} %{NUMTZ}`) + h.Must("RFC3339", `%{YEAR}-[01]\d-[0-3]\dT%{TIME}%{ISO8601_TIMEZONE}`) + h.Must("RFC3339NANO", `%{YEAR}-[01]\d-[0-3]\dT%{TIME}\.\d{9}%{ISO8601_TIMEZONE}`) + h.Must("KITCHEN", `\d{1,2}:\d{2}(AM|PM|am|pm)`) + // Syslog Dates: Month Day HH:MM:SS + h.Must("SYSLOGTIMESTAMP", `%{MONTH} +%{MONTHDAY} %{TIME}`) + h.Must("PROG", `[\x21-\x5a\x5c\x5e-\x7e]+`) + h.Must("SYSLOGPROG", `%{PROG:program}(?:\[%{POSINT:pid}\])?`) + h.Must("SYSLOGHOST", `%{IPORHOST}`) + h.Must("SYSLOGFACILITY", `<%{NONNEGINT:facility}.%{NONNEGINT:priority}>`) + h.Must("HTTPDATE", `%{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{INT}`) + // Shortcuts + h.Must("QS", `%{QUOTEDSTRING}`) + // Log Levels + h.Must("LOGLEVEL", `[Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?`) + // Log formats + h.Must("SYSLOGBASE", `%{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:`) + h.Must("COMMONAPACHELOG", `%{IPORHOST:clientip} %{HTTPDUSER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)`) + h.Must("COMBINEDAPACHELOG", `%{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}`) + h.Must("HTTPD20_ERRORLOG", `\[%{HTTPDERROR_DATE:timestamp}\] \[%{LOGLEVEL:loglevel}\] (?:\[client %{IPORHOST:clientip}\] ){0,1}%{GREEDYDATA:errormsg}`) + h.Must("HTTPD24_ERRORLOG", `\[%{HTTPDERROR_DATE:timestamp}\] \[%{WORD:module}:%{LOGLEVEL:loglevel}\] \[pid %{POSINT:pid}:tid %{NUMBER:tid}\]( \(%{POSINT:proxy_errorcode}\)%{DATA:proxy_errormessage}:)?( \[client %{IPORHOST:client}:%{POSINT:clientport}\])? %{DATA:errorcode}: %{GREEDYDATA:message}`) + h.Must("HTTPD_ERRORLOG", `%{HTTPD20_ERRORLOG}|%{HTTPD24_ERRORLOG}`) + return h +} diff --git a/pkg/grok/grok.go b/pkg/grok/grok.go new file mode 100644 index 0000000000..b53f57ee52 --- /dev/null +++ b/pkg/grok/grok.go @@ -0,0 +1,231 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/grok.go + +package grok + +import ( + "bufio" + "errors" + "fmt" + "io" + "regexp" + "sort" + "strings" +) + +var ( + // ErrEmptyName arises when pattern name is an empty string + ErrEmptyName = errors.New("an empty name") + // ErrEmptyExpression arises when expression is an empty string + ErrEmptyExpression = errors.New("an empty expression") + // ErrAlreadyExist arises when pattern with given name alrady exists + ErrAlreadyExist = errors.New("the pattern already exist") + // ErrNotExist arises when pattern with given name doesn't exists + ErrNotExist = errors.New("pattern doesn't exist") +) + +// Host is a patterns collection. Host does not need to be kept around +// after all need patterns are generated +type Host map[string]string + +// New returns new empty host +func New() Host { return make(Host) } + +// Add a new pattern to the Host. If a pattern name +// already exists the ErrAlreadyExists will be returned. +func (h Host) Add(name, expr string) error { + if name == "" { + return ErrEmptyName + } + if expr == "" { + return ErrEmptyExpression + } + if _, ok := h[name]; ok { + return ErrAlreadyExist + } + if _, err := h.compileExternal(expr); err != nil { + return err + } + h[name] = expr + return nil +} + +func (h Host) compile(name string) (*Pattern, error) { + expr, ok := h[name] + if !ok { + return nil, ErrNotExist + } + return h.compileExternal(expr) +} + +var patternRegexp = regexp.MustCompile(`\%\{(\w+)(\:([\w\[\]\.]+)(\:(\w+))?)?}`) + +func (h Host) compileExternal(expr string) (*Pattern, error) { + subs := patternRegexp.FindAllString(expr, -1) + ts := make(map[string]struct{}) + for _, s := range subs { + name, sem := split(s) + if _, ok := h[name]; !ok { + return nil, fmt.Errorf("the '%s' pattern doesn't exist", name) + } + ts[sem] = struct{}{} + } + if len(subs) == 0 { + r, err := regexp.Compile(expr) + if err != nil { + return nil, err + } + p := &Pattern{Regexp: r} + return p, nil + } + spl := patternRegexp.Split(expr, -1) + msi := make(map[string]int) + order := 1 // semantic order + var res string + for i := 0; i < len(spl)-1; i++ { + splPart := spl[i] + order += capCount(splPart) + sub := subs[i] + subName, subSem := split(sub) + p, err := h.compile(subName) + if err != nil { + return nil, err + } + sub = p.String() + subNumSubexp := p.NumSubexp() + subNumSubexp++ + sub = wrap(sub) + if subSem != "" { + msi[subSem] = order + } + res += splPart + sub + // add sub semantics to this semantics + for k, v := range p.s { + if _, ok := ts[k]; !ok { + msi[k] = order + v + } + } + order += subNumSubexp + } + res += spl[len(spl)-1] + r, err := regexp.Compile(res) + if err != nil { + return nil, err + } + p := &Pattern{Regexp: r} + p.s = msi + p.order = make(map[int]string) + for k, v := range msi { + p.order[v] = k + } + return p, nil +} + +func split(s string) (name, sem string) { + ss := patternRegexp.FindStringSubmatch(s) + if len(ss) >= 2 { + name = ss[1] + } + if len(ss) >= 4 { + sem = ss[3] + } + return +} + +func wrap(s string) string { return "(" + s + ")" } + +var ( + nonCapLeftRxp = regexp.MustCompile(`\(\?[imsU\-]*\:`) + nonCapFlagsRxp = regexp.MustCompile(`\(?[imsU\-]+\)`) +) + +func capCount(in string) int { + leftParens := strings.Count(in, "(") + nonCapLeft := len(nonCapLeftRxp.FindAllString(in, -1)) + nonCapBoth := len(nonCapFlagsRxp.FindAllString(in, -1)) + escapedLeftParens := strings.Count(in, `\(`) + return leftParens - nonCapLeft - nonCapBoth - escapedLeftParens +} + +// Get pattern by name from the Host. +func (h Host) Get(name string) (*Pattern, error) { + return h.compile(name) +} + +// Compile and get pattern without name (and without adding it to this Host) +func (h Host) Compile(expr string) (*Pattern, error) { + if expr == "" { + return nil, ErrEmptyExpression + } + return h.compileExternal(expr) +} + +type Pattern struct { + *regexp.Regexp + s map[string]int + order map[int]string + cache []string +} + +// Parse returns a map of matches on the input. The map can be empty. +func (p *Pattern) Parse(input string) map[string]string { + ss := p.FindStringSubmatch(input) + r := make(map[string]string) + if len(ss) <= 1 { + return r + } + for sem, order := range p.s { + r[sem] = ss[order] + } + return r +} + +func (p *Pattern) ParseValues(input string) []string { + a := p.FindStringSubmatchIndex(input) + if a == nil { + return nil + } + p.cache = p.cache[:0] + for i := 0; len(p.cache) < len(p.s); i++ { + if _, ok := p.order[i]; !ok { + continue + } + p.cache = append(p.cache, input[a[i*2]:a[i*2+1]]) + } + return p.cache +} + +// Names returns all names that this pattern has in order. +func (p *Pattern) Names() (ss []string) { + ss = make([]string, 0, len(p.s)) + for k := range p.s { + ss = append(ss, k) + } + sort.Slice(ss, func(i, j int) bool { + return p.s[ss[i]] < p.s[ss[j]] + }) + return +} + +// AddFromReader appends all patterns from the reader to this Host. +func (h Host) AddFromReader(r io.Reader) error { + scanner := bufio.NewScanner(r) + for scanner.Scan() { + if err := h.addFromLine(scanner.Text()); err != nil { + return err + } + } + if err := scanner.Err(); err != nil { + return err + } + return nil +} + +var lineRegexp = regexp.MustCompile(`^(\w+)\s+(.+)$`) + +func (h Host) addFromLine(line string) error { + sub := lineRegexp.FindStringSubmatch(line) + if len(sub) == 0 { // no match + return nil + } + return h.Add(sub[1], sub[2]) +} diff --git a/pkg/grok/host_test.go b/pkg/grok/host_test.go new file mode 100644 index 0000000000..abf51ab3e7 --- /dev/null +++ b/pkg/grok/host_test.go @@ -0,0 +1,102 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/host_test.go + +package grok + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNew(t *testing.T) { + h := New() + require.Len(t, h, 0) + require.NotNil(t, h) +} + +func TestHost_Add(t *testing.T) { + h := New() + require.ErrorIs(t, h.Add("", "expr"), ErrEmptyName) + require.Len(t, h, 0) + require.ErrorIs(t, h.Add("name", ""), ErrEmptyExpression) + require.Len(t, h, 0) + require.NoError(t, h.Add("DIGIT", `\d`)) + require.Len(t, h, 1) + require.ErrorIs(t, h.Add("DIGIT", `[+-](0x)?\d`), ErrAlreadyExist) + require.Len(t, h, 1) + require.Error(t, h.Add("BAD", `(?![0-5])`)) + require.Len(t, h, 1) + require.NoError(t, h.Add("TWODIG", `%{DIGIT}-%{DIGIT}`)) + require.Len(t, h, 2) + require.Error(t, h.Add("THREE", `%{NOT}-%{EXIST}`)) + require.Len(t, h, 2) + require.NoError(t, h.Add("FOUR", `%{DIGIT:one}-%{DIGIT:two}`)) + require.Len(t, h, 3) + require.Error(t, h.Add("FIVE", `(?!\d)%{DIGIT}(?!\d)`)) + require.Len(t, h, 3) + require.NoError(t, h.Add("SIX", `%{FOUR:four}-%{DIGIT:six}`)) + require.Len(t, h, 4) +} + +func TestHost_Compile(t *testing.T) { + h := New() + _, err := h.Compile("") + require.ErrorIs(t, err, ErrEmptyExpression) + require.Len(t, h, 0) + p, err := h.Compile(`\d+`) + require.NoError(t, err) + require.NotNil(t, p) + require.Len(t, h, 0) +} + +func TestHost_Get(t *testing.T) { + h := New() + require.NoError(t, h.Add("DIG", `\d`)) + p, err := h.Get("DIG") + require.NoError(t, err) + require.NotNil(t, p) + p, err = h.Get("SEVEN") + require.ErrorIs(t, err, ErrNotExist) + require.Nil(t, p) +} + +func TestHost_AddFromReader(t *testing.T) { + s := `# +# for testing +# +ONE \d +TWO %{ONE:two} +THREE %{ONE:one}-%{TWO}-%{ONE:three} + +# +# enough +#` + h := New() + require.NoError(t, h.AddFromReader(strings.NewReader(s))) + require.Len(t, h, 3) + _, err := h.Get("ONE") + require.NoError(t, err) + _, err = h.Get("TWO") + require.NoError(t, err) + _, err = h.Get("THREE") + require.NoError(t, err) +} + +func TestHost_AddFromReader_malformedPatterns(t *testing.T) { + s := ` +ONE \d +TWO %{THREE:two}` + require.Error(t, New().AddFromReader(strings.NewReader(s))) +} + +func TestHost_inject(t *testing.T) { + h := New() + h["TWO"] = `(?!\d)` + require.Error(t, h.Add("ONE", `%{TWO:one}`)) +} + +func TestHost_addFromLine(t *testing.T) { + h := New() + require.Error(t, h.addFromLine("ONE (?!\\d)")) +} diff --git a/pkg/grok/pattern_test.go b/pkg/grok/pattern_test.go new file mode 100644 index 0000000000..e3fdeec271 --- /dev/null +++ b/pkg/grok/pattern_test.go @@ -0,0 +1,78 @@ +// Adapted from https://github.com/logrusorgru/grokky/blob/f28bfe018565ac1e90d93502eae1170006dd1f48/pattern_test.go + +package grok + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPattern_Parse(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `%{ONE:one}-%{ONE:two}`)) + require.NoError(t, h.Add("THREE", `%{ONE:zero}-%{TWO:three}`)) + p, err := h.Get("ONE") + require.NoError(t, err) + require.NotNil(t, p.Parse("1")) + p, err = h.Get("TWO") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "2"}, p.Parse("1-2")) + p, err = h.Get("THREE") + require.NoError(t, err) + require.Equal(t, map[string]string{ + "one": "1", + "two": "2", + "zero": "0", + "three": "1-2", + }, p.Parse("0-1-2")) + require.NoError(t, h.Add("FOUR", `%{TWO:two}`)) + p, err = h.Get("FOUR") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "1-2"}, p.Parse("1-2")) +} + +func TestPattern_nestedGroups(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `(?:%{ONE:one})-(?:%{ONE:two})?`)) + p, err := h.Get("TWO") + require.NoError(t, err) + require.Equal(t, map[string]string{"one": "1", "two": "2"}, p.Parse("1-2")) + require.Equal(t, map[string]string{"one": "1", "two": ""}, p.Parse("1-")) +} + +func TestPattern_Names(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + require.NoError(t, h.Add("TWO", `%{ONE:one}-%{ONE:two}`)) + require.NoError(t, h.Add("THREE", `%{ONE:zero}-%{TWO:three}`)) + p, err := h.Get("THREE") + require.NoError(t, err) + require.Equal(t, []string{"zero", "three", "one", "two"}, p.Names()) +} + +func TestPattern_ParseValues(t *testing.T) { + h := NewBase() + p, err := h.Compile("%{TIMESTAMP_ISO8601:event_time} %{LOGLEVEL:log_level} %{GREEDYDATA:log_message}") + require.NoError(t, err) + ss := p.ParseValues("2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message") + require.Equal(t, []string{"2020-09-16T04:20:42.45+01:00", "DEBUG", "This is a sample debug log message"}, ss) +} + +func TestPattern_NamesIgnoreTypeCast(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + p, err := h.Compile("%{ONE:one:int}") + require.NoError(t, err) + require.Equal(t, []string{"one"}, p.Names()) +} + +func TestPattern_NamesNested(t *testing.T) { + h := New() + require.NoError(t, h.Add("ONE", `\d`)) + p, err := h.Compile("%{ONE:num.one}-%{ONE:[num][two]}") + require.NoError(t, err) + require.Equal(t, []string{"num.one", "[num][two]"}, p.Names()) +} diff --git a/runtime/expr/function/function.go b/runtime/expr/function/function.go index eb95255bd0..8a4cc93b36 100644 --- a/runtime/expr/function/function.go +++ b/runtime/expr/function/function.go @@ -31,6 +31,9 @@ func New(zctx *zed.Context, name string, narg int) (expr.Function, field.Path, e case "grep": argmax = 2 f = &Grep{zctx: zctx} + case "grok": + argmin, argmax = 2, 3 + f = newGrok(zctx) case "len": f = &LenFn{zctx: zctx} case "abs": diff --git a/runtime/expr/function/grok.go b/runtime/expr/function/grok.go new file mode 100644 index 0000000000..23f85ca059 --- /dev/null +++ b/runtime/expr/function/grok.go @@ -0,0 +1,106 @@ +package function + +import ( + "fmt" + "strings" + + "github.com/brimdata/zed" + "github.com/brimdata/zed/pkg/grok" + "github.com/brimdata/zed/zcode" +) + +type Grok struct { + zctx *zed.Context + builder zcode.Builder + hosts map[string]*host +} + +func newGrok(zctx *zed.Context) *Grok { + return &Grok{ + zctx: zctx, + hosts: make(map[string]*host), + } +} + +func (g *Grok) Call(ectx zed.Allocator, vals []zed.Value) *zed.Value { + patternArg, inputArg, defArg := vals[0], vals[1], zed.NullString + if len(vals) == 3 { + defArg = &vals[2] + } + switch { + case zed.TypeUnder(defArg.Type) != zed.TypeString: + return g.error(ectx, "definitions argument must be a string", defArg) + case zed.TypeUnder(patternArg.Type) != zed.TypeString: + return g.error(ectx, "pattern argument must be a string", &patternArg) + case zed.TypeUnder(inputArg.Type) != zed.TypeString: + return g.error(ectx, "input argument must be a string", &inputArg) + } + h, err := g.getHost(defArg.AsString()) + if err != nil { + return g.error(ectx, err.Error(), defArg) + } + p, err := h.getPattern(g.zctx, patternArg.AsString()) + if err != nil { + return g.error(ectx, err.Error(), &patternArg) + } + ss := p.ParseValues(inputArg.AsString()) + if ss == nil { + return g.error(ectx, "value does not match pattern", &inputArg) + } + g.builder.Reset() + for _, s := range ss { + g.builder.Append([]byte(s)) + } + return ectx.NewValue(p.typ, g.builder.Bytes()) +} + +func (g *Grok) error(ectx zed.Allocator, err string, val *zed.Value) *zed.Value { + err = fmt.Sprintf("grok(): %s", err) + if val == nil { + return ectx.CopyValue(*g.zctx.NewErrorf(err)) + } + return ectx.CopyValue(*g.zctx.WrapError(err, val)) +} + +func (g *Grok) getHost(defs string) (*host, error) { + h, ok := g.hosts[defs] + if !ok { + h = &host{Host: grok.NewBase(), patterns: make(map[string]*pattern)} + if err := h.AddFromReader(strings.NewReader(defs)); err != nil { + return nil, err + } + g.hosts[defs] = h + } + return h, nil +} + +type host struct { + grok.Host + patterns map[string]*pattern +} + +func (h *host) getPattern(zctx *zed.Context, patternArg string) (*pattern, error) { + p, ok := h.patterns[patternArg] + if !ok { + pat, err := h.Host.Compile(patternArg) + if err != nil { + return nil, err + } + var fields []zed.Field + for _, name := range pat.Names() { + fields = append(fields, zed.NewField(name, zed.TypeString)) + } + typ, err := zctx.LookupTypeRecord(fields) + if err != nil { + return nil, err + } + p = &pattern{Pattern: pat, typ: typ} + h.patterns[patternArg] = p + } + return p, nil +} + +type pattern struct { + *grok.Pattern + typ zed.Type +} diff --git a/runtime/expr/function/ztests/grok.yaml b/runtime/expr/function/ztests/grok.yaml new file mode 100644 index 0000000000..4b088a6ce3 --- /dev/null +++ b/runtime/expr/function/ztests/grok.yaml @@ -0,0 +1,38 @@ +script: | + zq -z 'grok(pattern, field)' simple.zson + echo "// ===" + echo '"0-1-2"' | zq -z -I patterns.zed - + echo "// ===" + # Ignores type annotation. + echo '"0"' | zq -z 'grok("%{INT:int:int64}", this)' - + echo "// ===" + # Check to see that duplicate fields are squashed. This is not great but + # this is what grokconstructor.appspot.com does. + zq -z 'grok("%{INT:one} %{INT:one}", "1 2")' + echo "// ===" + echo '"string value"' | zq -z 'grok("%{INT:int}", this)' - + +inputs: + - name: simple.zson + data: | + { + field: "2020-09-16T04:20:42.45+01:00 DEBUG This is a sample debug log message", + pattern: "%{TIMESTAMP_ISO8601:event_time} %{LOGLEVEL:log_level} %{GREEDYDATA:log_message}" + } + - name: patterns.zed + data: | + const pattern = "ONE \\d\n" + "TWO %{ONE:one}-%{ONE:two}" + yield grok("%{ONE:zero}-%{TWO:three}", this, pattern) + +outputs: + - name: stdout + data: | + {event_time:"2020-09-16T04:20:42.45+01:00",log_level:"DEBUG",log_message:"This is a sample debug log message"} + // === + {zero:"0",three:"1-2",one:"1",two:"2"} + // === + {int:"0"} + // === + {one:"2"} + // === + error({message:"grok(): value does not match pattern",on:"string value"})