Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LIBBEAT: Enhancement Convert dissected values from String to other basic data types and IP #18683

Merged
merged 31 commits into from
Jul 13, 2020
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b045e56
Dissect processor - Convert strings to data types specified in tokenizer
premendrasingh May 12, 2020
2fac969
Add benchmark test cases
premendrasingh May 14, 2020
335c214
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh May 14, 2020
2e6230c
Update benchmark tests
premendrasingh May 16, 2020
0260ec7
Updated benchmark tests to one value and multiple value conversions
premendrasingh May 16, 2020
1f5ee5e
convert to .string, output error
premendrasingh May 16, 2020
5e89636
Add benchmark to demonstrate degradation
premendrasingh May 21, 2020
f974066
Add documentation for data conversion
premendrasingh May 21, 2020
43dd28c
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh May 21, 2020
ac44544
Update example to include data conversion
premendrasingh May 21, 2020
d6ce235
Update test case
premendrasingh May 21, 2020
44c6241
Update change log to include this PR 18683
premendrasingh May 21, 2020
7fc0caf
formatting issue fixed
premendrasingh May 21, 2020
f4db1ea
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jun 1, 2020
b9254da
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jun 3, 2020
7903328
Remove line from CHANGELOG.next.asciidoc, update config test for inva…
premendrasingh Jun 8, 2020
c9c0831
Move parsing to regexp, move data type after greedy indicator, use co…
premendrasingh Jun 8, 2020
a22150a
Refactor code, check | suffix, to panic for missing data type
premendrasingh Jun 10, 2020
79a443c
Remove [unset], add a new line
premendrasingh Jun 10, 2020
acb1b60
Convert data for indirectField use case, add test case, update existi…
premendrasingh Jun 11, 2020
3f09b10
Remove failure cases from benchmark, since behavior has changed to pa…
premendrasingh Jun 11, 2020
d315233
Move change log to bottom of the list
premendrasingh Jun 12, 2020
b69a672
Rever back changes to mage/check.go, change dissect and processor to …
premendrasingh Jun 22, 2020
a54ebac
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jun 23, 2020
49f15eb
Move change log too bottom of the group
premendrasingh Jun 23, 2020
03e916c
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jun 24, 2020
da92256
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jun 26, 2020
d5fcefb
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jun 30, 2020
d8ffa85
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jul 3, 2020
af75de8
Move change log to bottom of the list
premendrasingh Jul 3, 2020
e0baf48
Merge remote-tracking branch 'beats_upstream/master' into dissect_typ…
premendrasingh Jul 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- Add TLS support to Kerberos authentication in Elasticsearch. {pull}18607[18607]
- Upgrade k8s.io/client-go and k8s keystore tests. {pull}18817[18817]
- Add support for multiple sets of hints on autodiscover {pull}18883[18883]
- Add data type conversion in `dissect` processor for converting string values to other basic data types. {pull}18683[18683]

*Auditbeat*

Expand Down
48 changes: 48 additions & 0 deletions libbeat/processors/dissect/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,51 @@ func TestConfig(t *testing.T) {
}
})
}

func TestConfigForDataType(t *testing.T) {
t.Run("valid data type", func(t *testing.T) {
c, err := common.NewConfigFrom(map[string]interface{}{
"tokenizer": "%{value1|integer} %{value2|float} %{value3|boolean} %{value4|long} %{value5|double}",
"field": "message",
})
if !assert.NoError(t, err) {
return
}

cfg := config{}
err = c.Unpack(&cfg)
if !assert.NoError(t, err) {
return
}
})
t.Run("invalid data type", func(t *testing.T) {
c, err := common.NewConfigFrom(map[string]interface{}{
"tokenizer": "%{value1|int} %{value2|short} %{value3|char} %{value4|void} %{value5|unsigned} id=%{id|xyz} status=%{status|abc} msg=\"%{message}\"",
"field": "message",
})
if !assert.NoError(t, err) {
jsoriano marked this conversation as resolved.
Show resolved Hide resolved
return
}

cfg := config{}
err = c.Unpack(&cfg)
if !assert.Error(t, err) {
return
}
})
t.Run("missing data type", func(t *testing.T) {
c, err := common.NewConfigFrom(map[string]interface{}{
"tokenizer": "%{value1|} %{value2|}",
"field": "message",
})
if !assert.NoError(t, err) {
jsoriano marked this conversation as resolved.
Show resolved Hide resolved
return
}

cfg := config{}
err = c.Unpack(&cfg)
if !assert.Error(t, err) {
return
}
})
}
8 changes: 7 additions & 1 deletion libbeat/processors/dissect/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,18 @@ var (
indirectAppendPrefix = "&+"
greedySuffix = "->"
pointerFieldPrefix = "*"
dataTypeIndicator = "|"
dataTypeSeparator = "\\|" // Needed for regexp

numberRE = "\\d{1,2}"
alphaRE = "[[:alpha:]]*"

delimiterRE = regexp.MustCompile("(?s)(.*?)%\\{([^}]*?)}")
suffixRE = regexp.MustCompile("(.+?)" + // group 1 for key name
"(" + ordinalIndicator + "(" + numberRE + ")" + ")?" + // group 2, 3 for ordinal
"(" + fixedLengthIndicator + "(" + numberRE + ")" + ")?" + // group 4, 5 for fixed length
"(" + greedySuffix + ")?$") // group 6 for greedy
"(" + greedySuffix + ")?" + // group 6 for greedy
"(" + dataTypeSeparator + "(" + alphaRE + ")?" + ")?$") // group 7,8 for data type separator and data type

defaultJoinString = " "

Expand All @@ -55,4 +59,6 @@ var (
errMixedPrefixIndirectAppend = errors.New("mixed prefix `&+`")
errMixedPrefixAppendIndirect = errors.New("mixed prefix `&+`")
errEmptyKey = errors.New("empty key")
errInvalidDatatype = errors.New("invalid data type")
errMissingDatatype = errors.New("missing data type")
)
106 changes: 105 additions & 1 deletion libbeat/processors/dissect/dissect.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,20 @@

package dissect

import "fmt"
import (
"fmt"
"net"
"strconv"
"strings"

"github.com/elastic/beats/v7/libbeat/common"

"github.com/pkg/errors"
)

// Map represents the keys and their values extracted with the defined tokenizer.
type Map = map[string]string
type MapConverted = map[string]interface{}

// positions represents the start and end position of the keys found in the string.
type positions []position
Expand Down Expand Up @@ -61,6 +71,23 @@ func (d *Dissector) Dissect(s string) (Map, error) {
return d.resolve(s, positions), nil
}

func (d *Dissector) DissectConvert(s string) (MapConverted, error) {
if len(s) == 0 {
return nil, errEmpty
}

positions, err := d.extract(s)
if err != nil {
return nil, err
}

if len(positions) == 0 {
return nil, errParsingFailure
}

return d.resolveConvert(s, positions), nil
}

// Raw returns the raw tokenizer used to generate the actual parser.
func (d *Dissector) Raw() string {
return d.raw
Expand Down Expand Up @@ -161,6 +188,35 @@ func (d *Dissector) resolve(s string, p positions) Map {
return m
}

func (d *Dissector) resolveConvert(s string, p positions) MapConverted {
lookup := make(common.MapStr, len(p))
m := make(Map, len(p))
mc := make(MapConverted, len(p))
for _, f := range d.parser.fields {
pos := p[f.ID()]
f.Apply(s[pos.start:pos.end], m) // using map[string]string to avoid another set of apply methods
if !f.IsSaveable() {
lookup[f.Key()] = s[pos.start:pos.end]
} else {
key := f.Key()
if k, ok := lookup[f.Key()]; ok {
key = k.(string)
}
v, _ := m[key]
if f.DataType() != "" {
mc[key] = convertData(f.DataType(), v)
} else {
mc[key] = v
}
}
}

for _, f := range d.parser.referenceFields {
delete(mc, f.Key())
}
return mc
}

// New creates a new Dissector from a tokenized string.
func New(tokenizer string) (*Dissector, error) {
p, err := newParser(tokenizer)
Expand All @@ -174,3 +230,51 @@ func New(tokenizer string) (*Dissector, error) {

return &Dissector{parser: p, raw: tokenizer}, nil
}

// strToInt is a helper to interpret a string as either base 10 or base 16.
func strToInt(s string, bitSize int) (int64, error) {
base := 10
if strings.HasPrefix(s, "0x") || strings.HasPrefix(s, "0X") {
// strconv.ParseInt will accept the '0x' or '0X` prefix only when base is 0.
base = 0
}
return strconv.ParseInt(s, base, bitSize)
}

func transformType(typ dataType, value string) (interface{}, error) {
value = strings.TrimRight(value, " ")
switch typ {
case String:
return value, nil
case Long:
return strToInt(value, 64)
case Integer:
i, err := strToInt(value, 32)
return int32(i), err
case Float:
f, err := strconv.ParseFloat(value, 32)
return float32(f), err
case Double:
d, err := strconv.ParseFloat(value, 64)
return float64(d), err
case Boolean:
return strconv.ParseBool(value)
case IP:
if net.ParseIP(value) != nil {
return value, nil
}
return "", errors.New("value is not a valid IP address")
default:
return value, nil
}
}

func convertData(typ string, b string) interface{} {
if dt, ok := dataTypeNames[typ]; ok {
value, err := transformType(dt, b)
if err == nil {
return value
}
}
return b
}
Loading