Skip to content

Commit

Permalink
feat: improve placeholder replacement of byte sizes (#13508)
Browse files Browse the repository at this point in the history
  • Loading branch information
na-- committed Jul 12, 2024
1 parent 07c3c76 commit ac284ca
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 2 deletions.
41 changes: 41 additions & 0 deletions pkg/pattern/tokenization/replacer.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,10 @@ restore: // should be faster than a defer
return false
}

// 'b' and 'B' are not present here because of the way we check for byte size
// units below. If they were present, then suffixes like 'Bb', 'bb', etc. would
// be considered valid byte sizes. Also, only integer numbers are accepted as
// valid bytesizes in bytes, so we handle bytes with special cases instead.
var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true}

// Only moves the head forward if it successfully matches a duration
Expand All @@ -339,6 +343,22 @@ func (r *replacer) advanceBytesize(c1 byte) (matched bool) {
return false
}

func (r *replacer) advanceSpacedBytesize(canBeBytes bool) (matched bool) {
// Get the next character after the space
c1, hasNext := r.advance()
if !hasNext {
return false
}
if canBeBytes && (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
return true
}
if r.advanceBytesize(c1) {
return true
}
r.backtrack()
return false
}

func (r *replacer) advance() (c byte, advanced bool) {
if r.head >= len(r.source) {
return 0, false
Expand Down Expand Up @@ -394,6 +414,14 @@ func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (e
c1 = r.peekFirstNonInt()
}

// Special case, this might be a byte size
if (c1 == 'b' || c1 == 'B') && r.peekNextIsBoundary() {
// We do not subsume a minus sign - byte sizes are unlikely to be
// negative, it's more likely this is a dash as a part of a range
r.emit(hasMinusPrefix, placeholderBytesize)
return true
}

// Maybe we are at the start of a hex string, either something like
// "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper
// case letters, but to avoid false positives, we want hex replacements to
Expand Down Expand Up @@ -489,6 +517,14 @@ func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint
return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2)
}

// This can be a byte size with a space, e.g. "3.14 GiB"
if b2 == ' ' && r.advanceSpacedBytesize(false) {
// We do not subsume a minus sign - byte sizes are unlikely to be
// negative, it's more likely this is a dash as a part of a range
r.emit(hasMinusPrefix, placeholderBytesize)
return true
}

// We have a decimal number followed by a non-dot boundary, so this is not
// an IP or a version number or anything like that.
if b2 != '.' {
Expand Down Expand Up @@ -633,6 +669,11 @@ func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool
case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'):
return r.handleSaneTimestamp(hasMinusPrefix, n1, b1)

// This might be a byte size with a space, e.g. "2 b", "3 GiB"
case b1 == ' ' && r.advanceSpacedBytesize(true):
r.emit(hasMinusPrefix, placeholderBytesize)
return true

// Weird RFC822 dates like "02 Jan 06 15:04 MST"
case n1 <= 31 && l1 <= 2 && b1 == ' ':
if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) {
Expand Down
14 changes: 12 additions & 2 deletions pkg/pattern/tokenization/tokenization_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{
[]string{"<NUM>.<DURATION>", "3h121m3.<DURATION>", "1h0.<DURATION>", "100usa", "0.12msa"},
},
{
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit",
[]string{"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>"},
// We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense)
"2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b",
[]string{
"<BYTESIZE>", "<BYTESIZE>-<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>", "<BYTESIZE>:<BYTESIZE>",
"<BYTESIZE>;<BYTESIZE>/<BYTESIZE>", "-", "<BYTESIZE>", "or", "<NUM>", "BeNot", "<NUM>", "b", "<BYTESIZE>"},
},
{
`status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`,
Expand Down Expand Up @@ -175,6 +178,13 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{
"level=debug", "ts=<TIMESTAMP>", "caller=shard_resolver.go:<NUM>", "bytes=<BYTESIZE>", "chunks=<NUM>", "streams=<NUM>", "entries=<NUM>", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=<DURATION>", "from=<TIMESTAMP>", "through=<TIMESTAMP>", "length=<DURATION>",
},
},
// tricky loki distributor message:
{
`level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`,
[]string{
"level=debug", "ts=<TIMESTAMP>", "caller=push.go:<NUM>", "org_id=<NUM>", "traceID=<HEX>", `msg="push request parsed"`, "path=/loki/api/v1/push", "contentType=application/x-protobuf", "contentEncoding=", `bodySize="<BYTESIZE>"`, "streams=<NUM>", "entries=<NUM>", `streamLabelsSize="<BYTESIZE>"`, `entriesSize="<BYTESIZE>"`, `structuredMetadataSize="<BYTESIZE>"`, `totalSize="<BYTESIZE>"`, "mostRecentLagMs=<NUM>", "adaptiveLogsDroppedLines=<NUM>", "adaptiveLogsDroppedSize=<NUM>", "adaptiveLogsMatchedLines=<NUM>",
},
},
// random JSON logs
{
`{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`,
Expand Down

0 comments on commit ac284ca

Please sign in to comment.