diff --git a/pkg/pattern/tokenization/replacer.go b/pkg/pattern/tokenization/replacer.go new file mode 100644 index 0000000000000..35d9259236be2 --- /dev/null +++ b/pkg/pattern/tokenization/replacer.go @@ -0,0 +1,859 @@ +package tokenization + +import ( + "bytes" + "slices" +) + +var ( + placeholderNumber = []byte("") + placeholderHex = []byte("") + placeholderUUID = []byte("") + placeholderTimestamp = []byte("") + placeholderDuration = []byte("") + placeholderBytesize = []byte("") + placeholderIP = []byte("") +) + +// Integer numbers after these words won't be replaced by ``. +var numPreservingKeys = [][]byte{[]byte("status"), []byte("status_code"), []byte("httpStatus")} + +const ( + maxYear = 2100 // anything above that is unlikely to be a timestamp... + minHexLen = 12 // 48 bits min for free-standing hex strings (e.g "0123456789ab") or 42 bits for "0xABCDEF1234" strings +) + +var boundaryChars = [256]bool{} + +func init() { + for i := 0; i < 128; i++ { // for now, we don't consider non-ASCII chars as boundary + if i < '0' || (i > '9' && i < 'A') || (i > 'Z' && i < 'a') || i > 'z' { + boundaryChars[i] = true + } + } + + // We need these keys sorted in an ascending length to efficiently compare them. + slices.SortStableFunc(numPreservingKeys, func(a, b []byte) int { + return len(a) - len(b) + }) +} + +type replacer struct { + source, dest []byte + + // This is the last position that was copied to the destination buffer. + tail int + // This is the current position we are working with + cur int + // This is essentially used for lookahed operations + head int + + // Here's some ASCII art to visualize that (the long string of dashes + // visualizes the log line: + // + // 0 `tail` `cur` `head` `len(source)` + // |---------------------------------------------|------------------------------------------|---------------------|------------------------------------------| + + // A somewhat hacky solution that allows us to preserve specific numeric + // values we care about, like status and status_code. + preserveNextNumbers bool +} + +// commit advances the current position marker to the lookahead position, i.e. +// we are committing to consume everything we've looked ahead so far. +func (r *replacer) commit() { + r.cur = r.head +} + +func (r *replacer) resetHead() { + r.head = r.cur +} + +func (r *replacer) resetHeadExpr() bool { + r.head = r.cur + return true // useful when included in boolean expressions with advance() +} + +func (r *replacer) backtrack() { + r.head-- +} + +func (r *replacer) consumeUpToCurrent() { + r.tail = r.cur +} + +// advanceUintRet returns the actual value of the number we read, as well as its +// length. The value might overflow for large numbers, so it's also important to +// check the length! +func (r *replacer) advanceUintRet() (val uint, length uint) { + var c byte + for ; r.head < len(r.source); r.head++ { + c = r.source[r.head] + if c < '0' || '9' < c { + break + } + length++ + val = val*10 + uint(c-'0') // TODO: use bitwise trick? + } + + return val, length +} + +func (r *replacer) advanceUintOrRangeRet(lc, hc byte) (length uint) { + var c byte + for ; r.head < len(r.source); r.head++ { + c = r.source[r.head] + if !(('0' <= c && c <= '9') || (lc <= c && c <= hc)) { + break + } + length++ + } + + return length +} + +func (r *replacer) advanceUint() bool { + _, l := r.advanceUintRet() + return l > 0 +} + +func (r *replacer) advanceUintUpTo(val uint, length uint) bool { + foundVal, foundLength := r.advanceUintRet() + return foundLength > 0 && foundVal <= val && foundLength <= length +} + +func (r *replacer) advanceYear() bool { + return r.advanceUintUpTo(maxYear, 4) +} + +func (r *replacer) advanceChar(c byte) bool { + if r.head < len(r.source) && r.source[r.head] == c { + r.head++ + return true + } + return false +} + +func (r *replacer) peekNextIsBoundary() bool { + if r.head >= len(r.source) { + return true // we are at the end of the line + } + return boundaryChars[r.source[r.head]] +} + +func (r *replacer) peekFirstNonInt() (c byte) { + for i := r.head; i < len(r.source); i++ { + c = r.source[i] + if c < '0' || '9' < c { + return c + } + } + return 0 // we can return the 0 value here! +} + +func (r *replacer) peekNext4() (result [4]byte) { + overhead := len(r.source) - r.head + switch { + case overhead > 3: + result[3] = r.source[r.head+3] + fallthrough + case overhead > 2: + result[2] = r.source[r.head+2] + fallthrough + case overhead > 1: + result[1] = r.source[r.head+1] + fallthrough + case overhead > 0: + result[0] = r.source[r.head+0] + } + return result +} + +func (r *replacer) advanceTimeZoneLetters() bool { + UCLetters := 0 + for { + nc, ok := r.advance() + if !ok { + break + } + if nc < 'A' || nc > 'Z' { + r.backtrack() + break + } + UCLetters++ + } + + return UCLetters >= 2 && UCLetters <= 5 +} + +func (r *replacer) advanceNumericTimeZone() bool { + // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + return r.advanceOneOf('+', '-') && r.advanceUintUpTo(14000, 5) +} + +// helper for handleWeirdTimestamp() +func (r *replacer) advanceStringOrNumericTimeZone(isOptional bool) bool { + curHead := r.head + if r.advanceChar(' ') && r.advanceNumericTimeZone() { + return true + } + r.head = curHead + if r.advanceChar(' ') && r.advanceTimeZoneLetters() { + return true + } + r.head = curHead + return isOptional +} + +func (r *replacer) advanceOneOf(chars ...byte) bool { + if r.head >= len(r.source) { + return false + } + c := r.source[r.head] + + for _, ec := range chars { + if c == ec { + r.head++ + return true + } + } + return false +} + +func (r *replacer) advanceTime(secondsOptional bool) bool { + return r.advanceUintUpTo(24, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) && (secondsOptional || (r.advanceChar(':') && r.advanceUintUpTo(60, 2))) +} + +// Mon|Tue|Wed|Thu|Fri|Sat|Sun +func (r *replacer) advanceDayOfTheWeek() bool { + c1, ok1 := r.advance() + c2, ok2 := r.advance() + c3, ok3 := r.advance() + if !ok1 || !ok2 || !ok3 { + return false + } + return (c1 == 'S' && ((c2 == 'a' && c3 == 't') || (c2 == 'u' && c3 == 'n'))) || // Sat, Sun + (c1 == 'M' && c2 == 'o' && c3 == 'n') || + (c1 == 'T' && c2 == 'u' && c3 == 'e') || + (c1 == 'W' && c2 == 'e' && c3 == 'd') || + (c1 == 'T' && c2 == 'h' && c3 == 'u') || + (c1 == 'F' && c2 == 'r' && c3 == 'i') +} + +// Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec +func (r *replacer) advanceMonthName() bool { + c1, ok1 := r.advance() + c2, ok2 := r.advance() + c3, ok3 := r.advance() + if !ok1 || !ok2 || !ok3 { + return false + } + + return (c1 == 'J' && ((c2 == 'u' && (c3 == 'n' || c3 == 'l')) || // Jun, Jul + (c2 == 'a' && c3 == 'n'))) || // Jan + (c1 == 'M' && c2 == 'a' && (c3 == 'r' || c3 == 'y')) || // Mar, May + (c2 == 'e' && ((c1 == 'F' && c3 == 'b') || (c1 == 'S' && c3 == 'p') || (c1 == 'D' && c3 == 'c'))) || // Feb, Sep, Dec + (c1 == 'A' && ((c2 == 'p' && c3 == 'r') || // Apr + (c2 == 'u' && c3 == 'g'))) || // Aug + (c1 == 'O' && c2 == 'c' && c3 == 't') || // Oct + (c1 == 'N' && c2 == 'o' && c3 == 'v') // Nov +} + +// Check if we in the middle of an UUID, exactly after the first 8 characters +// and the dash after them, e.g after "abcd0123-": +func (r *replacer) advanceUUIDAfterFirstDash(lc, hc byte) (endsWithBoundary bool) { + return (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && + (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && + (r.advanceUintOrRangeRet(lc, hc) == 4) && r.advanceChar('-') && + (r.advanceUintOrRangeRet(lc, hc) == 12) && r.peekNextIsBoundary() +} + +// Only moves the head forward if it successfully matches a duration +func (r *replacer) advanceDuration() (matched bool) { + curHead := r.head + + var secsLen int + n := r.peekNext4() + if n[0] == 'h' { + r.head++ + if boundaryChars[n[1]] { + return true // e.g. "1h", "123h" + } + if !r.advanceUintUpTo(60, 2) { + goto restore + } + n = r.peekNext4() + } + if n[0] == 'm' && (boundaryChars[n[1]] || n[1] != 's') { // we don't want to match 'ms' here + r.head++ + if boundaryChars[n[1]] { + return true // e.g. "2h21m", "121m" + } + if !(r.advanceUintUpTo(60, 2) && ((r.advanceChar('.') && r.advanceUint()) || true)) { + goto restore + } + n = r.peekNext4() + } + + if n[0] == 's' && boundaryChars[n[1]] { + secsLen = 1 + } else if n[1] == 's' && (n[0] == 'm' || n[0] == 'n' || n[0] == 'u') && boundaryChars[n[2]] { + secsLen = 2 + } else if n[2] == 's' && ((n[0] == 0xC2 && n[1] == 0xB5) || (n[0] == 0xCE && n[1] == 0xBC)) && boundaryChars[n[3]] { + // This checks for the unicode "µs" (U+00B5 = micro symbol) and "μs" (U+03BC = Greek letter mu) + secsLen = 3 + } else { + goto restore + } + + r.head += secsLen + return true + +restore: // should be faster than a defer + r.head = curHead + return false +} + +var byteSizes = [256]bool{'k': true, 'K': true, 'm': true, 'M': true, 'g': true, 'G': true, 't': true, 'T': true, 'p': true, 'P': true} + +// Only moves the head forward if it successfully matches a duration +func (r *replacer) advanceBytesize(c1 byte) (matched bool) { + if !byteSizes[c1] { + return false + } + n := r.peekNext4() + + var unitLen int // not counting the first character c1, which is already advanced to + if (n[0] == 'b' || n[0] == 'B') && boundaryChars[n[1]] { + unitLen = 1 + } else if n[0] == 'i' && (n[1] == 'b' || n[1] == 'B') && boundaryChars[n[2]] { + unitLen = 2 + } else if ((n[0] == 'b' && n[1] == 'p' && n[2] == 's') || (n[0] == 'b' && n[1] == 'i' && n[2] == 't')) && boundaryChars[n[3]] { + unitLen = 3 + } + + if unitLen > 0 { + r.head += unitLen + return true + } + return false +} + +func (r *replacer) advance() (c byte, advanced bool) { + if r.head >= len(r.source) { + return 0, false + } + + c = r.source[r.head] + r.head++ + return c, true +} + +func (r *replacer) emitNumber() { + r.commit() + r.dest = append(r.dest, placeholderNumber...) + r.consumeUpToCurrent() +} + +func (r *replacer) emitNumberOrCopyText(hasMinusPrefix bool) { + r.commit() + if !r.preserveNextNumbers { + r.dest = append(r.dest, placeholderNumber...) + r.consumeUpToCurrent() + } else { + r.maybeEmitDash(hasMinusPrefix) + r.copyUpToCurrent() + } +} + +func (r *replacer) emit(hasMinusPrefix bool, placeholder []byte) { + r.commit() + r.maybeEmitDash(hasMinusPrefix) + r.dest = append(r.dest, placeholder...) + r.consumeUpToCurrent() +} + +func (r *replacer) maybeEmitDash(hasMinusPrefix bool) { + // This minus was actually a dash, so we just copy it to the result + if hasMinusPrefix { + r.dest = append(r.dest, '-') + } +} + +func (r *replacer) copyUpToCurrent() { + r.dest = append(r.dest, r.source[r.tail:r.cur]...) + r.consumeUpToCurrent() +} + +func (r *replacer) handleHexOrUnit(hasMinusPrefix bool, n1, l1 uint, c1 byte) (endsWithBoundary bool) { + // Special case that is likely a hex string of the format "0x", but we don't + // know whether the rest is upper case or lower case yet. + zeroHex := false + if n1 == 0 && l1 == 1 && c1 == 'x' { + zeroHex = true // these cannot be the start of an UUID + c1 = r.peekFirstNonInt() + } + + // Maybe we are at the start of a hex string, either something like + // "[0-9]+[a-f]", "[0-9]+[A-F]", or "0x". We support both lower and upper + // case letters, but to avoid false positives, we want hex replacements to + // happen only if the strings are fully lower case or fully upper case. + if 'a' <= c1 && c1 <= 'f' { + return r.handleHex(hasMinusPrefix, l1+1, 'a', 'f', !zeroHex) + } else if 'A' <= c1 && c1 <= 'F' { + return r.handleHex(hasMinusPrefix, l1+1, 'A', 'F', !zeroHex) + } else if zeroHex { + // Well, it probably wasn't a zero-hex after all, or it contained only + // digits, so try to handle that or emit what we absorbed + _, l2 := r.advanceUintRet() + if l2+2 >= minHexLen { // We consider "0x" to be part of the hex string when comparing with minHexLen + r.emit(hasMinusPrefix, placeholderHex) + } else { + r.maybeEmitDash(hasMinusPrefix) + r.commit() + r.copyUpToCurrent() + } + return r.peekNextIsBoundary() + } + + return r.handlePotentialUnitWithDecimal(hasMinusPrefix, c1) +} + +func (r *replacer) handleHex(hasMinusPrefix bool, l1 uint, lc, hc byte, canBeUUID bool) (endsWithBoundary bool) { + totalLen := l1 + r.advanceUintOrRangeRet(lc, hc) + r.commit() + + if totalLen == 8 && canBeUUID { + // We might be at the first part of a UUID, right before the first dash + if r.advanceChar('-') && r.advanceUUIDAfterFirstDash(lc, hc) { + r.emit(hasMinusPrefix, placeholderUUID) + return true + } + r.resetHead() + } + + if totalLen >= minHexLen && r.peekNextIsBoundary() { + r.emit(hasMinusPrefix, placeholderHex) + return true + } + + r.copyUpToCurrent() + return r.peekNextIsBoundary() +} + +func (r *replacer) handlePotentialUnitWithDecimal(hasMinusPrefix bool, c1 byte) (endsWithBoundary bool) { + if r.advanceBytesize(c1) { + // We do not subsume a minus sign - byte sizes are unlikely to be + // negative, it's more likely this is a dash as a part of a range + r.emit(hasMinusPrefix, placeholderBytesize) + return true + } + + r.backtrack() + if r.advanceDuration() { + // We subsume hasMinusPrefix, since durations can be negative + r.emit(false, placeholderDuration) + return true + } + + // We couldn't match anything, so just copy what existed. + r.maybeEmitDash(hasMinusPrefix) + r.copyUpToCurrent() + return false +} + +func (r *replacer) handleNumberWithDecimal(hasMinusPrefix bool, n1 uint, l1 uint) (endsWithBoundary bool) { + n2, l2 := r.advanceUintRet() + if l2 == 0 { + // The dot wasn't followed by another number, so emit everything before it. + r.backtrack() + r.emitNumberOrCopyText(hasMinusPrefix) + return false + } + + // See if the number after the decimal is also followed by a boundary + b2, hasNext := r.advance() + + // We are at the end of the string, which is a boundary, replace evertything + // up to now with a number + if !hasNext { + r.emitNumber() // this also subsumes any minus sign we had + return true + } + + // The decimal number isn't followed by a boundary char (which include + // things like '.', ':', '/', etc.), so the part after the decimal is either + // not a real number, or it's some sort of a unit that can support decimals + // like size (e.g. 12KB, 3MiB) or durations (e.g. 3.5124s), etc. + if !boundaryChars[b2] { + return r.handlePotentialUnitWithDecimal(hasMinusPrefix, b2) + } + + // We have a decimal number followed by a non-dot boundary, so this is not + // an IP or a version number or anything like that. + if b2 != '.' { + r.backtrack() + r.emitNumber() + return true + } + + n3, l3 := r.advanceUintRet() + if l3 == 0 || !r.peekNextIsBoundary() { + // The second dot wasn't followed by another number and a boundary, so + // emit just the first number. + r.resetHead() + r.emitNumber() + return true + } + + // We have ".." at this point, so we either have something + // like a version number, or an IP address, but certainly not a simple + // decimal number we can just emit. + r.commit() + + // Check if this is an IP address... + if n1 <= 255 && l1 <= 3 && n2 <= 255 && l2 <= 3 && n3 <= 255 && l3 <= 3 && r.advanceChar('.') && r.advanceUintUpTo(255, 3) && r.peekNextIsBoundary() { + r.emit(hasMinusPrefix, placeholderIP) + return true + } + + // This wasn't an IP after all, so just emit "..". We don't + // want to assume this is a simple decimal number because of the second dot, + // e.g. this might be something like a version number. + r.resetHead() + r.maybeEmitDash(hasMinusPrefix) // we preserve the dashes here, since it's likely not a minus + r.dest = append(r.dest, placeholderNumber...) + r.dest = append(r.dest, '.') + r.dest = append(r.dest, placeholderNumber...) + r.dest = append(r.dest, '.') + r.dest = append(r.dest, placeholderNumber...) + r.consumeUpToCurrent() + return true +} + +func (r *replacer) handleSaneTimestamp(hasMinusPrefix bool, n1 uint, delim byte) (endsWithBoundary bool) { + if r.advanceUintUpTo(12, 2) && r.advanceChar(delim) && r.advanceUintUpTo(31, 2) && r.advanceOneOf('T', ' ') && r.advanceTime(false) { + r.commit() + // continue down to parsing sub-second and timezone values + } else if r.resetHeadExpr() && n1 <= 31 && r.advanceChar(delim) && r.advanceMonthName() && r.advanceChar(delim) && r.advanceYear() && r.advanceChar(':') && r.advanceTime(false) && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.peekNextIsBoundary() { + // We might not have a sane timestamp, but apparently a not-so-sane + // timestamp format first, e.g. "27/Mar/2024:14:34:37 +0000"... + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } else { + // Apparently that wasn't it either, we probably just have a dash or + // forward slash after a number, so we backtrack and emit the number. + r.resetHead() + r.emitNumberOrCopyText(hasMinusPrefix) + return true + } + + // We have a date that looks like `YY[YY]-MM-DD hh:mm:ss` or + // `YY[YY]/MM/DDZhh:mm:ss` + r.commit() // we want to keep this + + // Now we need to also check for sub-second precision and time zones: + c, canAdvance := r.advance() + if !canAdvance { + // End of the string + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + if c == '.' { + _, intl := r.advanceUintRet() + if intl == 0 { + // No sub-second precision, the dot was not part of the timestamp + r.backtrack() + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + + // We had sub-second precision, capture that too + r.commit() + + // Get the next char to see if we have time zone + c, canAdvance = r.advance() + if !canAdvance { + // We are at the end of the sting after we captured the + // sub-second value. + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + } + + if c == 'Z' || c == 'z' { + //UTC string, nothing to do after that + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + // See https://en.wikipedia.org/wiki/List_of_tz_database_time_zones + if (c == '+' || c == '-') && r.advanceUintUpTo(14, 2) && r.advanceChar(':') && r.advanceUintUpTo(60, 2) { + // e.g. "2020-09-30T00:00:59.9999+03:00" + r.commit() + } else if r.resetHeadExpr() && r.advanceChar(' ') && r.advanceNumericTimeZone() && r.advanceChar(' ') && r.advanceTimeZoneLetters() && r.peekNextIsBoundary() { + // e.g. "2023-09-05 23:20:28.030285153 +0000 UTC" + r.commit() + } else { + r.resetHead() + } + r.emit(hasMinusPrefix, placeholderTimestamp) + return true +} + +func (r *replacer) handleNumberStart(hasMinusPrefix bool) (endsWithBoundary bool) { + r.resetHead() // keep the head pos in sync with the current pos + // We know we were at a digit due to how handleNumber() is called + n1, l1 := r.advanceUintRet() + r.commit() // we will consume this one way or another for sure + + // See if the number is followed by a boundary + b1, hasNext := r.advance() + + switch { + // We are at the end of the string, which is a boundary, replace evertything + // up to now with a number + case !hasNext: + r.emitNumberOrCopyText(hasMinusPrefix) // this also may subsume any minus sign we had + return true + + // The number isn't followed by a boundary char (which include things like + // '.', ' ', '/', etc.), so it's either not a real number or date, or it's + // some sort of a unit like a duration (e.g. 3h5m2s), size, (e.g. 12KB, + // 3MiB), etc. + case !boundaryChars[b1]: + return r.handleHexOrUnit(hasMinusPrefix, n1, l1, b1) + + // We have a decimal point, so this can either be a plain number, a unit + // that can have a float value, or an IP address. + case b1 == '.': + return r.handleNumberWithDecimal(hasMinusPrefix, n1, l1) + + // This might be a timestamp that looks like '2024-04-01...' or + // `2024/03/27...`; timestamps in the far future are not supported :) + case n1 <= maxYear && l1 <= 4 && (b1 == '-' || b1 == '/'): + return r.handleSaneTimestamp(hasMinusPrefix, n1, b1) + + // Weird RFC822 dates like "02 Jan 06 15:04 MST" + case n1 <= 31 && l1 <= 2 && b1 == ' ': + if r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(true) && r.advanceStringOrNumericTimeZone(false) { + r.commit() + r.emit(hasMinusPrefix, placeholderTimestamp) + return true + } + // if not, go to default handler after switch statement + + // It could be a UUID that starts with 8 digits + case l1 == 8 && b1 == '-': + if r.advanceUUIDAfterFirstDash('a', 'f') || (r.resetHeadExpr() && r.advanceChar('-') && r.advanceUUIDAfterFirstDash('A', 'F')) { + r.emit(hasMinusPrefix, placeholderUUID) + return true + } + // if not, go to default handler after switch statement + } + + // Number with an unknown boundary - emit the number and leave the boundary + // for the following passes. + r.resetHead() + r.emitNumberOrCopyText(hasMinusPrefix) + return true +} + +var longDayNames = [...][]byte{ + []byte("Sunday"), + []byte("Monday"), + []byte("Tuesday"), + []byte("Wednesday"), + []byte("Thursday"), + []byte("Friday"), + []byte("Saturday"), +} + +func (r *replacer) handleWeirdTimestamp() (endsWithBoundary bool) { + r.resetHead() + + if r.advanceDayOfTheWeek() { + r.commit() // we will always consume this + + // RFC1123 and RFC1123Z, e.g.: + // - "Mon, 02 Jan 2006 15:04:05 MST" + // - "Mon, 02 Jan 2006 15:04:05 -0700" + if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceYear() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) { + r.emit(false, placeholderTimestamp) + return true + } + r.resetHead() + + // ANSIC, UnixDatem, RubyDate e.g + // - "Mon Jan 2 15:04:05 2006" + // - "Mon Jan 2 15:04:05 MST 2006" + // - "Mon Jan 02 15:04:05 -0700 2006" + if r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) && r.advanceChar(' ') && r.advanceYear() { + r.emit(false, placeholderTimestamp) + return true + } + r.resetHead() + + // Linux, e.g. + // - "Mon 2 Jan 15:04:05 MST 2006" + // - "Tue 23 Jan 15:04:05 -0700 2023" + if r.advanceChar(' ') && (r.advanceChar(' ') || true) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceMonthName() && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(false) && r.advanceChar(' ') && r.advanceYear() { + r.emit(false, placeholderTimestamp) + return true + } + r.resetHead() + + // RFC850, e.g. + // - "Monday, 02-Jan-06 15:04:05 MST" + backtrackedSlice := r.source[r.head-3:] + var matchedDay []byte + for _, dw := range longDayNames { + if bytes.HasPrefix(backtrackedSlice, dw) { + matchedDay = dw + break + } + } + if matchedDay != nil { + r.head += len(matchedDay) - 3 + if r.advanceChar(',') && r.advanceChar(' ') && r.advanceUintUpTo(31, 2) && r.advanceChar('-') && r.advanceMonthName() && r.advanceChar('-') && r.advanceUintUpTo(99, 2) && r.advanceChar(' ') && r.advanceTime(false) && r.advanceStringOrNumericTimeZone(true) { + r.emit(false, placeholderTimestamp) + return true + } + } + + r.cur -= 3 // unconsume + r.resetHead() + return false + } + r.resetHead() + + if r.advanceMonthName() { + r.commit() // provisionally consume this + + // Linux journald logs and others similar like this: + // - Feb 29 23:00:14 + // - Apr-10 23:43:46.807 + // - Jul 1 00:21:28 + if (r.advanceChar('-') || (r.advanceChar(' ') && (r.advanceChar(' ') || true))) && r.advanceUintUpTo(31, 2) && r.advanceChar(' ') && r.advanceTime(false) { + r.commit() + // This is already a timestamp, but let's try to match subseconds as well + if r.advanceChar('.') && r.advanceUint() { + r.commit() + } else { + r.resetHead() + } + r.emit(false, placeholderTimestamp) + return true + } + r.cur -= 3 // unconsume + r.resetHead() + return false + } + r.resetHead() + + return false +} + +func (r *replacer) wasNumPreservingKeyword() bool { + for _, key := range numPreservingKeys { + pPos := r.cur - 1 - len(key) + if pPos < -1 { + return false // all subsequent keys are longer + } + if pPos != -1 && !boundaryChars[r.source[pPos]] { + continue + } + if bytes.HasPrefix(r.source[pPos+1:], key) { + return true + } + } + return false +} + +func (r *replacer) replaceWithPlaceholders() { + lineLen := len(r.source) + + var c byte + onBoundary := true + for r.cur = 0; r.cur < lineLen; r.cur++ { + c = r.source[r.cur] + switch { + // If we're currently not at a boundary, the only thing we need to check + // is whether the current char would create a boundary in the next iteration. + case !onBoundary: + onBoundary = boundaryChars[c] + + // We weren't at a boundary and now we are, so check if we match one + // of the special keywords that will preserve numbers + if onBoundary { + r.preserveNextNumbers = r.wasNumPreservingKeyword() + } + + // If we've reached this far, it means we're currently at a boundary! + + // A lot of very complex logic if we encounter a number at a boundary, + // so we move that to a helper function. + case '0' <= c && c <= '9': + r.copyUpToCurrent() + onBoundary = r.handleNumberStart(false) + + // Handle negative numbers, potentially + case c == '-': + next := r.cur + 1 + // This might be a number, a date, an IP address, etc. So we don't + // know if this is a minus sign to replace or a dash to copy yet. + if next < lineLen && '0' <= r.source[next] && r.source[next] <= '9' { + // Copy everything before the dash, but mark it as consumed. + r.copyUpToCurrent() + r.cur++ + r.consumeUpToCurrent() + onBoundary = r.handleNumberStart(true) + } else { + onBoundary = true + } + + // Try to match weird timestamps. They require a lot of remaining + // length, generally start with a capitalized day of the week or month + // name (1 upper case letter followed by 2 lower case letters). + // + // We are basically looking for something that may match this here: + // Mon|Tue|Wed|Thu|Fri|Sat|Sun|Jan|Feb|Mar|Apr|May|Jul|Jun|Aug|Sep|Oct|Nov|Dec + // + // The detailed check would be performed by the actual handler: + case 'A' <= c && c <= 'W' && lineLen-r.cur >= 14 && + 'a' <= r.source[r.cur+1] && r.source[r.cur+1] <= 'u' && + 'b' <= r.source[r.cur+2] && r.source[r.cur+2] <= 'y': + r.copyUpToCurrent() + onBoundary = r.handleWeirdTimestamp() + + // This could be the start of an lower case hex string: + case 'a' <= c && c <= 'f': + r.copyUpToCurrent() + r.resetHead() + onBoundary = r.handleHex(false, 0, 'a', 'f', true) + + // This could be the start of an upper case hex string: + case 'A' <= c && c <= 'F': + r.copyUpToCurrent() + r.resetHead() + onBoundary = r.handleHex(false, 0, 'A', 'F', true) + + // If we haven't actually matched anything, update whether we're still + // on a boundary character and continue onto the next one. + default: + onBoundary = boundaryChars[c] + } + } + + if r.cur > r.tail { + r.dest = append(r.dest, r.source[r.tail:]...) + r.consumeUpToCurrent() + } +} + +func Preprocess(content []byte) []byte { + // ~floor(120%), to allow for some expansion from replacements, hopefully + // without needing to allocate more memory + r := replacer{source: content, dest: make([]byte, 0, len(content)*120/100)} + r.replaceWithPlaceholders() + return r.dest +} diff --git a/pkg/pattern/tokenization/tokenization.go b/pkg/pattern/tokenization/tokenization.go new file mode 100644 index 0000000000000..d4918ccb8e048 --- /dev/null +++ b/pkg/pattern/tokenization/tokenization.go @@ -0,0 +1,201 @@ +package tokenization + +import ( + "bytes" + "unsafe" +) + +const placeholderEndOfLine = "<...>" + +// Outside of quoted strings, these are the delimiters between tokens. However, +// they are not going to be a part of the tokens themselves and will be replaced +// by spaces in the actual log template. +var delimiters = [256]bool{0: true, '\t': true, '\n': true, '\v': true, '\f': true, '\r': true, ' ': true} + +type tokenizer struct { + // Input + rawLine []byte + maxTokens int + + // State and position iterators + buf []byte + tpos int + tokenCount int + maybeJSON bool + + // Result values, the values in the `tokens` slice reference line and shouldn't + // allocate new memory. + line string + tokens []string +} + +func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) { + if t.tokens != nil { + // Intentionally written like this and not with append(), so this can + // panic if we ever exceed the preallocated slice size, since that means + // we have a nasty bug in handleNextToken() below. + t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos] + } + t.tokenCount++ + t.tpos = endTokenPos + skip +} + +func (t *tokenizer) handleNextToken() bool { + escaped := false + var c, curQuoteChar byte + curQuotePos := -1 + + lineLen := len(t.line) + for p := t.tpos; p < lineLen; p++ { + c = t.line[p] + switch { + + // If the previous character was a backslash, we ignore the next + // character, unless it was an non-token delimiter (space, tab, etc.) + // outside of a quoted string. + case escaped: + if curQuotePos < 0 && delimiters[c] { + t.countOrSaveToken(p, 1) + return true + } else { + escaped = false + } + + // If we weren't already escaped and we encounter a backslash, toggle + // the escaped flag and ignore the current byte. + case c == '\\': + escaped = true + + // Non-ASCII / UTF8 / invalid character, consider it a part of the + // current token, for lack of a better efficient option... + case c > 127: + // Intentionally blank, continues to the next char + + // If we are currently in a quoted part of the string, the current + // character is also part of the current token. The only special case + // here is if the current character is a matching quote, that means + // we'll no longer be quoted. + case curQuotePos >= 0: + if c == curQuoteChar { // end quote + curQuotePos = -1 + } + + // If we encounter a qoute character and we were not already in a quoted + // part of the line, mark that we are now in a quote from that type. + case c == '"' || c == '\'' || c == '`': + curQuoteChar = c + curQuotePos = p + + // If we encounter a delimiter outside of a quote, count or save the + // token and skip the delimiter. + case delimiters[c]: + t.countOrSaveToken(p, 1) + return true + + // Handle likely JSON object keys that have been serialized without + // spaces. For example, something like this: + // `{"context":{"taskId":1},"message":"starting",...}` + // + // If the line starts or ends with curly braces, we consider it might be + // a JSON log and try to detect the `":` part of the message that isn't + // followed by a delimiter character. If we find that pattern, we + // consider everything up to and including the `:` character as a + // separate token. + // + // Similarly, we try to detect the `,"` pattern and also split a token + // before the comma value. The `p > t.tpos` check is crucial here, + // because it ensures that we're not at the start of a token, i.e. there + // wasn't a delimiter right before the comma. + case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen: + if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] { + t.countOrSaveToken(p+1, 0) + return true + } + if c == ',' && t.line[p+1] == '"' { + t.countOrSaveToken(p, 0) + return true + } + } + + // By default we do nothing, simply advance one character forward + // because the current character was a part of the current token. + } + + // We have an unterminated single quote at position `curQuotePos`. To handle + // this edge case somewhat gracefully, we can emit everything up to that + // unterminated quote and the quote itself as a single token, and continue + // fairly normally from there. + if curQuotePos > 0 { + t.countOrSaveToken(curQuotePos+1, 0) + return true + } + + if t.tpos < len(t.line) { + t.countOrSaveToken(len(t.line), 0) + return true + } + + return false +} + +// This function is called twice! The first time it counts the tokens but +// doesn't save them. Afterwards we allocate the tokens return slice with +// exactly the correct capacity and we call it again, this time to save them. +func (t *tokenizer) process() { + // We want to handle the end of the string as a single token, so we start + // the loop from 1. + for i := 1; i < t.maxTokens; i++ { + if !t.handleNextToken() { + break + } + } + + if t.tpos >= len(t.line) { + return + } + + // We have token count more than or equal to maxTokens, add one last token + // containing placeholderEndOfLine to signal that. + if t.tokens != nil { + t.tokens[t.tokenCount] = placeholderEndOfLine + } + t.tokenCount++ + t.tpos += len(placeholderEndOfLine) +} + +func (t *tokenizer) tokenize() []string { + t.buf = Preprocess(t.rawLine) + + // We use unsafe to convert buf to a string without any new allocations. + // This is safe because t.buf won't be used or referenced anywhere else + // besides here from now on. + t.line = unsafe.String(unsafe.SliceData(t.buf), len(t.buf)) + + if len(t.line) >= 2 && (t.line[0] == '{' || t.line[len(t.line)-1] == '}') { + t.maybeJSON = true + } + + t.process() + + // If we have super long lines (more than twice the size we need to get the + // maxTokens we want), copy just the part we need so the tokens don't hold a + // reference to the original huge []byte slice. + if t.tokenCount == t.maxTokens && t.tpos*2 < len(t.line) { + t.line = string(t.buf[0 : t.tpos+1]) + } + + t.tokens = make([]string, t.tokenCount) // intentionally like this, see comment in countOrSaveToken() + t.tokenCount = 0 + t.tpos = 0 + t.process() + + return t.tokens +} + +func PreprocessAndTokenize(content []byte) []string { + content = bytes.TrimSpace(content) + + t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens + + return t.tokenize() +} diff --git a/pkg/pattern/tokenization/tokenization_benchmark_test.go b/pkg/pattern/tokenization/tokenization_benchmark_test.go new file mode 100644 index 0000000000000..c614f57576ed9 --- /dev/null +++ b/pkg/pattern/tokenization/tokenization_benchmark_test.go @@ -0,0 +1,26 @@ +package tokenization + +import ( + "fmt" + "testing" +) + +var sampleLogBenchCount = 0 + +func BenchmarkTokenizationTestCases(b *testing.B) { + for i, tc := range tokenizationRealisticTestCases { + line := []byte(tc.line) + b.Run(fmt.Sprintf("test-case-%d", i), func(b *testing.B) { + for i := 0; i < b.N; i++ { + PreprocessAndTokenize(line) + } + }) + } +} + +func BenchmarkTokenizationPlayground(b *testing.B) { + line := []byte(`level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`) + for i := 0; i < b.N; i++ { + PreprocessAndTokenize(line) + } +} diff --git a/pkg/pattern/tokenization/tokenization_test.go b/pkg/pattern/tokenization/tokenization_test.go new file mode 100644 index 0000000000000..9e9e4c75fb02c --- /dev/null +++ b/pkg/pattern/tokenization/tokenization_test.go @@ -0,0 +1,376 @@ +package tokenization + +import ( + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type tokenizationTestCase struct { + line string + expResult []string +} + +var tokenizationCornerTestCases = []tokenizationTestCase{ + { + "", + []string{}, + }, + { + " foo ", + []string{"foo"}, + }, + { + "foo bar baz", + []string{"foo", "bar", "baz"}, + }, + { + "\nfoo\t bar baz\r\n", + // TOFIX: remove empty tokens? + []string{"foo", "", "", "bar", "baz"}, + }, + { + "ends single char C", + []string{"ends", "single", "char", "C"}, + }, + { + "0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Z Y X W V U T S R Q P O N M L K J I H G F E D C B A z y x w v u t s r q p o n m l k j i h g f e d c b a 9 8 7 6 5 4 3 2 1 0", + // The tail end of strings longer than maxTokens is returned as a single token + []string{"", "", "", "", "", "", "", "", "", "", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "Z", "Y", "X", "W", "V", "U", "T", "S", "R", "Q", "P", "O", "N", "M", "L", "K", "J", "I", "H", "G", "F", "E", "D", "C", "B", "A", "z", "y", "x", "w", "v", "u", "t", "s", "r", "q", "p", "<...>"}, + }, + { + `a "quoted string"`, + []string{"a", `"quoted string"`}, + }, + { + `a "quoted string with \" escapes!"`, + []string{"a", `"quoted string with \" escapes!"`}, + }, + { + `a 'singly quoted string"'`, + []string{"a", `'singly quoted string"'`}, + }, + { + `a 'singly quoted string" \''`, + []string{"a", `'singly quoted string" \''`}, + }, + { + `a 'singly quoted string" \\\''`, + []string{"a", `'singly quoted string" \\\''`}, + }, + { + `a'twisted excappe\\' "with an unterminated quote" 'at_the_end`, + []string{`a'twisted excappe\\'`, `"with an unterminated quote"`, `'`, "at_the_end"}, + }, + { + `a "quoted string 'inception'"!` + "`woot`'much`wow'", + []string{"a", `"quoted string 'inception'"!` + "`woot`'much`wow'"}, + }, + { + `unterminated"quote`, + []string{`unterminated"`, `quote`}, + }, + { + "`mix`" + ` "and" 'match'`, + []string{"`mix`", `"and"`, `'match'`}, + }, + { + "`mix`" + ` "and" 'match'`, + []string{"`mix`", `"and"`, `'match'`}, + }, + { + `{"json":"uninterrupted \"logline\"","foo":"bar"}`, + []string{`{"json":`, `"uninterrupted \"logline\""`, `,"foo":`, `"bar"}`}, + }, + { + `{"weirdo1":`, + []string{`{"weirdo1":`}, + }, + { + // This and the corner case below are not optimal, though they're + // unlikely to be encountered in the real world, so it's hopefully a + // nice tradeoff for the performance of not needing a full JSON parser. + `{"weirdo2":}`, + []string{`{"weirdo2":`, `}`}, + }, + { + `,"weirdo3":}`, + []string{`,"weirdo3":`, `}`}, + }, + { + // We deliberately do treat "escaped" whitespaces outside of quotes as + // delimiters, i.e. whitespaces outside of strings cannot be escaped. + `weird\ escape`, + []string{`weird\`, `escape`}, + }, + { + "-3.14-foo 0.0.0.0/24-0.0.0.1-255.255.255.255-256.255.255.255 1337-ber 0.12-ber n0tnumb3er 12faux -123.0.1.123 -123 -1231.11 333. 123.456. 123.45-", + []string{"-foo", "/---...", "-ber", "-ber", "n0tnumb3er", "12faux", "-", "", "", ".", "..", "-"}, + }, + { + "2022-12-31 12:12:31 3022-12-31 12:12:31-Jul 1 00:21:28", + []string{"", "--", "::-"}, + }, + { + "2022/12/01 12:12:31 - 2022/13/32 12:12:31", + []string{"", "-", "//", "::"}, + }, + { + "UUIDS: 123e4567-e89b-12d3-a456-426614174000, 550E8400-E29B-41D4-A716-446655440000, -00000000-0000-0000-0000-000000000000, 12345678-dead-beef-1337-000000000000 {c6ad1a63-10b5-460e-ab2c-05c13604539d} ''''", + []string{"UUIDS:", ",", ",", "-,", "", "{}", "''<>''"}, + }, + // Mixed case UUID and hex strings are ignored, to limit false positives + { + "Not UUIDS: 123e4567-E89B-12d3-a456-426614174000, 1234567-dead-beef-1337-00000000000a", + []string{"Not", "UUIDS:", "123e4567-E89B-12d3-a456-,", "-dead-beef--"}, + }, + { + "Hexes: 0x0123456789 0xabcdef0123 deadbeef1337-ABCDEF0123456?0123456789ab:FFFFAAAAFFFF Curses: 0x012345678 dEaDbeef1337 abcdefabcde ABCDEFABCDE 0xASDFASDFASDF abcdef0123456NOT", + []string{"Hexes:", "", "", "-?:", "Curses:", "0x012345678", "dEaDbeef1337", "abcdefabcde", "ABCDEFABCDE", "0xASDFASDFASDF", "abcdef0123456NOT"}, + }, + { + "30546354_3313121680 0_123_456_789 foo_123", + []string{"_", "___", "foo_"}, + }, + { + `3.31ms/1h2m|-12h2m6.1s 31m "165m2.1s(6h0m12.05us)" -451325.31µs 6m23μs 123h21m3.4124561s/0s/-0.0123ms`, + []string{"/|", "", `"()"`, "", "", "//"}, + }, + { + // Invalid duration values + "3.31.1ms 3h121m3.4124561s 1h0.12s 100usa 0.12msa", + []string{".", "3h121m3.", "1h0.", "100usa", "0.12msa"}, + }, + { + "2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit", + []string{"", "-", "", "", ":"}, + }, + { + `status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`, + []string{"status=123", "status_code:500", "status", "200", `status="-1"`, `status_code:"404"`, "httpStatus=200"}, + }, + { + `status_code_foo=123 status_code:500.1 status 2023-09-06T00:59:59.98 status:"404KiB"`, + []string{"status_code_foo=", "status_code:", "status", "", `status:""`}, + }, +} + +// TODO: add these to the benchmark tests +var tokenizationRealisticTestCases = []tokenizationTestCase{ + // logfmt from loki, metrics.go with a lot of number values of various types + { + `level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`, + + []string{ + "level=info", "ts=", "caller=metrics.go:", "component=frontend", "org_id=", "traceID=", "latency=fast", `query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} "`, "query_hash=", "query_type=limited", "range_type=range", "length=", "start_delta=", "end_delta=", "step=", "duration=", "status=200", "limit=", "returned_lines=", "throughput=", "total_bytes=", "total_bytes_non_indexed_labels=", "lines_per_second=", "total_lines=", "post_filter_lines=", "total_entries=", "store_chunks_download_time=", "queue_time=", "splits=", "shards=", "chunk_refs_fetch_time=", "cache_chunk_req=", "cache_chunk_hit=", "cache_chunk_bytes_stored=", "cache_chunk_bytes_fetched=", "cache_chunk_download_time=", "cache_index_req=", "cache_index_hit=", "cache_index_download_time=", "cache_stats_results_req=", "cache_stats_results_hit=", "cache_stats_results_download_time=", "cache_result_req=", "cache_result_hit=", "cache_result_download_time=", "token_id=gcom-", + }, + }, + // logfmt from loki, with string multi-word messages + { + `level=debug ts=2023-09-06T00:59:59.98214402Z caller=shard_resolver.go:114 bytes=205kB chunks=2 streams=2 entries=200 msg="queried index" type=single matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}" duration=9.498885ms from=2023-09-06T00:48:53.138Z through=2023-09-06T00:49:43.138Z length=50s`, + []string{ + "level=debug", "ts=", "caller=shard_resolver.go:", "bytes=", "chunks=", "streams=", "entries=", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=", "from=", "through=", "length=", + }, + }, + // random JSON logs + { + `{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`, + []string{`{"timestamp":`, `"",`, `"level":`, `"debug",`, `"message":`, `"Server starting",`, `"server_id":`, `"abcdefghij",`, `"start_time":`, `""}`}, + }, + { + // JSON logs without spaces between elements, like how JavaScript's JSON.Stringify() produces: + `{"context":{"taskId":1},"message":"starting task ID 1","sequence":0,"time":1506776210000,"version":"1.0.0"}`, + []string{`{"context":`, `{"taskId":`, `}`, `,"message":`, `"starting task ID "`, `,"sequence":`, ``, `,"time":`, ``, `,"version":`, `".."}`}, + }, + // Android logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Android/Android_2k.log + { + // This test case has an unterminated double quote + // + // TOFIX: + // - timestamp is not correctly detected + // - empty "" token? + `03-17 16:13:40.345 1702 14638 D PowerManagerService: release:lock=166121161, flg=0x0, tag="RILJ_ACK_WL", name=com.android.phone", ws=null, uid=1001, pid=2626`, + []string{"-", "::", "", "", "", "D", "PowerManagerService:", "release:lock=,", "flg=0x0,", `tag="RILJ_ACK_WL",`, `name=com.android.phone"`, ",", "ws=null,", "uid=,", "pid="}, + }, + { + // TOFIX: + // - timestamp is not correctly detected + // - empty "" tokens + `03-17 16:13:47.518 1702 8671 D ActivityManager: Skipping, withExcluded: false, tr.intent:Intent { act=android.intent.action.VIEW dat=file:///storage/emulated/0/Tencent/QQfile_recv/b.apk typ=application/vnd.android.package-archive flg=0x10800000 cmp=com.android.packageinstaller/.PackageInstallerActivity (has extras) }`, + []string{ + "-", "::", "", "", "", "", "D", "ActivityManager:", "Skipping,", "withExcluded:", "false,", "tr.intent:Intent", "{", "act=android.intent.action.VIEW", "dat=file:///storage/emulated//Tencent/QQfile_recv/b.apk", "typ=application/vnd.android.package-archive", "flg=0x10800000", "cmp=com.android.packageinstaller/.PackageInstallerActivity", "(has", "extras)", "}", + }, + }, + // Apache logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Apache/Apache_2k.log + { + `[Mon Dec 05 13:16:27 2005] [notice] jk2_init() Found child 5877 in scoreboard slot 9`, + []string{"[]", "[notice]", "jk2_init()", "Found", "child", "", "in", "scoreboard", "slot", ""}, + }, + { + `[Mon Dec 05 19:14:11 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties`, + []string{"[]", "[notice]", "workerEnv.init()", "ok", "/etc/httpd/conf/workers2.properties"}, + }, + // nginx logs by running `docker run -p 80:80 -v $(pwd):/usr/share/nginx/html nginx` locally + { + `2024/03/27 14:31:42 [error] 29#29: *1 directory index of "/usr/share/nginx/html/" is forbidden, client: 172.17.0.1, server: localhost, request: "GET / HTTP/1.1", host: "127.0.0.1"`, + []string{"", "[error]", "#:", "*", "directory", "index", "of", "\"/usr/share/nginx/html/\"", "is", "forbidden,", "client:", ",", "server:", "localhost,", "request:", `"GET / HTTP/",`, "host:", "\"\""}, + }, + { + // TOFIX: + // - probably not all numbers should be replaced with , e.g. for "*1", "(2:", "HTTP/1.1" it's definitely a worse UX + `2024/03/27 14:34:37 [error] 29#29: *1 open() "/usr/share/nginx/html/test url with spaces" failed (2: No such file or directory), client: 172.17.0.1, server: localhost, request: "GET /test%20url%20with%20spaces HTTP/1.1", host: "127.0.0.1" + 172.17.0.1 - - [31/Mar/2024:14:34:37 +0000] "GET /test%20url%20with%20spaces HTTP/1.1" 404 153 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0" "-"`, + []string{"", "[error]", "#:", "*", "open()", `"/usr/share/nginx/html/test url with spaces"`, "failed", "(:", "No", "such", "file", "or", "directory),", "client:", ",", "server:", "localhost,", "request:", `"GET /test%20url%20with%20spaces HTTP/",`, "host:", "\"\"", "", "", "", "-", "-", "[]", `"GET /test%20url%20with%20spaces HTTP/"`, "", "", "\"-\"", `"Mozilla/ (X11; Linux x86_; rv:) Gecko/ Firefox/"`, `"-"`}, + }, + // Linux systemd (journalctl) logs + { + `Mar 27 11:52:21 hostname systemd-networkd[2043]: enp6s0: LLDP Rx: Invoking callback for 'refreshed' event.`, + []string{"", "hostname", "systemd-networkd[]:", "enp6s0:", "LLDP", "Rx:", "Invoking", "callback", "for", "'refreshed'", "event."}, + }, + { + `Feb 29 23:00:14 nixos dbus-daemon[11432]: [system] Activating via systemd: service name='org.opensuse.Snapper' unit='snapperd.service' requested by ':1.324' (uid=0 pid=22089 comm="/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe" label="kernel")`, + []string{"", "nixos", "dbus-daemon[]:", "[system]", "Activating", "via", "systemd:", "service", "name='org.opensuse.Snapper'", "unit='snapperd.service'", "requested", "by", "':'", "(uid=", "pid=", "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\"", "label=\"kernel\")"}, + }, + // Random slack logs: + { + `Apr-10 23:43:46.807 [API-Q] (T02S4RCS0) c37dfd20-1712781826.804 conversations.suggestions is ACTIVE`, + []string{"", "[API-Q]", "(T02S4RCS0)", "c37dfd20-", "conversations.suggestions", "is", "ACTIVE"}, + }, + { + `Apr-11 00:01:57.743 [DEVICE-PERMISSIONS-MA] Permissions saved to local storage: {"permissions":{"microphone":"granted","camera":"prompt","screen":"prompt"},"timestamp":1712782917742}`, + []string{"", "[DEVICE-PERMISSIONS-MA]", "Permissions", "saved", "to", "local", "storage:", `{"permissions":`, `{"microphone":`, `"granted"`, `,"camera":`, `"prompt"`, `,"screen":`, `"prompt"}`, `,"timestamp":`, `}`}, + }, + // Another weird log from loki: + { + `ts=2023-09-06T00:59:59.900879737Z caller=spanlogger.go:86 user=29 level=debug msg="querying ingester" params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=2023-09-05 23:20:28.030285153 +0000 UTC, end=2023-09-05 23:20:48.030285153 +0000 UTC, limit=1000, shards="`, + []string{"ts=", "caller=spanlogger.go:", "user=", "level=debug", `msg="querying ingester"`, `params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=, end=, limit=, shards="`}, + }, + // {``, []string{}}, +} + +func getTimestampTests(t *testing.T) []tokenizationTestCase { + timeStamps := []string{ + "2023-01-03T15:04:05.999999999Z", + "2024-02-29T11:12:03.33+02:00", + "2022-03-02T23:59:59Z", + "2024-04-01T01:33:59+03:00", + "2021-05-07T00:00:00-04:30", + "1999-06-12T13:37:31.123+03:00", + "2005-07-17T12:00:00Z", + "1988-08-03T00:00:00Z", + "2020-09-30T00:00:59.9999+03:00", + "2034-10-31T12:34:56.7890+11:23", + "2034-11-01T01:23:45.67890Z", + "2025-12-31T01:23:45.67890Z", + } + + timestampConst := string(placeholderTimestamp) + layouts := []struct { + layout string + expResult []string + }{ + // TOFIX: all of the commented out lines. + + //{time.Layout, []string{timestampConst}}, + {time.ANSIC, []string{timestampConst}}, + {time.UnixDate, []string{timestampConst}}, + {"Mon _2 Jan 15:04:05 MST 2006", []string{timestampConst}}, // linux `date` + {time.RubyDate, []string{timestampConst}}, + {time.RFC822, []string{timestampConst}}, + {time.RFC822Z, []string{timestampConst}}, + {time.RFC850, []string{timestampConst}}, + {time.RFC1123, []string{timestampConst}}, + {time.RFC1123Z, []string{timestampConst}}, + {time.RFC3339, []string{timestampConst}}, + {time.RFC3339Nano, []string{timestampConst}}, + //{time.Stamp, []string{timestampConst}}, + //{time.StampMilli, []string{timestampConst}}, + //{time.StampMicro, []string{timestampConst}}, + //{time.StampNano, []string{timestampConst}}, + {time.DateTime, []string{timestampConst}}, + + // TOFIX: maybe consider these timestamps? + {time.DateOnly, []string{"--"}}, + {time.TimeOnly, []string{"::"}}, + } + + result := make([]tokenizationTestCase, 0, len(timeStamps)*len(layouts)) + for _, tss := range timeStamps { + ts, err := time.Parse(time.RFC3339Nano, tss) + require.NoError(t, err) + for _, layout := range layouts { + result = append(result, tokenizationTestCase{ + line: ts.Format(layout.layout), + expResult: layout.expResult, + }) + } + } + return result +} + +type tokenizationTestCasePack struct { + name string + cases []tokenizationTestCase +} + +func TestTokenization(t *testing.T) { + packs := []tokenizationTestCasePack{ + {"cornerTestCases", tokenizationCornerTestCases}, + {"timestamps", getTimestampTests(t)}, + {"realisticTestCases", tokenizationRealisticTestCases}, + } + + for i, pack := range packs { + pack := pack + t.Run(fmt.Sprintf("test-pack-%d-%s", i, pack.name), func(t *testing.T) { + for j, tc := range pack.cases { + tc := tc + t.Run(fmt.Sprintf("case-%d", j), func(t *testing.T) { + result := PreprocessAndTokenize([]byte(tc.line)) + assert.Equal( + t, tc.expResult, result, + fmt.Sprintf("Log line: %q\nActual result slice: %#v", tc.line, result), + ) + }) + } + }) + } +} + +func TestTokenizationMemcpy(t *testing.T) { + buf := make([]byte, 0, 1000) + for i := 0; i < 300; i++ { + buf = append(buf, 'a', ' ') + } + tokenizer := tokenizer{rawLine: buf, maxTokens: 10} + tokenizer.tokenize() + require.Less(t, len(tokenizer.line), 100) +} + +// Useful for running single test cases in isolation +func TestTokenizationPlayground(t *testing.T) { + tc := tokenizationTestCase{ + "foo 121113.21231 bar 123.0.1.123 -123 -1231.11", + []string{"foo", "", "bar", "", "", ""}, + } + result := PreprocessAndTokenize([]byte(tc.line)) + assert.Equal( + t, tc.expResult, result, + fmt.Sprintf("Log line: %q\nActual result slice: %#v", tc.line, result), + ) +} + +func TestReplacementPlayground(t *testing.T) { + line := "Jul 1 00:21:28" + exp := "" + r := replacer{source: []byte(line)} + r.replaceWithPlaceholders() + + assert.Equal( + t, exp, string(r.dest), + fmt.Sprintf("Log line: %q", line), + ) +}