Skip to content

Commit

Permalink
[ML] Minor improvements to categorization Grok pattern creation (#33353)
Browse files Browse the repository at this point in the history
1. The TOMCAT_DATESTAMP format needs to be checked before
   TIMESTAMP_ISO8601, otherwise TIMESTAMP_ISO8601 will
   match the start of the Tomcat datestamp.
2. Exclude more characters before and after numbers.  For
   example, in 1.2.3 we don't want to match 1.2 as a float.
  • Loading branch information
droberts195 committed Sep 4, 2018
1 parent 4a62711 commit 10a2ffb
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
*/
public final class GrokPatternCreator {

private static String PREFACE = "preface";
private static String EPILOGUE = "epilogue";
private static final String PREFACE = "preface";
private static final String EPILOGUE = "epilogue";

/**
* The first match in this list will be chosen, so it needs to be ordered
* such that more generic patterns come after more specific patterns.
*/
private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
Expand All @@ -41,7 +42,6 @@ public final class GrokPatternCreator {
new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
new GrokPatternCandidate("HTTPDATE", "timestamp"),
new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
new GrokPatternCandidate("DATE", "date"),
new GrokPatternCandidate("TIME", "time"),
Expand All @@ -56,12 +56,10 @@ public final class GrokPatternCreator {
new GrokPatternCandidate("IP", "ipaddress"),
// This already includes pre/post break conditions
new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
// Can't use \b as the break before, because it doesn't work for negative numbers (the
// minus sign is not a "word" character)
new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
// Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
// numbers that NUMBER rejected due to preceeding characters
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
// Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
// up numeric suffices too eagerly
new GrokPatternCandidate("NUMBER", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
// TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
// Fixing these problems with overly broad matches would require some extra intelligence
// to be added to remove inappropriate matches. One idea would be to use a dictionary,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,40 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() {
assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}

public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() {

// The first part of the Tomcat datestamp can match as an ISO8601
// timestamp if the ordering of candidate patterns is wrong
Collection<String> mustMatchStrings = Arrays.asList("2018-09-03 17:03:28,269 +0100 | ERROR | ",
"2018-09-03 17:04:27,279 +0100 | DEBUG | ",
"2018-09-03 17:05:26,289 +0100 | ERROR | ");

Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);

assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
}

public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() {

// If we're not careful then we might detect the first part of these strings as a
// number, e.g. 1.2 in the first example, but this is inappropriate given the
// trailing dot and digit
Collection<String> mustMatchStrings = Arrays.asList("1.2.3",
"-2.3.4",
"4.5.6.7",
"-9.8.7.6.5");

Map<String, Integer> fieldNameCountStore = new HashMap<>();
StringBuilder overallGrokPatternBuilder = new StringBuilder();

GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);

assertEquals(".+?", overallGrokPatternBuilder.toString());
}

public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {

Collection<String> mustMatchStrings = Arrays.asList("(-2)",
Expand Down

0 comments on commit 10a2ffb

Please sign in to comment.