-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ML] Refactor delimited file structure detection (#33233)
1. Use the term "delimited" rather than "separated values" 2. Use a single factory class with arguments to specify the delimiter and identification constraints This change makes it easier to add support for other delimiter characters.
- Loading branch information
1 parent
6e1354c
commit e5eddc2
Showing
24 changed files
with
278 additions
and
430 deletions.
There are no files selected for viewing
35 changes: 0 additions & 35 deletions
35
...main/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactory.java
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
...ava/org/elasticsearch/xpack/ml/logstructurefinder/DelimitedLogStructureFinderFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License; | ||
* you may not use this file except in compliance with the Elastic License. | ||
*/ | ||
package org.elasticsearch.xpack.ml.logstructurefinder; | ||
|
||
import org.supercsv.prefs.CsvPreference; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Locale; | ||
|
||
public class DelimitedLogStructureFinderFactory implements LogStructureFinderFactory { | ||
|
||
private final CsvPreference csvPreference; | ||
private final int minFieldsPerRow; | ||
private final boolean trimFields; | ||
|
||
DelimitedLogStructureFinderFactory(char delimiter, int minFieldsPerRow, boolean trimFields) { | ||
csvPreference = new CsvPreference.Builder('"', delimiter, "\n").build(); | ||
this.minFieldsPerRow = minFieldsPerRow; | ||
this.trimFields = trimFields; | ||
} | ||
|
||
/** | ||
* Rules are: | ||
* - It must contain at least two complete records | ||
* - There must be a minimum number of fields per record (otherwise files with no commas could be treated as CSV!) | ||
* - Every record except the last must have the same number of fields | ||
* The reason the last record is allowed to have fewer fields than the others is that | ||
* it could have been truncated when the file was sampled. | ||
*/ | ||
@Override | ||
public boolean canCreateFromSample(List<String> explanation, String sample) { | ||
String formatName; | ||
switch ((char) csvPreference.getDelimiterChar()) { | ||
case ',': | ||
formatName = "CSV"; | ||
break; | ||
case '\t': | ||
formatName = "TSV"; | ||
break; | ||
default: | ||
formatName = Character.getName(csvPreference.getDelimiterChar()).toLowerCase(Locale.ROOT) + " delimited values"; | ||
break; | ||
} | ||
return DelimitedLogStructureFinder.canCreateFromSample(explanation, sample, minFieldsPerRow, csvPreference, formatName); | ||
} | ||
|
||
@Override | ||
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker) | ||
throws IOException { | ||
return DelimitedLogStructureFinder.makeDelimitedLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, | ||
csvPreference, trimFields); | ||
} | ||
} |
Oops, something went wrong.