diff --git a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java index 7f18445e505e3..a8fd9d7eb895b 100644 --- a/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java +++ b/x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/LogStructureFinderManager.java @@ -163,9 +163,15 @@ CharsetMatch findCharset(List explanation, InputStream inputStream) thro // deduction algorithms on binary files is very slow as the binary files generally appear to // have very long lines. boolean spaceEncodingContainsZeroByte = false; - byte[] spaceBytes = " ".getBytes(name); - for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) { - spaceEncodingContainsZeroByte = (spaceBytes[i] == 0); + Charset charset = Charset.forName(name); + // Some character sets cannot be encoded. These are extremely rare so it's likely that + // they've been chosen based on incorrectly provided binary data. Therefore, err on + // the side of rejecting binary data. + if (charset.canEncode()) { + byte[] spaceBytes = " ".getBytes(charset); + for (int i = 0; i < spaceBytes.length && spaceEncodingContainsZeroByte == false; ++i) { + spaceEncodingContainsZeroByte = (spaceBytes[i] == 0); + } } if (containsZeroBytes && spaceEncodingContainsZeroByte == false) { explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() +