From cb88fc111d33730eec6d76db43dd4c7953e67ff8 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 12 Oct 2017 15:13:32 +0200 Subject: [PATCH] Allow Uid#decodeId to decode from a byte array slice Today we only allow to decode byte arrays where the data has a 0 offset and the same length as the array. Allowing to decode stuff from a slice will make decoding IDs cheaper if the the ID is for instance coming from a term dictionary or BytesRef. Relates to #26931 --- .../org/elasticsearch/index/mapper/Uid.java | 96 ++++++++++--------- .../elasticsearch/index/mapper/UidTests.java | 23 ++++- 2 files changed, 72 insertions(+), 47 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/mapper/Uid.java b/core/src/main/java/org/elasticsearch/index/mapper/Uid.java index 4b5ed5fd3cd92..1d5293259317f 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/Uid.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/Uid.java @@ -135,36 +135,36 @@ static boolean isURLBase64WithoutPadding(String id) { // 'xxx=' and 'xxx' could be considered the same id final int length = id.length(); switch (length & 0x03) { - case 0: - break; - case 1: - return false; - case 2: - // the last 2 symbols (12 bits) are encoding 1 byte (8 bits) - // so the last symbol only actually uses 8-6=2 bits and can only take 4 values - char last = id.charAt(length - 1); - if (last != 'A' && last != 'Q' && last != 'g' && last != 'w') { + case 0: + break; + case 1: return false; - } - break; - case 3: - // The last 3 symbols (18 bits) are encoding 2 bytes (16 bits) - // so the last symbol only actually uses 16-12=4 bits and can only take 16 values - last = id.charAt(length - 1); - if (last != 'A' && last != 'E' && last != 'I' && last != 'M' && last != 'Q'&& last != 'U'&& last != 'Y' + case 2: + // the last 2 symbols (12 bits) are encoding 1 byte (8 bits) + // so the last symbol only actually uses 8-6=2 bits and can only take 4 values + char last = id.charAt(length - 1); + if (last != 'A' && last != 'Q' && last != 'g' && last != 'w') { + return false; + } + break; + case 3: + // The last 3 symbols (18 bits) are encoding 2 bytes (16 bits) + // so the last symbol only actually uses 16-12=4 bits and can only take 16 values + last = id.charAt(length - 1); + if (last != 'A' && last != 'E' && last != 'I' && last != 'M' && last != 'Q'&& last != 'U'&& last != 'Y' && last != 'c'&& last != 'g'&& last != 'k' && last != 'o' && last != 's' && last != 'w' && last != '0' && last != '4' && last != '8') { - return false; - } - break; - default: - // number & 0x03 is always in [0,3] - throw new AssertionError("Impossible case"); + return false; + } + break; + default: + // number & 0x03 is always in [0,3] + throw new AssertionError("Impossible case"); } for (int i = 0; i < length; ++i) { final char c = id.charAt(i); final boolean allowed = - (c >= '0' && c <= '9') || + (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '-' || c == '_'; @@ -244,16 +244,16 @@ public static BytesRef encodeId(String id) { } } - private static String decodeNumericId(byte[] idBytes) { - assert Byte.toUnsignedInt(idBytes[0]) == NUMERIC; - int length = (idBytes.length - 1) * 2; + private static String decodeNumericId(byte[] idBytes, int offset, int len) { + assert Byte.toUnsignedInt(idBytes[offset]) == NUMERIC; + int length = (len - 1) * 2; char[] chars = new char[length]; - for (int i = 1; i < idBytes.length; ++i) { - final int b = Byte.toUnsignedInt(idBytes[i]); + for (int i = 1; i < len; ++i) { + final int b = Byte.toUnsignedInt(idBytes[offset + i]); final int b1 = (b >>> 4); final int b2 = b & 0x0f; chars[(i - 1) * 2] = (char) (b1 + '0'); - if (i == idBytes.length - 1 && b2 == 0x0f) { + if (i == len - 1 && b2 == 0x0f) { length--; break; } @@ -262,15 +262,17 @@ private static String decodeNumericId(byte[] idBytes) { return new String(chars, 0, length); } - private static String decodeUtf8Id(byte[] idBytes) { - assert Byte.toUnsignedInt(idBytes[0]) == UTF8; - return new BytesRef(idBytes, 1, idBytes.length - 1).utf8ToString(); + private static String decodeUtf8Id(byte[] idBytes, int offset, int length) { + assert Byte.toUnsignedInt(idBytes[offset]) == UTF8; + return new BytesRef(idBytes, offset + 1, length - 1).utf8ToString(); } - private static String decodeBase64Id(byte[] idBytes) { - assert Byte.toUnsignedInt(idBytes[0]) <= BASE64_ESCAPE; - if (Byte.toUnsignedInt(idBytes[0]) == BASE64_ESCAPE) { - idBytes = Arrays.copyOfRange(idBytes, 1, idBytes.length); + private static String decodeBase64Id(byte[] idBytes, int offset, int length) { + assert Byte.toUnsignedInt(idBytes[offset]) <= BASE64_ESCAPE; + if (Byte.toUnsignedInt(idBytes[offset]) == BASE64_ESCAPE) { + idBytes = Arrays.copyOfRange(idBytes, offset + 1, offset + length); + } else if ((idBytes.length == length && offset == 0) == false) { // no need to copy if it's not a slice + idBytes = Arrays.copyOfRange(idBytes, offset, offset + length); } return Base64.getUrlEncoder().withoutPadding().encodeToString(idBytes); } @@ -278,17 +280,23 @@ private static String decodeBase64Id(byte[] idBytes) { /** Decode an indexed id back to its original form. * @see #encodeId */ public static String decodeId(byte[] idBytes) { - if (idBytes.length == 0) { + return decodeId(idBytes, 0, idBytes.length); + } + + /** Decode an indexed id back to its original form. + * @see #encodeId */ + public static String decodeId(byte[] idBytes, int offset, int length) { + if (length == 0) { throw new IllegalArgumentException("Ids can't be empty"); } - final int magicChar = Byte.toUnsignedInt(idBytes[0]); + final int magicChar = Byte.toUnsignedInt(idBytes[offset]); switch (magicChar) { - case NUMERIC: - return decodeNumericId(idBytes); - case UTF8: - return decodeUtf8Id(idBytes); - default: - return decodeBase64Id(idBytes); + case NUMERIC: + return decodeNumericId(idBytes, offset, length); + case UTF8: + return decodeUtf8Id(idBytes, offset, length); + default: + return decodeBase64Id(idBytes, offset, length); } } } diff --git a/core/src/test/java/org/elasticsearch/index/mapper/UidTests.java b/core/src/test/java/org/elasticsearch/index/mapper/UidTests.java index 10b475e57ff87..c4fb94abd3846 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/UidTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/UidTests.java @@ -79,7 +79,7 @@ public void testEncodeUTF8Ids() { for (int iter = 0; iter < iters; ++iter) { final String id = TestUtil.randomRealisticUnicodeString(random(), 1, 10); BytesRef encoded = Uid.encodeId(id); - assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length))); + assertEquals(id, doDecodeId(encoded)); assertTrue(encoded.length <= 1 + new BytesRef(id).length); } } @@ -93,7 +93,7 @@ public void testEncodeNumericIds() { id = "0" + id; } BytesRef encoded = Uid.encodeId(id); - assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length))); + assertEquals(id, doDecodeId(encoded)); assertEquals(1 + (id.length() + 1) / 2, encoded.length); } } @@ -105,9 +105,26 @@ public void testEncodeBase64Ids() { random().nextBytes(binaryId); final String id = Base64.getUrlEncoder().withoutPadding().encodeToString(binaryId); BytesRef encoded = Uid.encodeId(id); - assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length))); + assertEquals(id, doDecodeId(encoded)); assertTrue(encoded.length <= 1 + binaryId.length); } } + private static String doDecodeId(BytesRef encoded) { + + if (randomBoolean()) { + return Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)); + } else { + if (randomBoolean()) { + BytesRef slicedCopy = new BytesRef(randomIntBetween(encoded.length + 1, encoded.length + 100)); + slicedCopy.offset = randomIntBetween(1, slicedCopy.bytes.length - encoded.length); + slicedCopy.length = encoded.length; + System.arraycopy(encoded.bytes, encoded.offset, slicedCopy.bytes, slicedCopy.offset, encoded.length); + assertArrayEquals(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length), + Arrays.copyOfRange(slicedCopy.bytes, slicedCopy.offset, slicedCopy.offset + slicedCopy.length)); + encoded = slicedCopy; + } + return Uid.decodeId(encoded.bytes, encoded.offset, encoded.length); + } + } }