Ensure s3 chunk read accounts for utf8 length (#296)

The s3 read is chunked by some number of bytes. Since utf8 characters are dynamically lengthed, we would have a s3 chunk that starts and ends with an invalid utf8 character. The issue is resolved by always breaking a s3 chunk into a valid part and an invalid part, and then merge the invalid part to the next chunk.
pinterest · Oct 26, 2020 · c043438 · c043438
1 parent 1ea2737
commit c043438
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 1 deletion.
diff --git a/datahub/server/clients/s3_client.py b/datahub/server/clients/s3_client.py
@@ -2,6 +2,8 @@
 import botocore
 
 from env import DataHubSettings
+from lib.utils.utf8 import split_by_last_invalid_utf8_char
+
 from .common import ChunkReader, FileDoesNotExist
 
 
@@ -100,6 +102,7 @@ def __init__(
     ):
         self._bucket_name = bucket_name
         self._key = key
+        self._left_over_bytes = b""
 
         super(S3FileReader, self).__init__(read_size, max_read_size)
 
@@ -117,4 +120,6 @@ def __init__(
                 raise e
 
     def read(self):
-        return self._body.read(self._read_size).decode("utf-8")
+        raw = self._left_over_bytes + self._body.read(self._read_size)
+        valid_raw, self._left_over_bytes = split_by_last_invalid_utf8_char(raw)
+        return valid_raw.decode("utf-8")
diff --git a/datahub/server/lib/utils/utf8.py b/datahub/server/lib/utils/utf8.py
@@ -0,0 +1,82 @@
+from typing import Tuple
+
+
+def is_start_byte(b: int) -> bool:
+    """Check if b is a start character byte in utf8
+       See https://en.wikipedia.org/wiki/UTF-8
+       for encoding details
+    Args:
+        b (int): a utf8 byte
+
+    Returns:
+        bool: whether or not b is a valid starting byte
+    """
+
+    # a non-start char has encoding 10xxxxxx
+    return (b >> 6) != 2
+
+
+CHAR_POS_TO_STR_LENGTH = {7: 1, 5: 2, 4: 3, 3: 4}
+
+
+def is_bytes_valid_utf8_char(bs: bytes) -> bool:
+    """Check if bs is a valid utf8 character
+
+    Args:
+        bs (bytes): The bytes string
+
+    Returns:
+        bool: Whether or not bs represents 1 utf8 char
+    """
+    if not is_start_byte(bs[0]):
+        return False
+
+    first_zero_char_pos = None
+    # Four possible variations
+    # 0, 110, 1110, 11110
+    for i in range(7, 2, -1):
+        if (bs[0] & (1 << i)) == 0:
+            first_zero_char_pos = i
+            break
+
+    valid_len = CHAR_POS_TO_STR_LENGTH.get(first_zero_char_pos, 0)
+    if len(bs) != valid_len:
+        return False
+
+    if any(is_start_byte(bs[i]) for i in range(1, len(bs))):
+        return False
+
+    return True
+
+
+def split_by_last_invalid_utf8_char(binary_s: bytes) -> Tuple[bytes, bytes]:
+    """ Given a utf8 bytes string, we want to ensure we can take substring of
+        the byte string and get back a valid utf8 string, so given substring a, b
+        we want to break a into a + a', and we would get 2 valid utf8 substring
+        a and a' + b
+
+    Args:
+        binary_s (bytes): utf8 encoded bytes string which the last char might not be
+                          valid
+
+    Returns:
+        Tuple[bytes, bytes]: The first bytes are valid prefix, the second bytes is the invalid suffix
+    """
+
+    last_start_byte = len(binary_s) - 1
+    while last_start_byte >= 0:
+        if is_start_byte(binary_s[last_start_byte]):
+            break
+        last_start_byte -= 1
+
+    # We went thru the entire string and found no valid start byte
+    if last_start_byte < 0:
+        return [b"", binary_s]
+
+    last_start_char = binary_s[last_start_byte:]
+
+    # The whole string is valid
+    if is_bytes_valid_utf8_char(last_start_char):
+        return [binary_s, b""]
+
+    return [binary_s[:last_start_byte], last_start_char]
diff --git a/datahub/tests/test_lib/test_utils/test_utf8.py b/datahub/tests/test_lib/test_utils/test_utf8.py
@@ -0,0 +1,50 @@
+# import pytest
+from lib.utils.utf8 import (
+    is_start_byte,
+    is_bytes_valid_utf8_char,
+    split_by_last_invalid_utf8_char,
+)
+
+
+def test_is_start_byte():
+    assert list(map(is_start_byte, "哦한¢".encode("utf-8"))) == [
+        True,
+        False,
+        False,
+        True,
+        False,
+        False,
+        True,
+        False,
+    ]
+
+
+def test_is_bytes_valid_utf8_char():
+    assert is_bytes_valid_utf8_char("哦".encode("utf-8"))
+    assert is_bytes_valid_utf8_char("한".encode("utf-8"))
+    assert is_bytes_valid_utf8_char("a".encode("utf-8"))
+    assert is_bytes_valid_utf8_char("𐍈".encode("utf-8"))
+
+    # 2 Chars
+    assert not is_bytes_valid_utf8_char("哦한".encode("utf-8"))
+    assert not is_bytes_valid_utf8_char(b"\xe5\x93")
+    assert not is_bytes_valid_utf8_char(b"\xe5\x93\xa6\xa6")
+
+
+def test_split_by_last_invalid_utf8_char():
+    assert ["hellow".encode("utf-8"), b""] == split_by_last_invalid_utf8_char(
+        "hellow".encode("utf-8")
+    )
+
+    # "你好" b'\xe4\xbd\xa0\xe5\xa5\xbd'
+    assert [b"\xe4\xbd\xa0\xe5\xa5\xbd", b""] == split_by_last_invalid_utf8_char(
+        b"\xe4\xbd\xa0\xe5\xa5\xbd"
+    )
+    # with 1 char removed
+    assert [b"\xe4\xbd\xa0", b"\xe5\xa5"] == split_by_last_invalid_utf8_char(
+        b"\xe4\xbd\xa0\xe5\xa5"
+    )
+    # with 2 char removed
+    assert [b"\xe4\xbd\xa0", b"\xe5"] == split_by_last_invalid_utf8_char(
+        b"\xe4\xbd\xa0\xe5"
+    )