Skip to content

Commit

Permalink
Ensure s3 chunk read accounts for utf8 length (#296)
Browse files Browse the repository at this point in the history
The s3 read is chunked by some number of bytes. Since utf8 characters are dynamically lengthed, we would have a s3 chunk that starts and ends with an invalid utf8 character. The issue is resolved by always breaking a s3 chunk into a valid part and an invalid part, and then merge the invalid part to the next chunk.
  • Loading branch information
czgu authored Oct 26, 2020
1 parent 1ea2737 commit c043438
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 1 deletion.
7 changes: 6 additions & 1 deletion datahub/server/clients/s3_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import botocore

from env import DataHubSettings
from lib.utils.utf8 import split_by_last_invalid_utf8_char

from .common import ChunkReader, FileDoesNotExist


Expand Down Expand Up @@ -100,6 +102,7 @@ def __init__(
):
self._bucket_name = bucket_name
self._key = key
self._left_over_bytes = b""

super(S3FileReader, self).__init__(read_size, max_read_size)

Expand All @@ -117,4 +120,6 @@ def __init__(
raise e

def read(self):
return self._body.read(self._read_size).decode("utf-8")
raw = self._left_over_bytes + self._body.read(self._read_size)
valid_raw, self._left_over_bytes = split_by_last_invalid_utf8_char(raw)
return valid_raw.decode("utf-8")
82 changes: 82 additions & 0 deletions datahub/server/lib/utils/utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import Tuple


def is_start_byte(b: int) -> bool:
"""Check if b is a start character byte in utf8
See https://en.wikipedia.org/wiki/UTF-8
for encoding details
Args:
b (int): a utf8 byte
Returns:
bool: whether or not b is a valid starting byte
"""

# a non-start char has encoding 10xxxxxx
return (b >> 6) != 2


CHAR_POS_TO_STR_LENGTH = {7: 1, 5: 2, 4: 3, 3: 4}


def is_bytes_valid_utf8_char(bs: bytes) -> bool:
"""Check if bs is a valid utf8 character
Args:
bs (bytes): The bytes string
Returns:
bool: Whether or not bs represents 1 utf8 char
"""
if not is_start_byte(bs[0]):
return False

first_zero_char_pos = None
# Four possible variations
# 0, 110, 1110, 11110
for i in range(7, 2, -1):
if (bs[0] & (1 << i)) == 0:
first_zero_char_pos = i
break

valid_len = CHAR_POS_TO_STR_LENGTH.get(first_zero_char_pos, 0)
if len(bs) != valid_len:
return False

if any(is_start_byte(bs[i]) for i in range(1, len(bs))):
return False

return True


def split_by_last_invalid_utf8_char(binary_s: bytes) -> Tuple[bytes, bytes]:
""" Given a utf8 bytes string, we want to ensure we can take substring of
the byte string and get back a valid utf8 string, so given substring a, b
we want to break a into a + a', and we would get 2 valid utf8 substring
a and a' + b
Args:
binary_s (bytes): utf8 encoded bytes string which the last char might not be
valid
Returns:
Tuple[bytes, bytes]: The first bytes are valid prefix, the second bytes is the invalid suffix
"""

last_start_byte = len(binary_s) - 1
while last_start_byte >= 0:
if is_start_byte(binary_s[last_start_byte]):
break
last_start_byte -= 1

# We went thru the entire string and found no valid start byte
if last_start_byte < 0:
return [b"", binary_s]

last_start_char = binary_s[last_start_byte:]

# The whole string is valid
if is_bytes_valid_utf8_char(last_start_char):
return [binary_s, b""]

return [binary_s[:last_start_byte], last_start_char]
50 changes: 50 additions & 0 deletions datahub/tests/test_lib/test_utils/test_utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# import pytest
from lib.utils.utf8 import (
is_start_byte,
is_bytes_valid_utf8_char,
split_by_last_invalid_utf8_char,
)


def test_is_start_byte():
assert list(map(is_start_byte, "哦한¢".encode("utf-8"))) == [
True,
False,
False,
True,
False,
False,
True,
False,
]


def test_is_bytes_valid_utf8_char():
assert is_bytes_valid_utf8_char("哦".encode("utf-8"))
assert is_bytes_valid_utf8_char("한".encode("utf-8"))
assert is_bytes_valid_utf8_char("a".encode("utf-8"))
assert is_bytes_valid_utf8_char("𐍈".encode("utf-8"))

# 2 Chars
assert not is_bytes_valid_utf8_char("哦한".encode("utf-8"))
assert not is_bytes_valid_utf8_char(b"\xe5\x93")
assert not is_bytes_valid_utf8_char(b"\xe5\x93\xa6\xa6")


def test_split_by_last_invalid_utf8_char():
assert ["hellow".encode("utf-8"), b""] == split_by_last_invalid_utf8_char(
"hellow".encode("utf-8")
)

# "你好" b'\xe4\xbd\xa0\xe5\xa5\xbd'
assert [b"\xe4\xbd\xa0\xe5\xa5\xbd", b""] == split_by_last_invalid_utf8_char(
b"\xe4\xbd\xa0\xe5\xa5\xbd"
)
# with 1 char removed
assert [b"\xe4\xbd\xa0", b"\xe5\xa5"] == split_by_last_invalid_utf8_char(
b"\xe4\xbd\xa0\xe5\xa5"
)
# with 2 char removed
assert [b"\xe4\xbd\xa0", b"\xe5"] == split_by_last_invalid_utf8_char(
b"\xe4\xbd\xa0\xe5"
)

0 comments on commit c043438

Please sign in to comment.