Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure s3 chunk read accounts for utf8 length #296

Merged
merged 2 commits into from
Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion datahub/server/clients/s3_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import botocore

from env import DataHubSettings
from lib.utils.utf8 import split_by_last_invalid_utf8_char

from .common import ChunkReader, FileDoesNotExist


Expand Down Expand Up @@ -100,6 +102,7 @@ def __init__(
):
self._bucket_name = bucket_name
self._key = key
self._left_over_bytes = b""

super(S3FileReader, self).__init__(read_size, max_read_size)

Expand All @@ -117,4 +120,6 @@ def __init__(
raise e

def read(self):
return self._body.read(self._read_size).decode("utf-8")
raw = self._left_over_bytes + self._body.read(self._read_size)
valid_raw, self._left_over_bytes = split_by_last_invalid_utf8_char(raw)
return valid_raw.decode("utf-8")
82 changes: 82 additions & 0 deletions datahub/server/lib/utils/utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import Tuple


def is_start_byte(b: int) -> bool:
"""Check if b is a start character byte in utf8
See https://en.wikipedia.org/wiki/UTF-8
for encoding details
Args:
b (int): a utf8 byte

Returns:
bool: whether or not b is a valid starting byte
"""

# a non-start char has encoding 10xxxxxx
return (b >> 6) != 2


CHAR_POS_TO_STR_LENGTH = {7: 1, 5: 2, 4: 3, 3: 4}


def is_bytes_valid_utf8_char(bs: bytes) -> bool:
"""Check if bs is a valid utf8 character

Args:
bs (bytes): The bytes string

Returns:
bool: Whether or not bs represents 1 utf8 char
"""
if not is_start_byte(bs[0]):
return False

first_zero_char_pos = None
# Four possible variations
# 0, 110, 1110, 11110
for i in range(7, 2, -1):
if (bs[0] & (1 << i)) == 0:
first_zero_char_pos = i
break

valid_len = CHAR_POS_TO_STR_LENGTH.get(first_zero_char_pos, 0)
if len(bs) != valid_len:
return False

if any(is_start_byte(bs[i]) for i in range(1, len(bs))):
return False

return True


def split_by_last_invalid_utf8_char(binary_s: bytes) -> Tuple[bytes, bytes]:
""" Given a utf8 bytes string, we want to ensure we can take substring of
the byte string and get back a valid utf8 string, so given substring a, b
we want to break a into a + a', and we would get 2 valid utf8 substring
a and a' + b

Args:
binary_s (bytes): utf8 encoded bytes string which the last char might not be
valid

Returns:
Tuple[bytes, bytes]: The first bytes are valid prefix, the second bytes is the invalid suffix
"""

last_start_byte = len(binary_s) - 1
while last_start_byte >= 0:
if is_start_byte(binary_s[last_start_byte]):
break
last_start_byte -= 1

# We went thru the entire string and found no valid start byte
if last_start_byte < 0:
return [b"", binary_s]

last_start_char = binary_s[last_start_byte:]

# The whole string is valid
if is_bytes_valid_utf8_char(last_start_char):
return [binary_s, b""]

return [binary_s[:last_start_byte], last_start_char]
50 changes: 50 additions & 0 deletions datahub/tests/test_lib/test_utils/test_utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# import pytest
from lib.utils.utf8 import (
is_start_byte,
is_bytes_valid_utf8_char,
split_by_last_invalid_utf8_char,
)


def test_is_start_byte():
assert list(map(is_start_byte, "哦한¢".encode("utf-8"))) == [
True,
False,
False,
True,
False,
False,
True,
False,
]


def test_is_bytes_valid_utf8_char():
assert is_bytes_valid_utf8_char("哦".encode("utf-8"))
assert is_bytes_valid_utf8_char("한".encode("utf-8"))
assert is_bytes_valid_utf8_char("a".encode("utf-8"))
assert is_bytes_valid_utf8_char("𐍈".encode("utf-8"))

# 2 Chars
assert not is_bytes_valid_utf8_char("哦한".encode("utf-8"))
assert not is_bytes_valid_utf8_char(b"\xe5\x93")
assert not is_bytes_valid_utf8_char(b"\xe5\x93\xa6\xa6")


def test_split_by_last_invalid_utf8_char():
assert ["hellow".encode("utf-8"), b""] == split_by_last_invalid_utf8_char(
"hellow".encode("utf-8")
)

# "你好" b'\xe4\xbd\xa0\xe5\xa5\xbd'
assert [b"\xe4\xbd\xa0\xe5\xa5\xbd", b""] == split_by_last_invalid_utf8_char(
b"\xe4\xbd\xa0\xe5\xa5\xbd"
)
# with 1 char removed
assert [b"\xe4\xbd\xa0", b"\xe5\xa5"] == split_by_last_invalid_utf8_char(
b"\xe4\xbd\xa0\xe5\xa5"
)
# with 2 char removed
assert [b"\xe4\xbd\xa0", b"\xe5"] == split_by_last_invalid_utf8_char(
b"\xe4\xbd\xa0\xe5"
)