Skip to content

Commit

Permalink
apacheGH-35627: [Format][Integration] Add string-view to arrow format
Browse files Browse the repository at this point in the history
  • Loading branch information
bkietz committed Sep 1, 2023
1 parent faa7cf6 commit aa75fa8
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 3 deletions.
103 changes: 103 additions & 0 deletions dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,26 @@ def _get_type(self):
return OrderedDict([('name', 'largeutf8')])


class BinaryViewField(BinaryField):

@property
def column_class(self):
return BinaryViewColumn

def _get_type(self):
return OrderedDict([('name', 'binaryview')])


class StringViewField(StringField):

@property
def column_class(self):
return StringViewColumn

def _get_type(self):
return OrderedDict([('name', 'utf8view')])


class Schema(object):

def __init__(self, fields, metadata=None):
Expand Down Expand Up @@ -743,6 +763,72 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
pass


class BinaryViewColumn(PrimitiveColumn):

def _encode_value(self, x):
return frombytes(binascii.hexlify(x).upper())

def _get_buffers(self):
char_buffers = []
DEFAULT_BUFFER_SIZE = 32 # ¯\_(ツ)_/¯
INLINE_SIZE = 12

data = []
for i, v in enumerate(self.values):
if not self.is_valid[i]:
v = b''
assert isinstance(v, bytes)

if len(v) > INLINE_SIZE:
offset = 0
if len(v) > DEFAULT_BUFFER_SIZE:
char_buffers.append(v)
else:
if len(char_buffers) == 0:
char_buffers.append(v)
elif len(char_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
char_buffers.append(v)
else:
offset = len(char_buffers[-1])
char_buffers[-1] += v
assert len(char_buffers[-1]) <= DEFAULT_BUFFER_SIZE

buffer_index = len(char_buffers) - 1

# the prefix is always 4 bytes so it may not be utf-8
# even if the whole string view is
prefix = v[:4].ljust(4, b'\0')
prefix = frombytes(binascii.hexlify(prefix).upper())

data.append(OrderedDict([
('SIZE', len(v)),
('PREFIX', prefix),
('BUFFER_INDEX', buffer_index),
('OFFSET', offset),
]))
else:
inlined = self._encode_value(v.ljust(12, b'\0'))
data.append(OrderedDict([
('SIZE', len(v)),
('INLINED', inlined),
]))

return [
('VALIDITY', [int(x) for x in self.is_valid]),
('DATA', data),
('VARIADIC_BUFFERS', [
frombytes(binascii.hexlify(buffer).upper())
for buffer in char_buffers
]),
]


class StringViewColumn(BinaryViewColumn):

def _encode_value(self, x):
return frombytes(x)


class FixedSizeBinaryColumn(PrimitiveColumn):

def _encode_value(self, x):
Expand Down Expand Up @@ -1541,6 +1627,15 @@ def generate_run_end_encoded_case():
return _generate_file("run_end_encoded", fields, batch_sizes)


def generate_view_case():
fields = [
BinaryViewField('bv'),
StringViewField('sv'),
]
batch_sizes = [0, 7, 256]
return _generate_file("view", fields, batch_sizes)


def generate_nested_large_offsets_case():
fields = [
LargeListField('large_list_nullable', get_field('item', 'int32')),
Expand Down Expand Up @@ -1740,6 +1835,14 @@ def _temp_path():
.skip_category('JS')
.skip_category('Rust'),

generate_view_case()
.skip_category('C++')
.skip_category('C#')
.skip_category('Go')
.skip_category('Java')
.skip_category('JS')
.skip_category('Rust'),

generate_extension_case()
.skip_category('C#')
.skip_category('JS'),
Expand Down
55 changes: 52 additions & 3 deletions docs/source/format/Columnar.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
Arrow Columnar Format
*********************

*Version: 1.3*
*Version: 1.4*

The "Arrow Columnar Format" includes a language-agnostic in-memory
data structure specification, metadata serialization, and a protocol
Expand Down Expand Up @@ -106,8 +106,10 @@ the different physical layouts defined by Arrow:
* **Primitive (fixed-size)**: a sequence of values each having the
same byte or bit width
* **Variable-size Binary**: a sequence of values each having a variable
byte length. Two variants of this layout are supported using 32-bit
and 64-bit length encoding.
byte length. Three variants of this layout are supported using
* 32-bit offset encoding
* 64-bit offset encoding
* 128-bit view-or-inline encoding
* **Fixed-size List**: a nested layout where each value has the same
number of elements taken from a child data type.
* **Variable-size List**: a nested layout where each value is a
Expand Down Expand Up @@ -350,6 +352,51 @@ will be represented as follows: ::
|----------------|-----------------------|
| joemark | unspecified (padding) |

Variable-size Binary View Layout
--------------------------------

Each value in this layout consists of 0 or more bytes. These characters'
locations are indicated using a **views** buffer, which may point to one
of potentially several **data** buffers or may contain the characters
inline.

The views buffer contains `length` view structures with the following layout:

::

* Short strings, length <= 12
| Bytes 0-3 | Bytes 4-15 |
|------------|---------------------------------------|
| length | data (padded with 0) |

* Long strings, length > 12
| Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 |
|------------|------------|------------|-------------|
| length | prefix | buf. index | offset |

In both the long and short string cases, the first four bytes encode the
length of the string and can be used to determine how the rest of the view
should be interpreted.

In the short string case the string's bytes are inlined- stored inside the
view itself, in the twelve bytes which follow the length.

In the long string case, a buffer index indicates which character buffer
stores the characters and an offset indicates where in that buffer the
characters begin. Buffer index 0 refers to the first character buffer, IE
the first buffer **after** the validity buffer and the views buffer.
The half-open range ``[offset, offset + length)`` must be entirely contained
within the indicated buffer. A copy of the first four bytes of the string is
stored inline in the prefix, after the length. This prefix enables a
profitable fast path for string comparisons, which are frequently determined
within the first four bytes.

Views must be aligned to an 8-byte boundary. This restriction enables more
efficient interoperation with systems where the index and offset are replaced
by a raw pointer. All integers (length, buffer index, and offset) are unsigned
for compatibility with engines which already implement these views.
This layout is adapted from TU Munich's `UmbraDB`_.

.. _variable-size-list-layout:

Variable-size List Layout
Expand Down Expand Up @@ -885,6 +932,7 @@ of memory buffers for each layout.

"Primitive",validity,data,
"Variable Binary",validity,offsets,data
"Variable Binary - View",validity,offsets,*multiple_data_buffers
"List",validity,offsets,
"Fixed-size List",validity,,
"Struct",validity,,
Expand Down Expand Up @@ -1346,3 +1394,4 @@ the Arrow spec.
.. _Endianness: https://en.wikipedia.org/wiki/Endianness
.. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates
.. _Parquet: https://parquet.apache.org/docs/
.. _UmbraDB: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
6 changes: 6 additions & 0 deletions format/Message.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ table RecordBatch {

/// Optional compression of the message body
compression: BodyCompression;

/// Some types such as Utf8View are represented using a variable number of buffers.
/// For each such Field in the pre-ordered flattened logical schema, there will be
/// an entry in variadicCounts to indicate the number of extra buffers which belong
/// to that Field.
variadicCounts: [long];
}

/// For sending dictionary encoding information. Any Field can be
Expand Down
14 changes: 14 additions & 0 deletions format/Schema.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
/// Version 1.1 - Add Decimal256.
/// Version 1.2 - Add Interval MONTH_DAY_NANO.
/// Version 1.3 - Add Run-End Encoded.
/// Version 1.4 - Add BinaryView and Utf8View.

namespace org.apache.arrow.flatbuf;

Expand Down Expand Up @@ -171,6 +172,17 @@ table LargeUtf8 {
table LargeBinary {
}

/// Same as Utf8, but string characters are delimited with a packed
/// length/pointer instead of offsets.
table Utf8View {
}

/// Same as Binary, but string characters are delimited with a packed
/// length/pointeBinary of offsets.
table BinaryView {
}


table FixedSizeBinary {
/// Number of bytes per value
byteWidth: int;
Expand Down Expand Up @@ -427,6 +439,8 @@ union Type {
LargeUtf8,
LargeList,
RunEndEncoded,
BinaryView,
Utf8View,
}

/// ----------------------------------------------------------------------
Expand Down

0 comments on commit aa75fa8

Please sign in to comment.