diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 99aeda52488db..ecf4804c67635 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -664,6 +664,26 @@ def _get_type(self): return OrderedDict([('name', 'largeutf8')]) +class BinaryViewField(BinaryField): + + @property + def column_class(self): + return BinaryViewColumn + + def _get_type(self): + return OrderedDict([('name', 'binaryview')]) + + +class StringViewField(StringField): + + @property + def column_class(self): + return StringViewColumn + + def _get_type(self): + return OrderedDict([('name', 'utf8view')]) + + class Schema(object): def __init__(self, fields, metadata=None): @@ -743,6 +763,72 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin): pass +class BinaryViewColumn(PrimitiveColumn): + + def _encode_value(self, x): + return frombytes(binascii.hexlify(x).upper()) + + def _get_buffers(self): + char_buffers = [] + DEFAULT_BUFFER_SIZE = 32 # ¯\_(ツ)_/¯ + INLINE_SIZE = 12 + + data = [] + for i, v in enumerate(self.values): + if not self.is_valid[i]: + v = b'' + assert isinstance(v, bytes) + + if len(v) > INLINE_SIZE: + offset = 0 + if len(v) > DEFAULT_BUFFER_SIZE: + char_buffers.append(v) + else: + if len(char_buffers) == 0: + char_buffers.append(v) + elif len(char_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE: + char_buffers.append(v) + else: + offset = len(char_buffers[-1]) + char_buffers[-1] += v + assert len(char_buffers[-1]) <= DEFAULT_BUFFER_SIZE + + buffer_index = len(char_buffers) - 1 + + # the prefix is always 4 bytes so it may not be utf-8 + # even if the whole string view is + prefix = v[:4].ljust(4, b'\0') + prefix = frombytes(binascii.hexlify(prefix).upper()) + + data.append(OrderedDict([ + ('SIZE', len(v)), + ('PREFIX', prefix), + ('BUFFER_INDEX', buffer_index), + ('OFFSET', offset), + ])) + else: + inlined = self._encode_value(v.ljust(12, b'\0')) + data.append(OrderedDict([ + ('SIZE', len(v)), + ('INLINED', inlined), + ])) + + return [ + ('VALIDITY', [int(x) for x in self.is_valid]), + ('DATA', data), + ('VARIADIC_BUFFERS', [ + frombytes(binascii.hexlify(buffer).upper()) + for buffer in char_buffers + ]), + ] + + +class StringViewColumn(BinaryViewColumn): + + def _encode_value(self, x): + return frombytes(x) + + class FixedSizeBinaryColumn(PrimitiveColumn): def _encode_value(self, x): @@ -1541,6 +1627,15 @@ def generate_run_end_encoded_case(): return _generate_file("run_end_encoded", fields, batch_sizes) +def generate_view_case(): + fields = [ + BinaryViewField('bv'), + StringViewField('sv'), + ] + batch_sizes = [0, 7, 256] + return _generate_file("view", fields, batch_sizes) + + def generate_nested_large_offsets_case(): fields = [ LargeListField('large_list_nullable', get_field('item', 'int32')), @@ -1740,6 +1835,14 @@ def _temp_path(): .skip_category('JS') .skip_category('Rust'), + generate_view_case() + .skip_category('C++') + .skip_category('C#') + .skip_category('Go') + .skip_category('Java') + .skip_category('JS') + .skip_category('Rust'), + generate_extension_case() .skip_category('C#') .skip_category('JS'), diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 3390f1b7b5f2c..abd0fe5bf8ead 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -21,7 +21,7 @@ Arrow Columnar Format ********************* -*Version: 1.3* +*Version: 1.4* The "Arrow Columnar Format" includes a language-agnostic in-memory data structure specification, metadata serialization, and a protocol @@ -106,8 +106,10 @@ the different physical layouts defined by Arrow: * **Primitive (fixed-size)**: a sequence of values each having the same byte or bit width * **Variable-size Binary**: a sequence of values each having a variable - byte length. Two variants of this layout are supported using 32-bit - and 64-bit length encoding. + byte length. Three variants of this layout are supported using + * 32-bit offset encoding + * 64-bit offset encoding + * 128-bit view-or-inline encoding * **Fixed-size List**: a nested layout where each value has the same number of elements taken from a child data type. * **Variable-size List**: a nested layout where each value is a @@ -350,6 +352,51 @@ will be represented as follows: :: |----------------|-----------------------| | joemark | unspecified (padding) | +Variable-size Binary View Layout +-------------------------------- + +Each value in this layout consists of 0 or more bytes. These characters' +locations are indicated using a **views** buffer, which may point to one +of potentially several **data** buffers or may contain the characters +inline. + +The views buffer contains `length` view structures with the following layout: + +:: + + * Short strings, length <= 12 + | Bytes 0-3 | Bytes 4-15 | + |------------|---------------------------------------| + | length | data (padded with 0) | + + * Long strings, length > 12 + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | + |------------|------------|------------|-------------| + | length | prefix | buf. index | offset | + +In both the long and short string cases, the first four bytes encode the +length of the string and can be used to determine how the rest of the view +should be interpreted. + +In the short string case the string's bytes are inlined- stored inside the +view itself, in the twelve bytes which follow the length. + +In the long string case, a buffer index indicates which character buffer +stores the characters and an offset indicates where in that buffer the +characters begin. Buffer index 0 refers to the first character buffer, IE +the first buffer **after** the validity buffer and the views buffer. +The half-open range ``[offset, offset + length)`` must be entirely contained +within the indicated buffer. A copy of the first four bytes of the string is +stored inline in the prefix, after the length. This prefix enables a +profitable fast path for string comparisons, which are frequently determined +within the first four bytes. + +Views must be aligned to an 8-byte boundary. This restriction enables more +efficient interoperation with systems where the index and offset are replaced +by a raw pointer. All integers (length, buffer index, and offset) are unsigned +for compatibility with engines which already implement these views. +This layout is adapted from TU Munich's `UmbraDB`_. + .. _variable-size-list-layout: Variable-size List Layout @@ -885,6 +932,7 @@ of memory buffers for each layout. "Primitive",validity,data, "Variable Binary",validity,offsets,data + "Variable Binary - View",validity,offsets,*multiple_data_buffers "List",validity,offsets, "Fixed-size List",validity,, "Struct",validity,, @@ -1346,3 +1394,4 @@ the Arrow spec. .. _Endianness: https://en.wikipedia.org/wiki/Endianness .. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates .. _Parquet: https://parquet.apache.org/docs/ +.. _UmbraDB: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf diff --git a/format/Message.fbs b/format/Message.fbs index 170ea8fbced89..2633f4dd97b0e 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -99,6 +99,12 @@ table RecordBatch { /// Optional compression of the message body compression: BodyCompression; + + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicCounts to indicate the number of extra buffers which belong + /// to that Field. + variadicCounts: [long]; } /// For sending dictionary encoding information. Any Field can be diff --git a/format/Schema.fbs b/format/Schema.fbs index ce29c25b7d1c8..e3a51fd9a112c 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -22,6 +22,7 @@ /// Version 1.1 - Add Decimal256. /// Version 1.2 - Add Interval MONTH_DAY_NANO. /// Version 1.3 - Add Run-End Encoded. +/// Version 1.4 - Add BinaryView and Utf8View. namespace org.apache.arrow.flatbuf; @@ -171,6 +172,17 @@ table LargeUtf8 { table LargeBinary { } +/// Same as Utf8, but string characters are delimited with a packed +/// length/pointer instead of offsets. +table Utf8View { +} + +/// Same as Binary, but string characters are delimited with a packed +/// length/pointeBinary of offsets. +table BinaryView { +} + + table FixedSizeBinary { /// Number of bytes per value byteWidth: int; @@ -427,6 +439,8 @@ union Type { LargeUtf8, LargeList, RunEndEncoded, + BinaryView, + Utf8View, } /// ----------------------------------------------------------------------