diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 53f7ba58bff99..5ac32da56a8de 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -665,6 +665,26 @@ def _get_type(self): return OrderedDict([('name', 'largeutf8')]) +class BinaryViewField(BinaryField): + + @property + def column_class(self): + return BinaryViewColumn + + def _get_type(self): + return OrderedDict([('name', 'binaryview')]) + + +class StringViewField(StringField): + + @property + def column_class(self): + return StringViewColumn + + def _get_type(self): + return OrderedDict([('name', 'utf8view')]) + + class Schema(object): def __init__(self, fields, metadata=None): @@ -744,6 +764,74 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin): pass +class BinaryViewColumn(PrimitiveColumn): + + def _encode_value(self, x): + return frombytes(binascii.hexlify(x).upper()) + + def _get_buffers(self): + views = [] + data_buffers = [] + # a small default data buffer size is used so we can exercise + # arrays with multiple data buffers with small data sets + DEFAULT_BUFFER_SIZE = 32 + INLINE_SIZE = 12 + + for i, v in enumerate(self.values): + if not self.is_valid[i]: + v = b'' + assert isinstance(v, bytes) + + if len(v) <= INLINE_SIZE: + # Append an inline view, skip data buffer management. + views.append(OrderedDict([ + ('SIZE', len(v)), + ('INLINED', self._encode_value(v)), + ])) + continue + + if len(data_buffers) == 0: + # No data buffers have been added yet; + # add this string whole (we may append to it later). + offset = 0 + data_buffers.append(v) + elif len(data_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE: + # Appending this string to the current active data buffer + # would overflow the default buffer size; add it whole. + offset = 0 + data_buffers.append(v) + else: + # Append this string to the current active data buffer. + offset = len(data_buffers[-1]) + data_buffers[-1] += v + + # the prefix is always 4 bytes so it may not be utf-8 + # even if the whole string view is + prefix = frombytes(binascii.hexlify(v[:4]).upper()) + + views.append(OrderedDict([ + ('SIZE', len(v)), + ('PREFIX_HEX', prefix), + ('BUFFER_INDEX', len(data_buffers) - 1), + ('OFFSET', offset), + ])) + + return [ + ('VALIDITY', [int(x) for x in self.is_valid]), + ('VIEWS', views), + ('VARIADIC_DATA_BUFFERS', [ + frombytes(binascii.hexlify(b).upper()) + for b in data_buffers + ]), + ] + + +class StringViewColumn(BinaryViewColumn): + + def _encode_value(self, x): + return frombytes(x) + + class FixedSizeBinaryColumn(PrimitiveColumn): def _encode_value(self, x): @@ -1568,6 +1656,15 @@ def generate_run_end_encoded_case(): return _generate_file("run_end_encoded", fields, batch_sizes) +def generate_binary_view_case(): + fields = [ + BinaryViewField('bv'), + StringViewField('sv'), + ] + batch_sizes = [0, 7, 256] + return _generate_file("binary_view", fields, batch_sizes) + + def generate_nested_large_offsets_case(): fields = [ LargeListField('large_list_nullable', get_field('item', 'int32')), @@ -1763,6 +1860,14 @@ def _temp_path(): .skip_tester('JS') .skip_tester('Rust'), + generate_binary_view_case() + .skip_tester('C++') + .skip_tester('C#') + .skip_tester('Go') + .skip_tester('Java') + .skip_tester('JS') + .skip_tester('Rust'), + generate_extension_case() .skip_tester('C#') # TODO: ensure the extension is registered in the C++ entrypoint diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 3390f1b7b5f2c..afbe2a08ee28c 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -21,7 +21,7 @@ Arrow Columnar Format ********************* -*Version: 1.3* +*Version: 1.4* The "Arrow Columnar Format" includes a language-agnostic in-memory data structure specification, metadata serialization, and a protocol @@ -108,6 +108,10 @@ the different physical layouts defined by Arrow: * **Variable-size Binary**: a sequence of values each having a variable byte length. Two variants of this layout are supported using 32-bit and 64-bit length encoding. +* **Views of Variable-size Binary**: a sequence of values each having a + variable byte length. In contrast to Variable-size Binary, the values + of this layout are distributed across potentially multiple buffers + instead of densely and sequentially packed in a single buffer. * **Fixed-size List**: a nested layout where each value has the same number of elements taken from a child data type. * **Variable-size List**: a nested layout where each value is a @@ -350,6 +354,51 @@ will be represented as follows: :: |----------------|-----------------------| | joemark | unspecified (padding) | +Variable-size Binary View Layout +-------------------------------- + +.. versionadded:: Arrow Columnar Format 1.4 + +Each value in this layout consists of 0 or more bytes. These bytes' +locations are indicated using a **views** buffer, which may point to one +of potentially several **data** buffers or may contain the characters +inline. + +The views buffer contains `length` view structures with the following layout: + +:: + + * Short strings, length <= 12 + | Bytes 0-3 | Bytes 4-15 | + |------------|---------------------------------------| + | length | data (padded with 0) | + + * Long strings, length > 12 + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | + |------------|------------|------------|-------------| + | length | prefix | buf. index | offset | + +In both the long and short string cases, the first four bytes encode the +length of the string and can be used to determine how the rest of the view +should be interpreted. + +In the short string case the string's bytes are inlined- stored inside the +view itself, in the twelve bytes which follow the length. + +In the long string case, a buffer index indicates which data buffer +stores the data bytes and an offset indicates where in that buffer the +data bytes begin. Buffer index 0 refers to the first data buffer, IE +the first buffer **after** the validity buffer and the views buffer. +The half-open range ``[offset, offset + length)`` must be entirely contained +within the indicated buffer. A copy of the first four bytes of the string is +stored inline in the prefix, after the length. This prefix enables a +profitable fast path for string comparisons, which are frequently determined +within the first four bytes. + +All integers (length, buffer index, and offset) are signed. + +This layout is adapted from TU Munich's `UmbraDB`_. + .. _variable-size-list-layout: Variable-size List Layout @@ -880,19 +929,20 @@ For the avoidance of ambiguity, we provide listing the order and type of memory buffers for each layout. .. csv-table:: Buffer Layouts - :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2" - :widths: 30, 20, 20, 20 - - "Primitive",validity,data, - "Variable Binary",validity,offsets,data - "List",validity,offsets, - "Fixed-size List",validity,, - "Struct",validity,, - "Sparse Union",type ids,, - "Dense Union",type ids,offsets, - "Null",,, - "Dictionary-encoded",validity,data (indices), - "Run-end encoded",,, + :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2", "Variadic Buffers" + :widths: 30, 20, 20, 20, 20 + + "Primitive",validity,data,, + "Variable Binary",validity,offsets,data, + "Variable Binary View",validity,views,,data + "List",validity,offsets,, + "Fixed-size List",validity,,, + "Struct",validity,,, + "Sparse Union",type ids,,, + "Dense Union",type ids,offsets,, + "Null",,,, + "Dictionary-encoded",validity,data (indices),, + "Run-end encoded",,,, Logical Types ============= @@ -1071,6 +1121,39 @@ bytes. Since this metadata can be used to communicate in-memory pointer addresses between libraries, it is recommended to set ``size`` to the actual memory size rather than the padded size. +Variadic buffers +^^^^^^^^^^^^^^^^ + +Some types such as Utf8View are represented using a variable number of buffers. +For each such Field in the pre-ordered flattened logical schema, there will be +an entry in ``variadicBufferCounts`` to indicate the number of variadic buffers +which belong to that Field in the current RecordBatch. + +For example, consider the schema :: + + col1: Struct + col2: Utf8View + +This has two fields with variadic buffers, so ``variadicBufferCounts`` will +have two entries in each RecordBatch. For a RecordBatch of this schema with +``variadicBufferCounts = [3, 2]``, the flattened buffers would be:: + + buffer 0: col1 validity + buffer 1: col1.a validity + buffer 2: col1.a values + buffer 3: col1.b validity + buffer 4: col1.b views + buffer 5: col1.b data + buffer 6: col1.b data + buffer 7: col1.b data + buffer 8: col1.c validity + buffer 9: col1.c values + buffer 10: col2 validity + buffer 11: col2 views + buffer 12: col2 data + buffer 13: col2 data + + Byte Order (`Endianness`_) --------------------------- @@ -1346,3 +1429,4 @@ the Arrow spec. .. _Endianness: https://en.wikipedia.org/wiki/Endianness .. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates .. _Parquet: https://parquet.apache.org/docs/ +.. _UmbraDB: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf diff --git a/format/Message.fbs b/format/Message.fbs index 170ea8fbced89..92a629f3f9d95 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -99,6 +99,22 @@ table RecordBatch { /// Optional compression of the message body compression: BodyCompression; + + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.b` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + variadicBufferCounts: [long]; } /// For sending dictionary encoding information. Any Field can be diff --git a/format/Schema.fbs b/format/Schema.fbs index ce29c25b7d1c8..fdaf623931760 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -22,6 +22,7 @@ /// Version 1.1 - Add Decimal256. /// Version 1.2 - Add Interval MONTH_DAY_NANO. /// Version 1.3 - Add Run-End Encoded. +/// Version 1.4 - Add BinaryView, Utf8View, and variadicBufferCounts. namespace org.apache.arrow.flatbuf; @@ -171,6 +172,27 @@ table LargeUtf8 { table LargeBinary { } +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +table Utf8View { +} + +/// Logically the same as Binary, but the internal representation uses a header +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +table BinaryView { +} + + table FixedSizeBinary { /// Number of bytes per value byteWidth: int; @@ -427,6 +449,8 @@ union Type { LargeUtf8, LargeList, RunEndEncoded, + BinaryView, + Utf8View, } /// ----------------------------------------------------------------------