Avoid PyArrow type optimization if it fails (#3234)

* Add option to disable type optimization * Add a test * Add DISABLE prefix * Style * Revert changes * Remove col in TypedSequence * Add fallback in case of range error * Add test * Fix * Log info message
huggingface · Nov 10, 2021 · 807341d · 807341d · github-actions · Nov 10, 2021
1 parent ec37b34
commit 807341d
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 0 deletions.
diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -100,6 +100,7 @@ def __arrow_array__(self, type=None):
             trying_type = True
         else:
             type = self.type
+        trying_int_optimization = False
         try:
             if isinstance(type, _ArrayXDExtensionType):
                 if isinstance(self.data, np.ndarray):
@@ -130,6 +131,7 @@ def __arrow_array__(self, type=None):
                         "Specified try_type alters data. Please check that the type/feature that you provided match the type/features of the data."
                     )
             if self.optimized_int_type and self.type is None and self.try_type is None:
+                trying_int_optimization = True
                 if pa.types.is_int64(out.type):
                     out = out.cast(self.optimized_int_type)
                 elif pa.types.is_list(out.type):
@@ -154,6 +156,10 @@ def __arrow_array__(self, type=None):
                                 type_(self.data), e
                             )
                         ) from None
+                    elif trying_int_optimization and "not in range" in str(e):
+                        optimized_int_type_str = np.dtype(self.optimized_int_type.to_pandas_dtype()).name
+                        logger.info(f"Failed to cast a sequence to {optimized_int_type_str}. Falling back to int64.")
+                        return out
                     else:
                         raise
             elif "overflow" in str(e):
@@ -162,6 +168,10 @@ def __arrow_array__(self, type=None):
                         type_(self.data), e
                     )
                 ) from None
+            elif trying_int_optimization and "not in range" in str(e):
+                optimized_int_type_str = np.dtype(self.optimized_int_type.to_pandas_dtype()).name
+                logger.info(f"Failed to cast a sequence to {optimized_int_type_str}. Falling back to int64.")
+                return out
             else:
                 raise
 

diff --git a/tests/test_arrow_writer.py b/tests/test_arrow_writer.py
@@ -1,7 +1,9 @@
+import copy
 import os
 import tempfile
 from unittest import TestCase
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
@@ -211,6 +213,13 @@ def get_base_dtype(arr_type):
         return arr_type
 
 
+def change_first_primitive_element_in_list(lst, value):
+    if isinstance(lst[0], list):
+        change_first_primitive_element_in_list(lst[0], value)
+    else:
+        lst[0] = value
+
+
 @pytest.mark.parametrize("optimized_int_type, expected_dtype", [(None, pa.int64()), (pa.int32(), pa.int32())])
 @pytest.mark.parametrize("sequence", [[1, 2, 3], [[1, 2, 3]], [[[1, 2, 3]]]])
 def test_optimized_int_type_for_typed_sequence(sequence, optimized_int_type, expected_dtype):
@@ -230,9 +239,19 @@ def test_optimized_int_type_for_typed_sequence(sequence, optimized_int_type, exp
 )
 @pytest.mark.parametrize("sequence", [[1, 2, 3], [[1, 2, 3]], [[[1, 2, 3]]]])
 def test_optimized_typed_sequence(sequence, col, expected_dtype):
+    # in range
     arr = pa.array(OptimizedTypedSequence(sequence, col=col))
     assert get_base_dtype(arr.type) == expected_dtype
 
+    # not in range
+    if col != "other":
+        # avoids errors due to in-place modifications
+        sequence = copy.deepcopy(sequence)
+        value = np.iinfo(expected_dtype.to_pandas_dtype()).max + 1
+        change_first_primitive_element_in_list(sequence, value)
+        arr = pa.array(OptimizedTypedSequence(sequence, col=col))
+        assert get_base_dtype(arr.type) == pa.int64()
+
 
 @pytest.mark.parametrize("raise_exception", [False, True])
 def test_arrow_writer_closes_stream(raise_exception, tmp_path):