don't pickle table index (#2264)

huggingface · Apr 26, 2021 · 22c5928 · 22c5928 · github-actions · Apr 26, 2021
1 parent 3db67f5
commit 22c5928
Showing 1 changed file with 18 additions and 23 deletions.
diff --git a/src/datasets/table.py b/src/datasets/table.py
@@ -145,31 +145,30 @@ def __deepcopy__(self, memo: dict):
         return _deepcopy(self, memo)
 
     def __getstate__(self):
-        state = self.__dict__.copy()
         # We can't pickle objects that are bigger than 4GiB, or it causes OverflowError
         # So we write the table on disk instead
         if self.table.nbytes >= config.MAX_TABLE_NBYTES_FOR_PICKLING:
-            table = state.pop("table")
+            table = self.table
             with tempfile.NamedTemporaryFile("wb", delete=False, suffix=".arrow") as tmp_file:
                 filename = tmp_file.name
                 logger.debug(
                     f"Attempting to pickle a table bigger than 4GiB. Writing it on the disk instead at {filename}"
                 )
                 _write_table_to_file(table=table, filename=filename)
-                state["path"] = filename
-                return state
+                return {"path": filename}
         else:
-            return state
+            return {"table": self.table}
 
     def __setstate__(self, state):
-        state = state.copy()
         if "path" in state:
-            filename = state.pop("path")
+            filename = state["path"]
             logger.debug(f"Unpickling a big table from the disk at {filename}")
-            state["table"] = _in_memory_arrow_table_from_file(filename)
+            table = _in_memory_arrow_table_from_file(filename)
             logger.debug(f"Removing temporary table file at {filename}")
             os.remove(filename)
-        self.__dict__ = state
+        else:
+            table = state["table"]
+        Table.__init__(self, table)
 
     @inject_arrow_table_documentation(pa.Table.validate)
     def validate(self, *args, **kwargs):
@@ -434,16 +433,14 @@ def from_file(cls, filename: str, replays=None):
         return cls(table, filename, replays)
 
     def __getstate__(self):
-        state = self.__dict__.copy()
-        state.pop("table")
-        return state
+        return {"path": self.path, "replays": self.replays}
 
     def __setstate__(self, state):
-        state = state.copy()
-        table = _memory_mapped_arrow_table_from_file(state["path"])
-        table = self._apply_replays(table, state["replays"])
-        state["table"] = table
-        self.__dict__ = state
+        path = state["path"]
+        replays = state["replays"]
+        table = _memory_mapped_arrow_table_from_file(path)
+        table = self._apply_replays(table, replays)
+        MemoryMappedTable.__init__(self, table, path=path, replays=replays)
 
     @staticmethod
     def _apply_replays(table: pa.Table, replays: Optional[List[Replay]] = None) -> pa.Table:
@@ -569,14 +566,12 @@ def __init__(self, table: pa.Table, blocks: List[List[TableBlock]]):
                     )
 
     def __getstate__(self):
-        state = self.__dict__.copy()
-        state.pop("table")
-        return state
+        return {"blocks": self.blocks}
 
     def __setstate__(self, state):
-        state = state.copy()
-        state["table"] = self._concat_blocks_horizontally_and_vertically(state["blocks"])
-        self.__dict__ = state
+        blocks = state["blocks"]
+        table = self._concat_blocks_horizontally_and_vertically(blocks)
+        ConcatenationTable.__init__(self, table, blocks=blocks)
 
     @staticmethod
     def _concat_blocks(blocks: List[Union[TableBlock, pa.Table]], axis: int = 0) -> pa.Table: