huggingface · mariosasko · Oct 7, 2022 · Oct 6, 2022 · Oct 6, 2022 · Oct 6, 2022
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -3571,6 +3571,9 @@ def shuffle(
         if len(self) == 0:
             return self
 
+        if keep_in_memory and indices_cache_file_name is not None:
+            raise ValueError("Please use either `keep_in_memory` or `indices_cache_file_name` but not both.")
+
         if seed is not None and generator is not None:
             raise ValueError("Both `seed` and `generator` were provided. Please specify just one of them.")
 
@@ -3585,7 +3588,7 @@ def shuffle(
             generator = np.random.default_rng(seed)
 
         # Check if we've already cached this computation (indexed by a hash)
-        if self.cache_files:
+        if self.cache_files and not keep_in_memory:
             if indices_cache_file_name is None:
                 # we create a unique hash from the function, current dataset file and the mapping args
                 indices_cache_file_name = self._get_cache_file_path(new_fingerprint)