From 5279db6531f5095b0576717cdfce0dad5b280831 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Tue, 18 Jun 2024 17:25:32 -0700 Subject: [PATCH] [Data] Disable multithreaded reads if `preserve_order` is enabled (#46135) Multithreaded reads can cause non-deterministic ordering of output rows. Signed-off-by: Balaji Veeramani --- python/ray/data/datasource/file_based_datasource.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/ray/data/datasource/file_based_datasource.py b/python/ray/data/datasource/file_based_datasource.py index a24d2700a57d..8253139759d0 100644 --- a/python/ray/data/datasource/file_based_datasource.py +++ b/python/ray/data/datasource/file_based_datasource.py @@ -233,6 +233,11 @@ def create_read_task_fn(read_paths, num_threads): def read_task_fn(): nonlocal num_threads, read_paths + # TODO: We should refactor the code so that we can get the results in + # order even when using multiple threads. + if ctx.execution_options.preserve_order: + num_threads = 0 + if num_threads > 0: if len(read_paths) < num_threads: num_threads = len(read_paths)