Skip to content

Commit

Permalink
style
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Nov 18, 2022
1 parent 2e270dc commit 532ae18
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 10 deletions.
4 changes: 1 addition & 3 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1314,9 +1314,7 @@ def save_to_disk(
shard_lengths[job_id], shard_sizes[job_id] = content
else:
pbar.update(content)
with fs.open(
path_join(dataset_path, config.DATASET_STATE_JSON_FILENAME), "w", encoding="utf-8"
) as state_file:
with fs.open(path_join(dataset_path, config.DATASET_STATE_JSON_FILENAME), "w", encoding="utf-8") as state_file:
json.dump(state, state_file, indent=2, sort_keys=True)
with fs.open(
path_join(dataset_path, config.DATASET_INFO_FILENAME), "w", encoding="utf-8"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3560,7 +3560,7 @@ def test_dummy_dataset_serialize_fs(dataset, mock_fsspec, tmp_path_factory):
dataset_path = "mock://my_dataset"
storage_options = {
"local_root_dir": tmp_path_factory.mktemp("test_dummy_dataset_serialize_fs"),
"auto_mkdir": True
"auto_mkdir": True,
}
dataset.save_to_disk(dataset_path, storage_options=storage_options)
reloaded = dataset.load_from_disk(dataset_path, storage_options=storage_options)
Expand Down
7 changes: 1 addition & 6 deletions tests/test_dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@
from datasets.features import ClassLabel, Features, Sequence, Value
from datasets.splits import NamedSplit

from .utils import (
assert_arrow_memory_doesnt_increase,
assert_arrow_memory_increases,
require_tf,
require_torch,
)
from .utils import assert_arrow_memory_doesnt_increase, assert_arrow_memory_increases, require_tf, require_torch


class DatasetDictTest(TestCase):
Expand Down

1 comment on commit 532ae18

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008243 / 0.011353 (-0.003110) 0.004443 / 0.011008 (-0.006565) 0.098287 / 0.038508 (0.059779) 0.029092 / 0.023109 (0.005983) 0.299848 / 0.275898 (0.023950) 0.356731 / 0.323480 (0.033251) 0.006795 / 0.007986 (-0.001191) 0.003321 / 0.004328 (-0.001007) 0.075806 / 0.004250 (0.071555) 0.034140 / 0.037052 (-0.002912) 0.307256 / 0.258489 (0.048767) 0.350526 / 0.293841 (0.056685) 0.036851 / 0.128546 (-0.091696) 0.014413 / 0.075646 (-0.061233) 0.325244 / 0.419271 (-0.094028) 0.043017 / 0.043533 (-0.000516) 0.304572 / 0.255139 (0.049433) 0.331277 / 0.283200 (0.048078) 0.086195 / 0.141683 (-0.055488) 1.494207 / 1.452155 (0.042053) 1.539026 / 1.492716 (0.046310)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.187563 / 0.018006 (0.169556) 0.403907 / 0.000490 (0.403417) 0.001142 / 0.000200 (0.000942) 0.000079 / 0.000054 (0.000024)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023805 / 0.037411 (-0.013606) 0.095813 / 0.014526 (0.081287) 0.108324 / 0.176557 (-0.068233) 0.142357 / 0.737135 (-0.594779) 0.110091 / 0.296338 (-0.186247)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.411938 / 0.215209 (0.196729) 4.129216 / 2.077655 (2.051561) 1.852765 / 1.504120 (0.348645) 1.665457 / 1.541195 (0.124262) 1.708619 / 1.468490 (0.240129) 0.688206 / 4.584777 (-3.896571) 3.338659 / 3.745712 (-0.407053) 2.748148 / 5.269862 (-2.521713) 1.481831 / 4.565676 (-3.083845) 0.080339 / 0.424275 (-0.343936) 0.011673 / 0.007607 (0.004066) 0.517948 / 0.226044 (0.291903) 5.199979 / 2.268929 (2.931050) 2.300550 / 55.444624 (-53.144074) 1.953120 / 6.876477 (-4.923356) 2.032288 / 2.142072 (-0.109784) 0.805017 / 4.805227 (-4.000211) 0.148193 / 6.500664 (-6.352471) 0.065033 / 0.075469 (-0.010436)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.552126 / 1.841788 (-0.289662) 12.343958 / 8.074308 (4.269650) 26.463690 / 10.191392 (16.272298) 0.910772 / 0.680424 (0.230348) 0.633597 / 0.534201 (0.099396) 0.394642 / 0.579283 (-0.184641) 0.391675 / 0.434364 (-0.042689) 0.234430 / 0.540337 (-0.305907) 0.242060 / 1.386936 (-1.144876)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006490 / 0.011353 (-0.004863) 0.004517 / 0.011008 (-0.006491) 0.096445 / 0.038508 (0.057937) 0.027204 / 0.023109 (0.004094) 0.339454 / 0.275898 (0.063556) 0.375035 / 0.323480 (0.051556) 0.004792 / 0.007986 (-0.003193) 0.003419 / 0.004328 (-0.000909) 0.074614 / 0.004250 (0.070363) 0.032879 / 0.037052 (-0.004174) 0.341916 / 0.258489 (0.083427) 0.386207 / 0.293841 (0.092366) 0.031079 / 0.128546 (-0.097468) 0.011486 / 0.075646 (-0.064160) 0.319085 / 0.419271 (-0.100187) 0.042677 / 0.043533 (-0.000856) 0.342564 / 0.255139 (0.087425) 0.366810 / 0.283200 (0.083611) 0.091209 / 0.141683 (-0.050474) 1.483901 / 1.452155 (0.031746) 1.559465 / 1.492716 (0.066748)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.216858 / 0.018006 (0.198852) 0.397017 / 0.000490 (0.396527) 0.001324 / 0.000200 (0.001124) 0.000077 / 0.000054 (0.000023)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.024396 / 0.037411 (-0.013015) 0.101496 / 0.014526 (0.086970) 0.108841 / 0.176557 (-0.067715) 0.143322 / 0.737135 (-0.593813) 0.114300 / 0.296338 (-0.182039)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.453535 / 0.215209 (0.238326) 4.528972 / 2.077655 (2.451317) 2.350900 / 1.504120 (0.846780) 2.182359 / 1.541195 (0.641164) 2.220051 / 1.468490 (0.751561) 0.689232 / 4.584777 (-3.895545) 3.349990 / 3.745712 (-0.395722) 1.856196 / 5.269862 (-3.413665) 1.163486 / 4.565676 (-3.402191) 0.081741 / 0.424275 (-0.342534) 0.011668 / 0.007607 (0.004061) 0.551253 / 0.226044 (0.325209) 5.508426 / 2.268929 (3.239498) 2.777251 / 55.444624 (-52.667374) 2.488622 / 6.876477 (-4.387855) 2.595554 / 2.142072 (0.453481) 0.807588 / 4.805227 (-3.997639) 0.152461 / 6.500664 (-6.348203) 0.066141 / 0.075469 (-0.009328)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.579166 / 1.841788 (-0.262622) 12.606858 / 8.074308 (4.532549) 12.174640 / 10.191392 (1.983248) 0.948105 / 0.680424 (0.267682) 0.664222 / 0.534201 (0.130021) 0.376814 / 0.579283 (-0.202469) 0.388730 / 0.434364 (-0.045634) 0.218941 / 0.540337 (-0.321396) 0.231411 / 1.386936 (-1.155525)

Please sign in to comment.