Skip to content

Commit

Permalink
update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Oct 6, 2022
1 parent 19d630a commit 1a56944
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 3 deletions.
Binary file added tests/features/data/test_image_rgba.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
80 changes: 77 additions & 3 deletions tests/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
_INDICES = [1, 0]

IMAGE_PATH_1 = Path(__file__).parent / "features" / "data" / "test_image_rgb.jpg"
IMAGE_PATH_2 = Path(__file__).parent / "features" / "data" / "test_image_rgba.png"
AUDIO_PATH_1 = Path(__file__).parent / "features" / "data" / "test_audio_44100.wav"


Expand Down Expand Up @@ -117,6 +118,7 @@ def test_numpy_formatter_np_array_kwargs(self):
self.assertEqual(batch["c"].dtype, np.dtype(np.float16))

def test_numpy_formatter_image(self):
# same dimensions
pa_table = pa.table({"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}] * 2})
formatter = NumpyFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
Expand All @@ -129,6 +131,23 @@ def test_numpy_formatter_image(self):
self.assertEqual(batch["image"].dtype, np.uint8)
self.assertEqual(batch["image"].shape, (2, 480, 640, 3))

# different dimensions
pa_table = pa.table(
{"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}, {"bytes": None, "path": str(IMAGE_PATH_2)}]}
)
formatter = NumpyFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
self.assertEqual(row["image"].dtype, np.uint8)
self.assertEqual(row["image"].shape, (480, 640, 3))
col = formatter.format_column(pa_table)
self.assertIsInstance(col, list)
self.assertEqual(col[0].dtype, np.uint8)
self.assertEqual(col[0].shape, (480, 640, 3))
batch = formatter.format_batch(pa_table)
self.assertIsInstance(batch["image"], list)
self.assertEqual(batch["image"][0].dtype, np.uint8)
self.assertEqual(batch["image"][0].shape, (480, 640, 3))

def test_numpy_formatter_audio(self):
pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]})
formatter = NumpyFormatter(features=Features({"audio": Audio()}))
Expand Down Expand Up @@ -197,6 +216,7 @@ def test_torch_formatter_image(self):

from datasets.formatting import TorchFormatter

# same dimensions
pa_table = pa.table({"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}] * 2})
formatter = TorchFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
Expand All @@ -209,6 +229,23 @@ def test_torch_formatter_image(self):
self.assertEqual(batch["image"].dtype, torch.uint8)
self.assertEqual(batch["image"].shape, (2, 480, 640, 3))

# different dimensions
pa_table = pa.table(
{"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}, {"bytes": None, "path": str(IMAGE_PATH_2)}]}
)
formatter = TorchFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
self.assertEqual(row["image"].dtype, torch.uint8)
self.assertEqual(row["image"].shape, (480, 640, 3))
col = formatter.format_column(pa_table)
self.assertIsInstance(col, list)
self.assertEqual(col[0].dtype, torch.uint8)
self.assertEqual(col[0].shape, (480, 640, 3))
batch = formatter.format_batch(pa_table)
self.assertIsInstance(batch["image"], list)
self.assertEqual(batch["image"][0].dtype, torch.uint8)
self.assertEqual(batch["image"][0].shape, (480, 640, 3))

@require_torch
def test_torch_formatter_audio(self):
import torch
Expand Down Expand Up @@ -270,6 +307,7 @@ def test_tf_formatter_image(self):

from datasets.formatting import TFFormatter

# same dimensions
pa_table = pa.table({"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}] * 2})
formatter = TFFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
Expand All @@ -282,6 +320,23 @@ def test_tf_formatter_image(self):
self.assertEqual(batch["image"][0].dtype, tf.uint8)
self.assertEqual(batch["image"].shape, (2, 480, 640, 3))

# different dimensions
pa_table = pa.table(
{"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}, {"bytes": None, "path": str(IMAGE_PATH_2)}]}
)
formatter = TFFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
self.assertEqual(row["image"].dtype, tf.uint8)
self.assertEqual(row["image"].shape, (480, 640, 3))
col = formatter.format_column(pa_table)
self.assertIsInstance(col, list)
self.assertEqual(col[0].dtype, tf.uint8)
self.assertEqual(col[0].shape, (480, 640, 3))
batch = formatter.format_batch(pa_table)
self.assertIsInstance(batch["image"], list)
self.assertEqual(batch["image"][0].dtype, tf.uint8)
self.assertEqual(batch["image"][0].shape, (480, 640, 3))

@require_tf
def test_tf_formatter_audio(self):
import tensorflow as tf
Expand All @@ -299,20 +354,21 @@ def test_tf_formatter_audio(self):

@require_jax
def test_jax_formatter(self):
import jax
import jax.numpy as jnp

from datasets.formatting import JaxFormatter

pa_table = self._create_dummy_table()
formatter = JaxFormatter()
row = formatter.format_row(pa_table)
jnp.allclose(row["a"], jnp.array(_COL_A, dtype=jnp.int64)[0])
jnp.allclose(row["a"], jnp.array(_COL_A, dtype=jnp.int64 if jax.config.jax_enable_x64 else jnp.int32)[0])
assert row["b"] == _COL_B[0]
jnp.allclose(row["c"], jnp.array(_COL_C, dtype=jnp.float32)[0])
col = formatter.format_column(pa_table)
jnp.allclose(col, jnp.array(_COL_A, dtype=jnp.int64))
jnp.allclose(col, jnp.array(_COL_A, dtype=jnp.int64 if jax.config.jax_enable_x64 else jnp.int32))
batch = formatter.format_batch(pa_table)
jnp.allclose(batch["a"], jnp.array(_COL_A, dtype=jnp.int64))
jnp.allclose(batch["a"], jnp.array(_COL_A, dtype=jnp.int64 if jax.config.jax_enable_x64 else jnp.int32))
assert batch["b"] == _COL_B
jnp.allclose(batch["c"], jnp.array(_COL_C, dtype=jnp.float32))
assert batch["c"].shape == np.array(_COL_C).shape
Expand All @@ -339,6 +395,7 @@ def test_jax_formatter_image(self):

from datasets.formatting import JaxFormatter

# same dimensions
pa_table = pa.table({"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}] * 2})
formatter = JaxFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
Expand All @@ -351,6 +408,23 @@ def test_jax_formatter_image(self):
self.assertEqual(batch["image"].dtype, jnp.uint8)
self.assertEqual(batch["image"].shape, (2, 480, 640, 3))

# different dimensions
pa_table = pa.table(
{"image": [{"bytes": None, "path": str(IMAGE_PATH_1)}, {"bytes": None, "path": str(IMAGE_PATH_2)}]}
)
formatter = JaxFormatter(features=Features({"image": Image()}))
row = formatter.format_row(pa_table)
self.assertEqual(row["image"].dtype, jnp.uint8)
self.assertEqual(row["image"].shape, (480, 640, 3))
col = formatter.format_column(pa_table)
self.assertIsInstance(col, list)
self.assertEqual(col[0].dtype, jnp.uint8)
self.assertEqual(col[0].shape, (480, 640, 3))
batch = formatter.format_batch(pa_table)
self.assertIsInstance(batch["image"], list)
self.assertEqual(batch["image"][0].dtype, jnp.uint8)
self.assertEqual(batch["image"][0].shape, (480, 640, 3))

@require_jax
def test_jax_formatter_audio(self):
import jax.numpy as jnp
Expand Down

1 comment on commit 1a56944

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008378 / 0.011353 (-0.002975) 0.004541 / 0.011008 (-0.006467) 0.098282 / 0.038508 (0.059774) 0.029472 / 0.023109 (0.006363) 0.312433 / 0.275898 (0.036535) 0.366578 / 0.323480 (0.043098) 0.006919 / 0.007986 (-0.001067) 0.003460 / 0.004328 (-0.000868) 0.076887 / 0.004250 (0.072637) 0.035133 / 0.037052 (-0.001920) 0.313419 / 0.258489 (0.054930) 0.351746 / 0.293841 (0.057905) 0.038257 / 0.128546 (-0.090289) 0.014310 / 0.075646 (-0.061336) 0.322663 / 0.419271 (-0.096609) 0.044677 / 0.043533 (0.001144) 0.303865 / 0.255139 (0.048726) 0.333071 / 0.283200 (0.049871) 0.088940 / 0.141683 (-0.052743) 1.537153 / 1.452155 (0.084999) 1.552432 / 1.492716 (0.059716)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.011303 / 0.018006 (-0.006703) 0.408241 / 0.000490 (0.407751) 0.004833 / 0.000200 (0.004634) 0.000080 / 0.000054 (0.000026)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.021378 / 0.037411 (-0.016034) 0.090722 / 0.014526 (0.076196) 0.103155 / 0.176557 (-0.073401) 0.147013 / 0.737135 (-0.590122) 0.103827 / 0.296338 (-0.192511)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.417183 / 0.215209 (0.201974) 4.156350 / 2.077655 (2.078696) 1.869783 / 1.504120 (0.365663) 1.667920 / 1.541195 (0.126726) 1.703935 / 1.468490 (0.235445) 0.703786 / 4.584777 (-3.880991) 3.322253 / 3.745712 (-0.423460) 1.838496 / 5.269862 (-3.431366) 1.134821 / 4.565676 (-3.430855) 0.080505 / 0.424275 (-0.343770) 0.011530 / 0.007607 (0.003923) 0.527479 / 0.226044 (0.301434) 5.283923 / 2.268929 (3.014994) 2.293246 / 55.444624 (-53.151379) 1.963884 / 6.876477 (-4.912593) 2.053404 / 2.142072 (-0.088668) 0.822195 / 4.805227 (-3.983032) 0.148391 / 6.500664 (-6.352273) 0.064959 / 0.075469 (-0.010510)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.528421 / 1.841788 (-0.313367) 12.240496 / 8.074308 (4.166188) 25.520268 / 10.191392 (15.328876) 0.880635 / 0.680424 (0.200211) 0.615103 / 0.534201 (0.080902) 0.383841 / 0.579283 (-0.195442) 0.387182 / 0.434364 (-0.047182) 0.237424 / 0.540337 (-0.302913) 0.246289 / 1.386936 (-1.140647)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006626 / 0.011353 (-0.004727) 0.004586 / 0.011008 (-0.006422) 0.097472 / 0.038508 (0.058964) 0.027389 / 0.023109 (0.004280) 0.337568 / 0.275898 (0.061670) 0.370836 / 0.323480 (0.047356) 0.004983 / 0.007986 (-0.003003) 0.003339 / 0.004328 (-0.000989) 0.075444 / 0.004250 (0.071194) 0.031416 / 0.037052 (-0.005636) 0.340576 / 0.258489 (0.082087) 0.382619 / 0.293841 (0.088778) 0.033256 / 0.128546 (-0.095290) 0.011695 / 0.075646 (-0.063951) 0.321445 / 0.419271 (-0.097826) 0.051224 / 0.043533 (0.007691) 0.343257 / 0.255139 (0.088118) 0.364574 / 0.283200 (0.081374) 0.090410 / 0.141683 (-0.051272) 1.490095 / 1.452155 (0.037941) 1.572689 / 1.492716 (0.079973)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.203695 / 0.018006 (0.185689) 0.399545 / 0.000490 (0.399055) 0.002763 / 0.000200 (0.002563) 0.000074 / 0.000054 (0.000020)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020642 / 0.037411 (-0.016769) 0.094760 / 0.014526 (0.080234) 0.102207 / 0.176557 (-0.074350) 0.138504 / 0.737135 (-0.598631) 0.104275 / 0.296338 (-0.192064)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.438530 / 0.215209 (0.223321) 4.365201 / 2.077655 (2.287547) 2.108204 / 1.504120 (0.604084) 1.914082 / 1.541195 (0.372887) 1.944064 / 1.468490 (0.475574) 0.714388 / 4.584777 (-3.870389) 3.314901 / 3.745712 (-0.430811) 1.833715 / 5.269862 (-3.436147) 1.148384 / 4.565676 (-3.417292) 0.081973 / 0.424275 (-0.342302) 0.011517 / 0.007607 (0.003910) 0.544715 / 0.226044 (0.318671) 5.449475 / 2.268929 (3.180547) 2.545543 / 55.444624 (-52.899082) 2.219667 / 6.876477 (-4.656810) 2.222798 / 2.142072 (0.080725) 0.824837 / 4.805227 (-3.980390) 0.148723 / 6.500664 (-6.351941) 0.063999 / 0.075469 (-0.011470)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.550207 / 1.841788 (-0.291581) 12.268737 / 8.074308 (4.194429) 11.712128 / 10.191392 (1.520736) 0.900404 / 0.680424 (0.219980) 0.630119 / 0.534201 (0.095918) 0.375211 / 0.579283 (-0.204072) 0.376494 / 0.434364 (-0.057870) 0.225975 / 0.540337 (-0.314362) 0.233879 / 1.386936 (-1.153057)

CML watermark

Please sign in to comment.