Skip to content

Commit

Permalink
Embed image /audio data in dl_and_prepare parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Sep 16, 2022
1 parent 5b23f58 commit 5d3f90b
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 1 deletion.
6 changes: 5 additions & 1 deletion src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from .filesystems import is_remote_filesystem
from .info import DatasetInfo
from .keyhash import DuplicatedKeysError, KeyHasher
from .table import array_cast, cast_array_to_feature, table_cast
from .table import array_cast, cast_array_to_feature, embed_table_storage, table_cast
from .utils import logging
from .utils.file_utils import hash_url_to_filename
from .utils.py_utils import asdict, first_non_null_value
Expand Down Expand Up @@ -287,6 +287,7 @@ def __init__(
update_features: bool = False,
with_metadata: bool = True,
unit: str = "examples",
embed_local_files: bool = False,
storage_options: Optional[dict] = None,
):
if path is None and stream is None:
Expand Down Expand Up @@ -332,6 +333,7 @@ def __init__(
self.update_features = update_features
self.with_metadata = with_metadata
self.unit = unit
self.embed_local_files = embed_local_files

self._num_examples = 0
self._num_bytes = 0
Expand Down Expand Up @@ -536,6 +538,8 @@ def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = Non
if self.pa_writer is None:
self._build_writer(inferred_schema=pa_table.schema)
pa_table = table_cast(pa_table, self._schema)
if self.embed_local_files:
pa_table = embed_table_storage(pa_table)
self._num_bytes += pa_table.nbytes
self._num_examples += pa_table.num_rows
self.pa_writer.write_table(pa_table, writer_batch_size)
Expand Down
7 changes: 7 additions & 0 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ def download_and_prepare(
If True, will get token from ~/.huggingface.
file_format (:obj:`str`, optional): format of the data files in which the dataset will be written.
Supported formats: "arrow", "parquet". Default to "arrow" format.
If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
<Added version="2.5.0"/>
max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard.
Expand Down Expand Up @@ -1348,6 +1349,7 @@ def _prepare_split(
generator = self._generate_examples(**split_generator.gen_kwargs)

writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
embed_local_files = file_format == "parquet"

shard_id = 0
# TODO: embed the images/audio files inside parquet files.
Expand All @@ -1358,6 +1360,7 @@ def _prepare_split(
hash_salt=split_info.name,
check_duplicates=check_duplicate_keys,
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
total_num_examples, total_num_bytes = 0, 0
try:
Expand All @@ -1381,6 +1384,7 @@ def _prepare_split(
hash_salt=split_info.name,
check_duplicates=check_duplicate_keys,
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
example = self.info.features.encode_example(record)
writer.write(example, key)
Expand Down Expand Up @@ -1474,13 +1478,15 @@ def _prepare_split(
generator = self._generate_tables(**split_generator.gen_kwargs)

writer_class = ParquetWriter if file_format == "parquet" else ArrowWriter
embed_local_files = file_format == "parquet"

shard_id = 0
# TODO: embed the images/audio files inside parquet files.
writer = writer_class(
features=self.info.features,
path=fpath.replace("SSSSS", f"{shard_id:05d}"),
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
total_num_examples, total_num_bytes = 0, 0
try:
Expand All @@ -1499,6 +1505,7 @@ def _prepare_split(
features=writer._features,
path=fpath.replace("SSSSS", f"{shard_id:05d}"),
storage_options=self._fs.storage_options,
embed_local_files=embed_local_files,
)
writer.write_table(table)
finally:
Expand Down
25 changes: 25 additions & 0 deletions tests/test_arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,3 +326,28 @@ def test_parquet_writer_write():
stream = pa.BufferReader(output.getvalue())
pa_table: pa.Table = pq.read_table(stream)
assert pa_table.to_pydict() == {"col_1": ["foo", "bar"], "col_2": [1, 2]}


@require_pil
@pytest.mark.parametrize("embed_local_files", [False, True])
def test_writer_embed_local_files(tmp_path, embed_local_files):
import PIL.Image

image_path = str(tmp_path / "test_image_rgb.jpg")
PIL.Image.fromarray(np.zeros((5, 5), dtype=np.uint8)).save(image_path, format="png")
output = pa.BufferOutputStream()
with ParquetWriter(
stream=output, features=Features({"image": Image()}), embed_local_files=embed_local_files
) as writer:
writer.write({"image": image_path})
writer.finalize()
stream = pa.BufferReader(output.getvalue())
pa_table: pa.Table = pq.read_table(stream)
out = pa_table.to_pydict()
if embed_local_files:
assert out["image"][0]["path"] is None
with open(image_path, "rb") as f:
assert out["image"][0]["bytes"] == f.read()
else:
assert out["image"][0]["path"] == image_path
assert out["image"][0]["bytes"] is None

1 comment on commit 5d3f90b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007129 / 0.011353 (-0.004224) 0.003436 / 0.011008 (-0.007572) 0.028438 / 0.038508 (-0.010070) 0.029255 / 0.023109 (0.006146) 0.300140 / 0.275898 (0.024242) 0.356710 / 0.323480 (0.033230) 0.005181 / 0.007986 (-0.002804) 0.004128 / 0.004328 (-0.000200) 0.006678 / 0.004250 (0.002428) 0.038270 / 0.037052 (0.001218) 0.310103 / 0.258489 (0.051614) 0.350868 / 0.293841 (0.057027) 0.028345 / 0.128546 (-0.100201) 0.009268 / 0.075646 (-0.066378) 0.247123 / 0.419271 (-0.172148) 0.043699 / 0.043533 (0.000166) 0.307351 / 0.255139 (0.052212) 0.331388 / 0.283200 (0.048188) 0.082912 / 0.141683 (-0.058771) 1.511704 / 1.452155 (0.059549) 1.568146 / 1.492716 (0.075430)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.207667 / 0.018006 (0.189661) 0.414356 / 0.000490 (0.413866) 0.002581 / 0.000200 (0.002381) 0.000079 / 0.000054 (0.000024)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020351 / 0.037411 (-0.017061) 0.091429 / 0.014526 (0.076903) 0.101529 / 0.176557 (-0.075028) 0.140933 / 0.737135 (-0.596203) 0.105648 / 0.296338 (-0.190691)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.416549 / 0.215209 (0.201340) 4.167520 / 2.077655 (2.089865) 1.878724 / 1.504120 (0.374604) 1.674343 / 1.541195 (0.133148) 1.677563 / 1.468490 (0.209073) 0.442713 / 4.584777 (-4.142064) 3.327015 / 3.745712 (-0.418697) 1.833088 / 5.269862 (-3.436773) 1.220767 / 4.565676 (-3.344909) 0.052544 / 0.424275 (-0.371731) 0.010704 / 0.007607 (0.003097) 0.515560 / 0.226044 (0.289515) 5.168033 / 2.268929 (2.899105) 2.285132 / 55.444624 (-53.159492) 1.948361 / 6.876477 (-4.928116) 2.008439 / 2.142072 (-0.133633) 0.557316 / 4.805227 (-4.247911) 0.117046 / 6.500664 (-6.383618) 0.062659 / 0.075469 (-0.012810)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.531125 / 1.841788 (-0.310662) 11.987875 / 8.074308 (3.913567) 25.659088 / 10.191392 (15.467696) 0.903421 / 0.680424 (0.222998) 0.594347 / 0.534201 (0.060146) 0.349292 / 0.579283 (-0.229991) 0.392103 / 0.434364 (-0.042261) 0.230087 / 0.540337 (-0.310251) 0.236201 / 1.386936 (-1.150736)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.005322 / 0.011353 (-0.006031) 0.003458 / 0.011008 (-0.007550) 0.026599 / 0.038508 (-0.011909) 0.028051 / 0.023109 (0.004942) 0.332198 / 0.275898 (0.056300) 0.391818 / 0.323480 (0.068338) 0.003222 / 0.007986 (-0.004764) 0.002815 / 0.004328 (-0.001513) 0.004559 / 0.004250 (0.000309) 0.033765 / 0.037052 (-0.003287) 0.342438 / 0.258489 (0.083949) 0.401147 / 0.293841 (0.107306) 0.026940 / 0.128546 (-0.101606) 0.009373 / 0.075646 (-0.066274) 0.246956 / 0.419271 (-0.172316) 0.046462 / 0.043533 (0.002929) 0.338426 / 0.255139 (0.083287) 0.388652 / 0.283200 (0.105453) 0.086316 / 0.141683 (-0.055367) 1.508509 / 1.452155 (0.056354) 1.551606 / 1.492716 (0.058889)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.257769 / 0.018006 (0.239763) 0.401668 / 0.000490 (0.401178) 0.010921 / 0.000200 (0.010722) 0.000110 / 0.000054 (0.000055)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020295 / 0.037411 (-0.017116) 0.092825 / 0.014526 (0.078299) 0.103814 / 0.176557 (-0.072743) 0.142680 / 0.737135 (-0.594455) 0.106089 / 0.296338 (-0.190250)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.431238 / 0.215209 (0.216029) 4.298477 / 2.077655 (2.220822) 2.020772 / 1.504120 (0.516652) 1.822965 / 1.541195 (0.281770) 1.847092 / 1.468490 (0.378602) 0.451785 / 4.584777 (-4.132992) 3.387336 / 3.745712 (-0.358376) 2.750291 / 5.269862 (-2.519571) 1.430726 / 4.565676 (-3.134951) 0.053630 / 0.424275 (-0.370645) 0.011021 / 0.007607 (0.003414) 0.531922 / 0.226044 (0.305878) 5.328242 / 2.268929 (3.059313) 2.455303 / 55.444624 (-52.989321) 2.126060 / 6.876477 (-4.750417) 2.186409 / 2.142072 (0.044336) 0.561420 / 4.805227 (-4.243808) 0.121496 / 6.500664 (-6.379169) 0.065348 / 0.075469 (-0.010121)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.572504 / 1.841788 (-0.269284) 12.339846 / 8.074308 (4.265538) 26.427388 / 10.191392 (16.235996) 0.917341 / 0.680424 (0.236917) 0.643128 / 0.534201 (0.108927) 0.344444 / 0.579283 (-0.234839) 0.391168 / 0.434364 (-0.043195) 0.229617 / 0.540337 (-0.310720) 0.234035 / 1.386936 (-1.152901)

CML watermark

Please sign in to comment.