Skip to content

Commit

Permalink
Merge branch 'master' into always-update-metadata-in-arrow-schema
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Apr 27, 2021
2 parents 03a57d6 + 80e59ef commit db02eb1
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 2 deletions.
22 changes: 22 additions & 0 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2657,6 +2657,28 @@ def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Un
for offset in range(0, len(self), batch_size)
)

def to_json(
self,
path_or_buf: Union[PathLike, BinaryIO],
batch_size: Optional[int] = None,
**to_json_kwargs,
) -> int:
"""Exports the dataset to JSON.
Args:
path_or_buf (``PathLike`` or ``FileOrBuffer``): Either a path to a file or a BinaryIO.
batch_size (Optional ``int``): Size of the batch to load in memory and write at once.
Defaults to :obj:`datasets.config.DEFAULT_MAX_BATCH_SIZE`.
to_json_kwargs: Parameters to pass to pandas's :func:`pandas.DataFrame.to_json`
Returns:
int: The number of characters or bytes written
"""
# Dynamic import to avoid circular dependency
from .io.json import JsonDatasetWriter

return JsonDatasetWriter(self, path_or_buf, batch_size=batch_size, **to_json_kwargs).write()

def to_pandas(
self, batch_size: Optional[int] = None, batched: bool = False
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
Expand Down
48 changes: 46 additions & 2 deletions src/datasets/io/json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Optional
import os
from typing import BinaryIO, Optional, Union

from .. import Features, NamedSplit
from .. import Dataset, Features, NamedSplit, config
from ..formatting import query_table
from ..packaged_modules.json.json import Json
from ..utils.typing import NestedDataStructureLike, PathLike
from .abc import AbstractDatasetReader
Expand Down Expand Up @@ -52,3 +54,45 @@ def read(self):
split=self.split, ignore_verifications=ignore_verifications, in_memory=self.keep_in_memory
)
return dataset


class JsonDatasetWriter:
def __init__(
self,
dataset: Dataset,
path_or_buf: Union[PathLike, BinaryIO],
batch_size: Optional[int] = None,
**to_json_kwargs,
):
self.dataset = dataset
self.path_or_buf = path_or_buf
self.batch_size = batch_size
self.to_json_kwargs = to_json_kwargs

def write(self) -> int:
batch_size = self.batch_size if self.batch_size else config.DEFAULT_MAX_BATCH_SIZE

if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):
with open(self.path_or_buf, "wb+") as buffer:
written = self._write(file_obj=buffer, batch_size=batch_size, **self.to_json_kwargs)
else:
written = self._write(file_obj=self.path_or_buf, batch_size=batch_size, **self.to_json_kwargs)
return written

def _write(self, file_obj: BinaryIO, batch_size: int, encoding: str = "utf-8", **to_json_kwargs) -> int:
"""Writes the pyarrow table as JSON to a binary file handle.
Caller is responsible for opening and closing the handle.
"""
written = 0
_ = to_json_kwargs.pop("path_or_buf", None)

for offset in range(0, len(self.dataset), batch_size):
batch = query_table(
table=self.dataset.data,
key=slice(offset, offset + batch_size),
indices=self.dataset._indices if self.dataset._indices is not None else None,
)
json_str = batch.to_pandas().to_json(path_or_buf=None, **to_json_kwargs)
written += file_obj.write(json_str.encode(encoding))
return written
10 changes: 10 additions & 0 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2163,6 +2163,16 @@ def test_dataset_from_text(path_type, split, features, keep_in_memory, text_path
assert dataset.features[feature].dtype == expected_dtype


def test_dataset_to_json(dataset, tmp_path):
file_path = tmp_path / "test_path.jsonl"
bytes_written = dataset.to_json(path_or_buf=file_path)
assert file_path.is_file()
assert bytes_written == file_path.stat().st_size
df = pd.read_json(file_path)
assert df.shape == dataset.shape
assert list(df.columns) == list(dataset.column_names)


@pytest.mark.parametrize("in_memory", [False, True])
@pytest.mark.parametrize(
"method_and_params",
Expand Down

1 comment on commit db02eb1

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==1.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.025987 / 0.011353 (0.014634) 0.020289 / 0.011008 (0.009281) 0.054807 / 0.038508 (0.016299) 0.039004 / 0.023109 (0.015895) 0.370698 / 0.275898 (0.094799) 0.393465 / 0.323480 (0.069985) 0.014211 / 0.007986 (0.006225) 0.005973 / 0.004328 (0.001645) 0.013714 / 0.004250 (0.009463) 0.056616 / 0.037052 (0.019564) 0.377443 / 0.258489 (0.118954) 0.438361 / 0.293841 (0.144520) 0.188620 / 0.128546 (0.060074) 0.155780 / 0.075646 (0.080134) 0.471158 / 0.419271 (0.051886) 0.671940 / 0.043533 (0.628407) 0.383384 / 0.255139 (0.128245) 0.418606 / 0.283200 (0.135407) 2.240852 / 0.141683 (2.099169) 1.974448 / 1.452155 (0.522293) 1.978890 / 1.492716 (0.486173)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.008317 / 0.018006 (-0.009689) 0.536212 / 0.000490 (0.535722) 0.000361 / 0.000200 (0.000161) 0.000053 / 0.000054 (-0.000001)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.050011 / 0.037411 (0.012600) 0.030199 / 0.014526 (0.015673) 0.032914 / 0.176557 (-0.143643) 0.050308 / 0.737135 (-0.686827) 0.036507 / 0.296338 (-0.259831)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.525701 / 0.215209 (0.310492) 5.296068 / 2.077655 (3.218413) 2.453090 / 1.504120 (0.948970) 2.178450 / 1.541195 (0.637256) 2.214200 / 1.468490 (0.745710) 8.136089 / 4.584777 (3.551312) 7.118990 / 3.745712 (3.373278) 9.920997 / 5.269862 (4.651136) 8.823823 / 4.565676 (4.258146) 0.801177 / 0.424275 (0.376902) 0.012163 / 0.007607 (0.004556) 0.690486 / 0.226044 (0.464442) 6.685452 / 2.268929 (4.416523) 3.011064 / 55.444624 (-52.433560) 2.473257 / 6.876477 (-4.403220) 2.532215 / 2.142072 (0.390142) 8.171727 / 4.805227 (3.366500) 6.289535 / 6.500664 (-0.211129) 8.947182 / 0.075469 (8.871713)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 13.337362 / 1.841788 (11.495575) 14.442332 / 8.074308 (6.368024) 40.189700 / 10.191392 (29.998308) 0.928414 / 0.680424 (0.247990) 0.653530 / 0.534201 (0.119329) 0.894530 / 0.579283 (0.315247) 0.709726 / 0.434364 (0.275362) 0.797356 / 0.540337 (0.257019) 1.722038 / 1.386936 (0.335102)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.027876 / 0.011353 (0.016523) 0.017817 / 0.011008 (0.006809) 0.055170 / 0.038508 (0.016662) 0.041809 / 0.023109 (0.018700) 0.352195 / 0.275898 (0.076297) 0.395005 / 0.323480 (0.071526) 0.012754 / 0.007986 (0.004768) 0.006368 / 0.004328 (0.002040) 0.012476 / 0.004250 (0.008225) 0.061833 / 0.037052 (0.024780) 0.351345 / 0.258489 (0.092856) 0.388815 / 0.293841 (0.094974) 0.192640 / 0.128546 (0.064094) 0.142931 / 0.075646 (0.067284) 0.471615 / 0.419271 (0.052344) 0.445123 / 0.043533 (0.401590) 0.357267 / 0.255139 (0.102128) 0.380835 / 0.283200 (0.097635) 1.745963 / 0.141683 (1.604280) 1.894918 / 1.452155 (0.442763) 1.957762 / 1.492716 (0.465046)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.010431 / 0.018006 (-0.007575) 0.545871 / 0.000490 (0.545381) 0.002954 / 0.000200 (0.002754) 0.000094 / 0.000054 (0.000039)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.046198 / 0.037411 (0.008787) 0.028493 / 0.014526 (0.013967) 0.034112 / 0.176557 (-0.142444) 0.051808 / 0.737135 (-0.685328) 0.034909 / 0.296338 (-0.261429)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.508385 / 0.215209 (0.293176) 5.121920 / 2.077655 (3.044265) 2.457549 / 1.504120 (0.953429) 2.162790 / 1.541195 (0.621595) 2.165441 / 1.468490 (0.696951) 7.769627 / 4.584777 (3.184850) 6.844485 / 3.745712 (3.098773) 9.679675 / 5.269862 (4.409814) 8.416655 / 4.565676 (3.850979) 0.762976 / 0.424275 (0.338701) 0.011864 / 0.007607 (0.004257) 0.658210 / 0.226044 (0.432165) 6.580250 / 2.268929 (4.311322) 3.118259 / 55.444624 (-52.326365) 2.603115 / 6.876477 (-4.273361) 2.622616 / 2.142072 (0.480543) 7.886526 / 4.805227 (3.081299) 5.966182 / 6.500664 (-0.534482) 6.708427 / 0.075469 (6.632958)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 12.638468 / 1.841788 (10.796680) 13.663384 / 8.074308 (5.589076) 42.270958 / 10.191392 (32.079566) 0.913834 / 0.680424 (0.233410) 0.638656 / 0.534201 (0.104455) 0.841786 / 0.579283 (0.262503) 0.676326 / 0.434364 (0.241962) 0.764192 / 0.540337 (0.223854) 1.680133 / 1.386936 (0.293197)

CML watermark

Please sign in to comment.