From cf60105d9515250c8fc5594d4506546f30668bd1 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Feb 2021 15:05:55 +0100 Subject: [PATCH 01/30] Test Dataset CSV Reader --- tests/io/test_csv.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/io/test_csv.py diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py new file mode 100644 index 00000000000..2fe68b41eb3 --- /dev/null +++ b/tests/io/test_csv.py @@ -0,0 +1,36 @@ +import csv + +import pytest + +from datasets import NamedSplit +from datasets.io.csv import CsvDatasetReader + + +DATA = [ + {"col_1": "0", "col_2": 0, "col_3": 0.0}, + {"col_1": "1", "col_2": 1, "col_3": 1.0}, + {"col_1": "2", "col_2": 2, "col_3": 2.0}, + {"col_1": "3", "col_2": 3, "col_3": 3.0}, +] + + +@pytest.fixture(scope="session") +def csv_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.csv") + with open(path, "w") as f: + writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) + writer.writeheader() + for item in DATA: + writer.writerow(item) + return path + + +@pytest.mark.parametrize("split", [None, NamedSplit("train")]) +def test_dataset_csv_reader(split, csv_path): + path = csv_path + + ds = CsvDatasetReader(path, split=split).read() + assert ds.num_rows == 4 + assert ds.num_columns == 3 + assert ds.column_names == ["col_1", "col_2", "col_3"] + assert ds.split == split From 5ce577fd7a21fca55493f94f48d379cae0b81b44 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Feb 2021 15:07:36 +0100 Subject: [PATCH 02/30] Implement CsvDatasetReader --- src/datasets/arrow_dataset.py | 3 +-- src/datasets/io/__init__.py | 0 src/datasets/io/abc.py | 20 ++++++++++++++++++++ src/datasets/io/csv.py | 17 +++++++++++++++++ src/datasets/utils/typing.py | 5 +++++ 5 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 src/datasets/io/__init__.py create mode 100644 src/datasets/io/abc.py create mode 100644 src/datasets/io/csv.py create mode 100644 src/datasets/utils/typing.py diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index b7c5b27ba7a..d7558b9908d 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -58,6 +58,7 @@ from .utils import map_nested from .utils.deprecation_utils import deprecated from .utils.logging import WARNING, get_logger, get_verbosity, set_verbosity_warning +from .utils.typing import PathLike if TYPE_CHECKING: @@ -70,8 +71,6 @@ else: PYARROW_V0 = False -PathLike = Union[str, bytes, os.PathLike] - class DatasetInfoMixin(object): """This base class exposes some attributes of DatasetInfo diff --git a/src/datasets/io/__init__.py b/src/datasets/io/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/io/abc.py b/src/datasets/io/abc.py new file mode 100644 index 00000000000..22ae4dc7e6a --- /dev/null +++ b/src/datasets/io/abc.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod +from typing import Optional + +from .. import DatasetInfo, NamedSplit +from ..arrow_dataset import Dataset +from ..utils.typing import PathLike + + +class AbstractDatasetReader(ABC): + def __init__( + self, path: PathLike, info: Optional[DatasetInfo] = None, split: Optional[NamedSplit] = None, **kwargs + ): + self.path = path + self.info = info + self.split = split + self.kwargs = kwargs + + @abstractmethod + def read(self) -> Dataset: + pass diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py new file mode 100644 index 00000000000..a81084d9e3d --- /dev/null +++ b/src/datasets/io/csv.py @@ -0,0 +1,17 @@ +import pandas as pd +import pyarrow as pa + +from ..arrow_dataset import Dataset +from .abc import AbstractDatasetReader + + +class CsvDatasetReader(AbstractDatasetReader): + def read(self): + table = self._read_table() + return Dataset(table, info=self.info, split=self.split) + + def _read_table(self): + schema = pa.schema(self.info.features.type) if self.info and self.info.features else None + df = pd.read_csv(self.path, **self.kwargs) + table = pa.Table.from_pandas(df, schema=schema) + return table diff --git a/src/datasets/utils/typing.py b/src/datasets/utils/typing.py new file mode 100644 index 00000000000..ee3d56df41f --- /dev/null +++ b/src/datasets/utils/typing.py @@ -0,0 +1,5 @@ +import os +from typing import Union + + +PathLike = Union[str, bytes, os.PathLike] From c478ed5ff18bf8b560bc29b5c2c1d00a563a7c89 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Feb 2021 15:08:11 +0100 Subject: [PATCH 03/30] Implement Dataset.from_csv --- src/datasets/arrow_dataset.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index d7558b9908d..f727a888148 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -434,6 +434,25 @@ def from_dict( pa_table: pa.Table = pa.Table.from_pydict(mapping=mapping) return cls(pa_table, info=info, split=split) + @staticmethod + def from_csv( + path: PathLike, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + ): + """Create Dataset from CSV file. + Args: + path (path-like): Path of the CSV file. + info (DatasetInfo, optional): Dataset information, like description, citation, etc. + split (NamedSplit, optional): Name of the dataset split. + Returns: + datasets.Dataset + """ + # Dynamic import to avoid circular dependency + from .io.csv import CsvDatasetReader + + return CsvDatasetReader(path, info=info, split=split).read() + def __del__(self): if hasattr(self, "_data"): del self._data From eb45f5c86d143e45e21887faeb58d2bf6c4aea21 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Feb 2021 16:39:03 +0100 Subject: [PATCH 04/30] Test CsvDatasetReader features --- tests/io/test_csv.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index 2fe68b41eb3..86b8a3c927d 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -2,7 +2,7 @@ import pytest -from datasets import NamedSplit +from datasets import DatasetInfo, Features, NamedSplit, Value from datasets.io.csv import CsvDatasetReader @@ -25,12 +25,29 @@ def csv_path(tmp_path_factory): return path +@pytest.mark.parametrize( + "features", + [ + None, + {"col_1": "string", "col_2": "int64", "col_3": "float64"}, + {"col_1": "string", "col_2": "string", "col_3": "string"}, + {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, + {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, + ], +) @pytest.mark.parametrize("split", [None, NamedSplit("train")]) -def test_dataset_csv_reader(split, csv_path): +def test_dataset_csv_reader(split, features, csv_path): path = csv_path + # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" + default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} + expected_features = features.copy() if features else default_expected_features + features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None + info = DatasetInfo(features=features) if features else None - ds = CsvDatasetReader(path, split=split).read() + ds = CsvDatasetReader(path, split=split, info=info).read() assert ds.num_rows == 4 assert ds.num_columns == 3 assert ds.column_names == ["col_1", "col_2", "col_3"] assert ds.split == split + for feature, expected_dtype in expected_features.items(): + assert ds.features[feature].dtype == expected_dtype From 1fe83446a2c28fa0c1baaad05960eb5e8b2287cb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Feb 2021 16:44:03 +0100 Subject: [PATCH 05/30] Fix CsvDatasetReader to allow cast int to str --- src/datasets/io/csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index a81084d9e3d..75ec02a88f0 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -12,6 +12,7 @@ def read(self): def _read_table(self): schema = pa.schema(self.info.features.type) if self.info and self.info.features else None - df = pd.read_csv(self.path, **self.kwargs) + dtype = {name: dtype.to_pandas_dtype() for name, dtype in zip(schema.names, schema.types)} if schema else None + df = pd.read_csv(self.path, dtype=dtype, **self.kwargs) # dtype allows reading an int column as str table = pa.Table.from_pandas(df, schema=schema) return table From cf3495fd1a96a287423d82e7b49952214cc1164d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 25 Feb 2021 20:24:26 +0100 Subject: [PATCH 06/30] Use CsvDatasetBuilder instead of CsvDatasetReader --- src/datasets/arrow_dataset.py | 7 +++-- src/datasets/io/csv.py | 59 +++++++++++++++++++++++++++++++++++ tests/io/test_csv.py | 33 +++++++++++++++++++- 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index f727a888148..38df0dc5f89 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -449,9 +449,12 @@ def from_csv( datasets.Dataset """ # Dynamic import to avoid circular dependency - from .io.csv import CsvDatasetReader + # from .io.csv import CsvDatasetReader + # + # return CsvDatasetReader(path, info=info, split=split).read() + from .io.csv import CsvDatasetBuilder - return CsvDatasetReader(path, info=info, split=split).read() + return CsvDatasetBuilder(path, info=info, split=split).build() def __del__(self): if hasattr(self, "_data"): diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 75ec02a88f0..2446051b609 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -2,6 +2,7 @@ import pyarrow as pa from ..arrow_dataset import Dataset +from ..packaged_modules.csv.csv import Csv from .abc import AbstractDatasetReader @@ -16,3 +17,61 @@ def _read_table(self): df = pd.read_csv(self.path, dtype=dtype, **self.kwargs) # dtype allows reading an int column as str table = pa.Table.from_pandas(df, schema=schema) return table + + +class CsvDatasetBuilder: + def __init__( + self, + path, + name=None, + data_dir=None, + data_files=None, + split=None, + cache_dir=None, + features=None, + **config_kwargs, + ): + self.split = split + self.builder = Csv( + cache_dir=cache_dir, + name=name, + data_dir=data_dir, + data_files=path or data_files, + hash=hash, + features=features, + **config_kwargs, + ) + + def build(self): + # split = "train" # None # if None: num_rows = {'train': 4} instead of 4 + + download_config = None + download_mode = None + ignore_verifications = False + + use_auth_token = None + + keep_in_memory = False + save_infos = False + + base_path = None + + # import pdb;pdb.set_trace() + + self.builder.download_and_prepare( + download_config=download_config, + download_mode=download_mode, + ignore_verifications=ignore_verifications, + # try_from_hf_gcs=try_from_hf_gcs, + base_path=base_path, + use_auth_token=use_auth_token, + ) + + # Build dataset for splits + ds = self.builder.as_dataset( + split=self.split, ignore_verifications=ignore_verifications, in_memory=keep_in_memory + ) + if save_infos: + self.builder._save_infos() + + return ds diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index 86b8a3c927d..5c1259bc9d6 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -3,7 +3,7 @@ import pytest from datasets import DatasetInfo, Features, NamedSplit, Value -from datasets.io.csv import CsvDatasetReader +from datasets.io.csv import CsvDatasetBuilder, CsvDatasetReader DATA = [ @@ -51,3 +51,34 @@ def test_dataset_csv_reader(split, features, csv_path): assert ds.split == split for feature, expected_dtype in expected_features.items(): assert ds.features[feature].dtype == expected_dtype + + +@pytest.mark.parametrize( + "features", + [ + None, + # {"col_1": "string", "col_2": "int64", "col_3": "float64"}, + # {"col_1": "string", "col_2": "string", "col_3": "string"}, + {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, + {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, + ], +) +@pytest.mark.parametrize("split", [None, NamedSplit("train")]) +def test_dataset_csv_builder(split, csv_path, features, tmp_path): + path = csv_path + cache_dir = tmp_path / "cache" + + # split = "train" # None # if None: num_rows = {'train': 4} instead of 4 + # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" + default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} + expected_features = features.copy() if features else default_expected_features + features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None + + ds = CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir).build() + ds = ds if split else ds["train"] + assert ds.num_rows == 4 + assert ds.num_columns == 3 + assert ds.column_names == ["col_1", "col_2", "col_3"] + assert ds.split == "train" + for feature, expected_dtype in expected_features.items(): + assert ds.features[feature].dtype == expected_dtype From d04b2c8e43d6df481f05e173396dd6e235f28eb1 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 14:11:27 +0100 Subject: [PATCH 07/30] Fix Csv(ArrowBasedBuilder) to allow cast int to str --- src/datasets/packaged_modules/csv/csv.py | 3 +++ tests/io/test_csv.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/datasets/packaged_modules/csv/csv.py b/src/datasets/packaged_modules/csv/csv.py index 611725fe672..d0a4d461a79 100644 --- a/src/datasets/packaged_modules/csv/csv.py +++ b/src/datasets/packaged_modules/csv/csv.py @@ -86,10 +86,13 @@ def _split_generators(self, dl_manager): def _generate_tables(self, files): schema = pa.schema(self.config.features.type) if self.config.features is not None else None + # dtype allows reading an int column as str + dtype = {name: dtype.to_pandas_dtype() for name, dtype in zip(schema.names, schema.types)} if schema else None for file_idx, file in enumerate(files): csv_file_reader = pd.read_csv( file, iterator=True, + dtype=dtype, sep=self.config.sep, header=self.config.header, names=self.config.names, diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index 5c1259bc9d6..c36ee5eb9ba 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -57,8 +57,8 @@ def test_dataset_csv_reader(split, features, csv_path): "features", [ None, - # {"col_1": "string", "col_2": "int64", "col_3": "float64"}, - # {"col_1": "string", "col_2": "string", "col_3": "string"}, + {"col_1": "string", "col_2": "int64", "col_3": "float64"}, + {"col_1": "string", "col_2": "string", "col_3": "string"}, {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, ], From f83daaec8c333d708cb3d46b349cf8955d1d5e0f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 15:01:01 +0100 Subject: [PATCH 08/30] Test different path types --- tests/io/test_csv.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index c36ee5eb9ba..6cba463f92e 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -64,18 +64,23 @@ def test_dataset_csv_reader(split, features, csv_path): ], ) @pytest.mark.parametrize("split", [None, NamedSplit("train")]) -def test_dataset_csv_builder(split, csv_path, features, tmp_path): - path = csv_path +@pytest.mark.parametrize("path_type", [str, list, dict]) +def test_dataset_csv_builder(path_type, split, csv_path, features, tmp_path): + if issubclass(path_type, str): + path = csv_path + elif issubclass(path_type, list): + path = [csv_path] + elif issubclass(path_type, dict): + path = {"train": csv_path} cache_dir = tmp_path / "cache" - # split = "train" # None # if None: num_rows = {'train': 4} instead of 4 # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None ds = CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir).build() - ds = ds if split else ds["train"] + ds = ds if split else ds["train"] # # if split is None: ds.num_rows = {'train': 4} instead of 4 assert ds.num_rows == 4 assert ds.num_columns == 3 assert ds.column_names == ["col_1", "col_2", "col_3"] From c819e4982ed489b1c77cb4fb8538706d0d2c8604 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 15:46:55 +0100 Subject: [PATCH 09/30] Remove CsvDatasetReader and unused attributes --- src/datasets/io/abc.py | 16 +++++++++++----- src/datasets/io/csv.py | 35 ++++------------------------------- 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/src/datasets/io/abc.py b/src/datasets/io/abc.py index 22ae4dc7e6a..dcd26b89f48 100644 --- a/src/datasets/io/abc.py +++ b/src/datasets/io/abc.py @@ -1,20 +1,26 @@ from abc import ABC, abstractmethod from typing import Optional -from .. import DatasetInfo, NamedSplit +from .. import Features, NamedSplit from ..arrow_dataset import Dataset from ..utils.typing import PathLike -class AbstractDatasetReader(ABC): +class AbstractDatasetBuilder(ABC): def __init__( - self, path: PathLike, info: Optional[DatasetInfo] = None, split: Optional[NamedSplit] = None, **kwargs + self, + path: PathLike, + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, + **kwargs, ): self.path = path - self.info = info self.split = split + self.features = features + self.cache_dir = cache_dir self.kwargs = kwargs @abstractmethod - def read(self) -> Dataset: + def build(self) -> Dataset: pass diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 2446051b609..20f839bf4e7 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -1,50 +1,25 @@ -import pandas as pd -import pyarrow as pa - -from ..arrow_dataset import Dataset from ..packaged_modules.csv.csv import Csv -from .abc import AbstractDatasetReader - - -class CsvDatasetReader(AbstractDatasetReader): - def read(self): - table = self._read_table() - return Dataset(table, info=self.info, split=self.split) - - def _read_table(self): - schema = pa.schema(self.info.features.type) if self.info and self.info.features else None - dtype = {name: dtype.to_pandas_dtype() for name, dtype in zip(schema.names, schema.types)} if schema else None - df = pd.read_csv(self.path, dtype=dtype, **self.kwargs) # dtype allows reading an int column as str - table = pa.Table.from_pandas(df, schema=schema) - return table +from .abc import AbstractDatasetBuilder -class CsvDatasetBuilder: +class CsvDatasetBuilder(AbstractDatasetBuilder): def __init__( self, path, - name=None, - data_dir=None, - data_files=None, split=None, - cache_dir=None, features=None, + cache_dir=None, **config_kwargs, ): self.split = split self.builder = Csv( cache_dir=cache_dir, - name=name, - data_dir=data_dir, - data_files=path or data_files, - hash=hash, + data_files=path, features=features, **config_kwargs, ) def build(self): - # split = "train" # None # if None: num_rows = {'train': 4} instead of 4 - download_config = None download_mode = None ignore_verifications = False @@ -56,8 +31,6 @@ def build(self): base_path = None - # import pdb;pdb.set_trace() - self.builder.download_and_prepare( download_config=download_config, download_mode=download_mode, From 6f2fbd90f5e4a76e2c43df93a8181eab1ba74b78 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 16:38:21 +0100 Subject: [PATCH 10/30] Update Dataset.from_csv signature --- src/datasets/arrow_dataset.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 38df0dc5f89..cb5c330fd12 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -437,24 +437,23 @@ def from_dict( @staticmethod def from_csv( path: PathLike, - info: Optional[DatasetInfo] = None, split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, ): - """Create Dataset from CSV file. + """Create Dataset from CSV file(s). Args: - path (path-like): Path of the CSV file. - info (DatasetInfo, optional): Dataset information, like description, citation, etc. + path (path-like): Path of the CSV file(s). split (NamedSplit, optional): Name of the dataset split. + features (Features, optional): Dataset features. + cache_dir (str, optional, default="~/datasets"): Directory to cache data. Returns: datasets.Dataset """ # Dynamic import to avoid circular dependency - # from .io.csv import CsvDatasetReader - # - # return CsvDatasetReader(path, info=info, split=split).read() from .io.csv import CsvDatasetBuilder - return CsvDatasetBuilder(path, info=info, split=split).build() + return CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir).build() def __del__(self): if hasattr(self, "_data"): From 800b8cd86086267ba1d714236782f5f21661fd9b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 16:43:53 +0100 Subject: [PATCH 11/30] Remove test for CsvDatasetReader --- tests/io/test_csv.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index 6cba463f92e..48f8e8da972 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -3,7 +3,7 @@ import pytest from datasets import DatasetInfo, Features, NamedSplit, Value -from datasets.io.csv import CsvDatasetBuilder, CsvDatasetReader +from datasets.io.csv import CsvDatasetBuilder DATA = [ @@ -25,34 +25,6 @@ def csv_path(tmp_path_factory): return path -@pytest.mark.parametrize( - "features", - [ - None, - {"col_1": "string", "col_2": "int64", "col_3": "float64"}, - {"col_1": "string", "col_2": "string", "col_3": "string"}, - {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, - {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, - ], -) -@pytest.mark.parametrize("split", [None, NamedSplit("train")]) -def test_dataset_csv_reader(split, features, csv_path): - path = csv_path - # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" - default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} - expected_features = features.copy() if features else default_expected_features - features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - info = DatasetInfo(features=features) if features else None - - ds = CsvDatasetReader(path, split=split, info=info).read() - assert ds.num_rows == 4 - assert ds.num_columns == 3 - assert ds.column_names == ["col_1", "col_2", "col_3"] - assert ds.split == split - for feature, expected_dtype in expected_features.items(): - assert ds.features[feature].dtype == expected_dtype - - @pytest.mark.parametrize( "features", [ From 9d1cbe7811ce196d76d904edcb46b948337c485b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 17:05:36 +0100 Subject: [PATCH 12/30] Remove unused import --- tests/io/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index 48f8e8da972..d6b4aef770d 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -2,7 +2,7 @@ import pytest -from datasets import DatasetInfo, Features, NamedSplit, Value +from datasets import Features, NamedSplit, Value from datasets.io.csv import CsvDatasetBuilder From 1af8f96b84635ae6aedb3aaeb60da30ac0f95ae2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 2 Mar 2021 17:13:19 +0100 Subject: [PATCH 13/30] Pass kwargs --- src/datasets/arrow_dataset.py | 3 ++- src/datasets/io/csv.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cb5c330fd12..ae0b85d22f8 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -440,6 +440,7 @@ def from_csv( split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, + **kwargs, ): """Create Dataset from CSV file(s). Args: @@ -453,7 +454,7 @@ def from_csv( # Dynamic import to avoid circular dependency from .io.csv import CsvDatasetBuilder - return CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir).build() + return CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir, **kwargs).build() def __del__(self): if hasattr(self, "_data"): diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 20f839bf4e7..5fc604ebf36 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -9,14 +9,14 @@ def __init__( split=None, features=None, cache_dir=None, - **config_kwargs, + **kwargs, ): - self.split = split + super().__init__(path, split=split, features=features, cache_dir=cache_dir, **kwargs) self.builder = Csv( cache_dir=cache_dir, data_files=path, features=features, - **config_kwargs, + **kwargs, ) def build(self): From e40e46d0217c0775a1c15ef69495112f6a8634e3 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 4 Mar 2021 08:00:06 +0100 Subject: [PATCH 14/30] Rename adapter as CsvDatasetReader instead of CsvDatasetBuilder --- src/datasets/arrow_dataset.py | 4 ++-- src/datasets/io/abc.py | 4 ++-- src/datasets/io/csv.py | 6 +++--- tests/io/test_csv.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index ae0b85d22f8..d9e54782989 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -452,9 +452,9 @@ def from_csv( datasets.Dataset """ # Dynamic import to avoid circular dependency - from .io.csv import CsvDatasetBuilder + from .io.csv import CsvDatasetReader - return CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir, **kwargs).build() + return CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir, **kwargs).read() def __del__(self): if hasattr(self, "_data"): diff --git a/src/datasets/io/abc.py b/src/datasets/io/abc.py index dcd26b89f48..15727543dc6 100644 --- a/src/datasets/io/abc.py +++ b/src/datasets/io/abc.py @@ -6,7 +6,7 @@ from ..utils.typing import PathLike -class AbstractDatasetBuilder(ABC): +class AbstractDatasetReader(ABC): def __init__( self, path: PathLike, @@ -22,5 +22,5 @@ def __init__( self.kwargs = kwargs @abstractmethod - def build(self) -> Dataset: + def read(self) -> Dataset: pass diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 5fc604ebf36..6af2e2f3e4e 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -1,8 +1,8 @@ from ..packaged_modules.csv.csv import Csv -from .abc import AbstractDatasetBuilder +from .abc import AbstractDatasetReader -class CsvDatasetBuilder(AbstractDatasetBuilder): +class CsvDatasetReader(AbstractDatasetReader): def __init__( self, path, @@ -19,7 +19,7 @@ def __init__( **kwargs, ) - def build(self): + def read(self): download_config = None download_mode = None ignore_verifications = False diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index d6b4aef770d..c17ed63eba4 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -3,7 +3,7 @@ import pytest from datasets import Features, NamedSplit, Value -from datasets.io.csv import CsvDatasetBuilder +from datasets.io.csv import CsvDatasetReader DATA = [ @@ -51,7 +51,7 @@ def test_dataset_csv_builder(path_type, split, csv_path, features, tmp_path): expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - ds = CsvDatasetBuilder(path, split=split, features=features, cache_dir=cache_dir).build() + ds = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir).read() ds = ds if split else ds["train"] # # if split is None: ds.num_rows = {'train': 4} instead of 4 assert ds.num_rows == 4 assert ds.num_columns == 3 From 3e4cb6f70b692581e13b57784d9376afff72cb43 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 4 Mar 2021 08:42:08 +0100 Subject: [PATCH 15/30] Pass keep_in_memory --- src/datasets/arrow_dataset.py | 6 +++++- src/datasets/io/abc.py | 2 ++ src/datasets/io/csv.py | 23 +++++++++++++---------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index d9e54782989..2ca82091fbc 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -440,6 +440,7 @@ def from_csv( split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, + keep_in_memory: bool = False, **kwargs, ): """Create Dataset from CSV file(s). @@ -448,13 +449,16 @@ def from_csv( split (NamedSplit, optional): Name of the dataset split. features (Features, optional): Dataset features. cache_dir (str, optional, default="~/datasets"): Directory to cache data. + keep_in_memory (bool, default=False): Whether to copy the data in-memory. Returns: datasets.Dataset """ # Dynamic import to avoid circular dependency from .io.csv import CsvDatasetReader - return CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir, **kwargs).read() + return CsvDatasetReader( + path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs + ).read() def __del__(self): if hasattr(self, "_data"): diff --git a/src/datasets/io/abc.py b/src/datasets/io/abc.py index 15727543dc6..eca4c0eab34 100644 --- a/src/datasets/io/abc.py +++ b/src/datasets/io/abc.py @@ -13,12 +13,14 @@ def __init__( split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, + keep_in_memory: bool = False, **kwargs, ): self.path = path self.split = split self.features = features self.cache_dir = cache_dir + self.keep_in_memory = keep_in_memory self.kwargs = kwargs @abstractmethod diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 6af2e2f3e4e..680d90137be 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -1,17 +1,24 @@ +from typing import Optional + +from .. import Features, NamedSplit from ..packaged_modules.csv.csv import Csv +from ..utils.typing import PathLike from .abc import AbstractDatasetReader class CsvDatasetReader(AbstractDatasetReader): def __init__( self, - path, - split=None, - features=None, - cache_dir=None, + path: PathLike, + split: Optional[NamedSplit] = None, + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, **kwargs, ): - super().__init__(path, split=split, features=features, cache_dir=cache_dir, **kwargs) + super().__init__( + path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs + ) self.builder = Csv( cache_dir=cache_dir, data_files=path, @@ -23,12 +30,8 @@ def read(self): download_config = None download_mode = None ignore_verifications = False - use_auth_token = None - - keep_in_memory = False save_infos = False - base_path = None self.builder.download_and_prepare( @@ -42,7 +45,7 @@ def read(self): # Build dataset for splits ds = self.builder.as_dataset( - split=self.split, ignore_verifications=ignore_verifications, in_memory=keep_in_memory + split=self.split, ignore_verifications=ignore_verifications, in_memory=self.keep_in_memory ) if save_infos: self.builder._save_infos() From b3978cdf53671f60bd847689701152e4e9de513e Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 4 Mar 2021 15:29:13 +0100 Subject: [PATCH 16/30] Remove save_infos --- src/datasets/io/csv.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 680d90137be..f9c2ea327c6 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -31,7 +31,6 @@ def read(self): download_mode = None ignore_verifications = False use_auth_token = None - save_infos = False base_path = None self.builder.download_and_prepare( @@ -44,10 +43,7 @@ def read(self): ) # Build dataset for splits - ds = self.builder.as_dataset( + dataset = self.builder.as_dataset( split=self.split, ignore_verifications=ignore_verifications, in_memory=self.keep_in_memory ) - if save_infos: - self.builder._save_infos() - - return ds + return dataset From dd46f5fb87b15f265a4737df730d6716b0c16a96 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 4 Mar 2021 16:42:15 +0100 Subject: [PATCH 17/30] Make explicit that path can be NestedStructure-like --- src/datasets/io/abc.py | 6 +++--- src/datasets/io/csv.py | 8 ++++---- src/datasets/utils/typing.py | 5 ++++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/datasets/io/abc.py b/src/datasets/io/abc.py index eca4c0eab34..58a3915c45e 100644 --- a/src/datasets/io/abc.py +++ b/src/datasets/io/abc.py @@ -3,20 +3,20 @@ from .. import Features, NamedSplit from ..arrow_dataset import Dataset -from ..utils.typing import PathLike +from ..utils.typing import NestedDataStructureLike, PathLike class AbstractDatasetReader(ABC): def __init__( self, - path: PathLike, + path_or_paths: NestedDataStructureLike[PathLike], split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, keep_in_memory: bool = False, **kwargs, ): - self.path = path + self.path_or_paths = path_or_paths self.split = split self.features = features self.cache_dir = cache_dir diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index f9c2ea327c6..72017ea077a 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -2,14 +2,14 @@ from .. import Features, NamedSplit from ..packaged_modules.csv.csv import Csv -from ..utils.typing import PathLike +from ..utils.typing import NestedDataStructureLike, PathLike from .abc import AbstractDatasetReader class CsvDatasetReader(AbstractDatasetReader): def __init__( self, - path: PathLike, + path_or_paths: NestedDataStructureLike[PathLike], split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, @@ -17,11 +17,11 @@ def __init__( **kwargs, ): super().__init__( - path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs + path_or_paths, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs ) self.builder = Csv( cache_dir=cache_dir, - data_files=path, + data_files=path_or_paths, features=features, **kwargs, ) diff --git a/src/datasets/utils/typing.py b/src/datasets/utils/typing.py index ee3d56df41f..33f8a758a66 100644 --- a/src/datasets/utils/typing.py +++ b/src/datasets/utils/typing.py @@ -1,5 +1,8 @@ import os -from typing import Union +from typing import Dict, List, TypeVar, Union +T = TypeVar("T") + +NestedDataStructureLike = Union[T, List[T], Dict[str, T]] PathLike = Union[str, bytes, os.PathLike] From 16b902be3e01603803e3592c2d38711be5c3c619 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 10:24:08 +0100 Subject: [PATCH 18/30] Test CsvDatasetReader for path or list of paths only, and split --- tests/io/test_csv.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index c17ed63eba4..3228e4cf0a0 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -2,7 +2,7 @@ import pytest -from datasets import Features, NamedSplit, Value +from datasets import Dataset, Features, NamedSplit, Value from datasets.io.csv import CsvDatasetReader @@ -35,27 +35,27 @@ def csv_path(tmp_path_factory): {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, ], ) -@pytest.mark.parametrize("split", [None, NamedSplit("train")]) -@pytest.mark.parametrize("path_type", [str, list, dict]) -def test_dataset_csv_builder(path_type, split, csv_path, features, tmp_path): +@pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"]) +@pytest.mark.parametrize("path_type", [str, list]) +def test_csv_dataset_reader(path_type, split, csv_path, features, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): path = [csv_path] - elif issubclass(path_type, dict): - path = {"train": csv_path} cache_dir = tmp_path / "cache" + expected_split = str(split) if split else "train" + # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None ds = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir).read() - ds = ds if split else ds["train"] # # if split is None: ds.num_rows = {'train': 4} instead of 4 + assert isinstance(ds, Dataset) assert ds.num_rows == 4 assert ds.num_columns == 3 assert ds.column_names == ["col_1", "col_2", "col_3"] - assert ds.split == "train" + assert ds.split == expected_split for feature, expected_dtype in expected_features.items(): assert ds.features[feature].dtype == expected_dtype From 5f5b56d12ae1667834d98dea8171b0e056fb1ac6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 10:25:36 +0100 Subject: [PATCH 19/30] Set train as default split --- src/datasets/io/abc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/io/abc.py b/src/datasets/io/abc.py index 58a3915c45e..7e7b44fc6e8 100644 --- a/src/datasets/io/abc.py +++ b/src/datasets/io/abc.py @@ -1,7 +1,7 @@ from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Union -from .. import Features, NamedSplit +from .. import DatasetDict, Features, NamedSplit from ..arrow_dataset import Dataset from ..utils.typing import NestedDataStructureLike, PathLike @@ -17,12 +17,12 @@ def __init__( **kwargs, ): self.path_or_paths = path_or_paths - self.split = split + self.split = split if split or isinstance(path_or_paths, dict) else "train" self.features = features self.cache_dir = cache_dir self.keep_in_memory = keep_in_memory self.kwargs = kwargs @abstractmethod - def read(self) -> Dataset: + def read(self) -> Union[Dataset, DatasetDict]: pass From b86e7cdab70f20f68d66b061417d2bda92b80b29 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 10:26:20 +0100 Subject: [PATCH 20/30] Pass paths as dict to allow split renaming --- src/datasets/io/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py index 72017ea077a..e35342aba38 100644 --- a/src/datasets/io/csv.py +++ b/src/datasets/io/csv.py @@ -19,6 +19,7 @@ def __init__( super().__init__( path_or_paths, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs ) + path_or_paths = path_or_paths if isinstance(path_or_paths, dict) else {self.split: path_or_paths} self.builder = Csv( cache_dir=cache_dir, data_files=path_or_paths, From 7991b60164b7ea502adda7535d6c5fa610ca5a8d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 10:28:46 +0100 Subject: [PATCH 21/30] Specify split and str or list paths for Dataset.from_csv --- src/datasets/arrow_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 2ca82091fbc..b3a9b00fd37 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -436,7 +436,7 @@ def from_dict( @staticmethod def from_csv( - path: PathLike, + path_or_paths: Union[PathLike, List[PathLike]], split: Optional[NamedSplit] = None, features: Optional[Features] = None, cache_dir: str = None, @@ -445,8 +445,8 @@ def from_csv( ): """Create Dataset from CSV file(s). Args: - path (path-like): Path of the CSV file(s). - split (NamedSplit, optional): Name of the dataset split. + path_or_paths (path-like or list of path-like): Path(s) of the CSV file(s). + split (NamedSplit, optional): Split name to be assigned to the dataset. features (Features, optional): Dataset features. cache_dir (str, optional, default="~/datasets"): Directory to cache data. keep_in_memory (bool, default=False): Whether to copy the data in-memory. @@ -457,7 +457,7 @@ def from_csv( from .io.csv import CsvDatasetReader return CsvDatasetReader( - path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs + path_or_paths, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs ).read() def __del__(self): From c26ed08f26963dda905e7ae5c41bfb7642a095cb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 10:37:03 +0100 Subject: [PATCH 22/30] Test CSV DatasetDict Reader for dict path --- tests/io/test_csv.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index 3228e4cf0a0..b093ef8da85 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -2,7 +2,7 @@ import pytest -from datasets import Dataset, Features, NamedSplit, Value +from datasets import Dataset, DatasetDict, Features, NamedSplit, Value from datasets.io.csv import CsvDatasetReader @@ -59,3 +59,38 @@ def test_csv_dataset_reader(path_type, split, csv_path, features, tmp_path): assert ds.split == expected_split for feature, expected_dtype in expected_features.items(): assert ds.features[feature].dtype == expected_dtype + + +@pytest.mark.parametrize( + "features", + [ + None, + {"col_1": "string", "col_2": "int64", "col_3": "float64"}, + {"col_1": "string", "col_2": "string", "col_3": "string"}, + {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, + {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, + ], +) +@pytest.mark.parametrize("split", [None, "train", "test"]) +def test_csv_datasetdict_reader(split, csv_path, features, tmp_path): + if split: + path = {split: csv_path} + else: + split = "train" + path = {"train": csv_path, "test": csv_path} + cache_dir = tmp_path / "cache" + + # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" + default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} + expected_features = features.copy() if features else default_expected_features + features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None + + ds = CsvDatasetReader(path, features=features, cache_dir=cache_dir).read() + assert isinstance(ds, DatasetDict) + ds = ds[split] + assert ds.num_rows == 4 + assert ds.num_columns == 3 + assert ds.column_names == ["col_1", "col_2", "col_3"] + assert ds.split == split + for feature, expected_dtype in expected_features.items(): + assert ds.features[feature].dtype == expected_dtype From 34c0ed7342cc288726d224762999c9b5461e8669 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 10:38:20 +0100 Subject: [PATCH 23/30] Implement DatasetDict.from_csv --- src/datasets/dataset_dict.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index ef62b011477..0f293b16b16 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -13,6 +13,7 @@ from .arrow_dataset import Dataset from .features import Features from .filesystems import extract_path_from_uri, is_remote_filesystem +from .utils.typing import PathLike class DatasetDict(dict): @@ -613,3 +614,27 @@ def load_from_disk(dataset_dict_path: str, fs=None) -> "DatasetDict": ) dataset_dict[k] = Dataset.load_from_disk(dataset_dict_split_path, fs) return dataset_dict + + @staticmethod + def from_csv( + path_or_paths: Dict[str, PathLike], + features: Optional[Features] = None, + cache_dir: str = None, + keep_in_memory: bool = False, + **kwargs, + ): + """Create DatasetDict from CSV file(s). + Args: + path_or_paths (dict of path-like): Path(s) of the CSV file(s). + features (Features, optional): Dataset features. + cache_dir (str, optional, default="~/datasets"): Directory to cache data. + keep_in_memory (bool, default=False): Whether to copy the data in-memory. + Returns: + datasets.DatasetDict + """ + # Dynamic import to avoid circular dependency + from .io.csv import CsvDatasetReader + + return CsvDatasetReader( + path_or_paths, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs + ).read() From a1596e686a465f5c5dbcda18ce91311e892e05fa Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 13:30:03 +0100 Subject: [PATCH 24/30] Add pandas kwargs to from_csv docstring --- src/datasets/arrow_dataset.py | 1 + src/datasets/dataset_dict.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index b3a9b00fd37..b8250991948 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -450,6 +450,7 @@ def from_csv( features (Features, optional): Dataset features. cache_dir (str, optional, default="~/datasets"): Directory to cache data. keep_in_memory (bool, default=False): Whether to copy the data in-memory. + kwargs: Keyword arguments passed to `pandas.read_csv`. Returns: datasets.Dataset """ diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 0f293b16b16..54acc20cf01 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -629,6 +629,7 @@ def from_csv( features (Features, optional): Dataset features. cache_dir (str, optional, default="~/datasets"): Directory to cache data. keep_in_memory (bool, default=False): Whether to copy the data in-memory. + kwargs: Keyword arguments passed to `pandas.read_csv`. Returns: datasets.DatasetDict """ From 625368cb97e4e21c861efcb7cbe17fe8717a2c26 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 14:22:03 +0100 Subject: [PATCH 25/30] Move csv_path fixture to conftest.py --- tests/conftest.py | 20 ++++++++++++++++ tests/io/test_csv.py | 55 ++++++++++++++------------------------------ 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 869015ec4e1..5bb03beaacd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import csv import lzma import textwrap @@ -98,3 +99,22 @@ def xml_file(tmp_path_factory): with open(filename, "w") as f: f.write(data) return filename + + +DATA = [ + {"col_1": "0", "col_2": 0, "col_3": 0.0}, + {"col_1": "1", "col_2": 1, "col_3": 1.0}, + {"col_1": "2", "col_2": 2, "col_3": 2.0}, + {"col_1": "3", "col_2": 3, "col_3": 3.0}, +] + + +@pytest.fixture(scope="session") +def csv_path(tmp_path_factory): + path = str(tmp_path_factory.mktemp("data") / "dataset.csv") + with open(path, "w") as f: + writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) + writer.writeheader() + for item in DATA: + writer.writerow(item) + return path diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index b093ef8da85..dff28bb8102 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -1,30 +1,9 @@ -import csv - import pytest from datasets import Dataset, DatasetDict, Features, NamedSplit, Value from datasets.io.csv import CsvDatasetReader -DATA = [ - {"col_1": "0", "col_2": 0, "col_3": 0.0}, - {"col_1": "1", "col_2": 1, "col_3": 1.0}, - {"col_1": "2", "col_2": 2, "col_3": 2.0}, - {"col_1": "3", "col_2": 3, "col_3": 3.0}, -] - - -@pytest.fixture(scope="session") -def csv_path(tmp_path_factory): - path = str(tmp_path_factory.mktemp("data") / "dataset.csv") - with open(path, "w") as f: - writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"]) - writer.writeheader() - for item in DATA: - writer.writerow(item) - return path - - @pytest.mark.parametrize( "features", [ @@ -37,7 +16,7 @@ def csv_path(tmp_path_factory): ) @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"]) @pytest.mark.parametrize("path_type", [str, list]) -def test_csv_dataset_reader(path_type, split, csv_path, features, tmp_path): +def test_csv_dataset_reader(path_type, split, features, csv_path, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): @@ -51,14 +30,14 @@ def test_csv_dataset_reader(path_type, split, csv_path, features, tmp_path): expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - ds = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir).read() - assert isinstance(ds, Dataset) - assert ds.num_rows == 4 - assert ds.num_columns == 3 - assert ds.column_names == ["col_1", "col_2", "col_3"] - assert ds.split == expected_split + dataset = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir).read() + assert isinstance(dataset, Dataset) + assert dataset.num_rows == 4 + assert dataset.num_columns == 3 + assert dataset.column_names == ["col_1", "col_2", "col_3"] + assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): - assert ds.features[feature].dtype == expected_dtype + assert dataset.features[feature].dtype == expected_dtype @pytest.mark.parametrize( @@ -72,7 +51,7 @@ def test_csv_dataset_reader(path_type, split, csv_path, features, tmp_path): ], ) @pytest.mark.parametrize("split", [None, "train", "test"]) -def test_csv_datasetdict_reader(split, csv_path, features, tmp_path): +def test_csv_datasetdict_reader(split, features, csv_path, tmp_path): if split: path = {split: csv_path} else: @@ -85,12 +64,12 @@ def test_csv_datasetdict_reader(split, csv_path, features, tmp_path): expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - ds = CsvDatasetReader(path, features=features, cache_dir=cache_dir).read() - assert isinstance(ds, DatasetDict) - ds = ds[split] - assert ds.num_rows == 4 - assert ds.num_columns == 3 - assert ds.column_names == ["col_1", "col_2", "col_3"] - assert ds.split == split + dataset = CsvDatasetReader(path, features=features, cache_dir=cache_dir).read() + assert isinstance(dataset, DatasetDict) + dataset = dataset[split] + assert dataset.num_rows == 4 + assert dataset.num_columns == 3 + assert dataset.column_names == ["col_1", "col_2", "col_3"] + assert dataset.split == split for feature, expected_dtype in expected_features.items(): - assert ds.features[feature].dtype == expected_dtype + assert dataset.features[feature].dtype == expected_dtype From ba4e408acb2cfb8a6981dabbe9e61b96ae3f4ee4 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 14:29:55 +0100 Subject: [PATCH 26/30] Test Dataset.from_csv --- tests/test_arrow_dataset.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index b748d842ef6..f4e315d9232 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -15,7 +15,7 @@ from moto import mock_s3 import datasets.arrow_dataset -from datasets import concatenate_datasets, load_from_disk, temp_seed +from datasets import NamedSplit, concatenate_datasets, load_from_disk, temp_seed from datasets.arrow_dataset import Dataset, transmit_format from datasets.dataset_dict import DatasetDict from datasets.features import Array2D, Array3D, ClassLabel, Features, Sequence, Value @@ -1871,6 +1871,40 @@ def encode(batch): self.assertEqual(str(dset[:2]), str(encode({"text": ["hello there", "foo"]}))) +@pytest.mark.parametrize( + "features", + [ + None, + {"col_1": "string", "col_2": "int64", "col_3": "float64"}, + {"col_1": "string", "col_2": "string", "col_3": "string"}, + {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, + {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, + ], +) +@pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"]) +@pytest.mark.parametrize("path_type", [str, list]) +def test_dataset_from_csv(path_type, split, features, csv_path, tmp_path): + if issubclass(path_type, str): + path = csv_path + elif issubclass(path_type, list): + path = [csv_path] + cache_dir = tmp_path / "cache" + expected_split = str(split) if split else "train" + # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" + default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} + expected_features = features.copy() if features else default_expected_features + features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None + + dataset = Dataset.from_csv(path, split=split, features=features, cache_dir=cache_dir) + assert isinstance(dataset, Dataset) + assert dataset.num_rows == 4 + assert dataset.num_columns == 3 + assert dataset.column_names == ["col_1", "col_2", "col_3"] + assert dataset.split == expected_split + for feature, expected_dtype in expected_features.items(): + assert dataset.features[feature].dtype == expected_dtype + + @pytest.mark.parametrize("in_memory", [False, True]) def test_dataset_from_file(in_memory, dataset, arrow_file): filename = arrow_file From 310e62fb3295c67c3fd43de1d7821cfd65744e28 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 14:30:21 +0100 Subject: [PATCH 27/30] Test DatasetDict.from_csv --- tests/test_dataset_dict.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index 7f565caae71..a1c028c5dbc 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -5,6 +5,7 @@ import boto3 import numpy as np import pandas as pd +import pytest from moto import mock_s3 from datasets import Features, Sequence, Value, load_from_disk @@ -412,3 +413,37 @@ def test_save_and_load_to_s3(self): self.assertEqual(len(dsets["test"]), 30) self.assertListEqual(dsets["test"].column_names, ["filename"]) del dsets + + +@pytest.mark.parametrize( + "features", + [ + None, + {"col_1": "string", "col_2": "int64", "col_3": "float64"}, + {"col_1": "string", "col_2": "string", "col_3": "string"}, + {"col_1": "int32", "col_2": "int32", "col_3": "int32"}, + {"col_1": "float32", "col_2": "float32", "col_3": "float32"}, + ], +) +@pytest.mark.parametrize("split", [None, "train", "test"]) +def test_datasetdict_from_csv(split, features, csv_path, tmp_path): + if split: + path = {split: csv_path} + else: + split = "train" + path = {"train": csv_path, "test": csv_path} + cache_dir = tmp_path / "cache" + # CSV file loses col_1 string dtype information: default now is "int64" instead of "string" + default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} + expected_features = features.copy() if features else default_expected_features + features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None + + dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir) + assert isinstance(dataset, DatasetDict) + dataset = dataset[split] + assert dataset.num_rows == 4 + assert dataset.num_columns == 3 + assert dataset.column_names == ["col_1", "col_2", "col_3"] + assert dataset.split == split + for feature, expected_dtype in expected_features.items(): + assert dataset.features[feature].dtype == expected_dtype From b232793ae68c56227195bf1619b6aaa136434b11 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Mar 2021 14:46:54 +0100 Subject: [PATCH 28/30] Test keep_in_memory --- tests/io/test_csv.py | 21 +++++++++++++++------ tests/test_arrow_dataset.py | 11 ++++++++--- tests/test_dataset_dict.py | 10 +++++++--- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/tests/io/test_csv.py b/tests/io/test_csv.py index dff28bb8102..6e9d9a96276 100644 --- a/tests/io/test_csv.py +++ b/tests/io/test_csv.py @@ -1,9 +1,11 @@ +import pyarrow as pa import pytest from datasets import Dataset, DatasetDict, Features, NamedSplit, Value from datasets.io.csv import CsvDatasetReader +@pytest.mark.parametrize("keep_in_memory", [False, True]) @pytest.mark.parametrize( "features", [ @@ -16,7 +18,7 @@ ) @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"]) @pytest.mark.parametrize("path_type", [str, list]) -def test_csv_dataset_reader(path_type, split, features, csv_path, tmp_path): +def test_csv_dataset_reader(path_type, split, features, keep_in_memory, csv_path, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): @@ -29,8 +31,11 @@ def test_csv_dataset_reader(path_type, split, features, csv_path, tmp_path): default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - - dataset = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir).read() + previous_allocated_memory = pa.total_allocated_bytes() + dataset = CsvDatasetReader( + path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory + ).read() + increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 3 @@ -38,8 +43,10 @@ def test_csv_dataset_reader(path_type, split, features, csv_path, tmp_path): assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype + assert increased_allocated_memory == keep_in_memory +@pytest.mark.parametrize("keep_in_memory", [False, True]) @pytest.mark.parametrize( "features", [ @@ -51,7 +58,7 @@ def test_csv_dataset_reader(path_type, split, features, csv_path, tmp_path): ], ) @pytest.mark.parametrize("split", [None, "train", "test"]) -def test_csv_datasetdict_reader(split, features, csv_path, tmp_path): +def test_csv_datasetdict_reader(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: @@ -63,8 +70,9 @@ def test_csv_datasetdict_reader(split, features, csv_path, tmp_path): default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - - dataset = CsvDatasetReader(path, features=features, cache_dir=cache_dir).read() + previous_allocated_memory = pa.total_allocated_bytes() + dataset = CsvDatasetReader(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory).read() + increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 @@ -73,3 +81,4 @@ def test_csv_datasetdict_reader(split, features, csv_path, tmp_path): assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype + assert increased_allocated_memory == keep_in_memory diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index f4e315d9232..26c3d3b7476 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -1871,6 +1871,7 @@ def encode(batch): self.assertEqual(str(dset[:2]), str(encode({"text": ["hello there", "foo"]}))) +@pytest.mark.parametrize("keep_in_memory", [False, True]) @pytest.mark.parametrize( "features", [ @@ -1883,7 +1884,7 @@ def encode(batch): ) @pytest.mark.parametrize("split", [None, NamedSplit("train"), "train", "test"]) @pytest.mark.parametrize("path_type", [str, list]) -def test_dataset_from_csv(path_type, split, features, csv_path, tmp_path): +def test_dataset_from_csv(path_type, split, features, keep_in_memory, csv_path, tmp_path): if issubclass(path_type, str): path = csv_path elif issubclass(path_type, list): @@ -1894,8 +1895,11 @@ def test_dataset_from_csv(path_type, split, features, csv_path, tmp_path): default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - - dataset = Dataset.from_csv(path, split=split, features=features, cache_dir=cache_dir) + previous_allocated_memory = pa.total_allocated_bytes() + dataset = Dataset.from_csv( + path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory + ) + increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert isinstance(dataset, Dataset) assert dataset.num_rows == 4 assert dataset.num_columns == 3 @@ -1903,6 +1907,7 @@ def test_dataset_from_csv(path_type, split, features, csv_path, tmp_path): assert dataset.split == expected_split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype + assert increased_allocated_memory == keep_in_memory @pytest.mark.parametrize("in_memory", [False, True]) diff --git a/tests/test_dataset_dict.py b/tests/test_dataset_dict.py index a1c028c5dbc..0f5fcf3ac66 100644 --- a/tests/test_dataset_dict.py +++ b/tests/test_dataset_dict.py @@ -5,6 +5,7 @@ import boto3 import numpy as np import pandas as pd +import pyarrow as pa import pytest from moto import mock_s3 @@ -415,6 +416,7 @@ def test_save_and_load_to_s3(self): del dsets +@pytest.mark.parametrize("keep_in_memory", [False, True]) @pytest.mark.parametrize( "features", [ @@ -426,7 +428,7 @@ def test_save_and_load_to_s3(self): ], ) @pytest.mark.parametrize("split", [None, "train", "test"]) -def test_datasetdict_from_csv(split, features, csv_path, tmp_path): +def test_datasetdict_from_csv(split, features, keep_in_memory, csv_path, tmp_path): if split: path = {split: csv_path} else: @@ -437,8 +439,9 @@ def test_datasetdict_from_csv(split, features, csv_path, tmp_path): default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"} expected_features = features.copy() if features else default_expected_features features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None - - dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir) + previous_allocated_memory = pa.total_allocated_bytes() + dataset = DatasetDict.from_csv(path, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory) + increased_allocated_memory = (pa.total_allocated_bytes() - previous_allocated_memory) > 0 assert isinstance(dataset, DatasetDict) dataset = dataset[split] assert dataset.num_rows == 4 @@ -447,3 +450,4 @@ def test_datasetdict_from_csv(split, features, csv_path, tmp_path): assert dataset.split == split for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype + assert increased_allocated_memory == keep_in_memory From b56d96034d465d5765170d844760641c47d54e8f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Mar 2021 17:47:35 +0100 Subject: [PATCH 29/30] Add from_csv to docs --- docs/source/package_reference/main_classes.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst index 708840c9d07..987d4bf6b84 100644 --- a/docs/source/package_reference/main_classes.rst +++ b/docs/source/package_reference/main_classes.rst @@ -29,7 +29,8 @@ The base class :class:`datasets.Dataset` implements a Dataset backed by an Apach list_indexes, get_index, drop_index, search, search_batch, get_nearest_examples, get_nearest_examples_batch, info, split, builder_name, citation, config_name, dataset_size, description, download_checksums, download_size, features, homepage, - license, size_in_bytes, supervised_keys, version + license, size_in_bytes, supervised_keys, version, + from_csv, .. autofunction:: datasets.concatenate_datasets @@ -51,6 +52,7 @@ It also has dataset transform methods like map or filter, to process all the spl flatten_, cast_, remove_columns_, rename_column_, flatten, cast, remove_columns, rename_column, save_to_disk, load_from_disk, + from_csv, ``Features`` @@ -106,7 +108,7 @@ The base class ``Metric`` implements a Metric backed by one or several :class:`d .. autoclass:: datasets.filesystems.S3FileSystem(anon=False, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, default_cache_type='bytes', version_aware=False, config_kwargs=None, s3_additional_kwargs=None, session=None, username=None, password=None, asynchronous=False, loop=None, **kwargs) - + .. autofunction:: datasets.filesystems.extract_path_from_uri .. autofunction:: datasets.filesystems.is_remote_filesystem From 1f2f8b6b5b644df26020030819fa1144ca4b9b3d Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 11 Mar 2021 10:42:06 +0100 Subject: [PATCH 30/30] Fix **kwargs in docstring --- src/datasets/arrow_dataset.py | 2 +- src/datasets/dataset_dict.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c08084fcc6f..d41b7860039 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -450,7 +450,7 @@ def from_csv( features (Features, optional): Dataset features. cache_dir (str, optional, default="~/datasets"): Directory to cache data. keep_in_memory (bool, default=False): Whether to copy the data in-memory. - kwargs: Keyword arguments passed to `pandas.read_csv`. + **kwargs: Keyword arguments to be passed to :meth:`pandas.read_csv`. Returns: datasets.Dataset """ diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index f2815aea63d..71985a6bf51 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -687,7 +687,7 @@ def from_csv( features (Features, optional): Dataset features. cache_dir (str, optional, default="~/datasets"): Directory to cache data. keep_in_memory (bool, default=False): Whether to copy the data in-memory. - kwargs: Keyword arguments passed to `pandas.read_csv`. + **kwargs: Keyword arguments to be passed to :meth:`pandas.read_csv`. Returns: datasets.DatasetDict """