Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Dataset from CSV #1946

Merged
merged 32 commits into from
Mar 12, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
cf60105
Test Dataset CSV Reader
albertvillanova Feb 25, 2021
5ce577f
Implement CsvDatasetReader
albertvillanova Feb 25, 2021
c478ed5
Implement Dataset.from_csv
albertvillanova Feb 25, 2021
eb45f5c
Test CsvDatasetReader features
albertvillanova Feb 25, 2021
1fe8344
Fix CsvDatasetReader to allow cast int to str
albertvillanova Feb 25, 2021
cf3495f
Use CsvDatasetBuilder instead of CsvDatasetReader
albertvillanova Feb 25, 2021
d04b2c8
Fix Csv(ArrowBasedBuilder) to allow cast int to str
albertvillanova Mar 2, 2021
f83daae
Test different path types
albertvillanova Mar 2, 2021
c819e49
Remove CsvDatasetReader and unused attributes
albertvillanova Mar 2, 2021
6f2fbd9
Update Dataset.from_csv signature
albertvillanova Mar 2, 2021
800b8cd
Remove test for CsvDatasetReader
albertvillanova Mar 2, 2021
9d1cbe7
Remove unused import
albertvillanova Mar 2, 2021
1af8f96
Pass kwargs
albertvillanova Mar 2, 2021
e40e46d
Rename adapter as CsvDatasetReader instead of CsvDatasetBuilder
albertvillanova Mar 4, 2021
3e4cb6f
Pass keep_in_memory
albertvillanova Mar 4, 2021
b3978cd
Remove save_infos
albertvillanova Mar 4, 2021
dd46f5f
Make explicit that path can be NestedStructure-like
albertvillanova Mar 4, 2021
16b902b
Test CsvDatasetReader for path or list of paths only, and split
albertvillanova Mar 9, 2021
5f5b56d
Set train as default split
albertvillanova Mar 9, 2021
b86e7cd
Pass paths as dict to allow split renaming
albertvillanova Mar 9, 2021
7991b60
Specify split and str or list paths for Dataset.from_csv
albertvillanova Mar 9, 2021
c26ed08
Test CSV DatasetDict Reader for dict path
albertvillanova Mar 9, 2021
34c0ed7
Implement DatasetDict.from_csv
albertvillanova Mar 9, 2021
a1596e6
Add pandas kwargs to from_csv docstring
albertvillanova Mar 9, 2021
625368c
Move csv_path fixture to conftest.py
albertvillanova Mar 9, 2021
ba4e408
Test Dataset.from_csv
albertvillanova Mar 9, 2021
310e62f
Test DatasetDict.from_csv
albertvillanova Mar 9, 2021
b232793
Test keep_in_memory
albertvillanova Mar 9, 2021
adf98ed
Merge remote-tracking branch 'upstream/master' into dataset-from-csv
albertvillanova Mar 9, 2021
b56d960
Add from_csv to docs
albertvillanova Mar 10, 2021
c1b7758
Merge remote-tracking branch 'upstream/master' into dataset-from-csv
albertvillanova Mar 11, 2021
1f2f8b6
Fix **kwargs in docstring
albertvillanova Mar 11, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from .utils import map_nested
from .utils.deprecation_utils import deprecated
from .utils.logging import WARNING, get_logger, get_verbosity, set_verbosity_warning
from .utils.typing import PathLike


if TYPE_CHECKING:
Expand All @@ -70,8 +71,6 @@
else:
PYARROW_V0 = False

PathLike = Union[str, bytes, os.PathLike]


class DatasetInfoMixin(object):
"""This base class exposes some attributes of DatasetInfo
Expand Down Expand Up @@ -435,6 +434,32 @@ def from_dict(
pa_table: pa.Table = pa.Table.from_pydict(mapping=mapping)
return cls(pa_table, info=info, split=split)

@staticmethod
def from_csv(
path: PathLike,
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: str = None,
keep_in_memory: bool = False,
**kwargs,
):
"""Create Dataset from CSV file(s).
Args:
path (path-like): Path of the CSV file(s).
split (NamedSplit, optional): Name of the dataset split.
features (Features, optional): Dataset features.
cache_dir (str, optional, default="~/datasets"): Directory to cache data.
keep_in_memory (bool, default=False): Whether to copy the data in-memory.
albertvillanova marked this conversation as resolved.
Show resolved Hide resolved
Returns:
datasets.Dataset
"""
# Dynamic import to avoid circular dependency
from .io.csv import CsvDatasetReader

return CsvDatasetReader(
path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs
).read()

def __del__(self):
if hasattr(self, "_data"):
del self._data
Expand Down
Empty file added src/datasets/io/__init__.py
Empty file.
28 changes: 28 additions & 0 deletions src/datasets/io/abc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from abc import ABC, abstractmethod
from typing import Optional

from .. import Features, NamedSplit
from ..arrow_dataset import Dataset
from ..utils.typing import PathLike


class AbstractDatasetReader(ABC):
def __init__(
self,
path: PathLike,
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: str = None,
keep_in_memory: bool = False,
**kwargs,
):
self.path = path
self.split = split
self.features = features
self.cache_dir = cache_dir
self.keep_in_memory = keep_in_memory
self.kwargs = kwargs

@abstractmethod
def read(self) -> Dataset:
pass
53 changes: 53 additions & 0 deletions src/datasets/io/csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Optional

from .. import Features, NamedSplit
from ..packaged_modules.csv.csv import Csv
from ..utils.typing import PathLike
from .abc import AbstractDatasetReader


class CsvDatasetReader(AbstractDatasetReader):
def __init__(
self,
path: PathLike,
split: Optional[NamedSplit] = None,
features: Optional[Features] = None,
cache_dir: str = None,
keep_in_memory: bool = False,
**kwargs,
):
super().__init__(
path, split=split, features=features, cache_dir=cache_dir, keep_in_memory=keep_in_memory, **kwargs
)
self.builder = Csv(
cache_dir=cache_dir,
data_files=path,
albertvillanova marked this conversation as resolved.
Show resolved Hide resolved
features=features,
**kwargs,
)

def read(self):
download_config = None
download_mode = None
ignore_verifications = False
use_auth_token = None
save_infos = False
base_path = None

self.builder.download_and_prepare(
download_config=download_config,
download_mode=download_mode,
ignore_verifications=ignore_verifications,
# try_from_hf_gcs=try_from_hf_gcs,
base_path=base_path,
use_auth_token=use_auth_token,
)

# Build dataset for splits
ds = self.builder.as_dataset(
split=self.split, ignore_verifications=ignore_verifications, in_memory=self.keep_in_memory
)
if save_infos:
self.builder._save_infos()
albertvillanova marked this conversation as resolved.
Show resolved Hide resolved

return ds
3 changes: 3 additions & 0 deletions src/datasets/packaged_modules/csv/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,13 @@ def _split_generators(self, dl_manager):

def _generate_tables(self, files):
schema = pa.schema(self.config.features.type) if self.config.features is not None else None
# dtype allows reading an int column as str
dtype = {name: dtype.to_pandas_dtype() for name, dtype in zip(schema.names, schema.types)} if schema else None
for file_idx, file in enumerate(files):
csv_file_reader = pd.read_csv(
file,
iterator=True,
dtype=dtype,
sep=self.config.sep,
header=self.config.header,
names=self.config.names,
Expand Down
5 changes: 5 additions & 0 deletions src/datasets/utils/typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import os
from typing import Union


PathLike = Union[str, bytes, os.PathLike]
61 changes: 61 additions & 0 deletions tests/io/test_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import csv

import pytest

from datasets import Features, NamedSplit, Value
from datasets.io.csv import CsvDatasetReader


DATA = [
{"col_1": "0", "col_2": 0, "col_3": 0.0},
{"col_1": "1", "col_2": 1, "col_3": 1.0},
{"col_1": "2", "col_2": 2, "col_3": 2.0},
{"col_1": "3", "col_2": 3, "col_3": 3.0},
]


@pytest.fixture(scope="session")
def csv_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.csv")
with open(path, "w") as f:
writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
writer.writeheader()
for item in DATA:
writer.writerow(item)
return path


@pytest.mark.parametrize(
"features",
[
None,
{"col_1": "string", "col_2": "int64", "col_3": "float64"},
{"col_1": "string", "col_2": "string", "col_3": "string"},
{"col_1": "int32", "col_2": "int32", "col_3": "int32"},
{"col_1": "float32", "col_2": "float32", "col_3": "float32"},
],
)
@pytest.mark.parametrize("split", [None, NamedSplit("train")])
@pytest.mark.parametrize("path_type", [str, list, dict])
def test_dataset_csv_builder(path_type, split, csv_path, features, tmp_path):
if issubclass(path_type, str):
path = csv_path
elif issubclass(path_type, list):
path = [csv_path]
elif issubclass(path_type, dict):
path = {"train": csv_path}
cache_dir = tmp_path / "cache"

# CSV file loses col_1 string dtype information: default now is "int64" instead of "string"
default_expected_features = {"col_1": "int64", "col_2": "int64", "col_3": "float64"}
expected_features = features.copy() if features else default_expected_features
features = Features({feature: Value(dtype) for feature, dtype in features.items()}) if features else None

ds = CsvDatasetReader(path, split=split, features=features, cache_dir=cache_dir).read()
ds = ds if split else ds["train"] # # if split is None: ds.num_rows = {'train': 4} instead of 4
assert ds.num_rows == 4
assert ds.num_columns == 3
assert ds.column_names == ["col_1", "col_2", "col_3"]
assert ds.split == "train"
for feature, expected_dtype in expected_features.items():
assert ds.features[feature].dtype == expected_dtype