From 44071698b17f6f791134a39c9c62951d6d7fca1d Mon Sep 17 00:00:00 2001 From: eunseojo Date: Thu, 24 Nov 2022 04:52:20 +0100 Subject: [PATCH 1/6] fix error where reading breaks when batch missing an assigned column feature --- src/datasets/packaged_modules/json/json.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index 152b35fdf9d..5ee0bee54fd 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -128,6 +128,13 @@ def _generate_tables(self, files): pa_table = paj.read_json( io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size) ) + feature_columns = set(self.config.features.arrow_schema.names) + pa_columns = set(pa_table.column_names) + missing_columns = feature_columns - pa_columns + if missing_columns: # some columns are missing + num_rows = len(pa_table) + for column_name in missing_columns: + pa_table = pa_table.append_column(column_name, pa.nulls(num_rows)) break except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: if ( From bd0165e8738a64fd5d6a4854c3773d7bf691dc77 Mon Sep 17 00:00:00 2001 From: eunseojo Date: Thu, 1 Dec 2022 07:17:31 +0100 Subject: [PATCH 2/6] fix it so that it behaves normally when features are not inserted --- src/datasets/packaged_modules/json/json.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index 5ee0bee54fd..db3c853d881 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -128,13 +128,14 @@ def _generate_tables(self, files): pa_table = paj.read_json( io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size) ) - feature_columns = set(self.config.features.arrow_schema.names) - pa_columns = set(pa_table.column_names) - missing_columns = feature_columns - pa_columns - if missing_columns: # some columns are missing - num_rows = len(pa_table) - for column_name in missing_columns: - pa_table = pa_table.append_column(column_name, pa.nulls(num_rows)) + if self.config.features != None: + feature_columns = set(self.config.features.arrow_schema.names) + pa_columns = set(pa_table.column_names) + missing_columns = feature_columns - pa_columns + if missing_columns: # some columns are missing + num_rows = len(pa_table) + for column_name in missing_columns: + pa_table = pa_table.append_column(column_name, pa.nulls(num_rows)) break except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: if ( From 6e142329c42c447168c39a0ef40b3e550fa08ec0 Mon Sep 17 00:00:00 2001 From: eunseojo Date: Thu, 1 Dec 2022 07:32:04 +0100 Subject: [PATCH 3/6] style --- src/datasets/packaged_modules/json/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index db3c853d881..5df4881b8c3 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -128,7 +128,7 @@ def _generate_tables(self, files): pa_table = paj.read_json( io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size) ) - if self.config.features != None: + if self.config.features is not None: feature_columns = set(self.config.features.arrow_schema.names) pa_columns = set(pa_table.column_names) missing_columns = feature_columns - pa_columns From 1fca888bd689ba747455273736c6727870df1286 Mon Sep 17 00:00:00 2001 From: eunseojo Date: Fri, 2 Dec 2022 13:57:40 +0100 Subject: [PATCH 4/6] added test and changed the null column --- src/datasets/packaged_modules/json/json.py | 7 +++++- tests/packaged_modules/test_json.py | 28 ++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index 5df4881b8c3..b5259ed92c0 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -135,7 +135,12 @@ def _generate_tables(self, files): if missing_columns: # some columns are missing num_rows = len(pa_table) for column_name in missing_columns: - pa_table = pa_table.append_column(column_name, pa.nulls(num_rows)) + # pa_table = pa_table.append_column(column_name, pa.nulls(num_rows)) + type = self.config.features.arrow_schema.field(column_name).type + pa_table = pa_table.append_column( + column_name, pa.array([None] * num_rows, type=type) + ) + break except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: if ( diff --git a/tests/packaged_modules/test_json.py b/tests/packaged_modules/test_json.py index 09e5572828c..0e1a2b30b4e 100644 --- a/tests/packaged_modules/test_json.py +++ b/tests/packaged_modules/test_json.py @@ -3,6 +3,7 @@ import pyarrow as pa import pytest +from datasets import Features, Value from datasets.packaged_modules.json.json import Json @@ -69,3 +70,30 @@ def test_json_generate_tables(file_fixture, config_kwargs, request): generator = json._generate_tables([[request.getfixturevalue(file_fixture)]]) pa_table = pa.concat_tables([table for _, table in generator]) assert pa_table.to_pydict() == {"col_1": [1, 10], "col_2": [2, 20]} + + +@pytest.mark.parametrize( + "file_fixture, config_kwargs", + [ + ( + "jsonl_file", + {"features": Features({"col_1": Value(dtype="int64", id=None), "col_2": Value(dtype="int64", id=None)})}, + ), + ( + "json_file_with_list_of_dicts", + {"features": Features({"col_1": Value(dtype="int64", id=None), "col_2": Value(dtype="int64", id=None)})}, + ), + ( + "json_file_with_list_of_dicts_field", + { + "field": "field3", + "features": Features({"col_1": Value(dtype="int64", id=None), "col_2": Value(dtype="int64", id=None)}), + }, + ), + ], +) +def test_json_generate_tables_with_features(file_fixture, config_kwargs, request): + json = Json(**config_kwargs) + generator = json._generate_tables([[request.getfixturevalue(file_fixture)]]) + pa_table = pa.concat_tables([table for _, table in generator]) + assert pa_table.to_pydict() == {"col_1": [1, 10], "col_2": [2, 20]} From 12bbc439c7228cec82d75b5ba891a5faf93a283d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 2 Dec 2022 14:55:21 +0100 Subject: [PATCH 5/6] update test with missing features --- tests/packaged_modules/test_json.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/packaged_modules/test_json.py b/tests/packaged_modules/test_json.py index 0e1a2b30b4e..17b7407f617 100644 --- a/tests/packaged_modules/test_json.py +++ b/tests/packaged_modules/test_json.py @@ -77,23 +77,25 @@ def test_json_generate_tables(file_fixture, config_kwargs, request): [ ( "jsonl_file", - {"features": Features({"col_1": Value(dtype="int64", id=None), "col_2": Value(dtype="int64", id=None)})}, + {"features": Features({"col_1": Value("int64"), "col_2": Value("int64"), "missing_col": Value("string")})}, ), ( "json_file_with_list_of_dicts", - {"features": Features({"col_1": Value(dtype="int64", id=None), "col_2": Value(dtype="int64", id=None)})}, + {"features": Features({"col_1": Value("int64"), "col_2": Value("int64"), "missing_col": Value("string")})}, ), ( "json_file_with_list_of_dicts_field", { "field": "field3", - "features": Features({"col_1": Value(dtype="int64", id=None), "col_2": Value(dtype="int64", id=None)}), + "features": Features( + {"col_1": Value("int64"), "col_2": Value("int64"), "missing_col": Value("string")} + ), }, ), ], ) -def test_json_generate_tables_with_features(file_fixture, config_kwargs, request): +def test_json_generate_tables_with_missing_features(file_fixture, config_kwargs, request): json = Json(**config_kwargs) generator = json._generate_tables([[request.getfixturevalue(file_fixture)]]) pa_table = pa.concat_tables([table for _, table in generator]) - assert pa_table.to_pydict() == {"col_1": [1, 10], "col_2": [2, 20]} + assert pa_table.to_pydict() == {"col_1": [1, 10], "col_2": [2, 20], "missing_col": [None, None]} From 6fa2535c2fc5a6be0b301cbc9f49b24c6fa96596 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 2 Dec 2022 14:55:49 +0100 Subject: [PATCH 6/6] add missing columns for other JSON kinds as well --- src/datasets/packaged_modules/json/json.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py index b5259ed92c0..fba42671c3f 100644 --- a/src/datasets/packaged_modules/json/json.py +++ b/src/datasets/packaged_modules/json/json.py @@ -82,6 +82,10 @@ def _split_generators(self, dl_manager): def _cast_table(self, pa_table: pa.Table) -> pa.Table: if self.config.features is not None: + # adding missing columns + for column_name in set(self.config.features) - set(pa_table.column_names): + type = self.config.features.arrow_schema.field(column_name).type + pa_table = pa_table.append_column(column_name, pa.array([None] * len(pa_table), type=type)) # more expensive cast to support nested structures with keys in a different order # allows str <-> int/float or str to Audio for example pa_table = table_cast(pa_table, self.config.features.arrow_schema) @@ -128,19 +132,6 @@ def _generate_tables(self, files): pa_table = paj.read_json( io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size) ) - if self.config.features is not None: - feature_columns = set(self.config.features.arrow_schema.names) - pa_columns = set(pa_table.column_names) - missing_columns = feature_columns - pa_columns - if missing_columns: # some columns are missing - num_rows = len(pa_table) - for column_name in missing_columns: - # pa_table = pa_table.append_column(column_name, pa.nulls(num_rows)) - type = self.config.features.arrow_schema.field(column_name).type - pa_table = pa_table.append_column( - column_name, pa.array([None] * num_rows, type=type) - ) - break except (pa.ArrowInvalid, pa.ArrowNotImplementedError) as e: if (