diff --git a/python/src/space/core/manifests/index.py b/python/src/space/core/manifests/index.py index e4368fe..2ed7bea 100644 --- a/python/src/space/core/manifests/index.py +++ b/python/src/space/core/manifests/index.py @@ -35,9 +35,15 @@ _MAX_FIELD = "_MAX" -def _stats_field_name(name: str) -> str: - """Column stats struct field name.""" - return f"{_STATS_FIELD}_{name}" +def _stats_field_name(field_id_: int) -> str: + """Column stats struct field name. + + It uses field ID instead of name. Manifest file has all Parquet files and it + is not tied with one Parquet schema, we can't do table field name to file + field name projection. Using field ID ensures that we can always uniquely + identifies a field. + """ + return f"{_STATS_FIELD}_f{field_id_}" def _stats_subfields(type_: pa.DataType) -> List[pa.Field]: @@ -62,9 +68,10 @@ def _manifest_schema( if f.name not in primary_keys_: continue + field_id_ = field_id(f) fields.append( - (_stats_field_name(f.name), pa.struct(_stats_subfields(f.type)))) - stats_fields.append((field_id(f), f.type)) + (_stats_field_name(field_id_), pa.struct(_stats_subfields(f.type)))) + stats_fields.append((field_id_, f.type)) return pa.schema(fields), stats_fields # type: ignore[arg-type] @@ -140,9 +147,6 @@ def __init__(self, metadata_dir: str, schema: pa.Schema, self._stats_column_ids.append(column_id) self._field_stats_dict[column_id] = _FieldStats(type_) - # Cache for manifest data to materialize. - self._manifest_data: List[pa.Table] = [] - def write(self, file_path: str, parquet_metadata: pq.FileMetaData) -> meta.StorageStatistics: """Write a new manifest row. @@ -182,6 +186,8 @@ def write(self, file_path: str, def finish(self) -> Optional[str]: """Materialize the manifest file and return the file path.""" # Convert cached manifest data to Arrow. + all_manifest_data: List[pa.Table] = [] + if self._file_paths: arrays = [ self._file_paths, self._num_rows, self._index_compressed_bytes, @@ -191,12 +197,15 @@ def finish(self) -> Optional[str]: for column_id in self._stats_column_ids: arrays.append(self._field_stats_dict[column_id].to_arrow()) - self._manifest_data.append( + all_manifest_data.append( pa.Table.from_arrays( arrays=arrays, schema=self._manifest_schema)) # type: ignore[call-arg] - manifest_data = pa.concat_tables(self._manifest_data) + if not all_manifest_data: + return None + + manifest_data = pa.concat_tables(all_manifest_data) if manifest_data.num_rows == 0: return None diff --git a/python/src/space/core/ops/__init__.py b/python/src/space/core/ops/__init__.py index 36a92ed..ded89f4 100644 --- a/python/src/space/core/ops/__init__.py +++ b/python/src/space/core/ops/__init__.py @@ -12,3 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # +"""Space local data operations.""" + +from space.core.ops.append import LocalAppendOp diff --git a/python/src/space/core/ops/append.py b/python/src/space/core/ops/append.py index 7214e9e..1ae8474 100644 --- a/python/src/space/core/ops/append.py +++ b/python/src/space/core/ops/append.py @@ -51,7 +51,7 @@ def finish(self) -> Optional[runtime.Patch]: @dataclass -class IndexWriterInfo: +class _IndexWriterInfo: """Contain information of index file writer.""" writer: pq.ParquetWriter file_path: str @@ -73,7 +73,7 @@ def __init__(self, location: str, metadata: meta.StorageMetadata): self._schema = arrow_schema(self._metadata.schema.fields) # Data file writers. - self._index_writer_info: Optional[IndexWriterInfo] = None + self._index_writer_info: Optional[_IndexWriterInfo] = None # Local runtime caches. self._cached_index_data: Optional[pa.Table] = None @@ -98,9 +98,6 @@ def finish(self) -> Optional[runtime.Patch]: Returns: A patch to the storage or None if no actual storage modification happens. """ - if self._patch.storage_statistics_update.num_rows == 0: - return None - # Flush all cached index data. if self._cached_index_data is not None: self._maybe_create_index_writer() @@ -115,6 +112,9 @@ def finish(self) -> Optional[runtime.Patch]: self._patch.added_index_manifest_files.append( self.short_path(index_manifest_full_path)) + if self._patch.storage_statistics_update.num_rows == 0: + return None + return self._patch def _append_arrow(self, data: pa.Table) -> None: @@ -150,7 +150,7 @@ def _maybe_create_index_writer(self) -> None: if self._index_writer_info is None: full_file_path = paths.new_index_file_path(self._data_dir) writer = pq.ParquetWriter(full_file_path, self._schema) - self._index_writer_info = IndexWriterInfo( + self._index_writer_info = _IndexWriterInfo( writer, self.short_path(full_file_path)) def _finish_index_writer(self) -> None: diff --git a/python/src/space/core/ops/utils.py b/python/src/space/core/ops/utils.py index b7e3f9f..b619c49 100644 --- a/python/src/space/core/ops/utils.py +++ b/python/src/space/core/ops/utils.py @@ -22,6 +22,6 @@ def update_index_storage_statistics( update: meta.StorageStatistics, ) -> None: """Update index storage statistics.""" - base.num_rows += base.num_rows + base.num_rows += update.num_rows base.index_compressed_bytes += update.index_compressed_bytes base.index_uncompressed_bytes += update.index_uncompressed_bytes diff --git a/python/src/space/core/storage.py b/python/src/space/core/storage.py index cf3e894..921b883 100644 --- a/python/src/space/core/storage.py +++ b/python/src/space/core/storage.py @@ -88,6 +88,7 @@ def create( primary_keys: un-enforced primary keys. """ # TODO: to verify that location is an empty directory. + # TODO: to verify primary key fields (and types) are valid. field_id_mgr = FieldIdManager() schema = field_id_mgr.assign_field_ids(schema) diff --git a/python/tests/core/manifests/test_index.py b/python/tests/core/manifests/test_index.py new file mode 100644 index 0000000..fb821ed --- /dev/null +++ b/python/tests/core/manifests/test_index.py @@ -0,0 +1,166 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List + +import pyarrow as pa +import pyarrow.parquet as pq + +from space.core.manifests.index import IndexManifestWriter +from space.core.schema.arrow import field_metadata + +_SCHEMA = pa.schema([ + pa.field("int64", pa.int64(), metadata=field_metadata(0)), + pa.field("float64", pa.float64(), metadata=field_metadata(1)), + pa.field("bool", pa.bool_(), metadata=field_metadata(2)), + pa.field("string", pa.string(), metadata=field_metadata(3)) +]) + + +def _write_parquet_file( + file_path: str, schema: pa.Schema, + batches: List[Dict[str, List[Any]]]) -> pq.FileMetaData: + writer = pq.ParquetWriter(file_path, schema) + for batch in batches: + writer.write_table(pa.Table.from_pydict(batch)) + + writer.close() + return writer.writer.metadata + + +class TestIndexManifestWriter: + + def test_write_all_types(self, tmp_path): + data_dir = tmp_path / "dataset" / "data" + data_dir.mkdir(parents=True) + metadata_dir = tmp_path / "dataset" / "metadata" + metadata_dir.mkdir(parents=True) + + schema = _SCHEMA + manifest_writer = IndexManifestWriter( + metadata_dir=str(metadata_dir), + schema=schema, + primary_keys=["int64", "float64", "bool", "string"]) + + file_path = str(data_dir / "file0") + # TODO: the test should cover all types supported by column stats. + manifest_writer.write( + file_path, + _write_parquet_file(file_path, schema, [{ + "int64": [1, 2, 3], + "float64": [0.1, 0.2, 0.3], + "bool": [True, False, False], + "string": ["a", "b", "c"] + }, { + "int64": [0, 10], + "float64": [-0.1, 100.0], + "bool": [False, False], + "string": ["A", "z"] + }])) + file_path = str(data_dir / "file1") + manifest_writer.write( + file_path, + _write_parquet_file(file_path, schema, [{ + "int64": [1000, 1000000], + "float64": [-0.001, 0.001], + "bool": [False, False], + "string": ["abcedf", "ABCDEF"] + }])) + + manifest_path = manifest_writer.finish() + + data_dir_str = str(data_dir) + assert manifest_path is not None + assert pq.read_table(manifest_path).to_pydict() == { + "_FILE": [f"{data_dir_str}/file0", f"{data_dir_str}/file1"], + "_INDEX_COMPRESSED_BYTES": [645, 334], + "_INDEX_UNCOMPRESSED_BYTES": [624, 320], + "_NUM_ROWS": [5, 2], + "_STATS_f0": [{ + "_MAX": 10, + "_MIN": 0 + }, { + "_MAX": 1000000, + "_MIN": 1000 + }], + "_STATS_f1": [{ + "_MAX": 100.0, + "_MIN": -0.1 + }, { + "_MAX": 0.001, + "_MIN": -0.001 + }], + "_STATS_f2": [{ + "_MAX": True, + "_MIN": False + }, { + "_MAX": False, + "_MIN": False + }], + "_STATS_f3": [{ + "_MAX": "z", + "_MIN": "A" + }, { + "_MAX": "abcedf", + "_MIN": "ABCDEF" + }] + } + + def test_write_collet_stats_for_primary_keys_only(self, tmp_path): + data_dir = tmp_path / "dataset" / "data" + data_dir.mkdir(parents=True) + metadata_dir = tmp_path / "dataset" / "metadata" + metadata_dir.mkdir(parents=True) + + schema = _SCHEMA + manifest_writer = IndexManifestWriter(metadata_dir=str(metadata_dir), + schema=schema, + primary_keys=["int64"]) + + file_path = str(data_dir / "file0") + # TODO: the test should cover all types supported by column stats. + manifest_writer.write( + file_path, + _write_parquet_file(file_path, schema, [{ + "int64": [1, 2, 3], + "float64": [0.1, 0.2, 0.3], + "bool": [True, False, False], + "string": ["a", "b", "c"] + }])) + + manifest_path = manifest_writer.finish() + + data_dir_str = str(data_dir) + assert manifest_path is not None + assert pq.read_table(manifest_path).to_pydict() == { + "_FILE": [f"{data_dir_str}/file0"], + "_INDEX_COMPRESSED_BYTES": [110], + "_INDEX_UNCOMPRESSED_BYTES": [109], + "_NUM_ROWS": [3], + "_STATS_f0": [{ + "_MAX": 3, + "_MIN": 1 + }] + } + + def test_empty_manifest_should_return_none(self, tmp_path): + metadata_dir = tmp_path / "dataset" / "metadata" + metadata_dir.mkdir(parents=True) + + schema = _SCHEMA + manifest_writer = IndexManifestWriter(metadata_dir=str(metadata_dir), + schema=schema, + primary_keys=["int64"]) + + assert manifest_writer.finish() is None diff --git a/python/tests/core/ops/test_append.py b/python/tests/core/ops/test_append.py new file mode 100644 index 0000000..23122cd --- /dev/null +++ b/python/tests/core/ops/test_append.py @@ -0,0 +1,85 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pyarrow as pa +import pyarrow.parquet as pq + +from space.core.ops import LocalAppendOp +import space.core.proto.metadata_pb2 as meta +from space.core.storage import Storage + + +class TestLocalAppendOp: + + def test_write_pydict_all_types(self, tmp_path): + location = tmp_path / "dataset" + schema = pa.schema([ + pa.field("int64", pa.int64()), + pa.field("float64", pa.float64()), + pa.field("bool", pa.bool_()), + pa.field("string", pa.string()) + ]) + storage = Storage.create(location=str(location), + schema=schema, + primary_keys=["int64"]) + + op = LocalAppendOp(str(location), storage.metadata) + + # TODO: the test should cover all types supported by column stats. + op.write({ + "int64": [1, 2, 3], + "float64": [0.1, 0.2, 0.3], + "bool": [True, False, False], + "string": ["a", "b", "c"] + }) + op.write({ + "int64": [0, 10], + "float64": [-0.1, 100.0], + "bool": [False, False], + "string": ["A", "z"] + }) + + patch = op.finish() + assert patch is not None + + index_manifests = [] + for f in patch.added_index_manifest_files: + index_manifests.append(pq.read_table(storage.full_path(f))) + + index_manifest = pa.concat_tables(index_manifests).to_pydict() + assert "_FILE" in index_manifest + + assert index_manifest == { + "_FILE": index_manifest["_FILE"], + "_INDEX_COMPRESSED_BYTES": [114], + "_INDEX_UNCOMPRESSED_BYTES": [126], + "_NUM_ROWS": [5], + "_STATS_f0": [{ + "_MAX": 10, + "_MIN": 0 + }] + } + + assert patch.storage_statistics_update == meta.StorageStatistics( + num_rows=5, index_compressed_bytes=114, index_uncompressed_bytes=126) + + def test_empty_op_return_none(self, tmp_path): + location = tmp_path / "dataset" + schema = pa.schema([pa.field("int64", pa.int64())]) + storage = Storage.create(location=str(location), + schema=schema, + primary_keys=["int64"]) + + op = LocalAppendOp(str(location), storage.metadata) + assert op.finish() is None diff --git a/python/tests/core/ops/test_utils.py b/python/tests/core/ops/test_utils.py new file mode 100644 index 0000000..54aa3c9 --- /dev/null +++ b/python/tests/core/ops/test_utils.py @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from space.core.ops import utils +from space.core.proto import metadata_pb2 as meta + + +def test_update_index_storage_statistics_positive(): + base = meta.StorageStatistics(num_rows=100, + index_compressed_bytes=200, + index_uncompressed_bytes=300) + + utils.update_index_storage_statistics( + base, + meta.StorageStatistics(num_rows=10, + index_compressed_bytes=20, + index_uncompressed_bytes=30)) + assert base == meta.StorageStatistics(num_rows=110, + index_compressed_bytes=220, + index_uncompressed_bytes=330) + + +def test_update_index_storage_statistics_negative(): + base = meta.StorageStatistics(num_rows=100, + index_compressed_bytes=200, + index_uncompressed_bytes=300) + + utils.update_index_storage_statistics( + base, + meta.StorageStatistics(num_rows=-10, + index_compressed_bytes=-20, + index_uncompressed_bytes=-30)) + assert base == meta.StorageStatistics(num_rows=90, + index_compressed_bytes=180, + index_uncompressed_bytes=270) diff --git a/python/tests/core/schema/__init__.py b/python/tests/core/schema/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/tests/core/schema/conftest.py b/python/tests/core/schema/conftest.py new file mode 100644 index 0000000..4205696 --- /dev/null +++ b/python/tests/core/schema/conftest.py @@ -0,0 +1,94 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import pyarrow as pa +from substrait.type_pb2 import NamedStruct, Type + +from space.core.schema.arrow import field_metadata + + +@pytest.fixture +def sample_substrait_fields(): + return NamedStruct( + names=[ + "float32", "list", "struct", "int64", "float64", "list_struct", + "bool", "struct_list", "list", "binary" + ], + struct=Type.Struct(types=[ + Type(fp32=Type.FP32(type_variation_reference=100)), + Type(list=Type.List(type=Type(i32=Type.I32( + type_variation_reference=110)), + type_variation_reference=120)), + Type(struct=Type.Struct(types=[ + Type(i64=Type.I64(type_variation_reference=130)), + Type(fp64=Type.FP64(type_variation_reference=140)) + ], + type_variation_reference=150)), + Type(list=Type.List(type=Type(struct=Type.Struct( + types=[Type(bool=Type.Boolean(type_variation_reference=200))], + type_variation_reference=210)), + type_variation_reference=220)), + Type(struct=Type.Struct(types=[ + Type(list=Type.List(type=Type(string=Type.String( + type_variation_reference=230)), + type_variation_reference=240)), + Type(binary=Type.Binary(type_variation_reference=250)) + ], + type_variation_reference=260)) + ])) + + +@pytest.fixture +def sample_arrow_schema(): + return pa.schema([ + pa.field("float32", pa.float32(), metadata=field_metadata(100)), + pa.field("list", + pa.list_( + pa.field("int32", pa.int32(), + metadata=field_metadata(110))), + metadata=field_metadata(120)), + pa.field("struct", + pa.struct([ + pa.field("int64", pa.int64(), metadata=field_metadata(130)), + pa.field("float64", + pa.float64(), + metadata=field_metadata(140)) + ]), + metadata=field_metadata(150)), + pa.field("list_struct", + pa.list_( + pa.field("struct", + pa.struct([ + pa.field("bool", + pa.bool_(), + metadata=field_metadata(200)) + ]), + metadata=field_metadata(210))), + metadata=field_metadata(220)), + pa.field("struct_list", + pa.struct([ + pa.field("list", + pa.list_( + pa.field("string", + pa.string(), + metadata=field_metadata(230))), + metadata=field_metadata(240)), + pa.field("binary", + pa.binary(), + metadata=field_metadata(250)) + ]), + metadata=field_metadata(260)) + ]) diff --git a/python/tests/core/schema/test_arrow.py b/python/tests/core/schema/test_arrow.py index e34bfd7..64dbfdd 100644 --- a/python/tests/core/schema/test_arrow.py +++ b/python/tests/core/schema/test_arrow.py @@ -14,14 +14,38 @@ import pyarrow as pa -import space.core.schema.arrow as arrow_schema +from space.core.schema import arrow def test_field_metadata(): - assert arrow_schema.field_metadata(123) == {b"PARQUET:field_id": b"123"} + assert arrow.field_metadata(123) == {b"PARQUET:field_id": b"123"} def test_field_id(): - assert arrow_schema.field_id( + assert arrow.field_id( pa.field("name", pa.int64(), metadata={b"PARQUET:field_id": b"123"})) == 123 + + +def test_arrow_schema(sample_substrait_fields, sample_arrow_schema): + assert sample_arrow_schema == arrow.arrow_schema(sample_substrait_fields) + + +def test_field_name_to_id_dict(sample_arrow_schema): + assert arrow.field_name_to_id_dict(sample_arrow_schema) == { + "float32": 100, + "list": 120, + "struct": 150, + "list_struct": 220, + "struct_list": 260 + } + + +def test_field_id_to_column_id_dict(sample_arrow_schema): + assert arrow.field_id_to_column_id_dict(sample_arrow_schema) == { + 100: 0, + 120: 1, + 150: 2, + 220: 3, + 260: 4 + } diff --git a/python/tests/core/schema/test_substrait.py b/python/tests/core/schema/test_substrait.py index 37834e3..5c48d28 100644 --- a/python/tests/core/schema/test_substrait.py +++ b/python/tests/core/schema/test_substrait.py @@ -12,79 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pyarrow as pa -from substrait.type_pb2 import NamedStruct, Type - -from space.core.schema.arrow import field_metadata from space.core.schema.substrait import substrait_fields -def test_substrait_fields(): - # The value field of list is a field instead of a type to populate field ID. - schema = pa.schema([ - pa.field("float32", pa.float32(), metadata=field_metadata(100)), - pa.field("list", - pa.list_( - pa.field("int32", pa.int32(), - metadata=field_metadata(110))), - metadata=field_metadata(120)), - pa.field("struct", - pa.struct([ - pa.field("int64", pa.int64(), metadata=field_metadata(130)), - pa.field("float64", - pa.float64(), - metadata=field_metadata(140)) - ]), - metadata=field_metadata(150)), - pa.field("list_struct", - pa.list_( - pa.field("struct", - pa.struct([ - pa.field("bool", - pa.bool_(), - metadata=field_metadata(200)) - ]), - metadata=field_metadata(210))), - metadata=field_metadata(220)), - pa.field("struct_list", - pa.struct([ - pa.field("list", - pa.list_( - pa.field("string", - pa.string(), - metadata=field_metadata(230))), - metadata=field_metadata(240)), - pa.field("binary", - pa.binary(), - metadata=field_metadata(250)) - ]), - metadata=field_metadata(260)) - ]) - - assert substrait_fields(schema) == NamedStruct( - names=[ - "float32", "list", "struct", "int64", "float64", "list_struct", - "bool", "struct_list", "list", "binary" - ], - struct=Type.Struct(types=[ - Type(fp32=Type.FP32(type_variation_reference=100)), - Type(list=Type.List(type=Type(i32=Type.I32( - type_variation_reference=110)), - type_variation_reference=120)), - Type(struct=Type.Struct(types=[ - Type(i64=Type.I64(type_variation_reference=130)), - Type(fp64=Type.FP64(type_variation_reference=140)) - ], - type_variation_reference=150)), - Type(list=Type.List(type=Type(struct=Type.Struct( - types=[Type(bool=Type.Boolean(type_variation_reference=200))], - type_variation_reference=210)), - type_variation_reference=220)), - Type(struct=Type.Struct(types=[ - Type(list=Type.List(type=Type(string=Type.String( - type_variation_reference=230)), - type_variation_reference=240)), - Type(binary=Type.Binary(type_variation_reference=250)) - ], - type_variation_reference=260)) - ])) +def test_substrait_fields(sample_arrow_schema, sample_substrait_fields): + assert substrait_fields(sample_arrow_schema) == sample_substrait_fields diff --git a/python/tests/core/test_storage.py b/python/tests/core/test_storage.py index 74db350..90c7e85 100644 --- a/python/tests/core/test_storage.py +++ b/python/tests/core/test_storage.py @@ -21,9 +21,10 @@ from space.core.storage import Storage from space.core.utils.paths import _ENTRY_POINT_FILE -_LOCATION = "location" _SNAPSHOT_ID = 100 -_SCHEMA = pa.schema([pa.field("a", pa.int64()), pa.field("b", pa.string())]) +_SCHEMA = pa.schema( + [pa.field("int64", pa.int64()), + pa.field("string", pa.string())]) class TestStorage: @@ -48,7 +49,7 @@ def metadata(self): @pytest.fixture def storage(self, metadata): - return Storage(_LOCATION, metadata) + return Storage("location", metadata) def test_create_dir(self, storage, metadata): assert storage.metadata == metadata @@ -64,12 +65,12 @@ def test_snapshot(self, storage): assert storage.snapshot(snapshot_id=10).snapshot_id == 10 def test_create_storage(self, tmp_path): - dir_path = tmp_path / "test_create_storage" / "dataset" - storage = Storage.create(location=str(dir_path), + location = tmp_path / "dataset" + storage = Storage.create(location=str(location), schema=_SCHEMA, - primary_keys=["a"]) + primary_keys=["int64"]) - entry_point_file = dir_path / "metadata" / _ENTRY_POINT_FILE + entry_point_file = location / "metadata" / _ENTRY_POINT_FILE assert entry_point_file.exists() metadata = storage.metadata @@ -80,22 +81,23 @@ def test_create_storage(self, tmp_path): assert (metadata.create_time == metadata.last_update_time == snapshot.create_time) - assert metadata.schema.fields == NamedStruct( - names=["a", "b"], + assert metadata.schema == meta.Schema(fields=NamedStruct( + names=["int64", "string"], struct=Type.Struct(types=[ Type(i64=Type.I64(type_variation_reference=0)), Type(string=Type.String(type_variation_reference=1)) - ])) + ])), + primary_keys=["int64"]) def test_load_storage(self, tmp_path): - dir_path = tmp_path / "test_create_storage" / "dataset" - storage = Storage.create(location=str(dir_path), + location = tmp_path / "dataset" + storage = Storage.create(location=str(location), schema=_SCHEMA, - primary_keys=["a"]) + primary_keys=["int64"]) - loaded_storage = Storage.load(str(dir_path)) + loaded_storage = Storage.load(str(location)) assert loaded_storage.metadata == storage.metadata def test_load_storage_file_not_found_should_fail(self, tmp_path): with pytest.raises(FileNotFoundError): - Storage.load(str(tmp_path / "test_create_storage" / "dataset")) + Storage.load(str(tmp_path / "dataset"))