Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: warn when reading a multilayer file without specifying layer #362

6 changes: 5 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@

### Improvements

- `read_arrow` and `open_arrow` now provide [GeoArrow-compliant extension metadata](https://geoarrow.org/extension-types.html), including the CRS, when using GDAL 3.8 or higher.
- `read_arrow` and `open_arrow` now provide
[GeoArrow-compliant extension metadata](https://geoarrow.org/extension-types.html),
including the CRS, when using GDAL 3.8 or higher (#366).
- Warn when reading from a multilayer file without specifying a layer (#362).


### Bug fixes

Expand Down
89 changes: 69 additions & 20 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ cdef apply_bbox_filter(OGRLayerH ogr_layer, bbox):
Parameters
----------
ogr_layer : pointer to open OGR layer
bbox: list or tuple of xmin, ymin, xmax, ymax
bbox : list or tuple of xmin, ymin, xmax, ymax

Raises
------
Expand All @@ -629,7 +629,7 @@ cdef apply_geometry_filter(OGRLayerH ogr_layer, wkb):
Parameters
----------
ogr_layer : pointer to open OGR layer
wkb: WKB encoding of geometry
wkb : WKB encoding of geometry
"""

cdef OGRGeometryH ogr_geometry = NULL
Expand Down Expand Up @@ -1124,9 +1124,8 @@ def ogr_read(
ogr_dataset = ogr_open(path_c, 0, dataset_options)

if sql is None:
# layer defaults to index 0
if layer is None:
layer = 0
layer = get_default_layer(ogr_dataset)
ogr_layer = get_ogr_layer(ogr_dataset, layer)
else:
ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect)
Expand Down Expand Up @@ -1315,9 +1314,8 @@ def ogr_open_arrow(
ogr_dataset = ogr_open(path_c, 0, dataset_options)

if sql is None:
# layer defaults to index 0
if layer is None:
layer = 0
layer = get_default_layer(ogr_dataset)
ogr_layer = get_ogr_layer(ogr_dataset, layer)
else:
ogr_layer = execute_sql(ogr_dataset, sql, sql_dialect)
Expand Down Expand Up @@ -1471,11 +1469,10 @@ def ogr_read_bounds(
path_b = path.encode('utf-8')
path_c = path_b

# layer defaults to index 0
if layer is None:
layer = 0

ogr_dataset = ogr_open(path_c, 0, NULL)

if layer is None:
layer = get_default_layer(ogr_dataset)
ogr_layer = get_ogr_layer(ogr_dataset, layer)

# Apply the attribute filter
Expand Down Expand Up @@ -1511,13 +1508,13 @@ def ogr_read_info(
path_b = path.encode('utf-8')
path_c = path_b

# layer defaults to index 0
if layer is None:
layer = 0


try:
dataset_options = dict_to_options(dataset_kwargs)
ogr_dataset = ogr_open(path_c, 0, dataset_options)

if layer is None:
layer = get_default_layer(ogr_dataset)
ogr_layer = get_ogr_layer(ogr_dataset, layer)

# Encoding is derived from the user, from the dataset capabilities / type,
Expand Down Expand Up @@ -1564,15 +1561,71 @@ def ogr_read_info(

def ogr_list_layers(str path):
cdef const char *path_c = NULL
cdef const char *ogr_name = NULL
cdef OGRDataSourceH ogr_dataset = NULL
cdef OGRLayerH ogr_layer = NULL

path_b = path.encode('utf-8')
path_c = path_b

ogr_dataset = ogr_open(path_c, 0, NULL)

layers = get_layer_names(ogr_dataset)

if ogr_dataset != NULL:
GDALClose(ogr_dataset)
ogr_dataset = NULL

return layers


cdef str get_default_layer(OGRDataSourceH ogr_dataset):
""" Get the layer in the dataset that is read by default.

The caller is responsible for closing the dataset.

Parameters
----------
ogr_dataset : pointer to open OGR dataset

Returns
-------
str
the name of the default layer to be read.

"""
layers = get_layer_names(ogr_dataset)
first_layer_name = layers[0][0]

if len(layers) > 1:
dataset_name = os.path.basename(get_string(OGR_DS_GetName(ogr_dataset)))

other_layer_names = ', '.join([f"'{l}'" for l in layers[1:, 0]])
warnings.warn(
f"More than one layer found in '{dataset_name}': '{first_layer_name}' "
f"(default), {other_layer_names}. Specify layer parameter to avoid this "
"warning.",
stacklevel=2,
)

return first_layer_name


cdef get_layer_names(OGRDataSourceH ogr_dataset):
""" Get the layers in the dataset.

The caller is responsible for closing the dataset.

Parameters
----------
ogr_dataset : pointer to open OGR dataset

Returns
-------
ndarray(n)
array of layer names

"""
cdef OGRLayerH ogr_layer = NULL

layer_count = GDALDatasetGetLayerCount(ogr_dataset)

data = np.empty(shape=(layer_count, 2), dtype=object)
Expand All @@ -1583,10 +1636,6 @@ def ogr_list_layers(str path):
data_view[i, 0] = get_string(OGR_L_GetName(ogr_layer))
data_view[i, 1] = get_geometry_type(ogr_layer)

if ogr_dataset != NULL:
GDALClose(ogr_dataset)
ogr_dataset = NULL

return data


Expand Down
2 changes: 2 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ cdef extern from "ogr_api.h":
OGRDataSourceH OGR_Dr_Open(OGRSFDriverH driver, const char *path, int bupdate)
const char* OGR_Dr_GetName(OGRSFDriverH driver)

const char* OGR_DS_GetName(OGRDataSourceH)

OGRFeatureH OGR_F_Create(OGRFeatureDefnH featuredefn)
void OGR_F_Destroy(OGRFeatureH feature)

Expand Down
7 changes: 7 additions & 0 deletions pyogrio/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ def test_read_arrow(naturalearth_lowres_all_ext):
assert_geodataframe_equal(result, expected, check_less_precise=check_less_precise)


def test_read_arrow_unspecified_layer_warning(data_dir):
"""Reading a multi-layer file without specifying a layer gives a warning."""
with pytest.warns(UserWarning, match="More than one layer found "):
read_arrow(data_dir / "sample.osm.pbf")


@pytest.mark.parametrize("skip_features, expected", [(10, 167), (200, 0)])
def test_read_arrow_skip_features(naturalearth_lowres, skip_features, expected):
table = read_arrow(naturalearth_lowres, skip_features=skip_features)[1]
Expand Down Expand Up @@ -117,6 +123,7 @@ def test_read_arrow_to_pandas_kwargs(test_fgdb_vsi):
arrow_to_pandas_kwargs = {"strings_to_categorical": True}
result = read_dataframe(
test_fgdb_vsi,
layer="basetable_2",
use_arrow=True,
arrow_to_pandas_kwargs=arrow_to_pandas_kwargs,
)
Expand Down
14 changes: 13 additions & 1 deletion pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ def test_read_bounds_max_features(naturalearth_lowres):
assert bounds.shape == (4, 2)


def test_read_bounds_unspecified_layer_warning(data_dir):
"""Reading a multi-layer file without specifying a layer gives a warning."""
with pytest.warns(UserWarning, match="More than one layer found "):
read_bounds(data_dir / "sample.osm.pbf")


def test_read_bounds_negative_max_features(naturalearth_lowres):
with pytest.raises(ValueError, match="'max_features' must be >= 0"):
read_bounds(naturalearth_lowres, max_features=-1)
Expand Down Expand Up @@ -451,8 +457,14 @@ def test_read_info_force_total_bounds(
assert info["total_bounds"] is None


def test_read_info_unspecified_layer_warning(data_dir):
"""Reading a multi-layer file without specifying a layer gives a warning."""
with pytest.warns(UserWarning, match="More than one layer found "):
read_info(data_dir / "sample.osm.pbf")


def test_read_info_without_geometry(test_fgdb_vsi):
assert read_info(test_fgdb_vsi)["total_bounds"] is None
assert read_info(test_fgdb_vsi, layer="basetable_2")["total_bounds"] is None


@pytest.mark.parametrize(
Expand Down
5 changes: 4 additions & 1 deletion pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def test_read_force_2d(test_fgdb_vsi, use_arrow):


@pytest.mark.filterwarnings("ignore: Measured")
@pytest.mark.filterwarnings("ignore: More than one layer found in")
def test_read_layer(test_fgdb_vsi, use_arrow):
layers = list_layers(test_fgdb_vsi)
kwargs = {"use_arrow": use_arrow, "read_geometry": False, "max_features": 1}
Expand Down Expand Up @@ -251,7 +252,9 @@ def test_read_write_datetime_tz_with_nulls(tmp_path):


def test_read_null_values(test_fgdb_vsi, use_arrow):
df = read_dataframe(test_fgdb_vsi, use_arrow=use_arrow, read_geometry=False)
df = read_dataframe(
test_fgdb_vsi, layer="basetable_2", use_arrow=use_arrow, read_geometry=False
)

# make sure that Null values are preserved
assert df.SEGMENT_NAME.isnull().max()
Expand Down
6 changes: 6 additions & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ def test_read_autodetect_driver(tmp_path, naturalearth_lowres, ext):
assert len(geometry) == len(fields[0])


def test_read_arrow_unspecified_layer_warning(data_dir):
"""Reading a multi-layer file without specifying a layer gives a warning."""
with pytest.warns(UserWarning, match="More than one layer found "):
read(data_dir / "sample.osm.pbf")


def test_read_invalid_layer(naturalearth_lowres):
with pytest.raises(DataLayerError, match="Layer 'invalid' could not be opened"):
read(naturalearth_lowres, layer="invalid")
Expand Down