Skip to content

Commit

Permalink
Add total_bounds of layer to read_info (#281)
Browse files Browse the repository at this point in the history
  • Loading branch information
theroggy authored Sep 25, 2023
1 parent dd88934 commit f39ddf8
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 25 deletions.
12 changes: 11 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
unknown count for a data layer (e.g., OSM driver); this may have signficant
performance impacts for some data sources that would otherwise return an
unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271).
- In `read_info`, the result now also contains the `total_bounds` of the layer as well
as some extra `capabilities` of the data source driver (#281)
- Raise error if `read` or `read_dataframe` is called with parameters to read no
columns, geometry, or fids (#280)

- Automatically detect supported driver by extension for all available
write drivers and addition of `detect_write_driver` (#270)

Expand All @@ -25,6 +26,15 @@
- Always raise an exception if there is an error when writing a data source
(#284)

### Potentially breaking changes

- In `read_info` (#281):
- the `features` property in the result will now be -1 if calculating the
feature count is an expensive operation for this driver. You can force it to be
calculated using the `force_feature_count` parameter.
- for boolean values in the `capabilities` property, the values will now be
booleans instead of 1 or 0.

## 0.6.0 (2023-04-27)

### Improvements
Expand Down
53 changes: 44 additions & 9 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ cdef get_driver(OGRDataSourceH ogr_dataset):
return driver


cdef get_feature_count(OGRLayerH ogr_layer):
cdef get_feature_count(OGRLayerH ogr_layer, int force):
"""Get the feature count of a layer.
If GDAL returns an unknown count (-1), this iterates over every feature
Expand All @@ -329,6 +329,8 @@ cdef get_feature_count(OGRLayerH ogr_layer):
Parameters
----------
ogr_layer : pointer to open OGR layer
force : bool
True if the feature count should be computed even if it is expensive
Returns
-------
Expand All @@ -337,12 +339,12 @@ cdef get_feature_count(OGRLayerH ogr_layer):
"""

cdef OGRFeatureH ogr_feature = NULL
cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, force)

# if GDAL refuses to give us the feature count, we have to loop over all
# features ourselves and get the count. This can happen for some drivers
# (e.g., OSM) or if a where clause is invalid but not rejected as error
if feature_count == -1:
if force and feature_count == -1:
# make sure layer is read from beginning
OGR_L_ResetReading(ogr_layer)

Expand Down Expand Up @@ -376,6 +378,34 @@ cdef get_feature_count(OGRLayerH ogr_layer):
return feature_count


cdef get_total_bounds(OGRLayerH ogr_layer, int force):
"""Get the total bounds of a layer.
Parameters
----------
ogr_layer : pointer to open OGR layer
force : bool
True if the total bounds should be computed even if it is expensive
Returns
-------
tuple of (xmin, ymin, xmax, ymax) or None
The total bounds of the layer, or None if they could not be determined.
"""

cdef OGREnvelope ogr_envelope
try:
exc_wrap_ogrerr(OGR_L_GetExtent(ogr_layer, &ogr_envelope, force))
bounds = (
ogr_envelope.MinX, ogr_envelope.MinY, ogr_envelope.MaxX, ogr_envelope.MaxY
)

except CPLE_BaseError:
bounds = None

return bounds


cdef set_metadata(GDALMajorObjectH obj, object metadata):
"""Set metadata on a dataset or layer
Expand Down Expand Up @@ -598,7 +628,7 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe
skip_features : number of features to skip from beginning of available range
max_features : maximum number of features to read from available range
"""
feature_count = get_feature_count(ogr_layer)
feature_count = get_feature_count(ogr_layer, 1)
num_features = max_features

if feature_count == 0:
Expand Down Expand Up @@ -1369,7 +1399,9 @@ def ogr_read_info(
str path,
dataset_kwargs,
object layer=None,
object encoding=None):
object encoding=None,
int force_feature_count=False,
int force_total_bounds=False):

cdef const char *path_c = NULL
cdef char **dataset_options = NULL
Expand Down Expand Up @@ -1404,12 +1436,15 @@ def ogr_read_info(
'fields': fields[:,2], # return only names
'dtypes': fields[:,3],
'geometry_type': get_geometry_type(ogr_layer),
'features': get_feature_count(ogr_layer),
'features': get_feature_count(ogr_layer, force_feature_count),
'total_bounds': get_total_bounds(ogr_layer, force_total_bounds),
'driver': get_driver(ogr_dataset),
"capabilities": {
"random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead),
"fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex),
"fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter),
"random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead) == 1,
"fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex) == 1,
"fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter) == 1,
"fast_feature_count": OGR_L_TestCapability(ogr_layer, OLCFastFeatureCount) == 1,
"fast_total_bounds": OGR_L_TestCapability(ogr_layer, OLCFastGetExtent) == 1,
},
'layer_metadata': get_metadata(ogr_layer),
'dataset_metadata': get_metadata(ogr_dataset),
Expand Down
3 changes: 3 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ cdef extern from "ogr_api.h":
const char* OGR_L_GetName(OGRLayerH layer)
const char* OGR_L_GetFIDColumn(OGRLayerH layer)
const char* OGR_L_GetGeometryColumn(OGRLayerH layer)
OGRErr OGR_L_GetExtent(OGRLayerH layer, OGREnvelope *psExtent, int bForce)
OGRSpatialReferenceH OGR_L_GetSpatialRef(OGRLayerH layer)
int OGR_L_TestCapability(OGRLayerH layer, const char *name)
OGRFeatureDefnH OGR_L_GetLayerDefn(OGRLayerH layer)
Expand All @@ -301,6 +302,8 @@ cdef extern from "ogr_api.h":
const char* OLCRandomRead
const char* OLCFastSetNextByIndex
const char* OLCFastSpatialFilter
const char* OLCFastFeatureCount
const char* OLCFastGetExtent
const char* OLCTransactions


Expand Down
41 changes: 34 additions & 7 deletions pyogrio/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,27 @@ def read_bounds(
return result


def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):
def read_info(
path_or_buffer,
/,
layer=None,
encoding=None,
force_feature_count=False,
force_total_bounds=False,
**kwargs,
):
"""Read information about an OGR data source.
``crs`` and ``geometry`` will be ``None`` and ``features`` will be 0 for a
nonspatial layer.
``crs``, ``geometry`` and ``total_bounds`` will be ``None`` and ``features`` will be
0 for a nonspatial layer.
``features`` will be -1 if this is an expensive operation for this driver. You can
force it to be calculated using the ``force_feature_count`` parameter.
``total_bounds`` is the 2-dimensional extent of all features within the dataset:
(xmin, ymin, xmax, ymax). It will be None if this is an expensive operation for this
driver or if the data source is nonspatial. You can force it to be calculated using
the ``force_total_bounds`` parameter.
Parameters
----------
Expand All @@ -199,6 +215,10 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):
If present, will be used as the encoding for reading string values from
the data source, unless encoding can be inferred directly from the data
source.
force_feature_count : bool, optional (default: False)
True if the feature count should be computed even if it is expensive.
force_total_bounds : bool, optional (default: False)
True if the total bounds should be computed even if it is expensive.
**kwargs
Additional driver-specific dataset open options passed to OGR. Invalid
options will trigger a warning.
Expand All @@ -214,10 +234,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):
"dtypes": <ndarray of field dtypes>,
"encoding": "<encoding>",
"geometry": "<geometry type>",
"features": <feature count>,
"features": <feature count or -1>,
"total_bounds": <tuple with total bounds or None>,
"driver": "<driver>",
"dataset_metadata" "<dict of dataset metadata or None>"
"layer_metadata" "<dict of layer metadata or None>"
"capabilities": "<dict of driver capabilities>"
"dataset_metadata": "<dict of dataset metadata or None>"
"layer_metadata": "<dict of layer metadata or None>"
}
"""
path, buffer = get_vsi_path(path_or_buffer)
Expand All @@ -226,7 +248,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs):

try:
result = ogr_read_info(
path, layer=layer, encoding=encoding, dataset_kwargs=dataset_kwargs
path,
layer=layer,
encoding=encoding,
force_feature_count=force_feature_count,
force_total_bounds=force_total_bounds,
dataset_kwargs=dataset_kwargs,
)
finally:
if buffer is not None:
Expand Down
50 changes: 43 additions & 7 deletions pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from pyogrio.core import detect_write_driver
from pyogrio.errors import DataSourceError, DataLayerError
from pyogrio.tests.conftest import prepare_testfile

from pyogrio._env import GDALEnv

Expand Down Expand Up @@ -279,7 +280,13 @@ def test_read_info(naturalearth_lowres):
assert meta["fields"].shape == (5,)
assert meta["dtypes"].tolist() == ["int64", "object", "object", "object", "float64"]
assert meta["features"] == 177
assert allclose(meta["total_bounds"], (-180, -90, 180, 83.64513))
assert meta["driver"] == "ESRI Shapefile"
assert meta["capabilities"]["random_read"] is True
assert meta["capabilities"]["fast_set_next_by_index"] is True
assert meta["capabilities"]["fast_spatial_filter"] is False
assert meta["capabilities"]["fast_feature_count"] is True
assert meta["capabilities"]["fast_total_bounds"] is True


@pytest.mark.parametrize(
Expand Down Expand Up @@ -321,19 +328,48 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres):

def test_read_info_force_feature_count_exception(data_dir):
with pytest.raises(DataLayerError, match="Could not iterate over features"):
read_info(data_dir / "sample.osm.pbf", layer="lines")
read_info(data_dir / "sample.osm.pbf", layer="lines", force_feature_count=True)


def test_read_info_force_feature_count(data_dir):
@pytest.mark.parametrize(
"layer, force, expected",
[
("points", False, -1),
("points", True, 8),
("lines", False, -1),
("lines", True, 36),
],
)
def test_read_info_force_feature_count(data_dir, layer, force, expected):
# the sample OSM file has non-increasing node IDs which causes the default
# custom indexing to raise an exception iterating over features
meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False)
assert meta["features"] == 8

meta = read_info(
data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False
data_dir / "sample.osm.pbf",
layer=layer,
force_feature_count=force,
USE_CUSTOM_INDEXING=False,
)
assert meta["features"] == 36
assert meta["features"] == expected


@pytest.mark.parametrize(
"force_total_bounds, expected_total_bounds",
[(True, (-180.0, -90.0, 180.0, 83.64513)), (False, None)],
)
def test_read_info_force_total_bounds(
tmpdir, naturalearth_lowres, force_total_bounds, expected_total_bounds
):
# Geojson files don't hava a fast way to determine total_bounds
geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson")
info = read_info(geojson_path, force_total_bounds=force_total_bounds)
if expected_total_bounds is not None:
assert allclose(info["total_bounds"], expected_total_bounds)
else:
assert info["total_bounds"] is None


def test_read_info_without_geometry(test_fgdb_vsi):
assert read_info(test_fgdb_vsi)["total_bounds"] is None


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def test_write_append_unsupported(tmpdir, naturalearth_lowres, driver, ext):

assert os.path.exists(filename)

assert read_info(filename)["features"] == 177
assert read_info(filename, force_feature_count=True)["features"] == 177

with pytest.raises(DataSourceError):
write(filename, geometry, field_data, driver=driver, append=True, **meta)
Expand Down

0 comments on commit f39ddf8

Please sign in to comment.