diff --git a/CHANGES.md b/CHANGES.md index 50539517..e2d9ff50 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,9 +9,10 @@ unknown count for a data layer (e.g., OSM driver); this may have signficant performance impacts for some data sources that would otherwise return an unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271). +- In `read_info`, the result now also contains the `total_bounds` of the layer as well + as some extra `capabilities` of the data source driver (#281) - Raise error if `read` or `read_dataframe` is called with parameters to read no columns, geometry, or fids (#280) - - Automatically detect supported driver by extension for all available write drivers and addition of `detect_write_driver` (#270) @@ -25,6 +26,15 @@ - Always raise an exception if there is an error when writing a data source (#284) +### Potentially breaking changes + +- In `read_info` (#281): + - the `features` property in the result will now be -1 if calculating the + feature count is an expensive operation for this driver. You can force it to be + calculated using the `force_feature_count` parameter. + - for boolean values in the `capabilities` property, the values will now be + booleans instead of 1 or 0. + ## 0.6.0 (2023-04-27) ### Improvements diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 874453f8..7b75ed8e 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -320,7 +320,7 @@ cdef get_driver(OGRDataSourceH ogr_dataset): return driver -cdef get_feature_count(OGRLayerH ogr_layer): +cdef get_feature_count(OGRLayerH ogr_layer, int force): """Get the feature count of a layer. If GDAL returns an unknown count (-1), this iterates over every feature @@ -329,6 +329,8 @@ cdef get_feature_count(OGRLayerH ogr_layer): Parameters ---------- ogr_layer : pointer to open OGR layer + force : bool + True if the feature count should be computed even if it is expensive Returns ------- @@ -337,12 +339,12 @@ cdef get_feature_count(OGRLayerH ogr_layer): """ cdef OGRFeatureH ogr_feature = NULL - cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1) + cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, force) # if GDAL refuses to give us the feature count, we have to loop over all # features ourselves and get the count. This can happen for some drivers # (e.g., OSM) or if a where clause is invalid but not rejected as error - if feature_count == -1: + if force and feature_count == -1: # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) @@ -376,6 +378,34 @@ cdef get_feature_count(OGRLayerH ogr_layer): return feature_count +cdef get_total_bounds(OGRLayerH ogr_layer, int force): + """Get the total bounds of a layer. + + Parameters + ---------- + ogr_layer : pointer to open OGR layer + force : bool + True if the total bounds should be computed even if it is expensive + + Returns + ------- + tuple of (xmin, ymin, xmax, ymax) or None + The total bounds of the layer, or None if they could not be determined. + """ + + cdef OGREnvelope ogr_envelope + try: + exc_wrap_ogrerr(OGR_L_GetExtent(ogr_layer, &ogr_envelope, force)) + bounds = ( + ogr_envelope.MinX, ogr_envelope.MinY, ogr_envelope.MaxX, ogr_envelope.MaxY + ) + + except CPLE_BaseError: + bounds = None + + return bounds + + cdef set_metadata(GDALMajorObjectH obj, object metadata): """Set metadata on a dataset or layer @@ -598,7 +628,7 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe skip_features : number of features to skip from beginning of available range max_features : maximum number of features to read from available range """ - feature_count = get_feature_count(ogr_layer) + feature_count = get_feature_count(ogr_layer, 1) num_features = max_features if feature_count == 0: @@ -1369,7 +1399,9 @@ def ogr_read_info( str path, dataset_kwargs, object layer=None, - object encoding=None): + object encoding=None, + int force_feature_count=False, + int force_total_bounds=False): cdef const char *path_c = NULL cdef char **dataset_options = NULL @@ -1404,12 +1436,15 @@ def ogr_read_info( 'fields': fields[:,2], # return only names 'dtypes': fields[:,3], 'geometry_type': get_geometry_type(ogr_layer), - 'features': get_feature_count(ogr_layer), + 'features': get_feature_count(ogr_layer, force_feature_count), + 'total_bounds': get_total_bounds(ogr_layer, force_total_bounds), 'driver': get_driver(ogr_dataset), "capabilities": { - "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead), - "fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex), - "fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter), + "random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead) == 1, + "fast_set_next_by_index": OGR_L_TestCapability(ogr_layer, OLCFastSetNextByIndex) == 1, + "fast_spatial_filter": OGR_L_TestCapability(ogr_layer, OLCFastSpatialFilter) == 1, + "fast_feature_count": OGR_L_TestCapability(ogr_layer, OLCFastFeatureCount) == 1, + "fast_total_bounds": OGR_L_TestCapability(ogr_layer, OLCFastGetExtent) == 1, }, 'layer_metadata': get_metadata(ogr_layer), 'dataset_metadata': get_metadata(ogr_dataset), diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd index 88f12d9d..38f777ff 100644 --- a/pyogrio/_ogr.pxd +++ b/pyogrio/_ogr.pxd @@ -280,6 +280,7 @@ cdef extern from "ogr_api.h": const char* OGR_L_GetName(OGRLayerH layer) const char* OGR_L_GetFIDColumn(OGRLayerH layer) const char* OGR_L_GetGeometryColumn(OGRLayerH layer) + OGRErr OGR_L_GetExtent(OGRLayerH layer, OGREnvelope *psExtent, int bForce) OGRSpatialReferenceH OGR_L_GetSpatialRef(OGRLayerH layer) int OGR_L_TestCapability(OGRLayerH layer, const char *name) OGRFeatureDefnH OGR_L_GetLayerDefn(OGRLayerH layer) @@ -301,6 +302,8 @@ cdef extern from "ogr_api.h": const char* OLCRandomRead const char* OLCFastSetNextByIndex const char* OLCFastSpatialFilter + const char* OLCFastFeatureCount + const char* OLCFastGetExtent const char* OLCTransactions diff --git a/pyogrio/core.py b/pyogrio/core.py index d190c603..ec91a3c9 100644 --- a/pyogrio/core.py +++ b/pyogrio/core.py @@ -184,11 +184,27 @@ def read_bounds( return result -def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): +def read_info( + path_or_buffer, + /, + layer=None, + encoding=None, + force_feature_count=False, + force_total_bounds=False, + **kwargs, +): """Read information about an OGR data source. - ``crs`` and ``geometry`` will be ``None`` and ``features`` will be 0 for a - nonspatial layer. + ``crs``, ``geometry`` and ``total_bounds`` will be ``None`` and ``features`` will be + 0 for a nonspatial layer. + + ``features`` will be -1 if this is an expensive operation for this driver. You can + force it to be calculated using the ``force_feature_count`` parameter. + + ``total_bounds`` is the 2-dimensional extent of all features within the dataset: + (xmin, ymin, xmax, ymax). It will be None if this is an expensive operation for this + driver or if the data source is nonspatial. You can force it to be calculated using + the ``force_total_bounds`` parameter. Parameters ---------- @@ -199,6 +215,10 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): If present, will be used as the encoding for reading string values from the data source, unless encoding can be inferred directly from the data source. + force_feature_count : bool, optional (default: False) + True if the feature count should be computed even if it is expensive. + force_total_bounds : bool, optional (default: False) + True if the total bounds should be computed even if it is expensive. **kwargs Additional driver-specific dataset open options passed to OGR. Invalid options will trigger a warning. @@ -214,10 +234,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): "dtypes": , "encoding": "", "geometry": "", - "features": , + "features": , + "total_bounds": , "driver": "", - "dataset_metadata" "" - "layer_metadata" "" + "capabilities": "" + "dataset_metadata": "" + "layer_metadata": "" } """ path, buffer = get_vsi_path(path_or_buffer) @@ -226,7 +248,12 @@ def read_info(path_or_buffer, /, layer=None, encoding=None, **kwargs): try: result = ogr_read_info( - path, layer=layer, encoding=encoding, dataset_kwargs=dataset_kwargs + path, + layer=layer, + encoding=encoding, + force_feature_count=force_feature_count, + force_total_bounds=force_total_bounds, + dataset_kwargs=dataset_kwargs, ) finally: if buffer is not None: diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 9859301e..0a07815e 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -14,6 +14,7 @@ ) from pyogrio.core import detect_write_driver from pyogrio.errors import DataSourceError, DataLayerError +from pyogrio.tests.conftest import prepare_testfile from pyogrio._env import GDALEnv @@ -279,7 +280,13 @@ def test_read_info(naturalearth_lowres): assert meta["fields"].shape == (5,) assert meta["dtypes"].tolist() == ["int64", "object", "object", "object", "float64"] assert meta["features"] == 177 + assert allclose(meta["total_bounds"], (-180, -90, 180, 83.64513)) assert meta["driver"] == "ESRI Shapefile" + assert meta["capabilities"]["random_read"] is True + assert meta["capabilities"]["fast_set_next_by_index"] is True + assert meta["capabilities"]["fast_spatial_filter"] is False + assert meta["capabilities"]["fast_feature_count"] is True + assert meta["capabilities"]["fast_total_bounds"] is True @pytest.mark.parametrize( @@ -321,19 +328,48 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres): def test_read_info_force_feature_count_exception(data_dir): with pytest.raises(DataLayerError, match="Could not iterate over features"): - read_info(data_dir / "sample.osm.pbf", layer="lines") + read_info(data_dir / "sample.osm.pbf", layer="lines", force_feature_count=True) -def test_read_info_force_feature_count(data_dir): +@pytest.mark.parametrize( + "layer, force, expected", + [ + ("points", False, -1), + ("points", True, 8), + ("lines", False, -1), + ("lines", True, 36), + ], +) +def test_read_info_force_feature_count(data_dir, layer, force, expected): # the sample OSM file has non-increasing node IDs which causes the default # custom indexing to raise an exception iterating over features - meta = read_info(data_dir / "sample.osm.pbf", USE_CUSTOM_INDEXING=False) - assert meta["features"] == 8 - meta = read_info( - data_dir / "sample.osm.pbf", layer="lines", USE_CUSTOM_INDEXING=False + data_dir / "sample.osm.pbf", + layer=layer, + force_feature_count=force, + USE_CUSTOM_INDEXING=False, ) - assert meta["features"] == 36 + assert meta["features"] == expected + + +@pytest.mark.parametrize( + "force_total_bounds, expected_total_bounds", + [(True, (-180.0, -90.0, 180.0, 83.64513)), (False, None)], +) +def test_read_info_force_total_bounds( + tmpdir, naturalearth_lowres, force_total_bounds, expected_total_bounds +): + # Geojson files don't hava a fast way to determine total_bounds + geojson_path = prepare_testfile(naturalearth_lowres, dst_dir=tmpdir, ext=".geojson") + info = read_info(geojson_path, force_total_bounds=force_total_bounds) + if expected_total_bounds is not None: + assert allclose(info["total_bounds"], expected_total_bounds) + else: + assert info["total_bounds"] is None + + +def test_read_info_without_geometry(test_fgdb_vsi): + assert read_info(test_fgdb_vsi)["total_bounds"] is None @pytest.mark.parametrize( diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index aa65fb5e..3bb87c55 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -490,7 +490,7 @@ def test_write_append_unsupported(tmpdir, naturalearth_lowres, driver, ext): assert os.path.exists(filename) - assert read_info(filename)["features"] == 177 + assert read_info(filename, force_feature_count=True)["features"] == 177 with pytest.raises(DataSourceError): write(filename, geometry, field_data, driver=driver, append=True, **meta)