diff --git a/.github/workflows/cmake_builds.yml b/.github/workflows/cmake_builds.yml index af93a78bb77d..5cdacd14e71f 100644 --- a/.github/workflows/cmake_builds.yml +++ b/.github/workflows/cmake_builds.yml @@ -83,6 +83,13 @@ jobs: && sudo make install sudo ldconfig # + # Install Arrow C++ + sudo apt-get install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt-get update + sudo apt-get install -y -V libarrow-dev libparquet-dev + # # Workaround bug in ogdi packaging sudo ln -s /usr/lib/ogdi/libvrf.so /usr/lib # @@ -344,7 +351,8 @@ jobs: libnetcdf openjpeg poppler libtiff libpng xerces-c expat libxml2 kealib json-c \ cfitsio freexl geotiff jpeg libpq libspatialite libwebp-base pcre pcre2 postgresql \ sqlite tiledb zstd cryptopp cgal doxygen librttopo libkml openssl xz \ - openjdk ant qhull armadillo blas blas-devel libblas libcblas liblapack liblapacke blosc + openjdk ant qhull armadillo blas blas-devel libblas libcblas liblapack liblapacke blosc \ + "arrow-cpp>=7.0.0" cd $CONDA_PREFIX/Library/share/proj curl http://download.osgeo.org/proj/proj-datumgrid-1.8.tar.gz > proj-datumgrid-1.8.tar.gz tar xvzf proj-datumgrid-1.8.tar.gz diff --git a/autotest/generate_parquet_test_file.py b/autotest/generate_parquet_test_file.py new file mode 100644 index 000000000000..77597d2e20b0 --- /dev/null +++ b/autotest/generate_parquet_test_file.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +############################################################################### +# $Id$ +# +# Project: GDAL/OGR Test Suite +# Purpose: Test read functionality for OGR Parquet driver. +# Author: Even Rouault +# +############################################################################### +# Copyright (c) 2022, Planet Labs +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +############################################################################### + +wkt_epsg_4326 = 'GEOGCRS["WGS 84",ENSEMBLE["World Geodetic ' + \ + 'System 1984 ensemble",MEMBER["World Geodetic ' + \ + 'System 1984 (Transit)"],MEMBER["World ' + \ + 'Geodetic System 1984 (G730)"],MEMBER["World ' + \ + 'Geodetic System 1984 (G873)"],MEMBER["World ' + \ + 'Geodetic System 1984 (G1150)"],MEMBER["World ' + \ + 'Geodetic System 1984 (G1674)"],MEMBER["World ' + \ + 'Geodetic System 1984 (G1762)"],MEMBER["World ' + \ + 'Geodetic System 1984 ' + \ + '(G2139)"],ELLIPSOID["WGS ' + \ + '84",6378137,298.257223563],ENSEMBLEACCURACY[2.0]],CS[ellipsoidal,2],AXIS["geodetic ' + \ + 'latitude (Lat)",north],AXIS["geodetic ' + \ + 'longitude ' + \ + '(Lon)",east],UNIT["degree",0.0174532925199433],USAGE[SCOPE["Horizontal ' + \ + 'component of 3D ' + \ + 'system."],AREA["World."],BBOX[-90,-180,90,180]],ID["EPSG",4326]]' + +def generate_test_parquet(): + import pyarrow as pa + import datetime + import decimal + import json + import pandas as pd + import pathlib + import pyarrow.parquet as pq + import struct + + boolean = pa.array([True, False, None, False, True], type=pa.bool_()) + uint8 = pa.array([None if i == 2 else 1 + i for i in range(5)], type=pa.uint8()) + int8 = pa.array([None if i == 2 else -2 + i for i in range(5)], type=pa.int8()) + uint16 = pa.array([None if i == 2 else 1 + i * 10000 for i in range(5)], type=pa.uint16()) + int16 = pa.array([None if i == 2 else -20000 + i * 10000 for i in range(5)], type=pa.int16()) + uint32 = pa.array([None if i == 2 else 1 + i * 1000000000 for i in range(5)], type=pa.uint32()) + int32 = pa.array([None if i == 2 else -2000000000 + i*1000000000 for i in range(5)], type=pa.int32()) + uint64 = pa.array([None if i == 2 else 1 + i * 100000000000 for i in range(5)], type=pa.uint64()) + int64 = pa.array([None if i == 2 else -200000000000 + i*100000000000 for i in range(5)], type=pa.int64()) + float32 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float32()) + float64 = pa.array([None if i == 2 else 1.5 + i for i in range(5)], type=pa.float64()) + string = pa.array(["abcd", "", None, "c", "d"], type=pa.string()) + large_string = pa.array(["abcd", "", None, "c", "d"], type=pa.large_string()) + gmt_plus_2 = datetime.timezone(datetime.timedelta(hours=2)) + timestamp_ms_gmt_plus_2 = pa.array( + [pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500*1e6, + tz=gmt_plus_2)] * 5, type=pa.timestamp('ms', tz=gmt_plus_2)) + gmt = datetime.timezone(datetime.timedelta(hours=0)) + timestamp_ms_gmt = pa.array( + [pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500*1e6, + tz=gmt)] * 5, type=pa.timestamp('ms', tz=gmt)) + gmt_minus_0215 = datetime.timezone(datetime.timedelta(hours=-2.25)) + timestamp_ms_gmt_minus_0215 = pa.array( + [pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500*1e6, + tz=gmt_minus_0215)] * 5, type=pa.timestamp('ms', tz=gmt_minus_0215)) + timestamp_s_no_tz = pa.array( + [pd.Timestamp(year=2019, month=1, day=1, hour=14, nanosecond=500*1e6)] * 5, type=pa.timestamp('s')) + time32_s = pa.array([3600 + 120 + 3,None,3,4,5], type=pa.time32('s')) + time32_ms = pa.array([(3600 + 120 + 3) * 1000 + 456,2,3,4,5], type=pa.time32('ms')) + time64_us = pa.array([(3600 + 120 + 3) * 1e6,None,3,4,5], type=pa.time64('us')) + time64_ns = pa.array([(3600 + 120 + 3) * 1e9 + 456,2,3,4,5], type=pa.time64('ns')) + date32 = pa.array([1,2,3,4,5], type=pa.date32()) + date64 = pa.array([86400*1000,2,3,4,5], type=pa.date64()) + duration_s = pa.array([1,2,3,4,5], type=pa.duration('s')) + duration_ms = pa.array([1,2,3,4,5], type=pa.duration('ms')) + binary = pa.array([b'\x00\x01'] * 5, type=pa.binary()) + large_binary = pa.array([b'\x00\x01'] * 5, type=pa.large_binary()) + fixed_size_binary = pa.array([b'\x00\x01'] * 5, type=pa.binary(2)) + decimal128 = pa.array([decimal.Decimal('1234.567'),decimal.Decimal('-1234.567'),None, + decimal.Decimal('1234.567'),decimal.Decimal('-1234.567')], type=pa.decimal128(7,3)) + decimal256 = pa.array([decimal.Decimal('1234.567'),decimal.Decimal('-1234.567'),None, + decimal.Decimal('1234.567'),decimal.Decimal('-1234.567')], type=pa.decimal256(7,3)) + list_boolean = pa.array([None if i == 2 else [None if j == 0 else True if (j % 2) == 0 else False for j in range(i)] for i in range(5)], type=pa.list_(pa.bool_())) + list_uint8 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.uint8())) + list_int8 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.int8())) + list_uint16 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.uint16())) + list_int16 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.int16())) + list_uint32 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.uint32())) + list_int32 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.int32())) + list_uint64 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.uint64())) + list_int64 = pa.array([None if i == 2 else [None if j == 0 else j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.int64())) + list_float32 = pa.array([None if i == 2 else [None if j == 0 else 0.5 + j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.float32())) + list_float64 = pa.array([None if i == 2 else [None if j == 0 else 0.5 + j + i * (i-1)//2 for j in range(i)] for i in range(5)], type=pa.list_(pa.float64())) + list_string = pa.array([None if i == 2 else ["".join(["%c" % (65+j+k) for k in range(1+j)]) for j in range(i)] for i in range(5)]) + fixed_size_list_boolean = pa.array([[True, False], [False,True], [True, False], [False,True], [True, False]], type=pa.list_(pa.bool_(), 2)) + fixed_size_list_uint8 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.uint8(), 2)) + fixed_size_list_int8 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.int8(), 2)) + fixed_size_list_uint16 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.uint16(), 2)) + fixed_size_list_int16 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.int16(), 2)) + fixed_size_list_uint32 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.uint32(), 2)) + fixed_size_list_int32 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.int32(), 2)) + fixed_size_list_uint64 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.uint64(), 2)) + fixed_size_list_int64 = pa.array([[0, 1], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.int64(), 2)) + fixed_size_list_float32 = pa.array([[0, None], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.float32(), 2)) + fixed_size_list_float64 = pa.array([[0, None], [2,3], [4, 5], [6,7], [8, 9]], type=pa.list_(pa.float64(), 2)) + fixed_size_list_string = pa.array([["a", "b"], ["c", "d"], ["e", "f"], ["g", "h"], ["i", "j"]], type=pa.list_(pa.string(), 2)) + struct_field = pa.array([{"a": 1, "b": 2.5, "c" : { "d": "e", "f": "g"}, "h":[5,6], "i":3 }] * 5) + + #struct_val = { "a": 5 } + #for i in range(123): + # struct_val = { "a": struct_val } + #struct_field = pa.array([struct_val] * 5) + + map_boolean = pa.array( [[('x', None), ('y', True)],[('z', True)],None,[],[]], type=pa.map_(pa.string(), pa.bool_())) + map_uint8 = pa.array( [[('x', 1), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.uint8())) + map_int8 = pa.array( [[('x', 1), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.int8())) + map_uint16 = pa.array( [[('x', 1), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.uint16())) + map_int16 = pa.array( [[('x', 1), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.int16())) + map_uint32 = pa.array( [[('x', 4*1000*1000*1000), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.uint32())) + map_int32 = pa.array( [[('x', 2*1000*1000*1000), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.int32())) + map_uint64 = pa.array( [[('x', 4*1000*1000*1000*1000), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.uint64())) + map_int64 = pa.array( [[('x', -2*1000*1000*1000*1000), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.int64())) + map_float32 = pa.array( [[('x', 1.5), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.float32())) + map_float64 = pa.array( [[('x', 1.5), ('y', None)],[('z', 3)],None,[],[]], type=pa.map_(pa.string(), pa.float64())) + map_string = pa.array( [[('x', 'x_val'), ('y', None)],[('z', 'z_val')],None,[],[]], type=pa.map_(pa.string(), pa.string())) + + indices = pa.array([0, 1, 2, None, 2]) + dictionary = pa.array(['foo', 'bar', 'baz']) + dict = pa.DictionaryArray.from_arrays(indices, dictionary) + + map_list = pa.array( [[('x', []), ('y', [])],[('z', [])],None,[],[]], type=pa.map_(pa.string(), pa.list_(pa.uint32()))) + + geometry = pa.array( [None if i == 1 else (b'\x01\x01\x00\x00\x00' + struct.pack(' +# +############################################################################### +# Copyright (c) 2022, Planet Labs +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +############################################################################### + +import json +import math +from osgeo import gdal, ogr + +import gdaltest +import pytest + +pytestmark = pytest.mark.require_driver('Arrow') + +from . import ogr_parquet + + +def check(test_filename, filename_prefix, dim): + ref_filename = 'data/arrow/from_paleolimbot_geoarrow/' + filename_prefix + dim + '-wkb.feather' + ds_ref = ogr.Open(ref_filename) + lyr_ref = ds_ref.GetLayer(0) + ds = ogr.Open(test_filename) + lyr = ds.GetLayer(0) + assert lyr_ref.GetFeatureCount() == lyr.GetFeatureCount() + while True: + f_ref = lyr_ref.GetNextFeature() + f = lyr.GetNextFeature() + assert (f_ref is None) == (f is None) + if f is None: + break + g = f.GetGeometryRef() + g_ref = f_ref.GetGeometryRef() + assert (g_ref is None) == (g is None) + if g: + if g.IsEmpty(): + assert g.IsEmpty() == g_ref.IsEmpty() + else: + assert g.Equals(g_ref), (g.ExportToIsoWkt(), g_ref.ExportToIsoWkt()) + + +############################################################################### +# Test reading test files from https://github.com/paleolimbot/geoarrow/tree/master/inst/example_feather + + +@pytest.mark.parametrize("filename_prefix", ['point', + 'linestring', + 'polygon', + 'multipoint', + 'multilinestring', + 'multipolygon', + 'geometrycollection']) +@pytest.mark.parametrize("dim", ['', '_z', '_m', '_zm']) +def test_ogr_arrow_read_all_geom_types(filename_prefix, dim): + + test_filename = 'data/arrow/from_paleolimbot_geoarrow/' + filename_prefix + dim + '-default.feather' + check(test_filename, filename_prefix, dim) + + +############################################################################### +# Test dplicating test files from https://github.com/paleolimbot/geoarrow/tree/master/inst/example_feather + + +@pytest.mark.parametrize("filename_prefix", ['point', + 'linestring', + 'polygon', + 'multipoint', + 'multilinestring', + 'multipolygon', + 'geometrycollection']) +@pytest.mark.parametrize("dim", ['', '_z', '_m', '_zm']) +@pytest.mark.parametrize("encoding", ['WKB', 'WKT', 'GEOARROW']) +def test_ogr_arrow_write_all_geom_types(filename_prefix, dim, encoding): + + test_filename = 'data/arrow/from_paleolimbot_geoarrow/' + filename_prefix + dim + '-default.feather' + ds_ref = ogr.Open(test_filename) + lyr_ref = ds_ref.GetLayer(0) + + if encoding != 'GEOARROW' or \ + lyr_ref.GetGeomType() not in (ogr.wkbGeometryCollection, + ogr.wkbGeometryCollection25D, + ogr.wkbGeometryCollectionM, + ogr.wkbGeometryCollectionZM): + vsifilename = '/vsimem/test.feather' + with gdaltest.config_option('OGR_ARROW_ALLOW_ALL_DIMS', 'YES'): + gdal.VectorTranslate(vsifilename, test_filename, + dstSRS='EPSG:4326', + reproject=False, + layerCreationOptions = ['GEOMETRY_ENCODING='+encoding]) + check(vsifilename, filename_prefix, dim) + gdal.Unlink(vsifilename) + +############################################################################### +# Read a file with all data types + + +@pytest.mark.parametrize("use_vsi", [False, True]) +def test_ogr_arrow_1(use_vsi): + + filename = 'data/arrow/test.feather' + if use_vsi: + vsifilename = '/vsimem/test.feather' + gdal.FileFromMemBuffer(vsifilename, open(filename, 'rb').read()) + filename = vsifilename + + try: + ogr_parquet._check_test_parquet(filename, + expect_fast_get_extent=False, + expect_ignore_fields=False) + finally: + if use_vsi: + gdal.Unlink(vsifilename) + +############################################################################### +# Run test_ogrsf on a Feather file + + +def test_ogr_arrow_test_ogrsf_test_feather(): + import test_cli_utilities + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal(test_cli_utilities.get_test_ogrsf_path() + ' -ro data/arrow/from_paleolimbot_geoarrow/polygon-default.feather') + + assert 'INFO' in ret + assert 'ERROR' not in ret + +############################################################################### +# Run test_ogrsf on a IPC stream file + + +def test_ogr_arrow_test_ogrsf_test_ipc(): + import test_cli_utilities + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal(test_cli_utilities.get_test_ogrsf_path() + ' -ro data/arrow/from_paleolimbot_geoarrow/polygon-default.ipc') + + assert 'INFO' in ret + assert 'ERROR' not in ret + +############################################################################### +# Run test_ogrsf on a IPC stream file, in streamable mode + + +def test_ogr_arrow_test_ogrsf_test_ipc_streamable(): + import test_cli_utilities + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal(test_cli_utilities.get_test_ogrsf_path() + ' -ro ARROW_IPC_STREAM:data/arrow/from_paleolimbot_geoarrow/polygon-default.ipc') + + assert 'INFO' in ret + assert 'ERROR' not in ret + + +############################################################################### +# Test write support + + +@pytest.mark.parametrize("use_vsi,batch_size,fid,write_gdal_footer,format,open_as_stream", + [(False, None, None, False, 'FILE', None), + (True, 2, "fid", True, 'FILE', None), + (False, None, None, False, 'STREAM', False), + (False, 2, None, False, 'STREAM', False), + (False, 2, None, False, 'STREAM', True), + ]) +def test_ogr_arrow_write_from_another_dataset(use_vsi, batch_size, fid, write_gdal_footer, format, open_as_stream): + + outfilename = '/vsimem/out' if use_vsi else 'tmp/out' + try: + layerCreationOptions = [ 'FORMAT=' + format ] + if batch_size: + layerCreationOptions.append('BATCH_SIZE=' + str(batch_size)) + if fid: + layerCreationOptions.append('FID=' + fid) + with gdaltest.config_option('OGR_ARROW_WRITE_GDAL_FOOTER', str(write_gdal_footer)): + gdal.VectorTranslate(outfilename, 'data/arrow/test.feather', + format='ARROW', + layerCreationOptions=layerCreationOptions) + + ds = gdal.OpenEx('ARROW_IPC_STREAM:' + outfilename if open_as_stream else outfilename) + lyr = ds.GetLayer(0) + + assert lyr.GetFIDColumn() == (fid if fid else "") + f = lyr.GetNextFeature() + assert f.GetGeometryRef() is not None + + if fid: + f = lyr.GetFeature(4) + assert f is not None + assert f.GetFID() == 4 + + assert lyr.GetFeature(5) is None + + if batch_size and format == 'FILE': + num_features = lyr.GetFeatureCount() + expected_num_row_groups = int(math.ceil(num_features / batch_size)) + assert lyr.GetMetadataItem("NUM_RECORD_BATCHES", "_ARROW_") == str(expected_num_row_groups) + for i in range(expected_num_row_groups): + got_num_rows = lyr.GetMetadataItem("RECORD_BATCHES[%d].NUM_ROWS" % i, "_ARROW_") + if i < expected_num_row_groups - 1: + assert got_num_rows == str(batch_size) + else: + assert got_num_rows == str(num_features - (expected_num_row_groups - 1) * batch_size) + + assert lyr.GetMetadataItem('FORMAT', '_ARROW_') == format + + geo = lyr.GetMetadataItem("geo", "_ARROW_METADATA_") + assert geo is not None + j = json.loads(geo) + assert j is not None + assert 'primary_column' in j + assert j['primary_column'] == 'geometry' + assert 'columns' in j + assert 'geometry' in j['columns'] + assert 'encoding' in j['columns']['geometry'] + assert j['columns']['geometry']['encoding'] == 'geoarrow.point' + assert 'bbox' not in j['columns']['geometry'] + + md = lyr.GetMetadata("_ARROW_METADATA_") + assert 'geo' in md + + if write_gdal_footer: + geo = lyr.GetMetadataItem("gdal:geo", "_ARROW_FOOTER_METADATA_") + assert geo is not None + j = json.loads(geo) + assert j is not None + assert 'bbox' in j['columns']['geometry'] + + + md = lyr.GetMetadata("_ARROW_FOOTER_METADATA_") + if write_gdal_footer: + assert 'gdal:geo' in md + else: + assert 'gdal:geo' not in md + + if open_as_stream: + + with gdaltest.error_handler(): + assert lyr.GetFeatureCount(force=0) == -1 + + assert lyr.GetFeatureCount() == 5 + + with gdaltest.error_handler(): + gdal.ErrorReset() + assert lyr.GetNextFeature() is None + assert gdal.GetLastErrorMsg() == 'Attempting to rewind non-seekable stream' + + elif format == 'STREAM' and batch_size: + + assert lyr.GetFeatureCount(force=0) == 5 + + ogr_parquet._check_test_parquet(outfilename, + expect_fast_feature_count=False if open_as_stream else True, + expect_fast_get_extent=False, + expect_ignore_fields=False) + + finally: + gdal.Unlink(outfilename) + + +############################################################################### +# Test compression support + + +@pytest.mark.parametrize("compression", ['uncompressed', 'lz4', 'zstd']) +def test_ogr_arrow_write_compression(compression): + + lco = gdal.GetDriverByName('Arrow').GetMetadataItem("DS_LAYER_CREATIONOPTIONLIST") + if compression.upper() not in lco: + pytest.skip() + + outfilename = '/vsimem/out.feather' + ds = gdal.GetDriverByName('Arrow').Create(outfilename, 0, 0, 0, gdal.GDT_Unknown) + options = ['FID=fid', 'COMPRESSION=' + compression] + lyr = ds.CreateLayer('out', geom_type=ogr.wkbNone, options=options) + assert lyr is not None + f = ogr.Feature(lyr.GetLayerDefn()) + lyr.CreateFeature(f) + lyr = None + ds = None + + ds = ogr.Open(outfilename) + assert ds is not None + lyr = ds.GetLayer(0) + assert lyr is not None + # TODO: it would be good to check the compression type, but I can't find anything in the arrow API for that + lyr = None + ds = None + + gdal.Unlink(outfilename) + + +############################################################################### +# Read invalid file .arrow + + +def test_ogr_arrow_invalid_arrow(): + + with gdaltest.error_handler(): + assert ogr.Open('data/arrow/invalid.arrow') is None + + +############################################################################### +# Read invalid file .arrows + + +def test_ogr_arrow_invalid_arrows(): + + with gdaltest.error_handler(): + assert ogr.Open('data/arrow/invalid.arrows') is None + + with gdaltest.error_handler(): + ogr.Open('ARROW_IPC_STREAM:/vsimem/i_dont_exist.bin') is None diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py new file mode 100755 index 000000000000..24a26dc89436 --- /dev/null +++ b/autotest/ogr/ogr_parquet.py @@ -0,0 +1,596 @@ +#!/usr/bin/env pytest +# -*- coding: utf-8 -*- +############################################################################### +# $Id$ +# +# Project: GDAL/OGR Test Suite +# Purpose: Test read/write functionality for OGR Parquet driver. +# Author: Even Rouault +# +############################################################################### +# Copyright (c) 2022, Planet Labs +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. +############################################################################### + +import json +import math +from osgeo import gdal, ogr, osr +import gdaltest +import pytest + +pytestmark = pytest.mark.require_driver('Parquet') + +############################################################################### +# Read invalid file + + +def test_ogr_parquet_invalid(): + + with gdaltest.error_handler(): + assert ogr.Open('data/parquet/invalid.parquet') is None + + +############################################################################### +# Basic tests + + +def _check_test_parquet(filename, + expect_fast_feature_count=True, + expect_fast_get_extent=True, + expect_ignore_fields=True): + with gdaltest.config_option('OGR_PARQUET_BATCH_SIZE', '2'): + ds = gdal.OpenEx(filename) + assert ds is not None, 'cannot open dataset' + assert ds.TestCapability("foo") == 0 + assert ds.GetLayerCount() == 1, 'bad layer count' + assert ds.GetLayer(-1) is None + assert ds.GetLayer(1) is None + lyr = ds.GetLayer(0) + assert lyr is not None + lyr_defn = lyr.GetLayerDefn() + assert lyr_defn.GetGeomFieldCount() == 1 + assert lyr_defn.GetGeomFieldDefn(0).GetName() == 'geometry' + srs = lyr_defn.GetGeomFieldDefn(0).GetSpatialRef() + assert srs is not None + assert srs.GetAuthorityCode(None) == '4326' + assert lyr_defn.GetGeomFieldDefn(0).GetType() == ogr.wkbPoint + assert lyr_defn.GetFieldCount() == 71 + got_field_defns = [ + (lyr_defn.GetFieldDefn(i).GetName(), + ogr.GetFieldTypeName(lyr_defn.GetFieldDefn(i).GetType()), + ogr.GetFieldSubTypeName(lyr_defn.GetFieldDefn(i).GetSubType()), + lyr_defn.GetFieldDefn(i).GetWidth(), + lyr_defn.GetFieldDefn(i).GetPrecision()) for i in range(lyr_defn.GetFieldCount()) ] + #import pprint + #pprint.pprint(got_field_defns) + expected_field_defns = [ + ('boolean', 'Integer', 'Boolean', 0, 0), + ('uint8', 'Integer', 'None', 0, 0), + ('int8', 'Integer', 'None', 0, 0), + ('uint16', 'Integer', 'None', 0, 0), + ('int16', 'Integer', 'Int16', 0, 0), + ('uint32', 'Integer64', 'None', 0, 0), + ('int32', 'Integer', 'None', 0, 0), + ('uint64', 'Real', 'None', 0, 0), + ('int64', 'Integer64', 'None', 0, 0), + ('float32', 'Real', 'Float32', 0, 0), + ('float64', 'Real', 'None', 0, 0), + ('string', 'String', 'None', 0, 0), + ('large_string', 'String', 'None', 0, 0), + ('timestamp_ms_gmt', 'DateTime', 'None', 0, 0), + ('timestamp_ms_gmt_plus_2', 'DateTime', 'None', 0, 0), + ('timestamp_ms_gmt_minus_0215', 'DateTime', 'None', 0, 0), + ('timestamp_s_no_tz', 'DateTime', 'None', 0, 0), + ('time32_s', 'Time', 'None', 0, 0), + ('time32_ms', 'Time', 'None', 0, 0), + ('time64_us', 'Integer64', 'None', 0, 0), + ('time64_ns', 'Integer64', 'None', 0, 0), + ('date32', 'Date', 'None', 0, 0), + ('date64', 'Date', 'None', 0, 0), + ('binary', 'Binary', 'None', 0, 0), + ('large_binary', 'Binary', 'None', 0, 0), + ('fixed_size_binary', 'Binary', 'None', 2, 0), + ('decimal128', 'Real', 'None', 7, 3), + ('decimal256', 'Real', 'None', 7, 3), + ('list_boolean', 'IntegerList', 'Boolean', 0, 0), + ('list_uint8', 'IntegerList', 'None', 0, 0), + ('list_int8', 'IntegerList', 'None', 0, 0), + ('list_uint16', 'IntegerList', 'None', 0, 0), + ('list_int16', 'IntegerList', 'None', 0, 0), + ('list_uint32', 'Integer64List', 'None', 0, 0), + ('list_int32', 'IntegerList', 'None', 0, 0), + ('list_uint64', 'RealList', 'None', 0, 0), + ('list_int64', 'Integer64List', 'None', 0, 0), + ('list_float32', 'RealList', 'Float32', 0, 0), + ('list_float64', 'RealList', 'None', 0, 0), + ('list_string', 'StringList', 'None', 0, 0), + ('fixed_size_list_boolean', 'IntegerList', 'Boolean', 0, 0), + ('fixed_size_list_uint8', 'IntegerList', 'None', 0, 0), + ('fixed_size_list_int8', 'IntegerList', 'None', 0, 0), + ('fixed_size_list_uint16', 'IntegerList', 'None', 0, 0), + ('fixed_size_list_int16', 'IntegerList', 'None', 0, 0), + ('fixed_size_list_uint32', 'Integer64List', 'None', 0, 0), + ('fixed_size_list_int32', 'IntegerList', 'None', 0, 0), + ('fixed_size_list_uint64', 'RealList', 'None', 0, 0), + ('fixed_size_list_int64', 'Integer64List', 'None', 0, 0), + ('fixed_size_list_float32', 'RealList', 'Float32', 0, 0), + ('fixed_size_list_float64', 'RealList', 'None', 0, 0), + ('fixed_size_list_string', 'StringList', 'None', 0, 0), + ('struct_field.a', 'Integer64', 'None', 0, 0), + ('struct_field.b', 'Real', 'None', 0, 0), + ('struct_field.c.d', 'String', 'None', 0, 0), + ('struct_field.c.f', 'String', 'None', 0, 0), + ('struct_field.h', 'Integer64List', 'None', 0, 0), + ('struct_field.i', 'Integer64', 'None', 0, 0), + ('map_boolean', 'String', 'JSON', 0, 0), + ('map_uint8', 'String', 'JSON', 0, 0), + ('map_int8', 'String', 'JSON', 0, 0), + ('map_uint16', 'String', 'JSON', 0, 0), + ('map_int16', 'String', 'JSON', 0, 0), + ('map_uint32', 'String', 'JSON', 0, 0), + ('map_int32', 'String', 'JSON', 0, 0), + ('map_uint64', 'String', 'JSON', 0, 0), + ('map_int64', 'String', 'JSON', 0, 0), + ('map_float32', 'String', 'JSON', 0, 0), + ('map_float64', 'String', 'JSON', 0, 0), + ('map_string', 'String', 'JSON', 0, 0), + ('dict', 'Integer', 'None', 0, 0) + ] + assert got_field_defns == expected_field_defns + if expect_fast_feature_count: + assert lyr.TestCapability(ogr.OLCFastFeatureCount) == 1 + assert lyr.TestCapability(ogr.OLCStringsAsUTF8) == 1 + if expect_fast_get_extent: + assert lyr.TestCapability(ogr.OLCFastGetExtent) == 1 + if expect_ignore_fields: + assert lyr.TestCapability(ogr.OLCIgnoreFields) == 1 + assert lyr.GetFeatureCount() == 5 + assert lyr.GetExtent() == (0.0, 4.0, 2.0, 2.0) + assert lyr.GetExtent(geom_field=0) == (0.0, 4.0, 2.0, 2.0) + with gdaltest.error_handler(): + lyr.GetExtent(geom_field=-1) + lyr.GetExtent(geom_field=1) + + assert ds.GetFieldDomainNames() == ['dictDomain'] + assert ds.GetFieldDomain('not_existing') is None + for _ in range(2): + domain = ds.GetFieldDomain('dictDomain') + assert domain is not None + assert domain.GetName() == 'dictDomain' + assert domain.GetDescription() == '' + assert domain.GetDomainType() == ogr.OFDT_CODED + assert domain.GetFieldType() == ogr.OFTInteger + assert domain.GetFieldSubType() == ogr.OFSTNone + assert domain.GetEnumeration() == {'0': 'foo', '1': 'bar', '2': 'baz'} + + f = lyr.GetNextFeature() + assert f.GetFID() == 0 + assert f['boolean'] + assert f['uint8'] == 1 + assert f['int8'] == -2 + assert f['uint16'] == 1 + assert f['int16'] == -20000 + assert f['uint32'] == 1 + assert f['int32'] == -2000000000 + assert f['uint64'] == 1 + assert f['int64'] == -200000000000 + assert f['float32'] == 1.5 + assert f['float64'] == 1.5 + assert f['string'] == 'abcd' + assert f['large_string'] == 'abcd' + assert f['timestamp_ms_gmt'] == '2019/01/01 14:00:00+00' + assert f['timestamp_ms_gmt_plus_2'] == '2019/01/01 14:00:00+02' + assert f['timestamp_ms_gmt_minus_0215'] == '2019/01/01 14:00:00-0215' + assert f['timestamp_s_no_tz'] == '2019/01/01 14:00:00' + assert f['time32_s'] == '01:02:03' + assert f['time32_ms'] == '01:02:03.456' + assert f['time64_us'] == 3723000000 + assert f['time64_ns'] == 3723000000456 + assert f['date32'] == '1970/01/02' + assert f['date64'] == '1970/01/02' + assert f['binary'] == '0001' + assert f['large_binary'] == '0001' + assert f['fixed_size_binary'] == '0001' + assert f['decimal128'] == 1234.567 + assert f['decimal256'] == 1234.567 + assert f['list_boolean'] == [] + assert f['list_uint8'] == [] + assert f['list_int8'] == [] + assert f['list_uint16'] == [] + assert f['list_int16'] == [] + assert f['list_uint32'] == [] + assert f['list_int32'] == [] + assert f['list_uint64'] == [] + assert f['list_int64'] == [] + assert f['list_float32'] == [] + assert f['list_float64'] == [] + assert f['list_string'] is None + assert f['fixed_size_list_boolean'] == [1, 0] + assert f['fixed_size_list_uint8'] == [0, 1] + assert f['fixed_size_list_int8'] == [0, 1] + assert f['fixed_size_list_uint16'] == [0, 1] + assert f['fixed_size_list_int16'] == [0, 1] + assert f['fixed_size_list_uint32'] == [0, 1] + assert f['fixed_size_list_int32'] == [0, 1] + assert f['fixed_size_list_uint64'] == [0, 1] + assert f['fixed_size_list_int64'] == [0, 1] + assert f['fixed_size_list_float32'][0] == 0 + assert math.isnan(f['fixed_size_list_float32'][1]) + assert f['fixed_size_list_float64'][0] == 0 + assert math.isnan(f['fixed_size_list_float64'][1]) + assert f['fixed_size_list_string'] == ['a', 'b'] + assert f['struct_field.a'] == 1 + assert f['struct_field.b'] == 2.5 + assert f['struct_field.c.d'] == 'e' + assert f['struct_field.c.f'] == 'g' + assert f['struct_field.h'] == [5,6] + assert f['struct_field.i'] == 3 + assert f['map_boolean'] == '{"x":null,"y":true}' + assert f['map_uint8'] == '{"x":1,"y":null}' + assert f['map_int8'] == '{"x":1,"y":null}' + assert f['map_uint16'] == '{"x":1,"y":null}' + assert f['map_int16'] == '{"x":1,"y":null}' + assert f['map_uint32'] == '{"x":4000000000,"y":null}' + assert f['map_int32'] == '{"x":2000000000,"y":null}' + assert f['map_uint64'] == '{"x":4000000000000.0,"y":null}' + assert f['map_int64'] == '{"x":-2000000000000,"y":null}' + assert f['map_float32'] == '{"x":1.5,"y":null}' + assert f['map_float64'] == '{"x":1.5,"y":null}' + assert f['map_string'] == '{"x":"x_val","y":null}' + assert f['dict'] == 0 + assert f.GetGeometryRef().ExportToWkt() == 'POINT (0 2)' + + f = lyr.GetNextFeature() + assert f.GetFID() == 1 + assert not f['boolean'] + assert f['uint8'] == 2 + assert f.GetGeometryRef() is None + + f = lyr.GetNextFeature() + assert f.GetFID() == 2 + assert f['uint8'] is None + assert f.GetGeometryRef().ExportToWkt() == 'POINT (2 2)' + + f = lyr.GetNextFeature() + assert f.GetFID() == 3 + assert f['uint8'] == 4 + assert f.GetGeometryRef().ExportToWkt() == 'POINT (3 2)' + + f = lyr.GetNextFeature() + assert f.GetFID() == 4 + assert f['uint8'] == 5 + assert f.GetGeometryRef().ExportToWkt() == 'POINT (4 2)' + + assert lyr.GetNextFeature() is None + + assert lyr.GetNextFeature() is None + + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f.GetFID() == 0 + + lyr.SetSpatialFilterRect(4,2,4,2) + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f.GetFID() == 4 + lyr.SetSpatialFilter(None) + + if expect_ignore_fields: + # Ignore just one member of a structure + assert lyr.SetIgnoredFields(['struct_field.a']) == ogr.OGRERR_NONE + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f['fixed_size_list_string'] == ['a', 'b'] + assert f['struct_field.a'] is None + assert f['struct_field.b'] == 2.5 + assert f['map_boolean'] == '{"x":null,"y":true}' + assert f.GetGeometryRef().ExportToWkt() == 'POINT (0 2)' + + # Ignore all members of a structure + assert lyr.SetIgnoredFields(['struct_field.a', + 'struct_field.b', + 'struct_field.c.d', + 'struct_field.c.f', + 'struct_field.h', + 'struct_field.i']) == ogr.OGRERR_NONE + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f['fixed_size_list_string'] == ['a', 'b'] + assert f['struct_field.a'] is None + assert f['struct_field.b'] is None + assert f['struct_field.c.d'] is None + assert f['struct_field.c.f'] is None + assert f['struct_field.h'] is None + assert f['struct_field.i'] is None + assert f['map_boolean'] == '{"x":null,"y":true}' + assert f.GetGeometryRef().ExportToWkt() == 'POINT (0 2)' + + # Ignore a map + assert lyr.SetIgnoredFields(['map_boolean']) == ogr.OGRERR_NONE + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f['fixed_size_list_string'] == ['a', 'b'] + assert f['struct_field.a'] == 1 + assert f['struct_field.b'] == 2.5 + assert f['map_boolean'] is None + assert f['map_uint8'] == '{"x":1,"y":null}' + assert f.GetGeometryRef().ExportToWkt() == 'POINT (0 2)' + + # Ignore geometry + assert lyr.SetIgnoredFields(['geometry']) == ogr.OGRERR_NONE + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f['fixed_size_list_string'] == ['a', 'b'] + assert f['struct_field.a'] == 1 + assert f['struct_field.b'] == 2.5 + assert f['map_boolean'] == '{"x":null,"y":true}' + assert f.GetGeometryRef() is None + + # Cancel ignored fields + assert lyr.SetIgnoredFields([]) == ogr.OGRERR_NONE + lyr.ResetReading() + f = lyr.GetNextFeature() + assert f['fixed_size_list_string'] == ['a', 'b'] + assert f['struct_field.a'] == 1 + assert f['struct_field.b'] == 2.5 + assert f['map_boolean'] == '{"x":null,"y":true}' + assert f.GetGeometryRef().ExportToWkt() == 'POINT (0 2)' + + +@pytest.mark.parametrize("use_vsi", [False, True]) +def test_ogr_parquet_1(use_vsi): + + filename = 'data/parquet/test.parquet' + if use_vsi: + vsifilename = '/vsimem/test.parquet' + gdal.FileFromMemBuffer(vsifilename, open(filename, 'rb').read()) + filename = vsifilename + + try: + _check_test_parquet(filename) + finally: + if use_vsi: + gdal.Unlink(vsifilename) + +############################################################################### +# Run test_ogrsf + + +def test_ogr_parquet_test_ogrsf_test(): + import test_cli_utilities + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal(test_cli_utilities.get_test_ogrsf_path() + ' -ro data/parquet/test.parquet') + + assert 'INFO' in ret + assert 'ERROR' not in ret + +############################################################################### +# Run test_ogrsf + + +def test_ogr_parquet_test_ogrsf_example(): + import test_cli_utilities + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal(test_cli_utilities.get_test_ogrsf_path() + ' -ro data/parquet/example.parquet') + + assert 'INFO' in ret + assert 'ERROR' not in ret + +############################################################################### +# Run test_ogrsf + + +def test_ogr_parquet_test_ogrsf_all_geoms(): + import test_cli_utilities + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal(test_cli_utilities.get_test_ogrsf_path() + ' -ro data/parquet/all_geoms.parquet') + + assert 'INFO' in ret + assert 'ERROR' not in ret + + +############################################################################### +# Test write support + + +@pytest.mark.parametrize("use_vsi,row_group_size,fid", [(False, None, None), (True, 2, "fid")]) +def test_ogr_parquet_write_from_another_dataset(use_vsi, row_group_size, fid): + + outfilename = '/vsimem/out.parquet' if use_vsi else 'tmp/out.parquet' + try: + layerCreationOptions = [] + if row_group_size: + layerCreationOptions.append('ROW_GROUP_SIZE=' + str(row_group_size)) + if fid: + layerCreationOptions.append('FID=' + fid) + gdal.VectorTranslate(outfilename, 'data/parquet/test.parquet', + layerCreationOptions=layerCreationOptions) + + ds = gdal.OpenEx(outfilename) + lyr = ds.GetLayer(0) + + assert lyr.GetFIDColumn() == (fid if fid else "") + + if fid: + f = lyr.GetFeature(4) + assert f is not None + assert f.GetFID() == 4 + + assert lyr.GetFeature(5) is None + + lyr.SetIgnoredFields([lyr.GetLayerDefn().GetFieldDefn(0).GetName()]) + + f = lyr.GetFeature(4) + assert f is not None + assert f.GetFID() == 4 + + assert lyr.GetFeature(5) is None + + lyr.SetIgnoredFields([]) + + if row_group_size: + num_features = lyr.GetFeatureCount() + expected_num_row_groups = int(math.ceil(num_features / row_group_size)) + assert lyr.GetMetadataItem("NUM_ROW_GROUPS", "_PARQUET_") == str(expected_num_row_groups) + for i in range(expected_num_row_groups): + got_num_rows = lyr.GetMetadataItem("ROW_GROUPS[%d].NUM_ROWS" % i, "_PARQUET_") + if i < expected_num_row_groups - 1: + assert got_num_rows == str(row_group_size) + else: + assert got_num_rows == str(num_features - (expected_num_row_groups - 1) * row_group_size) + + geo = lyr.GetMetadataItem("geo", "_PARQUET_METADATA_") + assert geo is not None + j = json.loads(geo) + assert j is not None + assert 'primary_column' in j + assert j['primary_column'] == 'geometry' + assert 'columns' in j + assert 'geometry' in j['columns'] + assert 'encoding' in j['columns']['geometry'] + assert j['columns']['geometry']['encoding'] == 'WKB' + + md = lyr.GetMetadata("_PARQUET_METADATA_") + assert 'geo' in md + + ds = None + + _check_test_parquet(outfilename) + + finally: + gdal.Unlink(outfilename) + + +############################################################################### +# Test write support + + +def test_ogr_parquet_write_edge_cases(): + + outfilename = '/vsimem/out.parquet' + + # No layer + ds = gdal.GetDriverByName('Parquet').Create(outfilename, 0, 0, 0, gdal.GDT_Unknown) + assert ds is not None + assert ds.GetLayerCount() == 0 + assert ds.GetLayer(0) is None + assert ds.TestCapability(ogr.ODsCCreateLayer) == 1 + assert ds.TestCapability(ogr.ODsCAddFieldDomain) == 0 + domain = ogr.CreateCodedFieldDomain('name', 'desc', ogr.OFTInteger, ogr.OFSTNone, {1: "one", "2": None}) + assert ds.AddFieldDomain(domain) == False + assert ds.GetFieldDomainNames() is None + assert ds.GetFieldDomain('foo') is None + ds = None + gdal.Unlink(outfilename) + + # No field, no record + ds = gdal.GetDriverByName('Parquet').Create(outfilename, 0, 0, 0, gdal.GDT_Unknown) + assert ds is not None + srs = osr.SpatialReference() + srs.ImportFromEPSG(4326) + with gdaltest.error_handler(): + assert ds.CreateLayer('out', srs=srs, geom_type=ogr.wkbPoint25D) is None + assert ds.CreateLayer('out', srs=srs, geom_type=ogr.wkbPoint, options=['COMPRESSION=invalid']) is None + lyr = ds.CreateLayer('out', srs=srs, geom_type=ogr.wkbPoint) + assert lyr is not None + assert ds.GetLayerCount() == 1 + assert ds.GetLayer(0) is not None + assert ds.TestCapability(ogr.ODsCCreateLayer) == 0 + assert ds.TestCapability(ogr.ODsCAddFieldDomain) == 1 + # Test creating a second layer + with gdaltest.error_handler(): + assert ds.CreateLayer('out2', srs=srs, geom_type=ogr.wkbPoint) is None + ds = None + ds = gdal.OpenEx(outfilename) + assert ds is not None + lyr = ds.GetLayer(0) + assert lyr.GetNextFeature() is None + lyr = None + ds = None + gdal.Unlink(outfilename) + + # No geometry field, one record + ds = gdal.GetDriverByName('Parquet').Create(outfilename, 0, 0, 0, gdal.GDT_Unknown) + assert ds is not None + lyr = ds.CreateLayer('out', geom_type=ogr.wkbNone) + assert lyr.TestCapability(ogr.OLCCreateField) == 1 + assert lyr.TestCapability(ogr.OLCCreateGeomField) == 1 + assert lyr.TestCapability(ogr.OLCSequentialWrite) == 1 + assert lyr.TestCapability(ogr.OLCStringsAsUTF8) == 1 + fld_defn = ogr.FieldDefn('foo') + fld_defn.SetNullable(False) + assert lyr.CreateField(fld_defn) == ogr.OGRERR_NONE + assert lyr is not None + f = ogr.Feature(lyr.GetLayerDefn()) + with gdaltest.error_handler(): + # violation of not-null constraint + assert lyr.CreateFeature(f) != ogr.OGRERR_NONE + f['foo'] = 'bar' + assert lyr.CreateFeature(f) == ogr.OGRERR_NONE + assert lyr.GetFeatureCount() == 1 + assert lyr.TestCapability(ogr.OLCCreateField) == 0 + assert lyr.TestCapability(ogr.OLCCreateGeomField) == 0 + with gdaltest.error_handler(): + assert lyr.CreateField(ogr.FieldDefn('bar')) != ogr.OGRERR_NONE + assert lyr.CreateGeomField(ogr.GeomFieldDefn('baz', ogr.wkbPoint)) != ogr.OGRERR_NONE + ds = None + ds = gdal.OpenEx(outfilename) + assert ds is not None + lyr = ds.GetLayer(0) + assert lyr.GetNextFeature() is not None + lyr = None + ds = None + gdal.Unlink(outfilename) + + +############################################################################### +# Test compression support + + +@pytest.mark.parametrize("compression", ['uncompressed', 'snappy', 'zstd']) +def test_ogr_parquet_write_compression(compression): + + lco = gdal.GetDriverByName('Parquet').GetMetadataItem("DS_LAYER_CREATIONOPTIONLIST") + if compression.upper() not in lco: + pytest.skip() + + outfilename = '/vsimem/out.parquet' + ds = gdal.GetDriverByName('Parquet').Create(outfilename, 0, 0, 0, gdal.GDT_Unknown) + options = ['FID=fid', 'COMPRESSION=' + compression] + lyr = ds.CreateLayer('out', geom_type=ogr.wkbNone, options=options) + assert lyr is not None + f = ogr.Feature(lyr.GetLayerDefn()) + lyr.CreateFeature(f) + lyr = None + ds = None + + ds = ogr.Open(outfilename) + lyr = ds.GetLayer(0) + assert lyr.GetMetadataItem('ROW_GROUPS[0].COLUMNS[0].COMPRESSION', '_PARQUET_') == compression + lyr = None + ds = None + + gdal.Unlink(outfilename) diff --git a/cmake/helpers/CheckDependentLibraries.cmake b/cmake/helpers/CheckDependentLibraries.cmake index e04e462a1afb..8bd210470b36 100644 --- a/cmake/helpers/CheckDependentLibraries.cmake +++ b/cmake/helpers/CheckDependentLibraries.cmake @@ -95,7 +95,7 @@ endfunction() macro (gdal_check_package name purpose) set(_options CONFIG CAN_DISABLE RECOMMENDED DISABLED_BY_DEFAULT ALWAYS_ON_WHEN_FOUND) set(_oneValueArgs VERSION NAMES) - set(_multiValueArgs COMPONENTS TARGETS) + set(_multiValueArgs COMPONENTS TARGETS PATHS) cmake_parse_arguments(_GCP "${_options}" "${_oneValueArgs}" "${_multiValueArgs}" ${ARGN}) string(TOUPPER ${name} key) set(_find_dependency "") @@ -112,6 +112,9 @@ macro (gdal_check_package name purpose) if (_GCP_COMPONENTS) list(APPEND _find_package_args COMPONENTS ${_GCP_COMPONENTS}) endif () + if (_GCP_PATHS) + list(APPEND _find_package_args PATHS ${_GCP_PATHS}) + endif () if (_GCP_NAMES) set(GDAL_CHECK_PACKAGE_${name}_NAMES "${_GCP_NAMES}" CACHE STRING "Config file name for ${name}") mark_as_advanced(GDAL_CHECK_PACKAGE_${name}_NAMES) @@ -679,6 +682,11 @@ option(GDAL_USE_PUBLICDECOMPWT gdal_check_package(KDU "Enable KAKADU" CAN_DISABLE) gdal_check_package(LURATECH "Enable JP2Lura driver" CAN_DISABLE) +gdal_check_package(Arrow "Apache Arrow C++ library" CONFIG CAN_DISABLE) +if (Arrow_FOUND) + gdal_check_package(Parquet "Apache Parquet C++ library" CONFIG PATHS ${Arrow_DIR} CAN_DISABLE) +endif() + # bindings gdal_check_package(SWIG "Enable language bindings" ALWAYS_ON_WHEN_FOUND) set_package_properties( diff --git a/doc/source/build_hints.rst b/doc/source/build_hints.rst index 01e35e3fbedf..3d5a52c8be79 100644 --- a/doc/source/build_hints.rst +++ b/doc/source/build_hints.rst @@ -191,6 +191,18 @@ need to be installed: ``blas blas-devel libblas libcblas liblapack liblapacke`` Control whether to use Armadillo. Defaults to ON when Armadillo is found. +Arrow +***** + +The `Apache Arrow C++ ` library +is required for the :ref:`vector.arrow` and :ref:`vector.parquet` drivers. +Specify install prefix in the ``CMAKE_PREFIX_PATH`` variable. + +.. option:: GDAL_USE_ARROW=ON/OFF + + Control whether to use Arrow. Defaults to ON when Arrow is found. + + Blosc ***** @@ -1285,6 +1297,18 @@ The Oracle Instant Client SDK (closed source/proprietary) is required for the Control whether to use Oracle. Defaults to ON when Oracle is found. +Parquet +******* + +The Parquet component of the `Apache Arrow C++ ` +library is required for the :ref:`vector.parquet` driver. +Specify install prefix in the ``CMAKE_PREFIX_PATH`` variable. + +.. option:: GDAL_USE_PARQUET=ON/OFF + + Control whether to use Parquet. Defaults to ON when Parquet is found. + + PCRE2 ***** diff --git a/doc/source/drivers/vector/arrow.rst b/doc/source/drivers/vector/arrow.rst new file mode 100644 index 000000000000..cad89d52043c --- /dev/null +++ b/doc/source/drivers/vector/arrow.rst @@ -0,0 +1,82 @@ +.. _vector.arrow: + +(Geo)Arrow IPC File Format / Stream +=================================== + +.. shortname:: Arrow + +.. build_dependencies:: Apache Arrow C++ library + +The Arrow IPC File Format (Feather) is a portable file format for storing Arrow +tables or data frames (from languages like Python or R) that utilizes the Arrow +IPC format internally. + +The driver supports the 2 variants of the format: + +- File or Random Access format, also known as Feather: + for serializing a fixed number of record batches. + Random access is required to read such files, but they can be generated using + a streaming-only capable file. The recommended extension for such file is ``.arrow`` + +- Streaming IPC format: for sending an arbitrary length sequence of record batches. + The format must generally be processed from start to end, and does not require + random access. That format is not generally materialized as a file. If it is, + the recommended extension is ``.arrows`` (with a trailing s). But the + driver can support regular files as well as the /vsistdin/ and /vsistdout/ streaming files. + On opening, it might difficult for the driver to detect that the content is + specifically a Arrow IPC stream, especially if the extension is not ``.arrows``, + and the metadata section is large. + Prefixing the filename with ``ARROW_IPC_STREAM:`` (e.g "ARROW_IPC_STREAM:/vsistdin/") + will cause the driver to unconditionally open the file as a streaming IPC format. + + +This driver also supports geometry columns using the GeoArrow specification. + +.. note:: The driver should be considered experimental as the GeoArrow specification is not finalized yet. + +Driver capabilities +------------------- + +.. supports_create:: + +.. supports_georeferencing:: + +.. supports_virtualio:: + +Creation issues +--------------- + +The driver supports creating only a single layer in a dataset. + +Layer creation options +---------------------- + +- **COMPRESSION=string**: Compression method. Can be one of ``NONE``, ``ZSTD`` + or ``LZ4``. Available values depend on how the Arrow library was compiled. + Defaults to LZ4 when available, otherwise NONE. + +- **FORMAT=FILE/STREAM**: Variant of the file format. See introduction paragraph + for the difference between both. Defaults to FILE, unless the filename is + "/vsistdout/" or its extension is ".arrows", in which case STREAM is used. + +- **GEOMETRY_ENCODING=GEOARROW/WKB/WKT**: Geometry encoding. Defaults to GEOARROW. + +- **BATCH_SIZE=integer**: Maximum number of rows per record batch. Default is 65536. + +- **GEOMETRY_NAME=string**: Name of geometry column. Default is ``geometry`` + +- **FID=string**: Name of the FID (Feature Identifier) column to create. If + none is specified, no FID column is created. Note that if using ogr2ogr with + the Arrow driver as the target driver and a source layer that has a named + FID column, this FID column name will be automatically used to set the FID + layer creation option of the Arrow driver (unless ``-lco FID=`` is used to + set an empty name) + +Links +----- + +- `Feather File Format `__ + +- `GeoArrow specification `__ + +- Related driver: :ref:`Parquet driver ` diff --git a/doc/source/drivers/vector/index.rst b/doc/source/drivers/vector/index.rst index bfbe398839ec..fc04ef93f56f 100644 --- a/doc/source/drivers/vector/index.rst +++ b/doc/source/drivers/vector/index.rst @@ -27,6 +27,7 @@ Vector drivers :maxdepth: 1 amigocloud + arrow avcbin avce00 cad @@ -81,6 +82,7 @@ Vector drivers ogdi openfilegdb osm + parquet pdf pds pgdump diff --git a/doc/source/drivers/vector/parquet.rst b/doc/source/drivers/vector/parquet.rst new file mode 100644 index 000000000000..518964b37f9d --- /dev/null +++ b/doc/source/drivers/vector/parquet.rst @@ -0,0 +1,69 @@ +.. _vector.parquet: + +(Geo)Parquet +============ + +.. shortname:: Parquet + +.. build_dependencies:: Parquet component of the Apache Arrow C++ library + +From https://databricks.com/glossary/what-is-parquet: +"Apache Parquet is an open source, column-oriented data file format designed +for efficient data storage and retrieval. It provides efficient data compression +and encoding schemes with enhanced performance to handle complex data in bulk. +Apache Parquet is designed to be a common interchange format for both batch and interactive workloads." + +This driver also supports geometry columns using the GeoParquet specification. + +.. note:: The driver should be considered experimental as the GeoParquet specification is not finalized yet. + + +Driver capabilities +------------------- + +.. supports_create:: + +.. supports_georeferencing:: + +.. supports_virtualio:: + + +Creation issues +--------------- + +The driver supports creating only a single layer in a dataset. + +Layer creation options +---------------------- + +- **COMPRESSION=string**: Compression method. Can be one of ``NONE``, ``SNAPPY``, + ``GZIP``, ``BROTLI``, ``ZSTD``, ``LZ4``, ``BZ2``, ``LZ4_HADOOP``. Available + values depend on how the Parquet library was compiled. + Defaults to SNAPPY when available, otherwise NONE. + +- **GEOMETRY_ENCODING=WKB/WKT/GEOARROW**: Geometry encoding. Defaults to WKB. + Other encodings (WKB and WKT) are *not* allowed by the GeoParquet + specification, but are handled as an extension, for symmetry with the Arrow + driver. + +- **ROW_GROUP_SIZE=integer**: Maximum number of rows per group. Default is 65536. + +- **GEOMETRY_NAME=string**: Name of geometry column. Default is ``geometry`` + +- **FID=string**: Name of the FID (Feature Identifier) column to create. If + none is specified, no FID column is created. Note that if using ogr2ogr with + the Parquet driver as the target driver and a source layer that has a named + FID column, this FID column name will be automatically used to set the FID + layer creation option of the Parquet driver (unless ``-lco FID=`` is used to + set an empty name) + +Links +----- + +- `Apache Parquet home page `__ + +- `Parquet file format `__ + +- `GeoParquet specification `__ + +- Related driver: :ref:`Arrow driver ` diff --git a/docker/ubuntu-full/Dockerfile b/docker/ubuntu-full/Dockerfile index af34ea40957a..e45827d446f9 100644 --- a/docker/ubuntu-full/Dockerfile +++ b/docker/ubuntu-full/Dockerfile @@ -237,7 +237,17 @@ RUN . /buildscripts/bh-set-envvars.sh \ && make -j$(nproc) install \ && make install DESTDIR="/build_thirdparty" \ && cd ../.. \ - && rm -rf libjxl + && rm -rf libjxl \ + && rm -rf /var/lib/apt/lists/* + +# Install Arrow C++ +RUN apt-get update -y \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -V ca-certificates lsb-release wget \ + && wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -V libarrow-dev libparquet-dev \ + && rm -rf /var/lib/apt/lists/* RUN apt-get update -y \ && apt-get install -y --fix-missing --no-install-recommends rsync ccache \ @@ -300,6 +310,12 @@ RUN apt-get update \ python-is-python3 \ # Workaround bug in ogdi packaging && ln -s /usr/lib/ogdi/libvrf.so /usr/lib \ + # Install Arrow C++ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -V ca-certificates lsb-release wget \ + && wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y -V libarrow700 libparquet700 \ && rm -rf /var/lib/apt/lists/* # Attempt to order layers starting with less frequently varying ones diff --git a/frmts/drivers.ini b/frmts/drivers.ini index d043efde6853..2797d1480a87 100644 --- a/frmts/drivers.ini +++ b/frmts/drivers.ini @@ -253,6 +253,9 @@ MVT NGW MapML HANA +Parquet +Arrow + # Put TIGER and AVCBIN at end since they need poOpenInfo->GetSiblingFiles() Tiger AVCBin diff --git a/ogr/ogr_geometry.h b/ogr/ogr_geometry.h index a1f58f657e1d..b20c9ddac099 100644 --- a/ogr/ogr_geometry.h +++ b/ogr/ogr_geometry.h @@ -402,9 +402,9 @@ class CPL_DLL OGRGeometry virtual OGRGeometry* Normalize() const; virtual OGRBoolean IsSimple() const; /*! Returns whether the geometry has a Z component. */ - OGRBoolean Is3D() const { return flags & OGR_G_3D; } + OGRBoolean Is3D() const { return (flags & OGR_G_3D) != 0; } /*! Returns whether the geometry has a M component. */ - OGRBoolean IsMeasured() const { return flags & OGR_G_MEASURED; } + OGRBoolean IsMeasured() const { return (flags & OGR_G_MEASURED) != 0; } virtual OGRBoolean IsRing() const; virtual void empty() = 0; virtual OGRGeometry *clone() const CPL_WARN_UNUSED_RESULT = 0; diff --git a/ogr/ogr_p.h b/ogr/ogr_p.h index 7ece26853bea..e0ad1fd213e8 100644 --- a/ogr/ogr_p.h +++ b/ogr/ogr_p.h @@ -174,10 +174,17 @@ char CPL_DLL * OGRGeometryToHexEWKB( OGRGeometry * poGeometry, int nSRSId, /* WKB Type Handling encoding */ /************************************************************************/ -OGRErr OGRReadWKBGeometryType( const unsigned char * pabyData, +OGRErr CPL_DLL OGRReadWKBGeometryType( const unsigned char * pabyData, OGRwkbVariant wkbVariant, OGRwkbGeometryType *eGeometryType ); +/************************************************************************/ +/* WKT Type Handling encoding */ +/************************************************************************/ + +OGRErr CPL_DLL OGRReadWKTGeometryType( const char* pszWKT, + OGRwkbGeometryType *peGeometryType ); + /************************************************************************/ /* Other */ /************************************************************************/ diff --git a/ogr/ogrsf_frmts/CMakeLists.txt b/ogr/ogrsf_frmts/CMakeLists.txt index 782cf0f231c2..adb12743ab25 100644 --- a/ogr/ogrsf_frmts/CMakeLists.txt +++ b/ogr/ogrsf_frmts/CMakeLists.txt @@ -102,6 +102,8 @@ ogr_dependent_driver(xlsx "Microsoft Office Excel(xlsx)" "GDAL_USE_EXPAT") ogr_dependent_driver(xls "Microsoft Office Excel(xls)" "GDAL_USE_FREEXL") ogr_dependent_driver(mongodbv3 "MongoDB V3" "GDAL_USE_MONGOCXX") ogr_dependent_driver(cad "OpenCAD" "GDAL_USE_OPENCAD OR GDAL_USE_OPENCAD_INTERNAL") +ogr_dependent_driver(parquet "Parquet" "GDAL_USE_PARQUET") +ogr_dependent_driver(arrow "Arrow" "GDAL_USE_ARROW") # ###################################################################################################################### # proprietary libraries diff --git a/ogr/ogrsf_frmts/arrow/CMakeLists.txt b/ogr/ogrsf_frmts/arrow/CMakeLists.txt new file mode 100644 index 000000000000..c39cb3fbd1a7 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/CMakeLists.txt @@ -0,0 +1,11 @@ +add_gdal_driver(TARGET ogr_Arrow + SOURCES ogrfeatherdriver.cpp + ogrfeatherdataset.cpp + ogrfeatherlayer.cpp + ogrfeatherwriterdataset.cpp + ogrfeatherwriterlayer.cpp + PLUGIN_CAPABLE + CXX_WFLAGS_EFFCXX) +gdal_standard_includes(ogr_Arrow) +target_include_directories(ogr_Arrow PRIVATE $) +gdal_target_link_libraries(ogr_Arrow PRIVATE arrow_shared) diff --git a/ogr/ogrsf_frmts/arrow/ogr_feather.h b/ogr/ogrsf_frmts/arrow/ogr_feather.h new file mode 100644 index 000000000000..f7a0537079f7 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/ogr_feather.h @@ -0,0 +1,206 @@ +/****************************************************************************** + * + * Project: Feather Translator + * Purpose: Implements OGRFeatherDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_FEATHER_H +#define OGR_FEATHER_H + +#include "ogrsf_frmts.h" + +#include + +#include "../arrow_common/ogr_arrow.h" + +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4244 ) /* warning 4244: 'initializing': conversion from 'int32_t' to 'int16_t', possible loss of data */ +#pragma warning( disable : 4458 ) /* warning 4458: declaration of 'type_id' hides class member */ +#endif + +#include "arrow/ipc/writer.h" + +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + +constexpr const char* GDAL_GEO_FOOTER_KEY = "gdal:geo"; +constexpr const char* ARROW_DRIVER_NAME_UC = "ARROW"; + +/************************************************************************/ +/* OGRFeatherLayer */ +/************************************************************************/ + +class OGRFeatherDataset; + +class OGRFeatherLayer final: public OGRArrowLayer + +{ + OGRFeatherLayer(const OGRFeatherLayer&) = delete; + OGRFeatherLayer& operator= (const OGRFeatherLayer&) = delete; + + OGRFeatherDataset* m_poDS = nullptr; + + // Variable only for seekable file format + std::shared_ptr m_poRecordBatchFileReader{}; + + // Variables only for streamable IPC format + std::shared_ptr m_poFile{}; + bool m_bSeekable = true; + arrow::ipc::IpcReadOptions m_oOptions{}; + std::shared_ptr m_poRecordBatchReader{}; + bool m_bResetRecordBatchReaderAsked = false; + bool m_bSingleBatch = false; + std::shared_ptr m_poBatchIdx0{}; + std::shared_ptr m_poBatchIdx1{}; + + CPLStringList m_aosFeatherMetadata{}; + + virtual std::string GetDriverUCName() const override { return ARROW_DRIVER_NAME_UC; } + + bool ResetRecordBatchReader(); + + void EstablishFeatureDefn(); + void LoadGeoMetadata(const arrow::KeyValueMetadata* kv_metadata, + const std::string& key); + OGRwkbGeometryType ComputeGeometryColumnType(int iGeomCol, int iCol) const; + bool ReadNextBatch() override; + void CreateFieldFromSchema( + const std::shared_ptr& field, + const std::vector& path, + const std::map>& oMapFieldNameToGDALSchemaFieldDefn); + OGRFeature* GetNextRawFeature(); + + virtual bool CanRunNonForcedGetExtent() override; + + bool ReadNextBatchFile(); + bool ReadNextBatchStream(); + void TryToCacheFirstTwoBatches(); + +public: + OGRFeatherLayer(OGRFeatherDataset* poDS, + const char* pszLayerName, + std::shared_ptr& poRecordBatchFileReader); + OGRFeatherLayer(OGRFeatherDataset* poDS, + const char* pszLayerName, + std::shared_ptr poFile, + bool bSeekable, + const arrow::ipc::IpcReadOptions& oOptions, + std::shared_ptr& poRecordBatchStreamReader); + + void ResetReading() override; + int TestCapability(const char* pszCap) override; + GIntBig GetFeatureCount(int bForce) override; + const char* GetMetadataItem( const char* pszName, + const char* pszDomain = "" ) override; + char** GetMetadata( const char* pszDomain = "" ) override; + + std::unique_ptr BuildDomain(const std::string& osDomainName, + int iFieldIndex) const override; +}; + +/************************************************************************/ +/* OGRFeatherDataset */ +/************************************************************************/ + +class OGRFeatherDataset final: public OGRArrowDataset +{ +public: + explicit OGRFeatherDataset(std::unique_ptr&& poMemoryPool); +}; + +/************************************************************************/ +/* OGRFeatherWriterLayer */ +/************************************************************************/ + +class OGRFeatherWriterLayer final: public OGRArrowWriterLayer + +{ + OGRFeatherWriterLayer(const OGRFeatherWriterLayer&) = delete; + OGRFeatherWriterLayer& operator= (const OGRFeatherWriterLayer&) = delete; + + bool m_bStreamFormat = false; + std::shared_ptr m_poFileWriter{}; + std::shared_ptr m_poFooterKeyValueMetadata{}; + + virtual bool IsFileWriterCreated() const override { return m_poFileWriter != nullptr; } + virtual void CreateWriter() override; + virtual void CloseFileWriter() override; + + virtual void CreateSchema() override; + virtual void DoSomethingBeforeFinalFlushGroup() override; + + virtual bool FlushGroup() override; + + virtual std::string GetDriverUCName() const override { return ARROW_DRIVER_NAME_UC; } + +public: + OGRFeatherWriterLayer( arrow::MemoryPool* poMemoryPool, + const std::shared_ptr& poOutputStream, + const char *pszLayerName ); + + ~OGRFeatherWriterLayer() override; + + bool SetOptions( const std::string& osFilename, + CSLConstList papszOptions, + OGRSpatialReference *poSpatialRef, + OGRwkbGeometryType eGType ); +}; + +/************************************************************************/ +/* OGRFeatherWriterDataset */ +/************************************************************************/ + +class OGRFeatherWriterDataset final: public GDALPamDataset +{ + const std::string m_osFilename{}; + std::unique_ptr m_poMemoryPool{}; + std::unique_ptr m_poLayer{}; + std::shared_ptr m_poOutputStream{}; + +public: + explicit OGRFeatherWriterDataset( + const char* pszFilename, + const std::shared_ptr& poOutputStream); + + arrow::MemoryPool* GetMemoryPool() const { return m_poMemoryPool.get(); } + + int GetLayerCount() override ; + OGRLayer* GetLayer(int idx) override; + int TestCapability(const char* pszCap) override; + std::vector GetFieldDomainNames(CSLConstList /*papszOptions*/ = nullptr) const override; + const OGRFieldDomain* GetFieldDomain(const std::string& name) const override; + bool AddFieldDomain(std::unique_ptr&& domain, + std::string& failureReason) override; +protected: + OGRLayer *ICreateLayer( const char *pszName, + OGRSpatialReference *poSpatialRef = nullptr, + OGRwkbGeometryType eGType = wkbUnknown, + char ** papszOptions = nullptr ) override; + +}; + +#endif // OGR_FEATHER_H diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherdataset.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherdataset.cpp new file mode 100644 index 000000000000..7868a6c4e7ec --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherdataset.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** + * + * Project: Feather Translator + * Purpose: Implements OGRFeatherDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_feather.h" + +#include "../arrow_common/ograrrowdataset.hpp" + +/************************************************************************/ +/* OGRFeatherDataset() */ +/************************************************************************/ + +OGRFeatherDataset::OGRFeatherDataset(std::unique_ptr&& poMemoryPool): + OGRArrowDataset(std::move(poMemoryPool)) +{ +} diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp new file mode 100644 index 000000000000..324f1fee714c --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp @@ -0,0 +1,471 @@ +/****************************************************************************** + * + * Project: Feather Translator + * Purpose: Implements OGRFeatherDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "gdal_pam.h" +#include "ogrsf_frmts.h" + +#include + +#include "ogr_feather.h" +#include "../arrow_common/ograrrowrandomaccessfile.h" +#include "../arrow_common/ograrrowwritablefile.h" +#include "../arrow_common/ograrrowdataset.hpp" + +/************************************************************************/ +/* IsArrowIPCStream() */ +/************************************************************************/ + +static bool IsArrowIPCStream( GDALOpenInfo* poOpenInfo ) +{ + if( STARTS_WITH_CI(poOpenInfo->pszFilename, "ARROW_IPC_STREAM:") ) + return true; + + constexpr int CONTINUATION_SIZE = 4; // 0xFFFFFFFF + constexpr int METADATA_SIZE_SIZE = 4; + + // See https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format + if( poOpenInfo->fpL != nullptr && + poOpenInfo->nHeaderBytes >= CONTINUATION_SIZE + METADATA_SIZE_SIZE && + memcmp(poOpenInfo->pabyHeader, "\xFF\xFF\xFF\xFF", CONTINUATION_SIZE) == 0 ) + { + const char* pszExt = CPLGetExtension(poOpenInfo->pszFilename); + if( EQUAL(pszExt, "arrows") || EQUAL(pszExt, "ipc") ) + return true; + + const uint32_t nMetadataSize = CPL_LSBUINT32PTR( + poOpenInfo->pabyHeader + CONTINUATION_SIZE); + if( strcmp(poOpenInfo->pszFilename, "/vsistdin/") == 0 ) + { + // Padding after metadata and before body is not necessarily present + // but the body must be at least 4 bytes + constexpr int PADDING_MAX_SIZE = 4; + + // /vsistdin/ cannot seek back beyond first MB + if( nMetadataSize > 1024 * 1024 - + (CONTINUATION_SIZE + METADATA_SIZE_SIZE + PADDING_MAX_SIZE) ) + { + return false; + } + const int nSizeToRead = CONTINUATION_SIZE + METADATA_SIZE_SIZE + + nMetadataSize + PADDING_MAX_SIZE; + if( !poOpenInfo->TryToIngest(nSizeToRead) ) + { + return false; + } + + const std::string osTmpFilename(CPLSPrintf("/vsimem/_arrow/%p", poOpenInfo)); + VSILFILE* fp = VSIFileFromMemBuffer( + osTmpFilename.c_str(), poOpenInfo->pabyHeader, nSizeToRead, false); + auto infile = std::make_shared(fp); + auto options = arrow::ipc::IpcReadOptions::Defaults(); + auto result = arrow::ipc::RecordBatchStreamReader::Open(infile, options); + CPLDebug("ARROW", "RecordBatchStreamReader::Open(): %s", + result.status().message().c_str()); + VSIUnlink(osTmpFilename.c_str()); + return result.ok(); + } + + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_END); + const auto nFileSize = VSIFTellL(poOpenInfo->fpL); + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_SET); + if( nMetadataSize > nFileSize - (CONTINUATION_SIZE + METADATA_SIZE_SIZE) ) + return false; + + // Do not give ownership of poOpenInfo->fpL to infile + auto infile = std::make_shared(poOpenInfo->fpL, false); + auto options = arrow::ipc::IpcReadOptions::Defaults(); + auto result = arrow::ipc::RecordBatchStreamReader::Open(infile, options); + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_SET); + return result.ok(); + } + return false; +} + +/************************************************************************/ +/* IsArrowFileFormat() */ +/************************************************************************/ + +template constexpr int constexpr_length( const char (&) [N] ) +{ + return static_cast(N-1); +} + +static bool IsArrowFileFormat( GDALOpenInfo* poOpenInfo ) +{ + // See https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format + bool bRet = false; + constexpr const char SIGNATURE[] = "ARROW1"; + constexpr int SIGNATURE_SIZE = constexpr_length(SIGNATURE); + static_assert(SIGNATURE_SIZE == 6, "SIGNATURE_SIZE == 6"); + constexpr int SIGNATURE_PLUS_PADDING = SIGNATURE_SIZE + 2; + constexpr int FOOTERSIZE_SIZE = 4; + if( poOpenInfo->fpL != nullptr && + poOpenInfo->nHeaderBytes >= SIGNATURE_PLUS_PADDING + FOOTERSIZE_SIZE + + SIGNATURE_SIZE && + memcmp(poOpenInfo->pabyHeader, SIGNATURE, SIGNATURE_SIZE) == 0 ) + { + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_END); + const auto nFileSize = VSIFTellL(poOpenInfo->fpL); + VSIFSeekL(poOpenInfo->fpL, + nFileSize - (FOOTERSIZE_SIZE + SIGNATURE_SIZE), SEEK_SET); + uint32_t nFooterSize = 0; + static_assert(sizeof(nFooterSize) == FOOTERSIZE_SIZE, "sizeof(nFooterSize) == FOOTERSIZE_SIZE"); + VSIFReadL(&nFooterSize, 1, sizeof(nFooterSize), poOpenInfo->fpL); + CPL_LSBPTR32(&nFooterSize); + unsigned char abyTrailingBytes[SIGNATURE_SIZE] = {0}; + VSIFReadL(&abyTrailingBytes[0], 1, SIGNATURE_SIZE, poOpenInfo->fpL); + bRet = memcmp(abyTrailingBytes, SIGNATURE, SIGNATURE_SIZE) == 0 && + nFooterSize < nFileSize; + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_SET); + } + return bRet; +} + +/************************************************************************/ +/* Identify() */ +/************************************************************************/ + +static int OGRFeatherDriverIdentify( GDALOpenInfo* poOpenInfo ) +{ + return IsArrowIPCStream(poOpenInfo) || IsArrowFileFormat(poOpenInfo); +} + +/************************************************************************/ +/* Open() */ +/************************************************************************/ + +static GDALDataset *OGRFeatherDriverOpen( GDALOpenInfo* poOpenInfo ) +{ + if( poOpenInfo->eAccess == GA_Update ) + { + return nullptr; + } + + const bool bIsStreamingFormat = IsArrowIPCStream(poOpenInfo); + if( !bIsStreamingFormat && + !IsArrowFileFormat(poOpenInfo) ) + { + return nullptr; + } + + std::shared_ptr infile; + if( STARTS_WITH_CI(poOpenInfo->pszFilename, "ARROW_IPC_STREAM:") ) + { + const std::string osFilename( + poOpenInfo->pszFilename + strlen("ARROW_IPC_STREAM:")); + VSILFILE* fp = VSIFOpenL(osFilename.c_str(), "rb"); + if( fp == nullptr ) + { + CPLError(CE_Failure, CPLE_FileIO, + "Cannot open %s", osFilename.c_str()); + return nullptr; + } + infile = std::make_shared(fp); + } + else if( STARTS_WITH(poOpenInfo->pszFilename, "/vsi") || + CPLTestBool(CPLGetConfigOption("OGR_ARROW_USE_VSI", "NO")) ) + { + VSILFILE* fp = poOpenInfo->fpL; + poOpenInfo->fpL = nullptr; + infile = std::make_shared(fp); + } + else + { + auto result = arrow::io::ReadableFile::Open(poOpenInfo->pszFilename); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadableFile::Open() failed with %s", + result.status().message().c_str()); + return nullptr; + } + infile = *result; + } + + auto poMemoryPool = arrow::MemoryPool::CreateDefault(); + auto options = arrow::ipc::IpcReadOptions::Defaults(); + options.memory_pool = poMemoryPool.get(); + + auto poDS = cpl::make_unique(std::move(poMemoryPool)); + if( bIsStreamingFormat ) + { + auto result = arrow::ipc::RecordBatchStreamReader::Open(infile, options); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "RecordBatchStreamReader::Open() failed with %s", + result.status().message().c_str()); + return nullptr; + } + auto poRecordBatchStreamReader = *result; + const bool bSeekable = + !STARTS_WITH_CI(poOpenInfo->pszFilename, "ARROW_IPC_STREAM:") && + strcmp(poOpenInfo->pszFilename, "/vsistdin/") != 0; + std::string osLayername = CPLGetBasename(poOpenInfo->pszFilename); + if( osLayername.empty() ) + osLayername = "layer"; + auto poLayer = cpl::make_unique( + poDS.get(), + osLayername.c_str(), + infile, + bSeekable, + options, + poRecordBatchStreamReader); + poDS->SetLayer(std::move(poLayer)); + + // Pre-load field domains, as this depends on the first record batch + auto poLayerPtr = poDS->GetLayer(0); + const auto poFeatureDefn = poLayerPtr->GetLayerDefn(); + bool bHasReadBatch = false; + for( int i = 0; i < poFeatureDefn->GetFieldCount(); ++i ) + { + const auto poFieldDefn = poFeatureDefn->GetFieldDefn(i); + const auto& osDomainName = poFieldDefn->GetDomainName(); + if( !osDomainName.empty() ) + { + if( !bHasReadBatch ) + { + bHasReadBatch = true; + delete poLayerPtr->GetNextFeature(); + poLayerPtr->ResetReading(); + } + poDS->GetFieldDomain(osDomainName); + } + } + } + else + { + auto result = arrow::ipc::RecordBatchFileReader::Open(infile, options); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "RecordBatchFileReader::Open() failed with %s", + result.status().message().c_str()); + return nullptr; + } + auto poRecordBatchReader = *result; + auto poLayer = cpl::make_unique( + poDS.get(), + CPLGetBasename(poOpenInfo->pszFilename), + poRecordBatchReader); + poDS->SetLayer(std::move(poLayer)); + } + return poDS.release(); +} + +/************************************************************************/ +/* Create() */ +/************************************************************************/ + +static GDALDataset* OGRFeatherDriverCreate(const char * pszName, + int nXSize, int nYSize, int nBands, + GDALDataType eType, + char ** /* papszOptions */ ) +{ + if( !(nXSize == 0 && nYSize == 0 && nBands == 0 && eType == GDT_Unknown) ) + return nullptr; + + std::shared_ptr out_file; + if( STARTS_WITH(pszName, "/vsi") || + CPLTestBool(CPLGetConfigOption("OGR_ARROW_USE_VSI", "YES")) ) + { + VSILFILE* fp = VSIFOpenL(pszName, "wb"); + if( fp == nullptr ) + { + CPLError(CE_Failure, CPLE_FileIO, + "Cannot create %s", pszName); + return nullptr; + } + out_file = std::make_shared(fp); + } + else + { + auto result = arrow::io::FileOutputStream::Open(pszName); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_FileIO, + "Cannot create %s: %s", pszName, + result.status().message().c_str()); + return nullptr; + } + out_file = *result; + } + + return new OGRFeatherWriterDataset(pszName, out_file); +} + +/************************************************************************/ +/* OGRFeatherDriver() */ +/************************************************************************/ + +class OGRFeatherDriver final: public GDALDriver +{ + bool m_bMetadataInitialized = false; + void InitMetadata(); + +public: + const char* GetMetadataItem(const char* pszName, const char* pszDomain) override + { + if( EQUAL(pszName, GDAL_DS_LAYER_CREATIONOPTIONLIST) ) + { + InitMetadata(); + } + return GDALDriver::GetMetadataItem(pszName, pszDomain); + } + + char** GetMetadata(const char* pszDomain) override + { + InitMetadata(); + return GDALDriver::GetMetadata(pszDomain); + } +}; + +void OGRFeatherDriver::InitMetadata() +{ + if( m_bMetadataInitialized ) + return; + m_bMetadataInitialized = true; + + CPLXMLTreeCloser oTree(CPLCreateXMLNode( + nullptr, CXT_Element, "LayerCreationOptionList")); + + std::vector apszCompressionMethods; + bool bHasLZ4 = false; + for( const char* pszMethod: { "ZSTD", "LZ4" } ) + { + auto oResult = arrow::util::Codec::GetCompressionType( + CPLString(pszMethod).tolower()); + if( oResult.ok() && arrow::util::Codec::IsAvailable(*oResult) ) + { + if( EQUAL(pszMethod, "LZ4") ) + bHasLZ4 = true; + apszCompressionMethods.emplace_back(pszMethod); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "FORMAT"); + CPLAddXMLAttributeAndValue(psOption, "type", "string-select"); + CPLAddXMLAttributeAndValue(psOption, "description", "File format variant"); + for( const char* pszEncoding : {"FILE", "STREAM"} ) + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "COMPRESSION"); + CPLAddXMLAttributeAndValue(psOption, "type", "string-select"); + CPLAddXMLAttributeAndValue(psOption, "description", "Compression method"); + CPLAddXMLAttributeAndValue(psOption, "default", bHasLZ4 ? "LZ4" : "NONE"); + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLAddXMLAttributeAndValue(poValueNode, "alias", "UNCOMPRESSED"); + CPLCreateXMLNode(poValueNode, CXT_Text, "NONE"); + } + for( const char* pszMethod: apszCompressionMethods ) + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLCreateXMLNode(poValueNode, CXT_Text, pszMethod); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_ENCODING"); + CPLAddXMLAttributeAndValue(psOption, "type", "string-select"); + CPLAddXMLAttributeAndValue(psOption, "description", "Encoding of geometry columns"); + CPLAddXMLAttributeAndValue(psOption, "default", "GEOARROW"); + for( const char* pszEncoding : {"GEOARROW", "WKB", "WKT"} ) + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "BATCH_SIZE"); + CPLAddXMLAttributeAndValue(psOption, "type", "integer"); + CPLAddXMLAttributeAndValue(psOption, "description", "Maximum number of rows per batch"); + CPLAddXMLAttributeAndValue(psOption, "default", "65536"); + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_NAME"); + CPLAddXMLAttributeAndValue(psOption, "type", "string"); + CPLAddXMLAttributeAndValue(psOption, "description", "Name of geometry column"); + CPLAddXMLAttributeAndValue(psOption, "default", "geometry"); + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "FID"); + CPLAddXMLAttributeAndValue(psOption, "type", "string"); + CPLAddXMLAttributeAndValue(psOption, "description", "Name of the FID column to create"); + } + + char* pszXML = CPLSerializeXMLTree(oTree.get()); + GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML); + CPLFree(pszXML); +} + +/************************************************************************/ +/* RegisterOGRArrow() */ +/************************************************************************/ + +void RegisterOGRArrow() +{ + if( GDALGetDriverByName( "Arrow" ) != nullptr ) + return; + + auto poDriver = cpl::make_unique(); + + poDriver->SetDescription( "Arrow" ); + poDriver->SetMetadataItem( GDAL_DCAP_VECTOR, "YES" ); + poDriver->SetMetadataItem( GDAL_DMD_LONGNAME, "(Geo)Arrow IPC File Format / Stream" ); + poDriver->SetMetadataItem( GDAL_DMD_EXTENSIONS, "arrow feather arrows ipc" ); + poDriver->SetMetadataItem( GDAL_DMD_HELPTOPIC, "drivers/vector/feather.html" ); + poDriver->SetMetadataItem( GDAL_DCAP_VIRTUALIO, "YES" ); + + poDriver->SetMetadataItem( GDAL_DMD_CREATIONFIELDDATATYPES, + "Integer Integer64 Real String Date Time DateTime " + "Binary IntegerList Integer64List RealList StringList" ); + poDriver->SetMetadataItem( GDAL_DMD_CREATIONFIELDDATASUBTYPES, + "Boolean Int16 Float32 JSON UUID" ); + + poDriver->pfnOpen = OGRFeatherDriverOpen; + poDriver->pfnIdentify = OGRFeatherDriverIdentify; + poDriver->pfnCreate = OGRFeatherDriverCreate; + + GetGDALDriverManager()->RegisterDriver(poDriver.release()); +} diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherlayer.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherlayer.cpp new file mode 100644 index 000000000000..7bd96fc121d8 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherlayer.cpp @@ -0,0 +1,816 @@ +/****************************************************************************** + * + * Project: Feather Translator + * Purpose: Implements OGRFeatherDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "cpl_json.h" +#include "cpl_time.h" +#include "gdal_pam.h" +#include "ogrsf_frmts.h" +#include "ogr_p.h" + +#include +#include +#include +#include +#include + +#include "ogr_feather.h" + +#include "../arrow_common/ograrrowlayer.hpp" +#include "../arrow_common/ograrrowdataset.hpp" + +/************************************************************************/ +/* OGRFeatherLayer() */ +/************************************************************************/ + +OGRFeatherLayer::OGRFeatherLayer(OGRFeatherDataset* poDS, + const char* pszLayerName, + std::shared_ptr& poRecordBatchFileReader): + OGRArrowLayer(poDS, pszLayerName), + m_poDS(poDS), + m_poRecordBatchFileReader(poRecordBatchFileReader) +{ + EstablishFeatureDefn(); + CPLAssert( static_cast(m_aeGeomEncoding.size()) == m_poFeatureDefn->GetGeomFieldCount() ); +} + +/************************************************************************/ +/* OGRFeatherLayer() */ +/************************************************************************/ + +OGRFeatherLayer::OGRFeatherLayer(OGRFeatherDataset* poDS, + const char* pszLayerName, + std::shared_ptr poFile, + bool bSeekable, + const arrow::ipc::IpcReadOptions& oOptions, + std::shared_ptr& poRecordBatchStreamReader): + OGRArrowLayer(poDS, pszLayerName), + m_poDS(poDS), + m_poFile(poFile), + m_bSeekable(bSeekable), + m_oOptions(oOptions), + m_poRecordBatchReader(poRecordBatchStreamReader) +{ + EstablishFeatureDefn(); + CPLAssert( static_cast(m_aeGeomEncoding.size()) == m_poFeatureDefn->GetGeomFieldCount() ); +} + +/************************************************************************/ +/* LoadGeoMetadata() */ +/************************************************************************/ + +void OGRFeatherLayer::LoadGeoMetadata(const arrow::KeyValueMetadata* kv_metadata, + const std::string& key) +{ + if( kv_metadata && kv_metadata->Contains(key) ) + { + auto geo = kv_metadata->Get(key); + if( geo.ok() ) + { + CPLJSONDocument oDoc; + if( oDoc.LoadMemory(*geo) ) + { + auto oRoot = oDoc.GetRoot(); + const auto osVersion = oRoot.GetString("schema_version"); + if( key != GDAL_GEO_FOOTER_KEY && osVersion != "0.1.0" ) + { + CPLDebug("FEATHER", + "schema_version = %s not explicitly handled by the driver", + osVersion.c_str()); + } + auto oColumns = oRoot.GetObj("columns"); + if( oColumns.IsValid() ) + { + for( const auto oColumn: oColumns.GetChildren() ) + { + m_oMapGeometryColumns[oColumn.GetName()] = oColumn; + } + } + } + else + { + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot parse 'geo' metadata"); + } + } + } +} + +/************************************************************************/ +/* EstablishFeatureDefn() */ +/************************************************************************/ + +void OGRFeatherLayer::EstablishFeatureDefn() +{ + m_poSchema = m_poRecordBatchFileReader ? + m_poRecordBatchFileReader->schema() : m_poRecordBatchReader->schema(); + const auto& kv_metadata = m_poSchema->metadata(); + +#ifdef DEBUG + if( kv_metadata ) + { + for(const auto& keyValue: kv_metadata->sorted_pairs() ) + { + CPLDebug("FEATHER", "%s = %s", + keyValue.first.c_str(), + keyValue.second.c_str()); + } + } +#endif + + auto poFooterMetadata = m_poRecordBatchFileReader ? + m_poRecordBatchFileReader->metadata() : nullptr; + if( poFooterMetadata && poFooterMetadata->Contains(GDAL_GEO_FOOTER_KEY) && + CPLTestBool(CPLGetConfigOption("OGR_ARROW_READ_GDAL_FOOTER", "YES")) ) + { + LoadGeoMetadata(poFooterMetadata.get(), GDAL_GEO_FOOTER_KEY); + } + else + { + LoadGeoMetadata(kv_metadata.get(), "geo"); + } + const auto oMapFieldNameToGDALSchemaFieldDefn = LoadGDALMetadata(kv_metadata.get()); + + const auto fields = m_poSchema->fields(); + for( int i = 0; i < m_poSchema->num_fields(); ++i ) + { + const auto& field = fields[i]; + const auto& fieldName = field->name(); + + const auto& field_kv_metadata = field->metadata(); + std::string osExtensionName; + if( field_kv_metadata ) + { + auto extension_name = kv_metadata->Get("ARROW:extension:name"); + if( extension_name.ok() ) + { + osExtensionName = *extension_name; + } +#ifdef DEBUG + CPLDebug("FEATHER", "Metadata field %s:", fieldName.c_str()); + for(const auto& keyValue: field_kv_metadata->sorted_pairs() ) + { + CPLDebug("FEATHER", " %s = %s", + keyValue.first.c_str(), + keyValue.second.c_str()); + } +#endif + } + + if( !m_osFIDColumn.empty() && + fieldName == m_osFIDColumn ) + { + m_iFIDArrowColumn = i; + continue; + } + + bool bRegularField = true; + auto oIter = m_oMapGeometryColumns.find(fieldName); + if( oIter != m_oMapGeometryColumns.end() || + STARTS_WITH(osExtensionName.c_str(), "geoarrow.") ) + { + CPLJSONObject oJSONDef; + if( oIter != m_oMapGeometryColumns.end() ) + oJSONDef = oIter->second; + auto osEncoding = oJSONDef.GetString("encoding"); + if( osEncoding.empty() && !osExtensionName.empty() ) + osEncoding = osExtensionName; + + OGRwkbGeometryType eGeomType = wkbUnknown; + auto eGeomEncoding = OGRArrowGeomEncoding::WKB; + if( IsValidGeometryEncoding(field, osEncoding, eGeomType, eGeomEncoding) ) + { + bRegularField = false; + OGRGeomFieldDefn oField(fieldName.c_str(), wkbUnknown); + + const auto osWKT = oJSONDef.GetString("crs"); + if( osWKT.empty() ) + { +#if 0 + CPLError(CE_Warning, CPLE_AppDefined, + "Missing required 'crs' field for geometry column %s", + fieldName.c_str()); +#endif + } + else + { + OGRSpatialReference* poSRS = new OGRSpatialReference(); + poSRS->SetAxisMappingStrategy(OAMS_TRADITIONAL_GIS_ORDER); + if( poSRS->importFromWkt(osWKT.c_str()) == OGRERR_NONE ) + oField.SetSpatialRef(poSRS); + poSRS->Release(); + } + + // m_aeGeomEncoding be filled before calling ComputeGeometryColumnType() + m_aeGeomEncoding.push_back(eGeomEncoding); + if( eGeomType == wkbUnknown ) + { + auto osType = oJSONDef.GetString("geometry_type"); + if( osType.empty() ) + osType = oJSONDef.GetString("gdal:geometry_type"); + if( m_bSeekable && + osType.empty() && CPLTestBool(CPLGetConfigOption( + "OGR_ARROW_COMPUTE_GEOMETRY_TYPE", "YES")) ) + { + eGeomType = ComputeGeometryColumnType( + m_poFeatureDefn->GetGeomFieldCount(), i); + if( m_poRecordBatchReader ) + ResetRecordBatchReader(); + } + else + eGeomType = GetGeometryTypeFromString(osType); + } + + oField.SetType(eGeomType); + oField.SetNullable(field->nullable()); + m_poFeatureDefn->AddGeomFieldDefn(&oField); + m_anMapGeomFieldIndexToArrowColumn.push_back(i); + } + } + + if( bRegularField ) + { + CreateFieldFromSchema(field, {i}, + oMapFieldNameToGDALSchemaFieldDefn); + } + } + + CPLAssert( static_cast(m_anMapFieldIndexToArrowColumn.size()) == m_poFeatureDefn->GetFieldCount() ); + CPLAssert( static_cast(m_anMapGeomFieldIndexToArrowColumn.size()) == m_poFeatureDefn->GetGeomFieldCount() ); +} + +/************************************************************************/ +/* ResetRecordBatchReader() */ +/************************************************************************/ + +bool OGRFeatherLayer::ResetRecordBatchReader() +{ + const auto nPos = *(m_poFile->Tell()); + m_poFile->Seek(0); + auto result = arrow::ipc::RecordBatchStreamReader::Open(m_poFile, m_oOptions); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "RecordBatchStreamReader::Open() failed with %s", + result.status().message().c_str()); + m_poFile->Seek(nPos); + return false; + } + else + { + m_poRecordBatchReader = *result; + return true; + } +} + +/************************************************************************/ +/* ComputeGeometryColumnType() */ +/************************************************************************/ + +OGRwkbGeometryType OGRFeatherLayer::ComputeGeometryColumnType(int iGeomCol, + int iCol) const +{ + // Compute type of geometry column by iterating over each geometry, and + // looking at the WKB geometry type in the first 5 bytes of each geometry. + + OGRwkbGeometryType eGeomType = wkbNone; + + if( m_poRecordBatchReader != nullptr ) + { + std::shared_ptr poBatch; + while( true ) + { + auto status = m_poRecordBatchReader->ReadNext(&poBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + break; + } + else if( !poBatch ) + break; + eGeomType = ComputeGeometryColumnTypeProcessBatch(poBatch, + iGeomCol, iCol, + eGeomType); + if( eGeomType == wkbUnknown ) + break; + } + } + else + { + for(int iBatch = 0; iBatch < m_poRecordBatchFileReader->num_record_batches(); ++iBatch ) + { + auto result = m_poRecordBatchFileReader->ReadRecordBatch(iBatch); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadRecordBatch() failed: %s", + result.status().message().c_str()); + break; + } + eGeomType = ComputeGeometryColumnTypeProcessBatch(*result, + iGeomCol, iCol, + eGeomType); + if( eGeomType == wkbUnknown ) + break; + } + } + + return eGeomType == wkbNone ? wkbUnknown : eGeomType; +} + +/************************************************************************/ +/* CreateFieldFromSchema() */ +/************************************************************************/ + +void OGRFeatherLayer::CreateFieldFromSchema( + const std::shared_ptr& field, + const std::vector& path, + const std::map>& oMapFieldNameToGDALSchemaFieldDefn) +{ + OGRFieldDefn oField(field->name().c_str(), OFTString); + OGRFieldType eType = OFTString; + OGRFieldSubType eSubType = OFSTNone; + bool bTypeOK = true; + + auto type = field->type(); + if( type->id() == arrow::Type::DICTIONARY && path.size() == 1 ) + { + const auto dictionaryType = std::static_pointer_cast(field->type()); + const auto indexType = dictionaryType->index_type(); + if( dictionaryType->value_type()->id() == arrow::Type::STRING && + IsIntegerArrowType(indexType->id()) ) + { + std::string osDomainName(field->name() + "Domain"); + m_poDS->RegisterDomainName(osDomainName, m_poFeatureDefn->GetFieldCount()); + oField.SetDomainName(osDomainName); + type = indexType; + } + else + { + bTypeOK = false; + } + } + + if( type->id() == arrow::Type::STRUCT ) + { + const auto subfields = field->Flatten(); + auto newpath = path; + newpath.push_back(0); + for( int j = 0; j < static_cast(subfields.size()); j++ ) + { + const auto& subfield = subfields[j]; + newpath.back() = j; + CreateFieldFromSchema(subfield, + newpath, oMapFieldNameToGDALSchemaFieldDefn); + } + } + else if( bTypeOK ) + { + MapArrowTypeToOGR(type, field, oField, eType, eSubType, + path, oMapFieldNameToGDALSchemaFieldDefn); + } +} + +/************************************************************************/ +/* BuildDomain() */ +/************************************************************************/ + +std::unique_ptr OGRFeatherLayer::BuildDomain(const std::string& osDomainName, + int iFieldIndex) const +{ + const int iArrowCol = m_anMapFieldIndexToArrowColumn[iFieldIndex][0]; + CPLAssert( m_poSchema->fields()[iArrowCol]->type()->id() == arrow::Type::DICTIONARY ); + + if( m_poRecordBatchReader ) + { + if( m_poBatch ) + { + return BuildDomainFromBatch(osDomainName, m_poBatch, iArrowCol); + } + } + else if( m_poRecordBatchFileReader ) + { + auto result = m_poRecordBatchFileReader->ReadRecordBatch(0); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadRecordBatch() failed: %s", + result.status().message().c_str()); + } + auto poBatch = *result; + if( poBatch ) + { + return BuildDomainFromBatch(osDomainName, poBatch, iArrowCol); + } + } + + return nullptr; +} + +/************************************************************************/ +/* ResetReading() */ +/************************************************************************/ + +void OGRFeatherLayer::ResetReading() +{ + if( m_poRecordBatchReader != nullptr && m_iRecordBatch > 0 ) + { + if( m_iRecordBatch == 1 && m_poBatchIdx1 ) + { + // do nothing + } + else + { + m_bResetRecordBatchReaderAsked = true; + } + } + OGRArrowLayer::ResetReading(); +} + +/************************************************************************/ +/* ReadNextBatch() */ +/************************************************************************/ + +bool OGRFeatherLayer::ReadNextBatch() +{ + if( m_poRecordBatchFileReader == nullptr ) + { + return ReadNextBatchStream(); + } + else + { + return ReadNextBatchFile(); + } +} + +/************************************************************************/ +/* ReadNextBatchFile() */ +/************************************************************************/ + +bool OGRFeatherLayer::ReadNextBatchFile() +{ + ++m_iRecordBatch; + if( m_iRecordBatch == m_poRecordBatchFileReader->num_record_batches() ) + { + if( m_iRecordBatch == 1 ) + m_iRecordBatch = 0; + else + m_poBatch.reset(); + return false; + } + + m_nIdxInBatch = 0; + + auto result = m_poRecordBatchFileReader->ReadRecordBatch(m_iRecordBatch); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadRecordBatch() failed: %s", + result.status().message().c_str()); + m_poBatch.reset(); + return false; + } + m_poBatch = *result; + + return true; +} + +/************************************************************************/ +/* ReadNextBatchStream() */ +/************************************************************************/ + +bool OGRFeatherLayer::ReadNextBatchStream() +{ + m_nIdxInBatch = 0; + + if( m_iRecordBatch == 0 && m_poBatchIdx0 ) + { + m_poBatch = m_poBatchIdx0; + m_iRecordBatch = 1; + return true; + } + + if( m_iRecordBatch == 1 && m_poBatchIdx1 ) + { + m_poBatch = m_poBatchIdx1; + m_iRecordBatch = 2; + return true; + } + + if( m_bSingleBatch ) + { + CPLAssert( m_iRecordBatch == 0); + CPLAssert( m_poBatch != nullptr); + return false; + } + + if( m_bResetRecordBatchReaderAsked ) + { + if( !m_bSeekable ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Attempting to rewind non-seekable stream"); + return false; + } + if( !ResetRecordBatchReader() ) + return false; + m_bResetRecordBatchReaderAsked = false; + } + + CPLAssert(m_poRecordBatchReader); + + ++m_iRecordBatch; + + std::shared_ptr poNextBatch; + auto status = m_poRecordBatchReader->ReadNext(&poNextBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + poNextBatch.reset(); + } + if( poNextBatch == nullptr ) + { + if( m_iRecordBatch == 1 ) + { + m_iRecordBatch = 0; + m_bSingleBatch = true; + } + else + m_poBatch.reset(); + return false; + } + m_poBatch = std::move(poNextBatch); + + return true; +} + +/************************************************************************/ +/* TryToCacheFirstTwoBatches() */ +/************************************************************************/ + +void OGRFeatherLayer::TryToCacheFirstTwoBatches() +{ + if( m_poRecordBatchReader != nullptr && m_iRecordBatch <= 0 && + !m_bSingleBatch && m_poBatchIdx0 == nullptr ) + { + ResetReading(); + if( !m_poBatch ) + { + CPL_IGNORE_RET_VAL(ReadNextBatchStream()); + } + if( m_poBatch ) + { + auto poBatchIdx0 = m_poBatch; + if( ReadNextBatchStream() ) + { + CPLAssert(m_nIdxInBatch == 2); + m_poBatchIdx0 = poBatchIdx0; + m_poBatchIdx1 = m_poBatch; + m_poBatch = poBatchIdx0; + ResetReading(); + } + ResetReading(); + } + } +} + +/************************************************************************/ +/* GetFeatureCount() */ +/************************************************************************/ + +GIntBig OGRFeatherLayer::GetFeatureCount(int bForce) +{ + if( m_poRecordBatchFileReader != nullptr && + m_poAttrQuery == nullptr && m_poFilterGeom == nullptr ) + { + auto result = m_poRecordBatchFileReader->CountRows(); + if( result.ok() ) + return *result; + } + else if( m_poRecordBatchReader != nullptr ) + { + if( !m_bSeekable && !bForce ) + { + if( m_poAttrQuery == nullptr && m_poFilterGeom == nullptr ) + { + TryToCacheFirstTwoBatches(); + } + + if( !m_bSingleBatch ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "GetFeatureCount() cannot be run in non-forced mode on " + "a non-seekable file made of several batches"); + return -1; + } + } + + if( m_poAttrQuery == nullptr && m_poFilterGeom == nullptr ) + { + GIntBig nFeatures = 0; + ResetReading(); + if( !m_poBatch ) + ReadNextBatchStream(); + while( m_poBatch ) + { + nFeatures += m_poBatch->num_rows(); + if( !ReadNextBatchStream() ) + break; + } + ResetReading(); + return nFeatures; + } + } + return OGRLayer::GetFeatureCount(bForce); +} + +/************************************************************************/ +/* CanRunNonForcedGetExtent() */ +/************************************************************************/ + +bool OGRFeatherLayer::CanRunNonForcedGetExtent() +{ + if( m_bSeekable ) + return true; + TryToCacheFirstTwoBatches(); + if( !m_bSingleBatch ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "GetExtent() cannot be run in non-forced mode on " + "a non-seekable file made of several batches"); + return false; + } + return true; +} + +/************************************************************************/ +/* TestCapability() */ +/************************************************************************/ + +int OGRFeatherLayer::TestCapability(const char* pszCap) +{ + if( EQUAL(pszCap, OLCFastFeatureCount) ) + { + return m_bSeekable && + m_poAttrQuery == nullptr && m_poFilterGeom == nullptr; + } + + if( EQUAL(pszCap, OLCFastGetExtent) ) + { + for(int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); i++ ) + { + auto oIter = m_oMapGeometryColumns.find( + m_poFeatureDefn->GetGeomFieldDefn(i)->GetNameRef() ); + if( oIter == m_oMapGeometryColumns.end() ) + { + return false; + } + const auto& oJSONDef = oIter->second; + const auto oBBox = oJSONDef.GetArray("bbox"); + if( !(oBBox.IsValid() && oBBox.Size() == 4) ) + { + return false; + } + } + return true; + } + + if( EQUAL(pszCap, OLCStringsAsUTF8) ) + return true; + + if( EQUAL(pszCap, OLCMeasuredGeometries) ) + return true; + + return false; +} + +/************************************************************************/ +/* GetMetadataItem() */ +/************************************************************************/ + +const char* OGRFeatherLayer::GetMetadataItem( const char* pszName, + const char* pszDomain ) +{ + // Mostly for unit test purposes + if( pszDomain != nullptr && EQUAL(pszDomain, "_ARROW_") ) + { + if( EQUAL(pszName, "FORMAT") ) + { + return m_poRecordBatchFileReader ? "FILE": "STREAM"; + } + if( m_poRecordBatchFileReader != nullptr ) + { + int iBatch = -1; + if( EQUAL(pszName, "NUM_RECORD_BATCHES") ) + { + return CPLSPrintf("%d", m_poRecordBatchFileReader->num_record_batches()); + } + else if( sscanf(pszName, "RECORD_BATCHES[%d]", &iBatch) == 1 && + strstr(pszName, ".NUM_ROWS") ) + { + auto result = m_poRecordBatchFileReader->ReadRecordBatch(iBatch); + if( !result.ok() ) + { + return nullptr; + } + return CPLSPrintf("%" PRId64, (*result)->num_rows()); + } + } + return nullptr; + } + if( pszDomain != nullptr && EQUAL(pszDomain, "_ARROW_METADATA_") ) + { + const auto kv_metadata = (m_poRecordBatchFileReader ? + m_poRecordBatchFileReader->schema() : m_poRecordBatchReader->schema())->metadata(); + if( kv_metadata && kv_metadata->Contains(pszName) ) + { + auto metadataItem = kv_metadata->Get(pszName); + if( metadataItem.ok() ) + { + return CPLSPrintf("%s", metadataItem->c_str()); + } + } + return nullptr; + } + if( m_poRecordBatchFileReader != nullptr && + pszDomain != nullptr && EQUAL(pszDomain, "_ARROW_FOOTER_METADATA_") ) + { + const auto kv_metadata = m_poRecordBatchFileReader->metadata(); + if( kv_metadata && kv_metadata->Contains(pszName) ) + { + auto metadataItem = kv_metadata->Get(pszName); + if( metadataItem.ok() ) + { + return CPLSPrintf("%s", metadataItem->c_str()); + } + } + return nullptr; + } + return OGRLayer::GetMetadataItem(pszName, pszDomain); +} + +/************************************************************************/ +/* GetMetadata() */ +/************************************************************************/ + +char** OGRFeatherLayer::GetMetadata( const char* pszDomain ) +{ + // Mostly for unit test purposes + if( pszDomain != nullptr && EQUAL(pszDomain, "_ARROW_METADATA_") ) + { + m_aosFeatherMetadata.Clear(); + const auto kv_metadata = (m_poRecordBatchFileReader ? + m_poRecordBatchFileReader->schema() : m_poRecordBatchReader->schema())->metadata(); + if( kv_metadata ) + { + for( const auto& kv: kv_metadata->sorted_pairs() ) + { + m_aosFeatherMetadata.SetNameValue(kv.first.c_str(), kv.second.c_str()); + } + } + return m_aosFeatherMetadata.List(); + } + if( m_poRecordBatchFileReader != nullptr && + pszDomain != nullptr && EQUAL(pszDomain, "_ARROW_FOOTER_METADATA_") ) + { + m_aosFeatherMetadata.Clear(); + const auto kv_metadata = m_poRecordBatchFileReader->metadata(); + if( kv_metadata ) + { + for( const auto& kv: kv_metadata->sorted_pairs() ) + { + m_aosFeatherMetadata.SetNameValue(kv.first.c_str(), kv.second.c_str()); + } + } + return m_aosFeatherMetadata.List(); + } + return OGRLayer::GetMetadata(pszDomain); +} diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherwriterdataset.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterdataset.cpp new file mode 100644 index 000000000000..a1ca7666a4f0 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterdataset.cpp @@ -0,0 +1,134 @@ +/****************************************************************************** + * + * Project: Feather Translator + * Purpose: Implements OGRFeatherDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_feather.h" + +#include "../arrow_common/ograrrowwriterlayer.hpp" + +/************************************************************************/ +/* OGRFeatherWriterDataset() */ +/************************************************************************/ + +OGRFeatherWriterDataset::OGRFeatherWriterDataset( + const char* pszFilename, + const std::shared_ptr& poOutputStream): + m_osFilename(pszFilename), + m_poMemoryPool(arrow::MemoryPool::CreateDefault()), + m_poOutputStream(poOutputStream) +{ +} + +/************************************************************************/ +/* GetLayerCount() */ +/************************************************************************/ + +int OGRFeatherWriterDataset::GetLayerCount() +{ + return m_poLayer ? 1 : 0; +} + +/************************************************************************/ +/* GetLayer() */ +/************************************************************************/ + +OGRLayer* OGRFeatherWriterDataset::GetLayer(int idx) +{ + return idx == 0 ? m_poLayer.get() : nullptr; +} + +/************************************************************************/ +/* TestCapability() */ +/************************************************************************/ + +int OGRFeatherWriterDataset::TestCapability(const char* pszCap) +{ + if( EQUAL(pszCap, ODsCCreateLayer) ) + return m_poLayer == nullptr; + if( EQUAL(pszCap, ODsCAddFieldDomain) ) + return m_poLayer != nullptr; + return false; +} + +/************************************************************************/ +/* ICreateLayer() */ +/************************************************************************/ + +OGRLayer* OGRFeatherWriterDataset::ICreateLayer( const char *pszName, + OGRSpatialReference *poSpatialRef, + OGRwkbGeometryType eGType, + char ** papszOptions ) +{ + if( m_poLayer ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Can write only one layer in a Feather file"); + return nullptr; + } + m_poLayer = cpl::make_unique(m_poMemoryPool.get(), + m_poOutputStream, + pszName); + if( !m_poLayer->SetOptions(m_osFilename, papszOptions, poSpatialRef, eGType) ) + { + m_poLayer.reset(); + return nullptr; + } + return m_poLayer.get(); +} + +/************************************************************************/ +/* AddFieldDomain() */ +/************************************************************************/ + +bool OGRFeatherWriterDataset::AddFieldDomain(std::unique_ptr&& domain, + std::string& failureReason) +{ + if( m_poLayer == nullptr ) + { + failureReason = "Layer must be created"; + return false; + } + return m_poLayer->AddFieldDomain(std::move(domain), failureReason); +} + +/************************************************************************/ +/* GetFieldDomainNames() */ +/************************************************************************/ + +std::vector OGRFeatherWriterDataset::GetFieldDomainNames(CSLConstList) const +{ + return m_poLayer ? m_poLayer->GetFieldDomainNames() : std::vector(); +} + +/************************************************************************/ +/* GetFieldDomain() */ +/************************************************************************/ + +const OGRFieldDomain* OGRFeatherWriterDataset::GetFieldDomain(const std::string& name) const +{ + return m_poLayer ? m_poLayer->GetFieldDomain(name): nullptr; +} diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp new file mode 100644 index 000000000000..c3375f8010f8 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp @@ -0,0 +1,427 @@ +/****************************************************************************** + * + * Project: Feather Translator + * Purpose: Implements OGRFeatherDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_feather.h" + +#include "../arrow_common/ograrrowwriterlayer.hpp" + +/************************************************************************/ +/* OGRFeatherWriterLayer() */ +/************************************************************************/ + +OGRFeatherWriterLayer::OGRFeatherWriterLayer( + arrow::MemoryPool* poMemoryPool, + const std::shared_ptr& poOutputStream, + const char *pszLayerName): + OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName) +{ + m_bWriteFieldArrowExtensionName = true; +} + +/************************************************************************/ +/* ~OGRFeatherWriterLayer() */ +/************************************************************************/ + +OGRFeatherWriterLayer::~OGRFeatherWriterLayer() +{ + if( m_bInitializationOK ) + FinalizeWriting(); +} + +/************************************************************************/ +/* SetOptions() */ +/************************************************************************/ + +bool OGRFeatherWriterLayer::SetOptions(const std::string& osFilename, + CSLConstList papszOptions, + OGRSpatialReference *poSpatialRef, + OGRwkbGeometryType eGType) +{ + const char* pszDefaultFormat = + (EQUAL(CPLGetExtension(osFilename.c_str()), "arrows") || + STARTS_WITH_CI(osFilename.c_str(), "/vsistdout")) ? "STREAM" : "FILE"; + m_bStreamFormat = EQUAL( + CSLFetchNameValueDef(papszOptions, "FORMAT", pszDefaultFormat), "STREAM"); + + const char* pszGeomEncoding = CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING"); + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC; + if( pszGeomEncoding ) + { + if( EQUAL(pszGeomEncoding, "WKB") ) + m_eGeomEncoding = OGRArrowGeomEncoding::WKB; + else if( EQUAL(pszGeomEncoding, "WKT") ) + m_eGeomEncoding = OGRArrowGeomEncoding::WKT; + else if( EQUAL(pszGeomEncoding, "GEOARROW") ) + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC; + else + { + CPLError(CE_Failure, CPLE_NotSupported, + "Unsupported GEOMETRY_ENCODING = %s", + pszGeomEncoding); + return false; + } + } + + if( eGType != wkbNone ) + { + if( !IsSupportedGeometryType(eGType) ) + { + return false; + } + + if( poSpatialRef == nullptr ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column should have an associated CRS"); + } + + m_poFeatureDefn->SetGeomType(eGType); + auto eGeomEncoding = m_eGeomEncoding; + if( eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC ) + { + eGeomEncoding = GetPreciseArrowGeomEncoding(eGType); + if( eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC ) + return false; + } + m_aeGeomEncoding.push_back(eGeomEncoding); + m_poFeatureDefn->GetGeomFieldDefn(0)->SetName( + CSLFetchNameValueDef(papszOptions, "GEOMETRY_NAME", "geometry")); + if( poSpatialRef ) + { + auto poSRS = poSpatialRef->Clone(); + m_poFeatureDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS); + poSRS->Release(); + } + } + + m_osFIDColumn = CSLFetchNameValueDef(papszOptions, "FID", ""); + + const char* pszCompression = CSLFetchNameValue( + papszOptions, "COMPRESSION"); + if( pszCompression == nullptr ) + { + auto oResult = arrow::util::Codec::GetCompressionType("lz4"); + if( oResult.ok() && arrow::util::Codec::IsAvailable(*oResult) ) + { + pszCompression = "LZ4"; + } + else + { + pszCompression = "NONE"; + } + } + + if( EQUAL(pszCompression, "NONE") ) + pszCompression = "UNCOMPRESSED"; + auto oResult = arrow::util::Codec::GetCompressionType( + CPLString(pszCompression).tolower()); + if( !oResult.ok() ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Unrecognized compression method: %s", pszCompression); + return false; + } + m_eCompression = *oResult; + if( !arrow::util::Codec::IsAvailable(m_eCompression) ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Compression method %s is known, but libarrow has not " + "been built with support for it", pszCompression); + return false; + } + + const char* pszRowGroupSize = CSLFetchNameValue(papszOptions, "BATCH_SIZE"); + if( pszRowGroupSize ) + { + auto nRowGroupSize = static_cast(atoll(pszRowGroupSize)); + if( nRowGroupSize > 0 ) + { + if( nRowGroupSize > INT_MAX ) + nRowGroupSize = INT_MAX; + m_nRowGroupSize = nRowGroupSize; + } + } + + m_bInitializationOK = true; + return true; +} + +/************************************************************************/ +/* CloseFileWriter() */ +/************************************************************************/ + +void OGRFeatherWriterLayer::CloseFileWriter() +{ + auto status = m_poFileWriter->Close(); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "FileWriter::Close() failed with %s", + status.message().c_str()); + } +} + +/************************************************************************/ +/* CreateSchema() */ +/************************************************************************/ + +void OGRFeatherWriterLayer::CreateSchema() +{ + CreateSchemaCommon(); + + if( m_poFeatureDefn->GetGeomFieldCount() != 0 && + CPLTestBool(CPLGetConfigOption("OGR_ARROW_WRITE_GEO", "YES")) ) + { + CPLJSONObject oRoot; + oRoot.Add("schema_version", "0.1.0"); + oRoot.Add("primary_column", + m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()); + CPLJSONObject oColumns; + oRoot.Add("columns", oColumns); + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + CPLJSONObject oColumn; + oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn); + oColumn.Add("encoding", + GetGeomEncodingAsString(m_aeGeomEncoding[i])); + + const auto poSRS = poGeomFieldDefn->GetSpatialRef(); + if( poSRS ) + { + const char* const apszOptions[] = { + "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr }; + char* pszWKT = nullptr; + poSRS->exportToWkt(&pszWKT, apszOptions); + if( pszWKT ) + oColumn.Add("crs", pszWKT); + CPLFree(pszWKT); + } + +#if 0 + if( m_aoEnvelopes[i].IsInit() && + CPLTestBool(CPLGetConfigOption( + "OGR_ARROW_WRITE_BBOX", "YES")) ) + { + CPLJSONArray oBBOX; + oBBOX.Add(m_aoEnvelopes[i].MinX); + oBBOX.Add(m_aoEnvelopes[i].MinY); + oBBOX.Add(m_aoEnvelopes[i].MaxX); + oBBOX.Add(m_aoEnvelopes[i].MaxY); + oColumn.Add("bbox", oBBOX); + } +#endif + const auto eType = poGeomFieldDefn->GetType(); + if( CPLTestBool(CPLGetConfigOption( + "OGR_ARROW_WRITE_GDAL_GEOMETRY_TYPE", "YES")) && + eType == wkbFlatten(eType) ) + { + // Geometry type, place under a temporary "gdal:geometry_type" property + // pending acceptance of proposal at + // https://github.com/opengeospatial/geoparquet/issues/41 + const char* pszType = "mixed"; + if( wkbPoint == eType ) + pszType = "Point"; + else if( wkbLineString == eType ) + pszType = "LineString"; + else if( wkbPolygon == eType ) + pszType = "Polygon"; + else if( wkbMultiPoint == eType ) + pszType = "MultiPoint"; + else if( wkbMultiLineString == eType ) + pszType = "MultiLineString"; + else if( wkbMultiPolygon == eType ) + pszType = "MultiPolygon"; + else if( wkbGeometryCollection == eType ) + pszType = "GeometryCollection"; + oColumn.Add("gdal:geometry_type", pszType); + } + } + + auto kvMetadata = m_poSchema->metadata() ? m_poSchema->metadata()->Copy() : + std::make_shared(); + kvMetadata->Append("geo", oRoot.Format(CPLJSONObject::PrettyFormat::Plain)); + m_poSchema = m_poSchema->WithMetadata(kvMetadata); + CPLAssert(m_poSchema); + } +} + +/************************************************************************/ +/* CreateWriter() */ +/************************************************************************/ + +void OGRFeatherWriterLayer::CreateWriter() +{ + CPLAssert( m_poFileWriter == nullptr ); + + if( m_poSchema == nullptr ) + { + CreateSchema(); + } + else + { + FinalizeSchema(); + } + + auto options = arrow::ipc::IpcWriteOptions::Defaults(); + options.memory_pool = m_poMemoryPool; + + { + auto result = arrow::util::Codec::Create(m_eCompression); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Codec::Create() failed with %s", + result.status().message().c_str()); + } + else + { + options.codec.reset(result->release()); + } + } + + if( m_bStreamFormat ) + { + auto result = arrow::ipc::MakeStreamWriter(m_poOutputStream, + m_poSchema, + options); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "arrow::ipc::MakeStreamWriter() failed with %s", + result.status().message().c_str()); + } + else + { + m_poFileWriter = *result; + } + } + else + { + m_poFooterKeyValueMetadata = std::make_shared(); + auto result = arrow::ipc::MakeFileWriter(m_poOutputStream, + m_poSchema, + options, + m_poFooterKeyValueMetadata); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "arrow::ipc::MakeFileWriter() failed with %s", + result.status().message().c_str()); + } + else + { + m_poFileWriter = *result; + } + } +} + +/************************************************************************/ +/* DoSomethingBeforeFinalFlushGroup() */ +/************************************************************************/ + +void OGRFeatherWriterLayer::DoSomethingBeforeFinalFlushGroup() +{ + // gdal:geo extension for now. Embeds a bbox + if( m_poFooterKeyValueMetadata && + m_poFeatureDefn->GetGeomFieldCount() != 0 && + CPLTestBool(CPLGetConfigOption("OGR_ARROW_WRITE_GDAL_FOOTER", "YES")) ) + { + CPLJSONObject oRoot; + oRoot.Add("primary_column", + m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()); + CPLJSONObject oColumns; + oRoot.Add("columns", oColumns); + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + CPLJSONObject oColumn; + oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn); + oColumn.Add("encoding", + GetGeomEncodingAsString(m_aeGeomEncoding[i])); + + const auto poSRS = poGeomFieldDefn->GetSpatialRef(); + if( poSRS ) + { + const char* const apszOptions[] = { + "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr }; + char* pszWKT = nullptr; + poSRS->exportToWkt(&pszWKT, apszOptions); + if( pszWKT ) + oColumn.Add("crs", pszWKT); + CPLFree(pszWKT); + } + + if( m_aoEnvelopes[i].IsInit() ) + { + CPLJSONArray oBBOX; + oBBOX.Add(m_aoEnvelopes[i].MinX); + oBBOX.Add(m_aoEnvelopes[i].MinY); + oBBOX.Add(m_aoEnvelopes[i].MaxX); + oBBOX.Add(m_aoEnvelopes[i].MaxY); + oColumn.Add("bbox", oBBOX); + } + } + + m_poFooterKeyValueMetadata->Append( + GDAL_GEO_FOOTER_KEY, + oRoot.Format(CPLJSONObject::PrettyFormat::Plain)); + } +} + +/************************************************************************/ +/* FlushGroup() */ +/************************************************************************/ + +bool OGRFeatherWriterLayer::FlushGroup() +{ + std::vector> columns; + auto ret = WriteArrays([this, &columns](const std::shared_ptr&, + const std::shared_ptr& array) { + columns.emplace_back(array); + return true; + }); + + if( ret ) + { + auto poRecordBatch = arrow::RecordBatch::Make( + m_poSchema, + !columns.empty() ? columns[0]->length(): 0, + columns); + auto status = m_poFileWriter->WriteRecordBatch(*poRecordBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "WriteRecordBatch() failed with %s", status.message().c_str()); + ret = false; + } + } + + m_apoBuilders.clear(); + return ret; +} diff --git a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h new file mode 100644 index 000000000000..ec4e6fcfbebd --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h @@ -0,0 +1,250 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_ARROW_H +#define OGR_ARROW_H + +#include "gdal_pam.h" +#include "ogrsf_frmts.h" + +#include + +#include "ogr_include_arrow.h" + +enum class OGRArrowGeomEncoding +{ + WKB, + WKT, + GEOARROW_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding + GEOARROW_POINT, + GEOARROW_LINESTRING, + GEOARROW_POLYGON, + GEOARROW_MULTIPOINT, + GEOARROW_MULTILINESTRING, + GEOARROW_MULTIPOLYGON, +}; + +/************************************************************************/ +/* OGRArrowLayer */ +/************************************************************************/ + +class OGRArrowDataset; + +class OGRArrowLayer CPL_NON_FINAL: public OGRLayer, + public OGRGetNextFeatureThroughRaw +{ + OGRArrowLayer(const OGRArrowLayer&) = delete; + OGRArrowLayer& operator= (const OGRArrowLayer&) = delete; + +protected: + arrow::MemoryPool* m_poMemoryPool = nullptr; + OGRFeatureDefn* m_poFeatureDefn = nullptr; + std::shared_ptr m_poSchema{}; + std::string m_osFIDColumn{}; + int m_iFIDArrowColumn = -1; + std::vector> m_anMapFieldIndexToArrowColumn{}; + std::vector m_anMapGeomFieldIndexToArrowColumn{}; + std::vector m_aeGeomEncoding{}; + + bool m_bIgnoredFields = false; + std::vector m_anMapFieldIndexToArrayIndex{}; // only valid when m_bIgnoredFields is set + std::vector m_anMapGeomFieldIndexToArrayIndex{}; // only valid when m_bIgnoredFields is set + int m_nRequestedFIDColumn = -1; // only valid when m_bIgnoredFields is set + + bool m_bEOF = false; + int64_t m_nFeatureIdx = 0; + int64_t m_nIdxInBatch = 0; + std::map m_oMapGeometryColumns{}; + int m_iRecordBatch = -1; + std::shared_ptr m_poBatch{}; + mutable std::shared_ptr m_poReadFeatureTmpArray{}; + + std::map> LoadGDALMetadata(const arrow::KeyValueMetadata* kv_metadata); + + OGRArrowLayer(OGRArrowDataset* poDS, const char* pszLayerName); + + virtual std::string GetDriverUCName() const = 0; + static bool IsIntegerArrowType(arrow::Type::type typeId); + static bool IsValidGeometryEncoding(const std::shared_ptr& field, + const std::string& osEncoding, + OGRwkbGeometryType& eGeomTypeOut, + OGRArrowGeomEncoding& eGeomEncodingOut); + static OGRwkbGeometryType GetGeometryTypeFromString(const std::string& osType); + bool MapArrowTypeToOGR(const std::shared_ptr& type, + const std::shared_ptr& field, + OGRFieldDefn& oField, + OGRFieldType& eType, + OGRFieldSubType& eSubType, + const std::vector& path, + const std::map>& oMapFieldNameToGDALSchemaFieldDefn); + std::unique_ptr BuildDomainFromBatch( + const std::string& osDomainName, + const std::shared_ptr& poBatch, + int iCol) const; + OGRwkbGeometryType ComputeGeometryColumnTypeProcessBatch( + const std::shared_ptr& poBatch, + int iGeomCol, int iBatchCol, + OGRwkbGeometryType eGeomType) const; + static bool ReadWKBBoundingBox(const uint8_t* data, size_t size, OGREnvelope& sEnvelope); + OGRFeature* ReadFeature(int64_t nIdxInBatch, + const std::vector>& poColumnArrays) const; + OGRGeometry* ReadGeometry(int iGeomField, + const arrow::Array* array, + int64_t nIdxInBatch) const; + virtual bool ReadNextBatch() = 0; + OGRFeature* GetNextRawFeature(); + + virtual bool CanRunNonForcedGetExtent() { return true; } + +public: + virtual ~OGRArrowLayer() override; + + OGRFeatureDefn* GetLayerDefn() override { return m_poFeatureDefn; } + void ResetReading() override; + const char* GetFIDColumn() override { return m_osFIDColumn.c_str(); } + DEFINE_GET_NEXT_FEATURE_THROUGH_RAW(OGRArrowLayer) + OGRErr GetExtent(OGREnvelope *psExtent, int bForce = TRUE) override; + OGRErr GetExtent(int iGeomField, OGREnvelope *psExtent, + int bForce = TRUE) override; + + virtual std::unique_ptr BuildDomain(const std::string& osDomainName, + int iFieldIndex) const = 0; +}; + +/************************************************************************/ +/* OGRArrowDataset */ +/************************************************************************/ + +class OGRArrowDataset CPL_NON_FINAL: public GDALPamDataset +{ + std::unique_ptr m_poMemoryPool{}; + std::unique_ptr m_poLayer{}; + std::vector m_aosDomainNames{}; + std::map m_oMapDomainNameToCol{}; + +public: + explicit OGRArrowDataset(std::unique_ptr&& poMemoryPool); + + inline arrow::MemoryPool* GetMemoryPool() const { return m_poMemoryPool.get(); } + void SetLayer(std::unique_ptr&& poLayer); + + void RegisterDomainName(const std::string& osDomainName, int iFieldIndex); + + std::vector GetFieldDomainNames(CSLConstList /*papszOptions*/ = nullptr) const override; + const OGRFieldDomain* GetFieldDomain(const std::string& name) const override; + + int GetLayerCount() override ; + OGRLayer* GetLayer(int idx) override; +}; + +/************************************************************************/ +/* OGRArrowWriterLayer */ +/************************************************************************/ + +class OGRArrowWriterLayer CPL_NON_FINAL: public OGRLayer + +{ +protected: + OGRArrowWriterLayer(const OGRArrowWriterLayer&) = delete; + OGRArrowWriterLayer& operator= (const OGRArrowWriterLayer&) = delete; + + arrow::MemoryPool* m_poMemoryPool = nullptr; + bool m_bInitializationOK = false; + std::shared_ptr m_poOutputStream{}; + std::shared_ptr m_poSchema{}; + OGRFeatureDefn* m_poFeatureDefn = nullptr; + std::map> m_oMapFieldDomains{}; + std::map> m_oMapFieldDomainToStringArray{}; + + bool m_bWriteFieldArrowExtensionName = false; + OGRArrowGeomEncoding m_eGeomEncoding = OGRArrowGeomEncoding::WKB; + std::vector m_aeGeomEncoding{}; + + std::string m_osFIDColumn{}; + int64_t m_nFeatureCount = 0; + + int64_t m_nRowGroupSize = 64 * 1024; + arrow::Compression::type m_eCompression = arrow::Compression::UNCOMPRESSED; + + std::vector> m_apoBuilders{}; + + std::vector m_abyBuffer{}; + + std::vector m_anTZFlag{}; // size: GetFieldCount() + std::vector m_aoEnvelopes{}; // size: GetGeomFieldCount() + + static OGRArrowGeomEncoding GetPreciseArrowGeomEncoding( + OGRwkbGeometryType eGType); + static const char* GetGeomEncodingAsString( + OGRArrowGeomEncoding eGeomEncoding); + + bool IsSupportedGeometryType(OGRwkbGeometryType eGType) const; + + virtual std::string GetDriverUCName() const = 0; + + virtual bool IsFileWriterCreated() const = 0; + virtual void CreateWriter() = 0; + virtual void CloseFileWriter() = 0; + + void CreateSchemaCommon(); + void FinalizeSchema(); + virtual void CreateSchema() = 0; + virtual void DoSomethingBeforeFinalFlushGroup() {} + + void CreateArrayBuilders(); + virtual bool FlushGroup() = 0; + void FinalizeWriting(); + bool WriteArrays(std::function&, + const std::shared_ptr&)> postProcessArray); + +public: + OGRArrowWriterLayer( arrow::MemoryPool* poMemoryPool, + const std::shared_ptr& poOutputStream, + const char *pszLayerName ); + + ~OGRArrowWriterLayer() override; + + bool AddFieldDomain(std::unique_ptr&& domain, + std::string& failureReason); + std::vector GetFieldDomainNames() const; + const OGRFieldDomain* GetFieldDomain(const std::string& name) const; + + OGRFeatureDefn* GetLayerDefn() override { return m_poFeatureDefn; } + void ResetReading() override {} + OGRFeature *GetNextFeature() override { return nullptr; } + int TestCapability(const char* pszCap) override; + OGRErr CreateField( OGRFieldDefn *poField, int bApproxOK = TRUE ) override; + OGRErr CreateGeomField( OGRGeomFieldDefn *poField, int bApproxOK = TRUE ) override; + GIntBig GetFeatureCount(int bForce) override; + +protected: + OGRErr ICreateFeature( OGRFeature* poFeature ) override; +}; + + +#endif // OGR_ARROW_H diff --git a/ogr/ogrsf_frmts/arrow_common/ogr_include_arrow.h b/ogr/ogrsf_frmts/arrow_common/ogr_include_arrow.h new file mode 100644 index 000000000000..2c1ecad4d2da --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ogr_include_arrow.h @@ -0,0 +1,57 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_INCLUDE_ARROW_H +#define OGR_INCLUDE_ARROW_H + +#if defined(__GNUC__) && !defined(_MSC_VER) +#pragma GCC system_header +#endif + +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4244 ) /* warning 4244: 'initializing': conversion from 'int32_t' to 'int16_t', possible loss of data */ +#pragma warning( disable : 4458 ) /* warning 4458: declaration of 'type_id' hides class member */ +#endif + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/array/array_dict.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/io/file.h" +#include "arrow/memory_pool.h" +#include "arrow/record_batch.h" +#include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" + +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + +#endif diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowdataset.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowdataset.hpp new file mode 100644 index 000000000000..b19348ee5b1c --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowdataset.hpp @@ -0,0 +1,103 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_arrow.h" + +/************************************************************************/ +/* OGRArrowDataset() */ +/************************************************************************/ + +inline OGRArrowDataset::OGRArrowDataset(std::unique_ptr&& poMemoryPool): + m_poMemoryPool(std::move(poMemoryPool)) +{ +} + +/************************************************************************/ +/* SetLayer() */ +/************************************************************************/ + +inline void OGRArrowDataset::SetLayer(std::unique_ptr&& poLayer) +{ + m_poLayer = std::move(poLayer); +} + +/************************************************************************/ +/* RegisterDomainName() */ +/************************************************************************/ + +inline void OGRArrowDataset::RegisterDomainName(const std::string& osDomainName, int iFieldIndex) +{ + m_aosDomainNames.push_back(osDomainName); + m_oMapDomainNameToCol[osDomainName] = iFieldIndex; +} + +/************************************************************************/ +/* GetFieldDomainNames() */ +/************************************************************************/ + +inline std::vector OGRArrowDataset::GetFieldDomainNames(CSLConstList) const +{ + return m_aosDomainNames; +} + +/************************************************************************/ +/* GetFieldDomain() */ +/************************************************************************/ + +inline const OGRFieldDomain* OGRArrowDataset::GetFieldDomain(const std::string& name) const +{ + { + const auto iter = m_oMapFieldDomains.find(name); + if( iter != m_oMapFieldDomains.end() ) + return iter->second.get(); + } + const auto iter = m_oMapDomainNameToCol.find(name); + if( iter == m_oMapDomainNameToCol.end() ) + return nullptr; + return m_oMapFieldDomains.insert( + std::pair>( + name, m_poLayer->BuildDomain(name, iter->second))).first->second.get(); +} + +/************************************************************************/ +/* GetLayerCount() */ +/************************************************************************/ + +inline int OGRArrowDataset::GetLayerCount() +{ + return m_poLayer ? 1 : 0; +} + +/************************************************************************/ +/* GetLayer() */ +/************************************************************************/ + +inline OGRLayer* OGRArrowDataset::GetLayer(int idx) +{ + return idx == 0 ? m_poLayer.get() : nullptr; +} diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp new file mode 100644 index 000000000000..e97853d81c44 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -0,0 +1,2388 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_arrow.h" + +#include "cpl_json.h" +#include "cpl_time.h" +#include "ogr_p.h" + +#include + +/************************************************************************/ +/* OGRArrowLayer() */ +/************************************************************************/ + +inline +OGRArrowLayer::OGRArrowLayer(OGRArrowDataset* poDS, const char* pszLayerName): + m_poMemoryPool(poDS->GetMemoryPool()) +{ + m_poFeatureDefn = new OGRFeatureDefn(pszLayerName); + m_poFeatureDefn->SetGeomType(wkbNone); + m_poFeatureDefn->Reference(); + SetDescription(pszLayerName); +} + +/************************************************************************/ +/* ~OGRFeatherLayer() */ +/************************************************************************/ + +inline OGRArrowLayer::~OGRArrowLayer() +{ + CPLDebug("ARROW", "Memory pool: bytes_allocated = %" PRId64, + m_poMemoryPool->bytes_allocated()); + CPLDebug("ARROW", "Memory pool: max_memory = %" PRId64, + m_poMemoryPool->max_memory()); + m_poFeatureDefn->Release(); +} + +/************************************************************************/ +/* LoadGDALMetadata() */ +/************************************************************************/ + +inline +std::map> OGRArrowLayer::LoadGDALMetadata(const arrow::KeyValueMetadata* kv_metadata) +{ + std::map> oMapFieldNameToGDALSchemaFieldDefn; + if( kv_metadata && kv_metadata->Contains("gdal:schema") && + CPLTestBool(CPLGetConfigOption(("OGR_" + GetDriverUCName() + "_READ_GDAL_SCHEMA").c_str(), "YES")) ) + { + auto gdalSchema = kv_metadata->Get("gdal:schema"); + if( gdalSchema.ok() ) + { + CPLDebug(GetDriverUCName().c_str(), "gdal:schema = %s", gdalSchema->c_str()); + CPLJSONDocument oDoc; + if( oDoc.LoadMemory(*gdalSchema) ) + { + auto oRoot = oDoc.GetRoot(); + + m_osFIDColumn = oRoot.GetString("fid"); + + auto oColumns = oRoot.GetObj("columns"); + if( oColumns.IsValid() ) + { + for( const auto oColumn: oColumns.GetChildren() ) + { + const auto osName = oColumn.GetName(); + const auto osType = oColumn.GetString("type"); + const auto osSubType = oColumn.GetString("subtype"); + auto poFieldDefn = cpl::make_unique(osName.c_str(), OFTString); + for( int iType = 0; iType <= static_cast(OFTMaxType); iType++ ) + { + if( EQUAL(osType.c_str(), OGRFieldDefn::GetFieldTypeName( + static_cast(iType))) ) + { + poFieldDefn->SetType(static_cast(iType)); + break; + } + } + if( !osSubType.empty() ) + { + for( int iSubType = 0; iSubType <= static_cast(OFSTMaxSubType); iSubType++ ) + { + if( EQUAL(osSubType.c_str(), + OGRFieldDefn::GetFieldSubTypeName( + static_cast(iSubType))) ) + { + poFieldDefn->SetSubType(static_cast(iSubType)); + break; + } + } + } + poFieldDefn->SetWidth(oColumn.GetInteger("width")); + poFieldDefn->SetPrecision(oColumn.GetInteger("precision")); + oMapFieldNameToGDALSchemaFieldDefn[osName] = std::move(poFieldDefn); + + } + } + } + } + } + return oMapFieldNameToGDALSchemaFieldDefn; +} + + +/************************************************************************/ +/* IsIntegerArrowType() */ +/************************************************************************/ + +inline bool OGRArrowLayer::IsIntegerArrowType(arrow::Type::type typeId) +{ + return typeId == arrow::Type::INT8 || + typeId == arrow::Type::UINT8 || + typeId == arrow::Type::INT16 || + typeId == arrow::Type::UINT16 || + typeId == arrow::Type::INT32 || + typeId == arrow::Type::UINT32 || + typeId == arrow::Type::INT64 || + typeId == arrow::Type::UINT64; +} + +/************************************************************************/ +/* MapArrowTypeToOGR() */ +/************************************************************************/ + +inline bool OGRArrowLayer::MapArrowTypeToOGR(const std::shared_ptr& type, + const std::shared_ptr& field, + OGRFieldDefn& oField, + OGRFieldType& eType, + OGRFieldSubType& eSubType, + const std::vector& path, + const std::map>& oMapFieldNameToGDALSchemaFieldDefn) +{ + bool bTypeOK = true; + switch( type->id() ) + { + case arrow::Type::NA: + break; + + case arrow::Type::BOOL: + eType = OFTInteger; + eSubType = OFSTBoolean; + break; + case arrow::Type::UINT8: + case arrow::Type::INT8: + case arrow::Type::UINT16: + eType = OFTInteger; + break; + case arrow::Type::INT16: + eType = OFTInteger; + eSubType = OFSTInt16; + break; + case arrow::Type::UINT32: + eType = OFTInteger64; + break; + case arrow::Type::INT32: + eType = OFTInteger; + break; + case arrow::Type::UINT64: + eType = OFTReal; // potential loss + break; + case arrow::Type::INT64: + eType = OFTInteger64; + break; + case arrow::Type::HALF_FLOAT: // should use OFSTFloat16 if we had it + case arrow::Type::FLOAT: + eType = OFTReal; + eSubType = OFSTFloat32; + break; + case arrow::Type::DOUBLE: + eType = OFTReal; + break; + case arrow::Type::STRING: + case arrow::Type::LARGE_STRING: + eType = OFTString; + break; + case arrow::Type::BINARY: + case arrow::Type::LARGE_BINARY: + eType = OFTBinary; + break; + case arrow::Type::FIXED_SIZE_BINARY: + eType = OFTBinary; + oField.SetWidth( + std::static_pointer_cast(type)->byte_width()); + break; + + case arrow::Type::DATE32: + case arrow::Type::DATE64: + eType = OFTDate; + break; + + case arrow::Type::TIMESTAMP: + eType = OFTDateTime; + break; + + case arrow::Type::TIME32: + eType = OFTTime; + break; + + case arrow::Type::TIME64: + eType = OFTInteger64; // our OFTTime doesn't have micro or nanosecond accuracy + break; + + case arrow::Type::DECIMAL128: + case arrow::Type::DECIMAL256: + { + const auto decimalType = std::static_pointer_cast(type); + eType = OFTReal; + oField.SetWidth(decimalType->precision()); + oField.SetPrecision(decimalType->scale()); + break; + } + + case arrow::Type::LIST: + case arrow::Type::FIXED_SIZE_LIST: + { + auto listType = std::static_pointer_cast(type); + switch( listType->value_type()->id() ) + { + case arrow::Type::BOOL: + eType = OFTIntegerList; + eSubType = OFSTBoolean; + break; + case arrow::Type::UINT8: + case arrow::Type::INT8: + case arrow::Type::UINT16: + case arrow::Type::INT16: + case arrow::Type::INT32: + eType = OFTIntegerList; + break; + case arrow::Type::UINT32: + eType = OFTInteger64List; + break; + case arrow::Type::UINT64: + eType = OFTRealList; // potential loss + break; + case arrow::Type::INT64: + eType = OFTInteger64List; + break; + case arrow::Type::HALF_FLOAT: // should use OFSTFloat16 if we had it + case arrow::Type::FLOAT: + eType = OFTRealList; + eSubType = OFSTFloat32; + break; + case arrow::Type::DOUBLE: + eType = OFTRealList; + break; + case arrow::Type::STRING: + eType = OFTStringList; + break; + default: + bTypeOK = false; + CPLError(CE_Warning, CPLE_AppDefined, + "Field %s of unhandled type %s ignored", + field->name().c_str(), + type->ToString().c_str()); + break; + } + break; + } + + case arrow::Type::MAP: + { + auto mapType = std::static_pointer_cast(type); + const auto itemTypeId = mapType->item_type()->id(); + if( mapType->key_type()->id() == arrow::Type::STRING && + (itemTypeId == arrow::Type::BOOL || + IsIntegerArrowType(itemTypeId) || + itemTypeId == arrow::Type::FLOAT || + itemTypeId == arrow::Type::DOUBLE || + itemTypeId == arrow::Type::STRING) ) + { + eType = OFTString; + eSubType = OFSTJSON; + } + else + { + bTypeOK = false; + CPLError(CE_Warning, CPLE_AppDefined, + "Field %s of unhandled type %s ignored", + field->name().c_str(), + type->ToString().c_str()); + } + break; + } + + case arrow::Type::STRUCT: + // should be handled by specialized code + CPLAssert(false); + break; + + // unhandled types + + case arrow::Type::INTERVAL_MONTHS: + case arrow::Type::INTERVAL_DAY_TIME: + case arrow::Type::SPARSE_UNION: + case arrow::Type::DENSE_UNION: + case arrow::Type::DICTIONARY: + case arrow::Type::EXTENSION: + case arrow::Type::DURATION: + case arrow::Type::LARGE_LIST: + case arrow::Type::INTERVAL_MONTH_DAY_NANO: + case arrow::Type::MAX_ID: + { + bTypeOK = false; + CPLError(CE_Warning, CPLE_AppDefined, + "Field %s of unhandled type %s ignored", + field->name().c_str(), + type->ToString().c_str()); + break; + } + } + + if( bTypeOK ) + { + const auto oIter = oMapFieldNameToGDALSchemaFieldDefn.find(field->name()); + oField.SetType(eType); + if( oIter != oMapFieldNameToGDALSchemaFieldDefn.end() ) + { + const auto& poGDALFieldDefn = oIter->second; + if( poGDALFieldDefn->GetType() == eType ) + { + if( eSubType == OFSTNone ) + { + eSubType = poGDALFieldDefn->GetSubType(); + } + else if( eSubType != poGDALFieldDefn->GetSubType() ) + { + CPLDebug(GetDriverUCName().c_str(), + "Field subtype inferred from Parquet/Arrow schema is %s, " + "whereas the one in gdal:schema is %s. " + "Using the former one.", + OGR_GetFieldSubTypeName(eSubType), + OGR_GetFieldSubTypeName(poGDALFieldDefn->GetSubType())); + } + } + else + { + CPLDebug(GetDriverUCName().c_str(), + "Field type inferred from Parquet/Arrow schema is %s, " + "whereas the one in gdal:schema is %s. " + "Using the former one.", + OGR_GetFieldTypeName(eType), + OGR_GetFieldTypeName(poGDALFieldDefn->GetType())); + } + if( poGDALFieldDefn->GetWidth() > 0 ) + oField.SetWidth(poGDALFieldDefn->GetWidth()); + if( poGDALFieldDefn->GetPrecision() > 0 ) + oField.SetPrecision(poGDALFieldDefn->GetPrecision()); + } + oField.SetSubType(eSubType); + oField.SetNullable(field->nullable()); + m_poFeatureDefn->AddFieldDefn(&oField); + m_anMapFieldIndexToArrowColumn.push_back(path); + } + + return bTypeOK; +} + +/************************************************************************/ +/* BuildDomainFromBatch() */ +/************************************************************************/ + +inline std::unique_ptr OGRArrowLayer::BuildDomainFromBatch( + const std::string& osDomainName, + const std::shared_ptr& poBatch, + int iCol) const +{ + const auto array = poBatch->column(iCol); + auto castArray = std::static_pointer_cast(array); + auto dict = castArray->dictionary(); + CPLAssert(dict->type_id() == arrow::Type::STRING ); + OGRFieldType eType = OFTInteger; + const auto indexTypeId = castArray->dict_type()->index_type()->id(); + if( indexTypeId == arrow::Type::UINT32 || + indexTypeId == arrow::Type::UINT64 || + indexTypeId == arrow::Type::INT64 ) + eType = OFTInteger64; + auto values = std::static_pointer_cast(dict); + std::vector asValues; + asValues.reserve(values->length()); + for( int i = 0; i < values->length(); ++i ) + { + if( !values->IsNull(i) ) + { + OGRCodedValue val; + val.pszCode = CPLStrdup(CPLSPrintf("%d", i)); + val.pszValue = CPLStrdup(values->GetString(i).c_str()); + asValues.emplace_back(val); + } + } + return cpl::make_unique( + osDomainName, std::string(), + eType, OFSTNone, std::move(asValues)); +} + +/************************************************************************/ +/* ComputeGeometryColumnTypeProcessBatch() */ +/************************************************************************/ + +inline +OGRwkbGeometryType OGRArrowLayer::ComputeGeometryColumnTypeProcessBatch( + const std::shared_ptr& poBatch, + int iGeomCol, int iBatchCol, + OGRwkbGeometryType eGeomType) const +{ + const auto array = poBatch->column(iBatchCol); + const auto castBinaryArray = + ( m_aeGeomEncoding[iGeomCol] == OGRArrowGeomEncoding::WKB ) ? + std::static_pointer_cast(array) : nullptr; + const auto castStringArray = + ( m_aeGeomEncoding[iGeomCol] == OGRArrowGeomEncoding::WKT ) ? + std::static_pointer_cast(array) : nullptr; + for( int64_t i = 0; i < poBatch->num_rows(); i++ ) + { + if( !array->IsNull(i) ) + { + OGRwkbGeometryType eThisGeomType = wkbNone; + if( m_aeGeomEncoding[iGeomCol] == OGRArrowGeomEncoding::WKB && castBinaryArray ) + { + arrow::BinaryArray::offset_type out_length = 0; + const uint8_t* data = castBinaryArray->GetValue(i, &out_length); + if( out_length >= 5 ) + { + OGRReadWKBGeometryType(data, wkbVariantIso, &eThisGeomType); + } + } + else if ( m_aeGeomEncoding[iGeomCol] == OGRArrowGeomEncoding::WKT && + castStringArray ) + { + const auto osWKT = castStringArray->GetString(i); + if( !osWKT.empty() ) + { + OGRReadWKTGeometryType(osWKT.c_str(), &eThisGeomType); + } + } + + if( eThisGeomType != wkbNone ) + { + if( eGeomType == wkbNone ) + eGeomType = eThisGeomType; + else if( wkbFlatten(eThisGeomType) == wkbFlatten(eGeomType) ) + ; + else if( wkbFlatten(eThisGeomType) == wkbMultiLineString && + wkbFlatten(eGeomType) == wkbLineString ) + { + eGeomType = OGR_GT_SetModifier(wkbMultiLineString, + OGR_GT_HasZ(eThisGeomType) || OGR_GT_HasZ(eGeomType), + OGR_GT_HasM(eThisGeomType) || OGR_GT_HasM(eGeomType)); + } + else if( wkbFlatten(eThisGeomType) == wkbLineString && + wkbFlatten(eGeomType) == wkbMultiLineString ) + ; + else if( wkbFlatten(eThisGeomType) == wkbMultiPolygon && + wkbFlatten(eGeomType) == wkbPolygon ) + { + eGeomType = OGR_GT_SetModifier(wkbMultiPolygon, + OGR_GT_HasZ(eThisGeomType) || OGR_GT_HasZ(eGeomType), + OGR_GT_HasM(eThisGeomType) || OGR_GT_HasM(eGeomType)); + } + else if( wkbFlatten(eThisGeomType) == wkbPolygon && + wkbFlatten(eGeomType) == wkbMultiPolygon ) + ; + else + return wkbUnknown; + + eGeomType = OGR_GT_SetModifier(eGeomType, + OGR_GT_HasZ(eThisGeomType) || OGR_GT_HasZ(eGeomType), + OGR_GT_HasM(eThisGeomType) || OGR_GT_HasM(eGeomType)); + } + } + } + return eGeomType; +} + +/************************************************************************/ +/* IsPointType() */ +/************************************************************************/ + +static bool IsPointType(const std::shared_ptr& type, + bool& bHasZOut, + bool& bHasMOut) +{ + if( type->id() != arrow::Type::FIXED_SIZE_LIST ) + return false; + auto poListType = std::static_pointer_cast(type); + const int nOutDimensionality = poListType->list_size(); + const auto osValueFieldName = poListType->value_field()->name(); + if( nOutDimensionality == 2 ) + { + bHasZOut = false; + bHasMOut = false; + } + else if( nOutDimensionality == 3 ) + { + if( osValueFieldName == "xym" ) + { + bHasZOut = false; + bHasMOut = true; + } + else if( osValueFieldName == "xyz" ) + { + bHasMOut = false; + bHasZOut = true; + } + } + else if( nOutDimensionality == 4 ) + { + bHasMOut = true; + bHasZOut = true; + } + else + { + return false; + } + return poListType->value_type()->id() == arrow::Type::DOUBLE; +} + +/************************************************************************/ +/* IsListOfPointType() */ +/************************************************************************/ + +static bool IsListOfPointType(const std::shared_ptr& type, + int nDepth, + bool& bHasZOut, + bool& bHasMOut) +{ + if( type->id() != arrow::Type::LIST ) + return false; + auto poListType = std::static_pointer_cast(type); + return nDepth == 1 ? + IsPointType(poListType->value_type(), bHasZOut, bHasMOut) : + IsListOfPointType(poListType->value_type(), nDepth - 1, bHasZOut, bHasMOut); +} + +/************************************************************************/ +/* IsValidGeometryEncoding() */ +/************************************************************************/ + +inline bool OGRArrowLayer::IsValidGeometryEncoding(const std::shared_ptr& field, + const std::string& osEncoding, + OGRwkbGeometryType& eGeomTypeOut, + OGRArrowGeomEncoding& eOGRArrowGeomEncodingOut) +{ + const auto& fieldName = field->name(); + const auto& fieldType = field->type(); + const auto fieldTypeId = fieldType->id(); + + eGeomTypeOut = wkbUnknown; + + if( osEncoding == "WKT" ) + { + if( fieldTypeId != arrow::Type::STRING ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a non String type: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::WKT; + return true; + } + + if( osEncoding == "WKB" ) + { + if( fieldTypeId != arrow::Type::BINARY ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a non Binary type: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::WKB; + return true; + } + + bool bHasZ = false; + bool bHasM = false; + if( osEncoding == "geoarrow.point" ) + { + if( !IsPointType(fieldType, bHasZ, bHasM) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a type != fixed_size_list[2]>: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eGeomTypeOut = OGR_GT_SetModifier(wkbPoint, static_cast(bHasZ), static_cast(bHasM)); + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_POINT; + return true; + } + + if( osEncoding == "geoarrow.linestring" ) + { + if( !IsListOfPointType(fieldType, 1, bHasZ, bHasM) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a type != fixed_size_list[2]>: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eGeomTypeOut = OGR_GT_SetModifier(wkbLineString, static_cast(bHasZ), static_cast(bHasM)); + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_LINESTRING; + return true; + } + + if( osEncoding == "geoarrow.polygon" ) + { + if( !IsListOfPointType(fieldType, 2, bHasZ, bHasM) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a type != list[2]>>: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eGeomTypeOut = OGR_GT_SetModifier(wkbPolygon, static_cast(bHasZ), static_cast(bHasM)); + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_POLYGON; + return true; + } + + if( osEncoding == "geoarrow.multipoint" ) + { + if( !IsListOfPointType(fieldType, 1, bHasZ, bHasM) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a type != fixed_size_list[2]>: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eGeomTypeOut = OGR_GT_SetModifier(wkbMultiPoint, static_cast(bHasZ), static_cast(bHasM)); + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_MULTIPOINT; + return true; + } + + if( osEncoding == "geoarrow.multilinestring" ) + { + if( !IsListOfPointType(fieldType, 2, bHasZ, bHasM) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a type != list[2]>>: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eGeomTypeOut = OGR_GT_SetModifier(wkbMultiLineString, static_cast(bHasZ), static_cast(bHasM)); + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING; + return true; + } + + if( osEncoding == "geoarrow.multipolygon" ) + { + if( !IsListOfPointType(fieldType, 3, bHasZ, bHasM) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s has a type != list[2]>>>: %s. " + "Handling it as a regular field", + fieldName.c_str(), + fieldType->name().c_str()); + return false; + } + eGeomTypeOut = OGR_GT_SetModifier(wkbMultiPolygon, static_cast(bHasZ), static_cast(bHasM)); + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON; + return true; + } + + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column %s uses a unhandled encoding: %s. " + "Handling it as a regular field", + fieldName.c_str(), + osEncoding.c_str()); + return false; +} + +/************************************************************************/ +/* GetGeometryTypeFromString() */ +/************************************************************************/ + +inline +OGRwkbGeometryType OGRArrowLayer::GetGeometryTypeFromString(const std::string& osType) +{ + OGRwkbGeometryType eGeomType = wkbUnknown; + OGRReadWKTGeometryType(osType.c_str(), &eGeomType); + if( eGeomType == wkbUnknown && !osType.empty() && osType != "mixed" ) + { + CPLDebug("ARROW", "Unknown geometry type: %s", + osType.c_str()); + } + return eGeomType; +} + +/************************************************************************/ +/* ReadWKBUInt32() */ +/************************************************************************/ + +inline uint32_t ReadWKBUInt32(const uint8_t* data, + OGRwkbByteOrder eByteOrder, + size_t& iOffset) +{ + uint32_t v; + memcpy(&v, data + iOffset, sizeof(v)); + iOffset += sizeof(v); + if( OGR_SWAP(eByteOrder)) + { + CPL_SWAP32PTR(&v); + } + return v; +} + +/************************************************************************/ +/* ReadWKBPointSequence() */ +/************************************************************************/ + +inline bool ReadWKBPointSequence(const uint8_t* data, size_t size, + OGRwkbByteOrder eByteOrder, + int nDim, + size_t& iOffset, OGREnvelope& sEnvelope) +{ + const uint32_t nPoints = ReadWKBUInt32(data, eByteOrder, iOffset); + if( nPoints > (size - iOffset) / (nDim * sizeof(double)) ) + return false; + double dfX = 0; + double dfY = 0; + for( uint32_t j = 0; j < nPoints; j++ ) + { + memcpy(&dfX, data + iOffset, sizeof(double)); + memcpy(&dfY, data + iOffset + sizeof(double), sizeof(double)); + iOffset += nDim * sizeof(double); + if( OGR_SWAP(eByteOrder)) + { + CPL_SWAP64PTR(&dfX); + CPL_SWAP64PTR(&dfY); + } + sEnvelope.MinX = std::min(sEnvelope.MinX, dfX); + sEnvelope.MinY = std::min(sEnvelope.MinY, dfY); + sEnvelope.MaxX = std::max(sEnvelope.MaxX, dfX); + sEnvelope.MaxY = std::max(sEnvelope.MaxY, dfY); + } + return true; +} + +/************************************************************************/ +/* ReadWKBRingSequence() */ +/************************************************************************/ + +inline bool ReadWKBRingSequence(const uint8_t* data, size_t size, + OGRwkbByteOrder eByteOrder, + int nDim, + size_t& iOffset, OGREnvelope& sEnvelope) +{ + const uint32_t nRings = ReadWKBUInt32(data, eByteOrder, iOffset); + if( nRings > (size - iOffset) / sizeof(uint32_t) ) + return false; + for(uint32_t i = 0; i < nRings; i++ ) + { + if( iOffset + sizeof(uint32_t) > size ) + return false; + if( !ReadWKBPointSequence(data, size, eByteOrder, nDim, iOffset, sEnvelope) ) + return false; + } + return true; +} + +/************************************************************************/ +/* ReadWKBBoundingBox() */ +/************************************************************************/ + +constexpr uint32_t WKB_PREFIX_SIZE = 1 + sizeof(uint32_t); +constexpr uint32_t MIN_WKB_SIZE = WKB_PREFIX_SIZE + sizeof(uint32_t); + +static bool ReadWKBBoundingBoxInternal(const uint8_t* data, size_t size, + size_t& iOffset, OGREnvelope& sEnvelope, + int nRec) +{ + if( size - iOffset < MIN_WKB_SIZE ) + return false; + const int nByteOrder = DB2_V72_FIX_BYTE_ORDER(data[iOffset]); + if( !(nByteOrder == wkbXDR || nByteOrder == wkbNDR) ) + return false; + const OGRwkbByteOrder eByteOrder = static_cast(nByteOrder); + + OGRwkbGeometryType eGeometryType = wkbUnknown; + OGRReadWKBGeometryType(data + iOffset, wkbVariantIso, &eGeometryType); + iOffset += 5; + const auto eFlatType = wkbFlatten(eGeometryType); + const int nDim = 2 + (OGR_GT_HasZ(eGeometryType) ? 1 : 0) + + (OGR_GT_HasM(eGeometryType) ? 1 : 0); + + if( eFlatType == wkbPoint ) + { + if( size - iOffset < nDim * sizeof(double) ) + return false; + double dfX = 0; + double dfY = 0; + memcpy(&dfX, data + iOffset, sizeof(double)); + memcpy(&dfY, data + iOffset + sizeof(double), sizeof(double)); + iOffset += nDim * sizeof(double); + if( OGR_SWAP(eByteOrder)) + { + CPL_SWAP64PTR(&dfX); + CPL_SWAP64PTR(&dfY); + } + sEnvelope.MinX = dfX; + sEnvelope.MinY = dfY; + sEnvelope.MaxX = dfX; + sEnvelope.MaxY = dfY; + return true; + } + + if( eFlatType == wkbLineString || eFlatType == wkbCircularString ) + { + sEnvelope.MinX = std::numeric_limits::max(); + sEnvelope.MinY = std::numeric_limits::max(); + sEnvelope.MaxX = -std::numeric_limits::max(); + sEnvelope.MaxY = -std::numeric_limits::max(); + + return ReadWKBPointSequence(data, size, eByteOrder, nDim, iOffset, sEnvelope); + } + + if( eFlatType == wkbPolygon ) + { + sEnvelope.MinX = std::numeric_limits::max(); + sEnvelope.MinY = std::numeric_limits::max(); + sEnvelope.MaxX = -std::numeric_limits::max(); + sEnvelope.MaxY = -std::numeric_limits::max(); + + return ReadWKBRingSequence(data, size, eByteOrder, nDim, iOffset, sEnvelope); + } + + if( eFlatType == wkbMultiPoint ) + { + sEnvelope.MinX = std::numeric_limits::max(); + sEnvelope.MinY = std::numeric_limits::max(); + sEnvelope.MaxX = -std::numeric_limits::max(); + sEnvelope.MaxY = -std::numeric_limits::max(); + + uint32_t nParts = ReadWKBUInt32(data, eByteOrder, iOffset); + if( nParts > (size - iOffset) / (WKB_PREFIX_SIZE + nDim * sizeof(double)) ) + return false; + double dfX = 0; + double dfY = 0; + for( uint32_t k = 0; k < nParts; k++ ) + { + iOffset += WKB_PREFIX_SIZE; + memcpy(&dfX, data + iOffset, sizeof(double)); + memcpy(&dfY, data + iOffset + sizeof(double), sizeof(double)); + iOffset += nDim * sizeof(double); + if( OGR_SWAP(eByteOrder)) + { + CPL_SWAP64PTR(&dfX); + CPL_SWAP64PTR(&dfY); + } + sEnvelope.MinX = std::min(sEnvelope.MinX, dfX); + sEnvelope.MinY = std::min(sEnvelope.MinY, dfY); + sEnvelope.MaxX = std::max(sEnvelope.MaxX, dfX); + sEnvelope.MaxY = std::max(sEnvelope.MaxY, dfY); + } + return true; + } + + if( eFlatType == wkbMultiLineString ) + { + sEnvelope.MinX = std::numeric_limits::max(); + sEnvelope.MinY = std::numeric_limits::max(); + sEnvelope.MaxX = -std::numeric_limits::max(); + sEnvelope.MaxY = -std::numeric_limits::max(); + + const uint32_t nParts = ReadWKBUInt32(data, eByteOrder, iOffset); + if( nParts > (size - iOffset) / MIN_WKB_SIZE ) + return false; + for( uint32_t k = 0; k < nParts; k++ ) + { + if( iOffset + MIN_WKB_SIZE > size ) + return false; + iOffset += WKB_PREFIX_SIZE; + if( !ReadWKBPointSequence(data, size, eByteOrder, nDim, iOffset, sEnvelope) ) + return false; + } + return true; + } + + if( eFlatType == wkbMultiPolygon ) + { + sEnvelope.MinX = std::numeric_limits::max(); + sEnvelope.MinY = std::numeric_limits::max(); + sEnvelope.MaxX = -std::numeric_limits::max(); + sEnvelope.MaxY = -std::numeric_limits::max(); + + const uint32_t nParts = ReadWKBUInt32(data, eByteOrder, iOffset); + if( nParts > (size - iOffset) / MIN_WKB_SIZE ) + return false; + for( uint32_t k = 0; k < nParts; k++ ) + { + if( iOffset + MIN_WKB_SIZE > size ) + return false; + CPLAssert( data[iOffset] == eByteOrder ); + iOffset += WKB_PREFIX_SIZE; + if( !ReadWKBRingSequence(data, size, eByteOrder, nDim, iOffset, sEnvelope) ) + return false; + } + return true; + } + + if( eFlatType == wkbGeometryCollection || + eFlatType == wkbCompoundCurve || + eFlatType == wkbCurvePolygon || + eFlatType == wkbMultiCurve || + eFlatType == wkbMultiSurface ) + { + if( nRec == 128 ) + return false; + sEnvelope.MinX = std::numeric_limits::max(); + sEnvelope.MinY = std::numeric_limits::max(); + sEnvelope.MaxX = -std::numeric_limits::max(); + sEnvelope.MaxY = -std::numeric_limits::max(); + + const uint32_t nParts = ReadWKBUInt32(data, eByteOrder, iOffset); + if( nParts > (size - iOffset) / MIN_WKB_SIZE ) + return false; + OGREnvelope sEnvelopeSubGeom; + for( uint32_t k = 0; k < nParts; k++ ) + { + if( !ReadWKBBoundingBoxInternal(data, size, iOffset, sEnvelopeSubGeom, nRec + 1) ) + return false; + sEnvelope.Merge(sEnvelopeSubGeom); + } + return true; + } + + return false; +} + +inline bool OGRArrowLayer::ReadWKBBoundingBox(const uint8_t* data, size_t size, OGREnvelope& sEnvelope) +{ + size_t iOffset = 0; + return ReadWKBBoundingBoxInternal(data, size, iOffset, sEnvelope, 0); +} + +/************************************************************************/ +/* ReadList() */ +/************************************************************************/ + +template +static void ReadList(OGRFeature* poFeature, int i, + int64_t nIdxInBatch, + const ArrayType* array) +{ + const auto values = std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInBatch); + const int nCount = array->value_length(nIdxInBatch); + std::vector aValues; + aValues.reserve(nCount); + for( int k = 0; k < nCount; k++ ) + { + aValues.push_back(static_cast(values->Value(nIdxStart + k))); + } + poFeature->SetField( i, nCount, aValues.data() ); +} + +template +static void ReadListDouble(OGRFeature* poFeature, int i, + int64_t nIdxInBatch, + const ArrayType* array) +{ + const auto values = std::static_pointer_cast(array->values()); + const auto rawValues = values->raw_values(); + const auto nIdxStart = array->value_offset(nIdxInBatch); + const int nCount = array->value_length(nIdxInBatch); + std::vector aValues; + aValues.reserve(nCount); + for( int k = 0; k < nCount; k++ ) + { + if( values->IsNull(nIdxStart + k) ) + aValues.push_back(std::numeric_limits::quiet_NaN()); + else + aValues.push_back(rawValues[nIdxStart + k]); + } + poFeature->SetField( i, nCount, aValues.data() ); +} + +template +static void ReadList(OGRFeature* poFeature, int i, + int64_t nIdxInBatch, + const ArrayType* array, + arrow::Type::type valueTypeId) +{ + switch( valueTypeId ) + { + case arrow::Type::BOOL: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::UINT8: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::INT8: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::UINT16: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::INT16: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::INT32: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::UINT32: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::INT64: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::UINT64: + { + ReadList(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::HALF_FLOAT: + { + ReadListDouble(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::FLOAT: + { + ReadListDouble(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::DOUBLE: + { + ReadListDouble(poFeature, i, + nIdxInBatch, + array); + break; + } + case arrow::Type::STRING: + { + const auto values = std::static_pointer_cast(array->values()); + const auto nIdxStart = array->value_offset(nIdxInBatch); + const int nCount = array->value_length(nIdxInBatch); + CPLStringList aosList; + for( int k = 0; k < nCount; k++ ) + { + if( values->IsNull(nIdxStart + k) ) + aosList.AddString(""); // we cannot have null strings in a list + else + aosList.AddString(values->GetString(nIdxStart + k).c_str()); + } + poFeature->SetField( i, aosList.List() ); + break; + } + + default: + break; + } +} + +/************************************************************************/ +/* ReadMap() */ +/************************************************************************/ + +template +static void ReadMap(OGRFeature* poFeature, int i, + int64_t nIdxInBatch, + const arrow::MapArray* array) +{ + const auto keys = std::static_pointer_cast(array->keys()); + const auto values = std::static_pointer_cast(array->items()); + const auto nIdxStart = array->value_offset(nIdxInBatch); + const int nCount = array->value_length(nIdxInBatch); + CPLJSONObject oRoot; + for( int k = 0; k < nCount; k++ ) + { + if( !keys->IsNull(nIdxStart + k) ) + { + const auto osKey = keys->GetString(nIdxStart + k); + if( !values->IsNull(nIdxStart + k) ) + oRoot.Add(osKey, static_cast(values->Value(nIdxStart + k))); + else + oRoot.AddNull(osKey); + } + } + poFeature->SetField(i, oRoot.Format(CPLJSONObject::PrettyFormat::Plain).c_str()); +} + +/************************************************************************/ +/* SetPointsOfLine() */ +/************************************************************************/ + +template +void SetPointsOfLine(OGRLineString* poLS, + const std::shared_ptr& pointValues, + int pointOffset, + int numPoints) +{ + if( !bHasZ && !bHasM ) + { + static_assert(sizeof(OGRRawPoint) == 2 * sizeof(double), + "sizeof(OGRRawPoint) == 2 * sizeof(double)"); + poLS->setPoints(numPoints, + reinterpret_cast( + pointValues->raw_values() + pointOffset)); + return; + } + + poLS->setNumPoints(numPoints, FALSE); + for( int k = 0; k < numPoints; k++ ) + { + if( bHasZ ) + { + if( bHasM ) + { + poLS->setPoint(k, + pointValues->Value(pointOffset + nDim * k), + pointValues->Value(pointOffset + nDim * k + 1), + pointValues->Value(pointOffset + nDim * k + 2), + pointValues->Value(pointOffset + nDim * k + 3)); + } + else + { + poLS->setPoint(k, + pointValues->Value(pointOffset + nDim * k), + pointValues->Value(pointOffset + nDim * k + 1), + pointValues->Value(pointOffset + nDim * k + 2)); + } + } + else /* if( bHasM ) */ + { + poLS->setPointM(k, + pointValues->Value(pointOffset + nDim * k), + pointValues->Value(pointOffset + nDim * k + 1), + pointValues->Value(pointOffset + nDim * k + 2)); + } + } +} + +typedef void (*SetPointsOfLineType)(OGRLineString*, + const std::shared_ptr&, + int, int); + +static SetPointsOfLineType GetSetPointsOfLine(bool bHasZ, bool bHasM) +{ + if( bHasZ && bHasM ) + return SetPointsOfLine; + if( bHasZ ) + return SetPointsOfLine; + if( bHasM ) + return SetPointsOfLine; + return SetPointsOfLine; +} + +/************************************************************************/ +/* ReadFeature() */ +/************************************************************************/ + +inline +OGRFeature* OGRArrowLayer::ReadFeature( + int64_t nIdxInBatch, + const std::vector>& poColumnArrays) const +{ + OGRFeature* poFeature = new OGRFeature(m_poFeatureDefn); + + if( m_iFIDArrowColumn >= 0 ) + { + const int iCol = m_bIgnoredFields ? m_nRequestedFIDColumn : m_iFIDArrowColumn; + const arrow::Array* array = poColumnArrays[iCol].get(); + if( !array->IsNull(nIdxInBatch) ) + { + if( array->type_id() == arrow::Type::INT64 ) + { + const auto castArray = static_cast(array); + poFeature->SetFID(static_cast(castArray->Value(nIdxInBatch))); + } + else if( array->type_id() == arrow::Type::INT32 ) + { + const auto castArray = static_cast(array); + poFeature->SetFID(castArray->Value(nIdxInBatch)); + } + } + } + + const int nFieldCount = m_poFeatureDefn->GetFieldCount(); + for( int i = 0; i < nFieldCount; ++i ) + { + int iCol; + if( m_bIgnoredFields ) + { + iCol = m_anMapFieldIndexToArrayIndex[i]; + if( iCol < 0 ) + continue; + } + else + { + iCol = m_anMapFieldIndexToArrowColumn[i][0]; + } + + const arrow::Array* array = poColumnArrays[iCol].get(); + if( array->IsNull(nIdxInBatch) ) + { + poFeature->SetFieldNull(i); + continue; + } + + int j = 1; + bool bSkipToNextField = false; + while( array->type_id() == arrow::Type::STRUCT ) + { + const auto castArray = static_cast(array); + const auto& subArrays = castArray->fields(); + CPLAssert( j < static_cast(m_anMapFieldIndexToArrowColumn[i].size()) ); + const int iArrowSubcol = m_anMapFieldIndexToArrowColumn[i][j]; + j ++; + CPLAssert( iArrowSubcol < static_cast(subArrays.size()) ); + array = subArrays[iArrowSubcol].get(); + if( array->IsNull(nIdxInBatch) ) + { + poFeature->SetFieldNull(i); + bSkipToNextField = true; + break; + } + } + if( bSkipToNextField ) + continue; + + if( array->type_id() == arrow::Type::DICTIONARY ) + { + const auto castArray = static_cast(array); + m_poReadFeatureTmpArray = castArray->indices(); // does not return a const reference + array = m_poReadFeatureTmpArray.get(); + if( array->IsNull(nIdxInBatch) ) + { + poFeature->SetFieldNull(i); + continue; + } + } + + switch( array->type_id() ) + { + case arrow::Type::NA: + break; + + case arrow::Type::BOOL: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::UINT8: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::INT8: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::UINT16: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::INT16: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::UINT32: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, static_cast(castArray->Value(nIdxInBatch))); + break; + } + case arrow::Type::INT32: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::UINT64: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, static_cast(castArray->Value(nIdxInBatch))); + break; + } + case arrow::Type::INT64: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, static_cast(castArray->Value(nIdxInBatch))); + break; + } + case arrow::Type::HALF_FLOAT: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::FLOAT: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::DOUBLE: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->Value(nIdxInBatch)); + break; + } + case arrow::Type::STRING: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->GetString(nIdxInBatch).c_str()); + break; + } + case arrow::Type::BINARY: + { + const auto castArray = static_cast(array); + int out_length = 0; + const uint8_t* data = castArray->GetValue(nIdxInBatch, &out_length); + poFeature->SetField(i, out_length, data); + break; + } + case arrow::Type::FIXED_SIZE_BINARY: + { + const auto castArray = static_cast(array); + const uint8_t* data = castArray->GetValue(nIdxInBatch); + poFeature->SetField(i, castArray->byte_width(), data); + break; + } + case arrow::Type::DATE32: + { + // number of days since Epoch + const auto castArray = static_cast(array); + int64_t timestamp = static_cast(castArray->Value(nIdxInBatch)) * 3600 * 24; + struct tm dt; + CPLUnixTimeToYMDHMS(timestamp, &dt); + poFeature->SetField(i, dt.tm_year + 1900, dt.tm_mon + 1, dt.tm_mday, + 0,0,0); + break; + } + case arrow::Type::DATE64: + { + // number of milliseconds since Epoch + const auto castArray = static_cast(array); + int64_t timestamp = static_cast(castArray->Value(nIdxInBatch)) / 1000; + struct tm dt; + CPLUnixTimeToYMDHMS(timestamp, &dt); + poFeature->SetField(i, dt.tm_year + 1900, dt.tm_mon + 1, dt.tm_mday, + 0,0,0); + break; + } + case arrow::Type::TIMESTAMP: + { + const auto timestampType = static_cast(array->data()->type.get()); + const auto castArray = static_cast(array); + int64_t timestamp = castArray->Value(nIdxInBatch); + const auto unit = timestampType->unit(); + double floatingPart = 0; + if( unit == arrow::TimeUnit::MILLI ) + { + floatingPart = (timestamp % 1000) / 1e3; + timestamp /= 1000; + } + else if( unit == arrow::TimeUnit::MICRO ) + { + floatingPart = (timestamp % (1000 * 1000)) / 1e6; + timestamp /= 1000 * 1000; + } + else if( unit == arrow::TimeUnit::NANO ) + { + floatingPart = (timestamp % (1000 * 1000 * 1000)) / 1e9; + timestamp /= 1000 * 1000 * 1000; + } + int nTZFlag = 0; + const auto osTZ = timestampType->timezone(); + if( osTZ == "UTC" || osTZ == "Etc/UTC" ) + { + nTZFlag = 100; + } + else if( osTZ.size() == 6 && + (osTZ[0] == '+' || osTZ[0] == '-') && + osTZ[3] == ':' ) + { + int nTZHour = atoi(osTZ.c_str() + 1); + int nTZMin = atoi(osTZ.c_str() + 4); + if( nTZHour >= 0 && nTZHour <= 14 && + nTZMin >= 0 && nTZMin < 60 && + (nTZMin % 15) == 0 ) + { + nTZFlag = (nTZHour * 4) + (nTZMin / 15); + if( osTZ[0] == '+' ) + { + nTZFlag = 100 + nTZFlag; + timestamp += nTZHour * 3600 + nTZMin * 60; + } + else + { + nTZFlag = 100 - nTZFlag; + timestamp -= nTZHour * 3600 + nTZMin * 60; + } + } + } + struct tm dt; + CPLUnixTimeToYMDHMS(timestamp, &dt); + poFeature->SetField(i, dt.tm_year + 1900, dt.tm_mon + 1, dt.tm_mday, + dt.tm_hour, dt.tm_min, + static_cast(dt.tm_sec + floatingPart), + nTZFlag); + break; + } + case arrow::Type::TIME32: + { + const auto timestampType = static_cast(array->data()->type.get()); + const auto castArray = static_cast(array); + const auto unit = timestampType->unit(); + int value = castArray->Value(nIdxInBatch); + double floatingPart = 0; + if( unit == arrow::TimeUnit::MILLI ) + { + floatingPart = (value % 1000) / 1e3; + value /= 1000; + } + const int nHour = value / 3600; + const int nMinute = (value / 60) % 60; + const int nSecond = value % 60; + poFeature->SetField(i, 0, 0, 0, nHour, nMinute, + static_cast(nSecond + floatingPart)); + break; + } + case arrow::Type::TIME64: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, static_cast(castArray->Value(nIdxInBatch))); + break; + } + + case arrow::Type::DECIMAL128: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, CPLAtof(castArray->FormatValue(nIdxInBatch).c_str())); + break; + } + + case arrow::Type::DECIMAL256: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, CPLAtof(castArray->FormatValue(nIdxInBatch).c_str())); + break; + } + + case arrow::Type::LIST: + { + const auto castArray = static_cast(array); + const auto listType = static_cast(array->data()->type.get()); + ReadList(poFeature, i, nIdxInBatch, castArray, listType->value_field()->type()->id()); + break; + } + + case arrow::Type::FIXED_SIZE_LIST: + { + const auto castArray = static_cast(array); + const auto listType = static_cast(array->data()->type.get()); + ReadList(poFeature, i, nIdxInBatch, castArray, listType->value_field()->type()->id()); + break; + } + + case arrow::Type::LARGE_STRING: + { + const auto castArray = static_cast(array); + poFeature->SetField(i, castArray->GetString(nIdxInBatch).c_str()); + break; + } + case arrow::Type::LARGE_BINARY: + { + const auto castArray = static_cast(array); + arrow::LargeBinaryArray::offset_type out_length = 0; + const uint8_t* data = castArray->GetValue(nIdxInBatch, &out_length); + if( out_length <= INT_MAX ) + { + poFeature->SetField(i, static_cast(out_length), data); + } + else + { + // this is probably the most likely code path if people use LargeBinary... + CPLError(CE_Warning, CPLE_AppDefined, + "Too large binary: " CPL_FRMT_GUIB " bytes", + static_cast(out_length)); + } + break; + } + + case arrow::Type::MAP: + { + const auto castArray = static_cast(array); + const auto mapType = static_cast(array->data()->type.get()); + const auto itemTypeId = mapType->item_type()->id(); + if( mapType->key_type()->id() == arrow::Type::STRING ) + { + if( itemTypeId == arrow::Type::BOOL ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::UINT8 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::INT8 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::UINT16 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::INT16 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::UINT32 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::INT32 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::UINT64 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::INT64 ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::FLOAT ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::DOUBLE ) + { + ReadMap(poFeature, i, + nIdxInBatch, + castArray); + } + else if( itemTypeId == arrow::Type::STRING ) + { + const auto keys = std::static_pointer_cast(castArray->keys()); + const auto values = std::static_pointer_cast(castArray->items()); + const auto nIdxStart = castArray->value_offset(nIdxInBatch); + const int nCount = castArray->value_length(nIdxInBatch); + CPLJSONDocument oDoc; + auto oRoot = oDoc.GetRoot(); + for( int k = 0; k < nCount; k++ ) + { + if( !keys->IsNull(nIdxStart + k) ) + { + const auto osKey = keys->GetString(nIdxStart + k); + if( !values->IsNull(nIdxStart + k) ) + oRoot.Add(osKey, values->GetString(nIdxStart + k)); + else + oRoot.AddNull(osKey); + } + } + poFeature->SetField(i, oRoot.Format(CPLJSONObject::PrettyFormat::Plain).c_str()); + } + } + break; + } + + // unhandled types + case arrow::Type::STRUCT: // should not happen + case arrow::Type::INTERVAL_MONTHS: + case arrow::Type::INTERVAL_DAY_TIME: + case arrow::Type::SPARSE_UNION: + case arrow::Type::DENSE_UNION: + case arrow::Type::DICTIONARY: + case arrow::Type::EXTENSION: + case arrow::Type::DURATION: + case arrow::Type::LARGE_LIST: + case arrow::Type::INTERVAL_MONTH_DAY_NANO: + case arrow::Type::MAX_ID: + { + // Shouldn't happen normally as we should have discarded those + // fields when creating OGR field definitions + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot read content for field %s", + m_poFeatureDefn->GetFieldDefn(i)->GetNameRef()); + break; + } + } + } + + const int nGeomFieldCount = m_poFeatureDefn->GetGeomFieldCount(); + for( int i = 0; i < nGeomFieldCount; ++i ) + { + int iCol; + if( m_bIgnoredFields ) + { + iCol = m_anMapGeomFieldIndexToArrayIndex[i]; + if( iCol < 0 ) + continue; + } + else + { + iCol = m_anMapGeomFieldIndexToArrowColumn[i]; + } + + const auto array = poColumnArrays[iCol].get(); + auto poGeometry = ReadGeometry(i, array, nIdxInBatch); + if( poGeometry ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + if( poGeometry->getGeometryType() == wkbLineString && + poGeomFieldDefn->GetType() == wkbMultiLineString ) + { + poGeometry = OGRGeometryFactory::forceToMultiLineString(poGeometry); + } + else if( poGeometry->getGeometryType() == wkbPolygon && + poGeomFieldDefn->GetType() == wkbMultiPolygon ) + { + poGeometry = OGRGeometryFactory::forceToMultiPolygon(poGeometry); + } + poFeature->SetGeomFieldDirectly(i, poGeometry); + } + } + + return poFeature; +} + +/************************************************************************/ +/* ReadGeometry() */ +/************************************************************************/ + +inline +OGRGeometry* OGRArrowLayer::ReadGeometry(int iGeomField, + const arrow::Array* array, + int64_t nIdxInBatch) const +{ + if( array->IsNull(nIdxInBatch) ) + { + return nullptr; + } + OGRGeometry* poGeometry = nullptr; + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(iGeomField); + const auto eGeomType = poGeomFieldDefn->GetType(); + const bool bHasZ = CPL_TO_BOOL(OGR_GT_HasZ(eGeomType)); + const bool bHasM = CPL_TO_BOOL(OGR_GT_HasM(eGeomType)); + const int nDim = 2 + (bHasZ ? 1 : 0) + (bHasM ? 1 : 0); + + const auto CreatePoint = [bHasZ, bHasM, nDim](const std::shared_ptr& pointValues, + int pointOffset) + { + if( bHasZ ) + { + if( bHasM ) + { + return new OGRPoint(pointValues->Value(pointOffset), + pointValues->Value(pointOffset + 1), + pointValues->Value(pointOffset + 2), + pointValues->Value(pointOffset + 3)); + } + else + { + return new OGRPoint(pointValues->Value(pointOffset), + pointValues->Value(pointOffset + 1), + pointValues->Value(pointOffset + 2)); + } + } + else if( bHasM ) + { + return OGRPoint::createXYM(pointValues->Value(pointOffset), + pointValues->Value(pointOffset + 1), + pointValues->Value(pointOffset + 2)); + } + else + { + return new OGRPoint(pointValues->Value(pointOffset), + pointValues->Value(pointOffset + 1)); + } + }; + + switch( m_aeGeomEncoding[iGeomField] ) + { + case OGRArrowGeomEncoding::WKB: + { + CPLAssert( array->type_id() == arrow::Type::BINARY ); + const auto castArray = static_cast(array); + int out_length = 0; + const uint8_t* data = castArray->GetValue(nIdxInBatch, &out_length); + if( OGRGeometryFactory::createFromWkb( + data, poGeomFieldDefn->GetSpatialRef(), &poGeometry, + out_length ) == OGRERR_NONE ) + { +#ifdef DEBUG_ReadWKBBoundingBox + OGREnvelope sEnvelopeFromWKB; + bool bRet = ReadWKBBoundingBox(data, out_length, sEnvelopeFromWKB); + CPLAssert(bRet); + OGREnvelope sEnvelopeFromGeom; + poGeometry->getEnvelope(&sEnvelopeFromGeom); + CPLAssert(sEnvelopeFromWKB == sEnvelopeFromGeom); +#endif + } + break; + } + + case OGRArrowGeomEncoding::WKT: + { + CPLAssert( array->type_id() == arrow::Type::STRING ); + const auto castArray = static_cast(array); + const auto osWKT = castArray->GetString(nIdxInBatch); + OGRGeometryFactory::createFromWkt( + osWKT.c_str(), poGeomFieldDefn->GetSpatialRef(), &poGeometry ); + break; + } + + case OGRArrowGeomEncoding::GEOARROW_GENERIC: + { + CPLAssert(false); + break; + } + + case OGRArrowGeomEncoding::GEOARROW_POINT: + { + CPLAssert( array->type_id() == arrow::Type::FIXED_SIZE_LIST ); + const auto listArray = static_cast(array); + CPLAssert( listArray->values()->type_id() == arrow::Type::DOUBLE ); + const auto pointValues = std::static_pointer_cast(listArray->values()); + if( !pointValues->IsNull(nDim * nIdxInBatch) ) + { + poGeometry = CreatePoint(pointValues, static_cast(nDim * nIdxInBatch)); + poGeometry->assignSpatialReference(poGeomFieldDefn->GetSpatialRef()); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_LINESTRING: + { + CPLAssert( array->type_id() == arrow::Type::LIST ); + const auto listArray = static_cast(array); + CPLAssert( listArray->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + const auto listOfPointsValues = std::static_pointer_cast(listArray->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + const auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + const auto nPoints = listArray->value_length(nIdxInBatch); + const auto nPointOffset = listArray->value_offset(nIdxInBatch) * nDim; + auto poLineString = new OGRLineString(); + poGeometry = poLineString; + poGeometry->assignSpatialReference(poGeomFieldDefn->GetSpatialRef()); + if( nPoints ) + { + GetSetPointsOfLine(bHasZ, bHasM)(poLineString, pointValues, nPointOffset, nPoints); + } + else + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_POLYGON: + { + CPLAssert( array->type_id() == arrow::Type::LIST ); + const auto listOfRingsArray = static_cast(array); + CPLAssert( listOfRingsArray->values()->type_id() == arrow::Type::LIST ); + const auto listOfRingsValues = std::static_pointer_cast(listOfRingsArray->values()); + CPLAssert( listOfRingsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + const auto listOfPointsValues = std::static_pointer_cast(listOfRingsValues->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + const auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + const auto setPointsFun = GetSetPointsOfLine(bHasZ, bHasM); + const auto nRings = listOfRingsArray->value_length(nIdxInBatch); + const auto nRingOffset = listOfRingsArray->value_offset(nIdxInBatch); + auto poPoly = new OGRPolygon(); + poGeometry = poPoly; + poGeometry->assignSpatialReference(poGeomFieldDefn->GetSpatialRef()); + for( auto k = decltype(nRings){0}; k < nRings; k++ ) + { + const auto nPoints = listOfRingsValues->value_length(nRingOffset + k); + const auto nPointOffset = listOfRingsValues->value_offset(nRingOffset + k) * nDim; + auto poRing = new OGRLinearRing(); + if( nPoints ) + { + setPointsFun(poRing, pointValues, nPointOffset, nPoints); + } + poPoly->addRingDirectly(poRing); + } + if( poGeometry->IsEmpty() ) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_MULTIPOINT: + { + CPLAssert( array->type_id() == arrow::Type::LIST ); + const auto listArray = static_cast(array); + CPLAssert( listArray->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + const auto listOfPointsValues = std::static_pointer_cast(listArray->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + const auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + const auto nPoints = listArray->value_length(nIdxInBatch); + const auto nPointOffset = listArray->value_offset(nIdxInBatch) * nDim; + auto poMultiPoint = new OGRMultiPoint(); + poGeometry = poMultiPoint; + poGeometry->assignSpatialReference(poGeomFieldDefn->GetSpatialRef()); + for( auto k = decltype(nPoints){0}; k < nPoints; k++ ) + { + poMultiPoint->addGeometryDirectly( + CreatePoint(pointValues, nPointOffset + k * nDim)); + } + if( poGeometry->IsEmpty() ) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING: + { + CPLAssert( array->type_id() == arrow::Type::LIST ); + const auto listOfStringsArray = static_cast(array); + CPLAssert( listOfStringsArray->values()->type_id() == arrow::Type::LIST ); + const auto listOfStringsValues = std::static_pointer_cast(listOfStringsArray->values()); + CPLAssert( listOfStringsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + const auto listOfPointsValues = std::static_pointer_cast(listOfStringsValues->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + const auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + const auto setPointsFun = GetSetPointsOfLine(bHasZ, bHasM); + const auto nStrings = listOfStringsArray->value_length(nIdxInBatch); + const auto nRingOffset = listOfStringsArray->value_offset(nIdxInBatch); + auto poMLS = new OGRMultiLineString(); + poGeometry = poMLS; + poGeometry->assignSpatialReference(poGeomFieldDefn->GetSpatialRef()); + for( auto k = decltype(nStrings){0}; k < nStrings; k++ ) + { + const auto nPoints = listOfStringsValues->value_length(nRingOffset + k); + const auto nPointOffset = listOfStringsValues->value_offset(nRingOffset + k) * nDim; + auto poLS = new OGRLineString(); + if( nPoints ) + { + setPointsFun(poLS, pointValues, nPointOffset, nPoints); + } + poMLS->addGeometryDirectly(poLS); + } + if( poGeometry->IsEmpty() ) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON: + { + CPLAssert( array->type_id() == arrow::Type::LIST ); + const auto listOfPartsArray = static_cast(array); + CPLAssert( listOfPartsArray->values()->type_id() == arrow::Type::LIST ); + const auto listOfPartsValues = std::static_pointer_cast(listOfPartsArray->values()); + CPLAssert( listOfPartsValues->values()->type_id() == arrow::Type::LIST ); + const auto listOfRingsValues = std::static_pointer_cast(listOfPartsValues->values()); + CPLAssert( listOfRingsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + const auto listOfPointsValues = std::static_pointer_cast(listOfRingsValues->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + const auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + auto poMP = new OGRMultiPolygon(); + poGeometry = poMP; + poGeometry->assignSpatialReference(poGeomFieldDefn->GetSpatialRef()); + const auto setPointsFun = GetSetPointsOfLine(bHasZ, bHasM); + const auto nParts = listOfPartsArray->value_length(nIdxInBatch); + const auto nPartOffset = listOfPartsArray->value_offset(nIdxInBatch); + for( auto j = decltype(nParts){0}; j < nParts; j++ ) + { + const auto nRings = listOfPartsValues->value_length(nPartOffset + j); + const auto nRingOffset = listOfPartsValues->value_offset(nPartOffset + j); + auto poPoly = new OGRPolygon(); + for( auto k = decltype(nRings){0}; k < nRings; k++ ) + { + const auto nPoints = listOfRingsValues->value_length(nRingOffset + k); + const auto nPointOffset = listOfRingsValues->value_offset(nRingOffset + k) * nDim; + auto poRing = new OGRLinearRing(); + if( nPoints ) + { + setPointsFun(poRing, pointValues, nPointOffset, nPoints); + } + poPoly->addRingDirectly(poRing); + } + poMP->addGeometryDirectly(poPoly); + } + if( poGeometry->IsEmpty() ) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + } + return poGeometry; +} + +/************************************************************************/ +/* ResetReading() */ +/************************************************************************/ + +inline +void OGRArrowLayer::ResetReading() +{ + m_bEOF = false; + m_nFeatureIdx = 0; + m_nIdxInBatch = 0; + m_poReadFeatureTmpArray.reset(); + if( m_iRecordBatch != 0 ) + { + m_iRecordBatch = -1; + m_poBatch.reset(); + } +} + +/************************************************************************/ +/* GetNextRawFeature() */ +/************************************************************************/ + +inline +OGRFeature* OGRArrowLayer::GetNextRawFeature() +{ + if( m_bEOF ) + return nullptr; + + if( m_poBatch == nullptr || m_nIdxInBatch == m_poBatch->num_rows() ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + return nullptr; + } + + // Evaluate spatial filter by computing the bounding box of each geometry + // but without creating a OGRGeometry + if( m_poFilterGeom ) + { + int iCol; + if( m_bIgnoredFields ) + { + iCol = m_anMapGeomFieldIndexToArrayIndex[m_iGeomFieldFilter]; + } + else + { + iCol = m_anMapGeomFieldIndexToArrowColumn[m_iGeomFieldFilter]; + } + if( iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == OGRArrowGeomEncoding::WKB ) + { + auto array = m_poBatch->columns()[iCol]; + CPLAssert( array->type_id() == arrow::Type::BINARY ); + auto castArray = std::static_pointer_cast(array); + OGREnvelope sEnvelope; + while( true ) + { + bool bSkipToNextFeature = false; + if( array->IsNull(m_nIdxInBatch) ) + { + bSkipToNextFeature = true; + } + else + { + int out_length = 0; + const uint8_t* data = castArray->GetValue(m_nIdxInBatch, &out_length); + if( ReadWKBBoundingBox(data, out_length, sEnvelope) && + !m_sFilterEnvelope.Intersects(sEnvelope) ) + { + bSkipToNextFeature = true; + } + } + if( !bSkipToNextFeature ) + { + break; + } + + m_nFeatureIdx ++; + m_nIdxInBatch ++; + if( m_nIdxInBatch == m_poBatch->num_rows() ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + return nullptr; + array = m_poBatch->columns()[iCol]; + CPLAssert( array->type_id() == arrow::Type::BINARY ); + castArray = std::static_pointer_cast(array); + } + } + } + else if( iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(m_iGeomFieldFilter); + const auto eGeomType = poGeomFieldDefn->GetType(); + const bool bHasZ = CPL_TO_BOOL(OGR_GT_HasZ(eGeomType)); + const bool bHasM = CPL_TO_BOOL(OGR_GT_HasM(eGeomType)); + const int nDim = 2 + (bHasZ ? 1 : 0) + (bHasM ? 1 : 0); + +begin_multipolygon: + auto array = m_poBatch->columns()[iCol].get(); + CPLAssert( array->type_id() == arrow::Type::LIST ); + auto listOfPartsArray = static_cast(array); + CPLAssert( listOfPartsArray->values()->type_id() == arrow::Type::LIST ); + auto listOfPartsValues = std::static_pointer_cast(listOfPartsArray->values()); + CPLAssert( listOfPartsValues->values()->type_id() == arrow::Type::LIST ); + auto listOfRingsValues = std::static_pointer_cast(listOfPartsValues->values()); + CPLAssert( listOfRingsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + auto listOfPointsValues = std::static_pointer_cast(listOfRingsValues->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + + while( true ) + { + if( !listOfPartsArray->IsNull(m_nIdxInBatch) ) + { + OGREnvelope sEnvelope; + const auto nParts = listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = listOfPartsArray->value_offset(m_nIdxInBatch); + for( auto j = decltype(nParts){0}; j < nParts; j++ ) + { + const auto nRings = listOfPartsValues->value_length(nPartOffset + j); + const auto nRingOffset = listOfPartsValues->value_offset(nPartOffset + j); + if( nRings >= 1 ) + { + const auto nPoints = listOfRingsValues->value_length(nRingOffset); + const auto nPointOffset = listOfRingsValues->value_offset(nRingOffset) * nDim; + const double* padfRawValue = pointValues->raw_values() + nPointOffset; + for( auto l = decltype(nPoints){0}; l < nPoints; ++l ) + { + sEnvelope.Merge( + padfRawValue[nDim * l], + padfRawValue[nDim * l + 1]); + } + // for bounding box, only the first ring matters + } + } + + if( nParts != 0 && + m_sFilterEnvelope.Intersects(sEnvelope) ) + { + break; + } + } + + m_nFeatureIdx ++; + m_nIdxInBatch ++; + if( m_nIdxInBatch == m_poBatch->num_rows() ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + return nullptr; + goto begin_multipolygon; + } + } + } + else if( iCol >= 0 ) + { + auto array = m_poBatch->columns()[iCol].get(); + OGREnvelope sEnvelope; + while( true ) + { + bool bSkipToNextFeature = false; + auto poGeometry = std::unique_ptr( + ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); + if( poGeometry == nullptr || + poGeometry->IsEmpty() ) + { + bSkipToNextFeature = true; + } + else + { + poGeometry->getEnvelope(&sEnvelope); + if( !m_sFilterEnvelope.Intersects(sEnvelope) ) + { + bSkipToNextFeature = true; + } + } + if( !bSkipToNextFeature ) + { + break; + } + + m_nFeatureIdx ++; + m_nIdxInBatch ++; + if( m_nIdxInBatch == m_poBatch->num_rows() ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + return nullptr; + array = m_poBatch->columns()[iCol].get(); + } + } + } + } + + auto poFeature = ReadFeature(m_nIdxInBatch, m_poBatch->columns()); + + if( m_iFIDArrowColumn < 0 ) + poFeature->SetFID(m_nFeatureIdx); + + m_nFeatureIdx ++; + m_nIdxInBatch ++; + + return poFeature; +} + +/************************************************************************/ +/* GetExtent() */ +/************************************************************************/ + +inline +OGRErr OGRArrowLayer::GetExtent(OGREnvelope *psExtent, int bForce) +{ + return GetExtent(0, psExtent, bForce); +} + +/************************************************************************/ +/* GetExtent() */ +/************************************************************************/ + +inline +OGRErr OGRArrowLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, + int bForce) +{ + if( iGeomField < 0 || iGeomField >= m_poFeatureDefn->GetGeomFieldCount() ) + { + if( iGeomField != 0 ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Invalid geometry field index : %d", iGeomField); + } + return OGRERR_FAILURE; + } + auto oIter = m_oMapGeometryColumns.find( + m_poFeatureDefn->GetGeomFieldDefn(iGeomField)->GetNameRef() ); + if( oIter != m_oMapGeometryColumns.end() && + CPLTestBool(CPLGetConfigOption(("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES")) ) + { + const auto& oJSONDef = oIter->second; + const auto oBBox = oJSONDef.GetArray("bbox"); + if( oBBox.IsValid() && oBBox.Size() == 4 ) + { + psExtent->MinX = oBBox[0].ToDouble(); + psExtent->MinY = oBBox[1].ToDouble(); + psExtent->MaxX = oBBox[2].ToDouble(); + psExtent->MaxY = oBBox[3].ToDouble(); + if( psExtent->MinX <= psExtent->MaxX ) + return OGRERR_NONE; + } + } + + if( !bForce && !CanRunNonForcedGetExtent() ) + { + return OGRERR_FAILURE; + } + + int iCol; + if( m_bIgnoredFields ) + { + iCol = m_anMapGeomFieldIndexToArrayIndex[iGeomField]; + } + else + { + iCol = m_anMapGeomFieldIndexToArrowColumn[iGeomField]; + } + if( iCol< 0 ) + { + return OGRERR_FAILURE; + } + + if( m_aeGeomEncoding[iGeomField] == OGRArrowGeomEncoding::WKB ) + { + ResetReading(); + if( m_poBatch == nullptr ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + return OGRERR_FAILURE; + } + *psExtent = OGREnvelope(); + + auto array = m_poBatch->columns()[iCol]; + CPLAssert( array->type_id() == arrow::Type::BINARY ); + auto castArray = std::static_pointer_cast(array); + OGREnvelope sEnvelope; + while( true ) + { + if( !array->IsNull(m_nIdxInBatch) ) + { + int out_length = 0; + const uint8_t* data = castArray->GetValue(m_nIdxInBatch, &out_length); + if( ReadWKBBoundingBox(data, out_length, sEnvelope) ) + { + psExtent->Merge(sEnvelope); + } + } + + m_nFeatureIdx ++; + m_nIdxInBatch ++; + if( m_nIdxInBatch == m_poBatch->num_rows() ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + { + ResetReading(); + return psExtent->IsInit() ? OGRERR_NONE : OGRERR_FAILURE; + } + array = m_poBatch->columns()[iCol]; + CPLAssert( array->type_id() == arrow::Type::BINARY ); + castArray = std::static_pointer_cast(array); + } + } + } + else if( m_aeGeomEncoding[iGeomField] == OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON ) + { + ResetReading(); + if( m_poBatch == nullptr ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + return OGRERR_FAILURE; + } + *psExtent = OGREnvelope(); + + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(iGeomField); + const auto eGeomType = poGeomFieldDefn->GetType(); + const bool bHasZ = CPL_TO_BOOL(OGR_GT_HasZ(eGeomType)); + const bool bHasM = CPL_TO_BOOL(OGR_GT_HasM(eGeomType)); + const int nDim = 2 + (bHasZ ? 1 : 0) + (bHasM ? 1 : 0); + +begin_multipolygon: + auto array = m_poBatch->columns()[iCol].get(); + CPLAssert( array->type_id() == arrow::Type::LIST ); + auto listOfPartsArray = static_cast(array); + CPLAssert( listOfPartsArray->values()->type_id() == arrow::Type::LIST ); + auto listOfPartsValues = std::static_pointer_cast(listOfPartsArray->values()); + CPLAssert( listOfPartsValues->values()->type_id() == arrow::Type::LIST ); + auto listOfRingsValues = std::static_pointer_cast(listOfPartsValues->values()); + CPLAssert( listOfRingsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST ); + auto listOfPointsValues = std::static_pointer_cast(listOfRingsValues->values()); + CPLAssert( listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE ); + auto pointValues = std::static_pointer_cast(listOfPointsValues->values()); + + while( true ) + { + if( !listOfPartsArray->IsNull(m_nIdxInBatch) ) + { + const auto nParts = listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = listOfPartsArray->value_offset(m_nIdxInBatch); + for( auto j = decltype(nParts){0}; j < nParts; j++ ) + { + const auto nRings = listOfPartsValues->value_length(nPartOffset + j); + const auto nRingOffset = listOfPartsValues->value_offset(nPartOffset + j); + if( nRings >= 1 ) + { + const auto nPoints = listOfRingsValues->value_length(nRingOffset); + const auto nPointOffset = listOfRingsValues->value_offset(nRingOffset) * nDim; + const double* padfRawValue = pointValues->raw_values() + nPointOffset; + for( auto l = decltype(nPoints){0}; l < nPoints; ++l ) + { + psExtent->Merge( + padfRawValue[nDim * l], + padfRawValue[nDim * l + 1]); + } + // for bounding box, only the first ring matters + } + } + } + + m_nFeatureIdx ++; + m_nIdxInBatch ++; + if( m_nIdxInBatch == m_poBatch->num_rows() ) + { + m_bEOF = !ReadNextBatch(); + if( m_bEOF ) + { + ResetReading(); + return psExtent->IsInit() ? OGRERR_NONE : OGRERR_FAILURE; + } + goto begin_multipolygon; + } + } + } + + return GetExtentInternal(iGeomField, psExtent, bForce); +} diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowrandomaccessfile.h b/ogr/ogrsf_frmts/arrow_common/ograrrowrandomaccessfile.h new file mode 100644 index 000000000000..8f6ee55e13c4 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowrandomaccessfile.h @@ -0,0 +1,121 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_ARROW_RANDOM_ACCESS_FILE_H +#define OGR_ARROW_RANDOM_ACCESS_FILE_H + +#include "cpl_vsi.h" + +#include "arrow/buffer.h" +#include "arrow/io/file.h" +#include "arrow/io/interfaces.h" + +/************************************************************************/ +/* OGRArrowRandomAccessFile */ +/************************************************************************/ + +class OGRArrowRandomAccessFile final: public arrow::io::RandomAccessFile +{ + int64_t m_nSize = -1; + VSILFILE* m_fp; + bool m_bOwnFP; + + OGRArrowRandomAccessFile(const OGRArrowRandomAccessFile&) = delete; + OGRArrowRandomAccessFile& operator= (const OGRArrowRandomAccessFile&) = delete; + +public: + explicit OGRArrowRandomAccessFile(VSILFILE* fp, bool bOwnFP = true): m_fp(fp), m_bOwnFP(bOwnFP) + { + } + + ~OGRArrowRandomAccessFile() override + { + if( m_fp && m_bOwnFP ) + VSIFCloseL(m_fp); + } + + arrow::Status Close() override + { + if( !m_bOwnFP ) + return arrow::Status::IOError("Cannot close a file that we don't own"); + int ret = VSIFCloseL(m_fp); + m_fp = nullptr; + return ret == 0 ? arrow::Status::OK() : arrow::Status::IOError("Error while closing"); + } + + arrow::Result Tell() const override + { + return static_cast(VSIFTellL(m_fp)); + } + + bool closed() const override + { + return m_fp == nullptr; + } + + arrow::Status Seek(int64_t position) override + { + if( VSIFSeekL(m_fp, static_cast(position), SEEK_SET) == 0 ) + return arrow::Status::OK(); + return arrow::Status::IOError("Error while seeking"); + } + + arrow::Result Read(int64_t nbytes, void* out) override + { + CPLAssert(static_cast(static_cast(nbytes)) == nbytes); + return static_cast( + VSIFReadL(out, 1, static_cast(nbytes), m_fp)); + } + + arrow::Result> Read(int64_t nbytes) override + { + auto buffer = arrow::AllocateResizableBuffer(nbytes); + if (!buffer.ok()) + { + return buffer; + } + uint8_t* buffer_data = (*buffer)->mutable_data(); + auto nread = Read(nbytes, buffer_data); + (*buffer)->Resize(*nread); + return buffer; + } + + arrow::Result GetSize() override + { + if( m_nSize < 0 ) + { + const auto nPos = VSIFTellL(m_fp); + VSIFSeekL(m_fp, 0, SEEK_END); + m_nSize = static_cast(VSIFTellL(m_fp)); + VSIFSeekL(m_fp, nPos, SEEK_SET); + } + return m_nSize; + } +}; + +#endif // OGR_ARROW_RANDOM_ACCESS_FILE_H diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowwritablefile.h b/ogr/ogrsf_frmts/arrow_common/ograrrowwritablefile.h new file mode 100644 index 000000000000..e28ac3d36e61 --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowwritablefile.h @@ -0,0 +1,92 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_ARROW_WRITABLE_FILE_H +#define OGR_ARROW_WRITABLE_FILE_H + +#include "cpl_vsi.h" + +#include "arrow/buffer.h" +#include "arrow/io/file.h" +#include "arrow/io/interfaces.h" + +/************************************************************************/ +/* OGRArrowWritableFile */ +/************************************************************************/ + +class OGRArrowWritableFile final: public arrow::io::OutputStream +{ + VSILFILE* m_fp; + + OGRArrowWritableFile(const OGRArrowWritableFile&) = delete; + OGRArrowWritableFile& operator= (const OGRArrowWritableFile&) = delete; + +public: + explicit OGRArrowWritableFile(VSILFILE* fp): m_fp(fp) + { + } + + ~OGRArrowWritableFile() override + { + if( m_fp ) + VSIFCloseL(m_fp); + } + + arrow::Status Close() override + { + int ret = VSIFCloseL(m_fp); + m_fp = nullptr; + return ret == 0 ? arrow::Status::OK() : arrow::Status::IOError("Error while closing"); + } + + arrow::Result Tell() const override + { + return static_cast(VSIFTellL(m_fp)); + } + + bool closed() const override + { + return m_fp == nullptr; + } + + arrow::Status Write(const void* data, int64_t nbytes) override + { + CPLAssert(static_cast(static_cast(nbytes)) == nbytes); + if( VSIFWriteL(data, 1, static_cast(nbytes), m_fp) == static_cast(nbytes) ) + return arrow::Status::OK(); + return arrow::Status::IOError("Error while writing"); + } + + arrow::Status Write(const std::shared_ptr& data) override + { + return Write(data->data(), data->size()); + } +}; + + +#endif // OGR_ARROW_WRITABLE_FILE_H diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp new file mode 100644 index 000000000000..44431923487d --- /dev/null +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp @@ -0,0 +1,1496 @@ +/****************************************************************************** + * + * Project: Arrow generic code + * Purpose: Arrow generic code + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + + +#include "ogr_arrow.h" + +#include "cpl_json.h" +#include "cpl_time.h" + +#include + +static constexpr int TZFLAG_UNINITIALIZED = -1; +static constexpr int TZFLAG_MIXED = -2; + +/************************************************************************/ +/* OGRArrowWriterLayer() */ +/************************************************************************/ + +inline +OGRArrowWriterLayer::OGRArrowWriterLayer( + arrow::MemoryPool* poMemoryPool, + const std::shared_ptr& poOutputStream, + const char *pszLayerName): + m_poMemoryPool(poMemoryPool), + m_poOutputStream(poOutputStream) +{ + m_poFeatureDefn = new OGRFeatureDefn(pszLayerName); + m_poFeatureDefn->SetGeomType(wkbNone); + m_poFeatureDefn->Reference(); + SetDescription(pszLayerName); +} + +/************************************************************************/ +/* ~OGRArrowWriterLayer() */ +/************************************************************************/ + +inline +OGRArrowWriterLayer::~OGRArrowWriterLayer() +{ + CPLDebug("ARROW", "Memory pool (writer layer): bytes_allocated = %" PRId64, + m_poMemoryPool->bytes_allocated()); + CPLDebug("ARROW", "Memory pool (writer layer): max_memory = %" PRId64, + m_poMemoryPool->max_memory()); + + m_poFeatureDefn->Release(); +} + +/************************************************************************/ +/* FinalizeWriting() */ +/************************************************************************/ + +inline +void OGRArrowWriterLayer::FinalizeWriting() +{ + if( !IsFileWriterCreated() ) + { + CreateWriter(); + } + if( IsFileWriterCreated() ) + { + DoSomethingBeforeFinalFlushGroup(); + + if( !m_apoBuilders.empty() ) + FlushGroup(); + + CloseFileWriter(); + } +} + +/************************************************************************/ +/* CreateSchemaCommon() */ +/************************************************************************/ + +inline +void OGRArrowWriterLayer::CreateSchemaCommon() +{ + CPLAssert(static_cast(m_aeGeomEncoding.size()) == m_poFeatureDefn->GetGeomFieldCount()); + + std::vector> fields; + bool bNeedGDALSchema = false; + + m_anTZFlag.resize(m_poFeatureDefn->GetFieldCount(), TZFLAG_UNINITIALIZED); + + if( !m_osFIDColumn.empty() ) + { + bNeedGDALSchema = true; + fields.emplace_back(arrow::field( + m_osFIDColumn, arrow::int64(), false)); + } + + for( int i = 0; i < m_poFeatureDefn->GetFieldCount(); ++i ) + { + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + std::shared_ptr dt; + const auto eSubDT = poFieldDefn->GetSubType(); + const auto& osDomainName = poFieldDefn->GetDomainName(); + const OGRFieldDomain* poFieldDomain = nullptr; + const int nWidth = poFieldDefn->GetWidth(); + if( !osDomainName.empty() ) + { + const auto oIter = m_oMapFieldDomains.find(osDomainName); + if( oIter == m_oMapFieldDomains.end() ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Field %s references domain %s, but the later one " + "has not been created", + poFieldDefn->GetNameRef(), + osDomainName.c_str()); + } + else + { + poFieldDomain = oIter->second.get(); + } + } + switch( poFieldDefn->GetType() ) + { + case OFTInteger: + if( eSubDT == OFSTBoolean ) + dt = arrow::boolean(); + else if( eSubDT == OFSTInt16 ) + dt = arrow::int16(); + else + dt = arrow::int32(); + if( poFieldDomain != nullptr ) + { + dt = arrow::dictionary(dt, arrow::utf8()); + } + break; + + case OFTInteger64: + dt = arrow::int64(); + if( poFieldDomain != nullptr ) + { + dt = arrow::dictionary(dt, arrow::utf8()); + } + break; + + case OFTReal: + { + const int nPrecision = poFieldDefn->GetPrecision(); + if( nWidth != 0 && nPrecision != 0 ) + dt = arrow::decimal(nWidth, nPrecision); + else if( eSubDT == OFSTFloat32 ) + dt = arrow::float32(); + else + dt = arrow::float64(); + break; + } + + case OFTString: + case OFTWideString: + if( eSubDT != OFSTNone || nWidth > 0 ) + bNeedGDALSchema = true; + dt = arrow::utf8(); + break; + + case OFTBinary: + if( nWidth != 0 ) + dt = arrow::fixed_size_binary(nWidth); + else + dt = arrow::binary(); + break; + + case OFTIntegerList: + if( eSubDT == OFSTBoolean ) + dt = arrow::list(arrow::boolean()); + else if( eSubDT == OFSTInt16 ) + dt = arrow::list(arrow::int16()); + else + dt = arrow::list(arrow::int32()); + break; + + case OFTInteger64List: + dt = arrow::list(arrow::int64()); + break; + + case OFTRealList: + if( eSubDT == OFSTFloat32 ) + dt = arrow::list(arrow::float32()); + else + dt = arrow::list(arrow::float64()); + break; + + case OFTStringList: + case OFTWideStringList: + dt = arrow::list(arrow::utf8()); + break; + + case OFTDate: + dt = arrow::date32(); + break; + + case OFTTime: + dt = arrow::time32(arrow::TimeUnit::MILLI); + break; + + case OFTDateTime: + dt = arrow::timestamp(arrow::TimeUnit::MILLI); + break; + } + fields.emplace_back(arrow::field( + poFieldDefn->GetNameRef(), dt, poFieldDefn->IsNullable())); + } + + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + const auto eGType = poGeomFieldDefn->GetType(); + const int nDim = 2 + (OGR_GT_HasZ(eGType) ? 1 : 0) + (OGR_GT_HasM(eGType) ? 1 : 0); + + const bool pointFieldNullable = GetDriverUCName() == "PARQUET"; + std::shared_ptr pointField; + if( nDim == 2 ) + pointField = arrow::field("xy", arrow::float64(), pointFieldNullable); + else if( nDim == 3 && OGR_GT_HasZ(eGType) ) + pointField = arrow::field("xyz", arrow::float64(), pointFieldNullable); + else if( nDim == 3 && OGR_GT_HasM(eGType) ) + pointField = arrow::field("xym", arrow::float64(), pointFieldNullable); + else + pointField = arrow::field("xyzm", arrow::float64(), pointFieldNullable); + + std::shared_ptr dt; + switch( m_aeGeomEncoding[i] ) + { + case OGRArrowGeomEncoding::WKB: + dt = arrow::binary(); + break; + + case OGRArrowGeomEncoding::WKT: + dt = arrow::utf8(); + break; + + case OGRArrowGeomEncoding::GEOARROW_GENERIC: + CPLAssert(false); + break; + + case OGRArrowGeomEncoding::GEOARROW_POINT: + dt = arrow::fixed_size_list(pointField, nDim); + break; + + case OGRArrowGeomEncoding::GEOARROW_LINESTRING: + dt = arrow::list(arrow::fixed_size_list(pointField, nDim)); + break; + + case OGRArrowGeomEncoding::GEOARROW_POLYGON: + dt = arrow::list(arrow::list(arrow::fixed_size_list(pointField, nDim))); + break; + + case OGRArrowGeomEncoding::GEOARROW_MULTIPOINT: + dt = arrow::list(arrow::fixed_size_list(pointField, nDim)); + break; + + case OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING: + dt = arrow::list(arrow::list(arrow::fixed_size_list(pointField, nDim))); + break; + + case OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON: + dt = arrow::list(arrow::list(arrow::list(arrow::fixed_size_list(pointField, nDim)))); + break; + } + + auto field = arrow::field( + poGeomFieldDefn->GetNameRef(), dt, poGeomFieldDefn->IsNullable()); + if( m_bWriteFieldArrowExtensionName ) + { + auto kvMetadata = field->metadata() ? field->metadata()->Copy() : + std::make_shared(); + kvMetadata->Append("ARROW::extension:name", + GetGeomEncodingAsString(m_aeGeomEncoding[i])); + field = field->WithMetadata(kvMetadata); + } + + fields.emplace_back(field); + } + + m_aoEnvelopes.resize(m_poFeatureDefn->GetGeomFieldCount()); + + m_poSchema = arrow::schema(fields); + CPLAssert(m_poSchema); + if( bNeedGDALSchema && + CPLTestBool(CPLGetConfigOption(("OGR_" + GetDriverUCName() + "_WRITE_GDAL_SCHEMA").c_str(), "YES")) ) + { + CPLJSONObject oRoot; + CPLJSONObject oColumns; + + if( !m_osFIDColumn.empty() ) + oRoot.Add("fid", m_osFIDColumn); + + oRoot.Add("columns", oColumns); + for( int i = 0; i < m_poFeatureDefn->GetFieldCount(); ++i ) + { + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + CPLJSONObject oColumn; + oColumns.Add(poFieldDefn->GetNameRef(), oColumn); + oColumn.Add("type", OGR_GetFieldTypeName(poFieldDefn->GetType())); + const auto eSubDT = poFieldDefn->GetSubType(); + if( eSubDT != OFSTNone ) + oColumn.Add("subtype", OGR_GetFieldSubTypeName(eSubDT)); + const int nWidth = poFieldDefn->GetWidth(); + if( nWidth > 0 ) + oColumn.Add("width", nWidth); + const int nPrecision = poFieldDefn->GetPrecision(); + if( nPrecision > 0 ) + oColumn.Add("precision", nPrecision); + } + + auto kvMetadata = m_poSchema->metadata() ? m_poSchema->metadata()->Copy() : + std::make_shared(); + kvMetadata->Append("gdal:schema", oRoot.Format(CPLJSONObject::PrettyFormat::Plain)); + m_poSchema = m_poSchema->WithMetadata(kvMetadata); + CPLAssert(m_poSchema); + } +} +/************************************************************************/ +/* FinalizeSchema() */ +/************************************************************************/ + +inline +void OGRArrowWriterLayer::FinalizeSchema() +{ + // Final tuning of schema taking into actual timezone values + // from features + int nArrowIdxFirstField = !m_osFIDColumn.empty() ? 1 : 0; + for( int i = 0; i < m_poFeatureDefn->GetFieldCount(); ++i ) + { + if( m_anTZFlag[i] > 1 ) + { + const int nOffset = (m_anTZFlag[i] - 100) * 15; + int nHours = static_cast(nOffset / 60); // Round towards zero. + const int nMinutes = std::abs(nOffset - nHours * 60); + + const std::string osTZ = CPLSPrintf("%c%02d:%02d", + nOffset >= 0 ? '+' : '-', + std::abs(nHours), nMinutes); + auto dt = arrow::timestamp(arrow::TimeUnit::MILLI, osTZ); + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + auto field = arrow::field( + poFieldDefn->GetNameRef(), dt, poFieldDefn->IsNullable()); + auto result = m_poSchema->SetField(nArrowIdxFirstField + i, field); + if( !result.ok() ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Schema::SetField() failed with %s", + result.status().message().c_str()); + } + else + { + m_poSchema = *result; + } + } + } +} + +/************************************************************************/ +/* AddFieldDomain() */ +/************************************************************************/ + +inline +bool OGRArrowWriterLayer::AddFieldDomain(std::unique_ptr&& domain, + std::string& failureReason) +{ + if( domain->GetDomainType() != OFDT_CODED ) + { + failureReason = "Only coded field domains are supported by Arrow"; + return false; + } + + const OGRCodedFieldDomain* poDomain = static_cast< + const OGRCodedFieldDomain*>(domain.get()); + const OGRCodedValue* psIter = poDomain->GetEnumeration(); + + auto poStringBuilder = std::make_shared(m_poMemoryPool); + + int nLastCode = -1; + for(; psIter->pszCode; ++psIter ) + { + if( CPLGetValueType(psIter->pszCode) != CPL_VALUE_INTEGER ) + { + failureReason = "Non integer code in domain "; + failureReason += domain->GetName(); + return false; + } + int nCode = atoi(psIter->pszCode); + if( nCode <= nLastCode || nCode - nLastCode > 100 ) + { + failureReason = "Too sparse codes in domain "; + failureReason += domain->GetName(); + return false; + } + for( int i = nLastCode + 1; i < nCode; ++i ) + { + poStringBuilder->AppendNull(); + } + if( psIter->pszValue ) + poStringBuilder->Append(psIter->pszValue); + else + poStringBuilder->AppendNull(); + nLastCode = nCode; + } + + std::shared_ptr stringArray; + auto status = poStringBuilder->Finish(&stringArray); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "StringArray::Finish() failed with %s", status.message().c_str()); + return false; + } + + m_oMapFieldDomainToStringArray[domain->GetName()] = stringArray; + m_oMapFieldDomains[domain->GetName()] = std::move(domain); + return true; +} + +/************************************************************************/ +/* GetFieldDomainNames() */ +/************************************************************************/ + +inline +std::vector OGRArrowWriterLayer::GetFieldDomainNames() const +{ + std::vector names; + names.reserve( m_oMapFieldDomains.size() ); + for(const auto& it : m_oMapFieldDomains ) { + names.emplace_back( it.first ); + } + return names; +} + +/************************************************************************/ +/* GetFieldDomain() */ +/************************************************************************/ + +inline +const OGRFieldDomain* OGRArrowWriterLayer::GetFieldDomain(const std::string& name) const +{ + const auto iter = m_oMapFieldDomains.find(name); + if( iter == m_oMapFieldDomains.end() ) + return nullptr; + return iter->second.get(); +} + +/************************************************************************/ +/* CreateField() */ +/************************************************************************/ + +inline +OGRErr OGRArrowWriterLayer::CreateField( OGRFieldDefn *poField, int /* bApproxOK */) +{ + if( m_poSchema ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Cannot add field after a first feature has been written"); + return OGRERR_FAILURE; + } + m_poFeatureDefn->AddFieldDefn(poField); + return OGRERR_NONE; +} + +/************************************************************************/ +/* GetPreciseArrowGeomEncoding() */ +/************************************************************************/ + +inline +OGRArrowGeomEncoding OGRArrowWriterLayer::GetPreciseArrowGeomEncoding( + OGRwkbGeometryType eGType) +{ + const auto eFlatType = wkbFlatten(eGType); + if( eFlatType == wkbPoint ) + { + return OGRArrowGeomEncoding::GEOARROW_POINT; + } + else if( eFlatType == wkbLineString ) + { + return OGRArrowGeomEncoding::GEOARROW_LINESTRING; + } + else if( eFlatType == wkbPolygon ) + { + return OGRArrowGeomEncoding::GEOARROW_POLYGON; + } + else if( eFlatType == wkbMultiPoint ) + { + return OGRArrowGeomEncoding::GEOARROW_MULTIPOINT; + } + else if( eFlatType == wkbMultiLineString ) + { + return OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING; + } + else if( eFlatType == wkbMultiPolygon ) + { + return OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON; + } + else + { + CPLError(CE_Failure, CPLE_NotSupported, + "GEOMETRY_FORMAT=GEOARROW is currently not supported for %s", + OGRGeometryTypeToName(eGType)); + return OGRArrowGeomEncoding::GEOARROW_GENERIC; + } +} + +/************************************************************************/ +/* GetGeomEncodingAsString() */ +/************************************************************************/ + +inline +const char* OGRArrowWriterLayer::GetGeomEncodingAsString(OGRArrowGeomEncoding eGeomEncoding) +{ + switch( eGeomEncoding ) + { + case OGRArrowGeomEncoding::WKB: + return "WKB"; + case OGRArrowGeomEncoding::WKT: + return "WKT"; + case OGRArrowGeomEncoding::GEOARROW_GENERIC: + CPLAssert(false); + break; + case OGRArrowGeomEncoding::GEOARROW_POINT: + return "geoarrow.point"; + case OGRArrowGeomEncoding::GEOARROW_LINESTRING: + return "geoarrow.linestring"; + case OGRArrowGeomEncoding::GEOARROW_POLYGON: + return "geoarrow.polygon"; + case OGRArrowGeomEncoding::GEOARROW_MULTIPOINT: + return "geoarrow.multipoint"; + case OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING: + return "geoarrow.multilinestring"; + case OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON: + return "geoarrow.multipolygon"; + } + return nullptr; +} + +/************************************************************************/ +/* IsSupportedGeometryTyp */ +/************************************************************************/ + +inline +bool OGRArrowWriterLayer::IsSupportedGeometryType(OGRwkbGeometryType eGType) const +{ + if( eGType != wkbFlatten(eGType) ) + { + const auto osConfigOptionName = "OGR_" + GetDriverUCName() + "_ALLOW_ALL_DIMS"; + if( !CPLTestBool(CPLGetConfigOption(osConfigOptionName.c_str(), "NO")) ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Only 2D geometry types are supported (unless the " + "%s configuration option is set to YES)", + osConfigOptionName.c_str()); + return false; + } + } + return true; +} + +/************************************************************************/ +/* CreateGeomField() */ +/************************************************************************/ + +inline +OGRErr OGRArrowWriterLayer::CreateGeomField( OGRGeomFieldDefn *poField, int /* bApproxOK */) +{ + if( m_poSchema ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Cannot add field after a first feature has been written"); + return OGRERR_FAILURE; + } + const auto eGType = poField->GetType(); + if( !IsSupportedGeometryType(eGType) ) + { + return OGRERR_FAILURE; + } + + if( poField->GetSpatialRef() == nullptr ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column should have an associated CRS"); + } + auto eGeomEncoding = m_eGeomEncoding; + if( eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC ) + { + eGeomEncoding = GetPreciseArrowGeomEncoding(eGType); + if( eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC ) + return OGRERR_FAILURE; + } + m_aeGeomEncoding.push_back(eGeomEncoding); + m_poFeatureDefn->AddGeomFieldDefn(poField); + return OGRERR_NONE; +} + +/************************************************************************/ +/* MakeGeoArrowBuilder() */ +/************************************************************************/ + +static std::shared_ptr MakeGeoArrowBuilder( + arrow::MemoryPool* poMemoryPool, + int nDim, + int nDepth) +{ + if( nDepth == 0 ) + return std::make_shared(poMemoryPool, + std::make_shared(poMemoryPool), nDim); + else + return std::make_shared(poMemoryPool, + MakeGeoArrowBuilder(poMemoryPool, nDim, nDepth - 1)); +} + +/************************************************************************/ +/* CreateArrayBuilders() */ +/************************************************************************/ + +inline +void OGRArrowWriterLayer::CreateArrayBuilders() +{ + m_apoBuilders.reserve(1 + m_poFeatureDefn->GetFieldCount() + m_poFeatureDefn->GetGeomFieldCount() ); + + int nArrowIdx = 0; + if( !m_osFIDColumn.empty() ) + { + m_apoBuilders.emplace_back(std::make_shared()); + nArrowIdx ++; + } + + for( int i = 0; i < m_poFeatureDefn->GetFieldCount(); ++i, ++nArrowIdx ) + { + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + std::shared_ptr builder; + const auto eSubDT = poFieldDefn->GetSubType(); + switch( poFieldDefn->GetType() ) + { + case OFTInteger: + if( eSubDT == OFSTBoolean ) + builder = std::make_shared(m_poMemoryPool); + else if( eSubDT == OFSTInt16 ) + builder = std::make_shared(m_poMemoryPool); + else + builder = std::make_shared(m_poMemoryPool); + break; + + case OFTInteger64: + builder = std::make_shared(m_poMemoryPool); + break; + + case OFTReal: + { + const auto arrowType = m_poSchema->fields()[nArrowIdx]->type(); + if( arrowType->id() == arrow::Type::DECIMAL128 ) + builder = std::make_shared(arrowType, m_poMemoryPool); + else if( arrowType->id() == arrow::Type::DECIMAL256 ) + builder = std::make_shared(arrowType, m_poMemoryPool); + else if( eSubDT == OFSTFloat32 ) + builder = std::make_shared(m_poMemoryPool); + else + builder = std::make_shared(m_poMemoryPool); + break; + } + + case OFTString: + case OFTWideString: + builder = std::make_shared(m_poMemoryPool); + break; + + case OFTBinary: + if( poFieldDefn->GetWidth() != 0 ) + builder = std::make_shared( + arrow::fixed_size_binary(poFieldDefn->GetWidth()), + m_poMemoryPool); + else + builder = std::make_shared(m_poMemoryPool); + break; + + case OFTIntegerList: + { + std::shared_ptr poBaseBuilder; + if( eSubDT == OFSTBoolean ) + poBaseBuilder = std::make_shared(m_poMemoryPool); + else if( eSubDT == OFSTInt16 ) + poBaseBuilder = std::make_shared(m_poMemoryPool); + else + poBaseBuilder = std::make_shared(m_poMemoryPool); + builder = std::make_shared(m_poMemoryPool, + poBaseBuilder); + break; + } + + case OFTInteger64List: + builder = std::make_shared(m_poMemoryPool, + std::make_shared(m_poMemoryPool)); + + break; + + case OFTRealList: + if( eSubDT == OFSTFloat32 ) + builder = std::make_shared(m_poMemoryPool, + std::make_shared(m_poMemoryPool)); + else + builder = std::make_shared(m_poMemoryPool, + std::make_shared(m_poMemoryPool)); + break; + + case OFTStringList: + case OFTWideStringList: + builder = std::make_shared(m_poMemoryPool, + std::make_shared(m_poMemoryPool)); + + break; + + case OFTDate: + builder = std::make_shared(m_poMemoryPool); + break; + + case OFTTime: + builder = std::make_shared( + arrow::time32(arrow::TimeUnit::MILLI), + m_poMemoryPool); + break; + + case OFTDateTime: + builder = std::make_shared( + arrow::timestamp(arrow::TimeUnit::MILLI), + m_poMemoryPool); + break; + } + m_apoBuilders.emplace_back(builder); + } + + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i, ++nArrowIdx ) + { + std::shared_ptr builder; + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + const auto eGType = poGeomFieldDefn->GetType(); + const int nDim = 2 + (OGR_GT_HasZ(eGType) ? 1 : 0) + (OGR_GT_HasM(eGType) ? 1 : 0); + + if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::WKB ) + builder = std::make_shared(m_poMemoryPool); + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::WKT ) + builder = std::make_shared(m_poMemoryPool); + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POINT ) + { + builder = MakeGeoArrowBuilder( + m_poMemoryPool, nDim, 0); + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_LINESTRING ) + { + builder = MakeGeoArrowBuilder( + m_poMemoryPool, nDim, 1); + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POLYGON ) + { + builder = MakeGeoArrowBuilder( + m_poMemoryPool, nDim, 2); + + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_MULTIPOINT ) + { + builder = MakeGeoArrowBuilder( + m_poMemoryPool, nDim, 1); + + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING ) + { + builder = MakeGeoArrowBuilder( + m_poMemoryPool, nDim, 2); + + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON ) + { + builder = MakeGeoArrowBuilder( + m_poMemoryPool, nDim, 3); + + } + else + { + CPLAssert(false); + } + m_apoBuilders.emplace_back(builder); + } +} + +/************************************************************************/ +/* ICreateFeature() */ +/************************************************************************/ + +inline +OGRErr OGRArrowWriterLayer::ICreateFeature( OGRFeature* poFeature ) +{ + if( m_poSchema == nullptr ) + { + CreateSchema(); + } + + if( m_apoBuilders.empty() ) + { + CreateArrayBuilders(); + } + + // First pass to check not-null constraints as Arrow doesn't seem + // to do that on the writing side. But such files can't be read. + const int nFieldCount = m_poFeatureDefn->GetFieldCount(); + for( int i = 0; i < nFieldCount; ++i ) + { + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + if( !poFieldDefn->IsNullable() && !poFeature->IsFieldSetAndNotNull(i) ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Null value found in non-nullable field %s", + poFieldDefn->GetNameRef()); + return OGRERR_FAILURE; + } + } + + const int nGeomFieldCount = m_poFeatureDefn->GetGeomFieldCount(); + for( int i = 0; i < nGeomFieldCount; ++i ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + if( !poGeomFieldDefn->IsNullable() && poFeature->GetGeomFieldRef(i) == nullptr ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Null value found in non-nullable geometry field %s", + poGeomFieldDefn->GetNameRef()); + return OGRERR_FAILURE; + } + } + + // Write FID, if FID column present + int nArrowIdx = 0; + if( !m_osFIDColumn.empty() ) + { + int64_t nFID = poFeature->GetFID(); + if( nFID == OGRNullFID ) + { + nFID = m_nFeatureCount; + poFeature->SetFID(nFID); + } + auto poBuilder = static_cast(m_apoBuilders[0].get()); + poBuilder->Append(nFID); + nArrowIdx ++; + } + + // Write attributes + for( int i = 0; i < nFieldCount; ++i, ++nArrowIdx ) + { + auto poBuilder = m_apoBuilders[nArrowIdx].get(); + if( !poFeature->IsFieldSetAndNotNull(i) ) + { + poBuilder->AppendNull(); + continue; + } + + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(i); + std::shared_ptr builder; + const auto eSubDT = poFieldDefn->GetSubType(); + switch( poFieldDefn->GetType() ) + { + case OFTInteger: + if( eSubDT == OFSTBoolean ) + static_cast(poBuilder)->Append( + poFeature->GetFieldAsInteger(i) != 0); + else if( eSubDT == OFSTInt16 ) + static_cast(poBuilder)->Append( + static_cast(poFeature->GetFieldAsInteger(i))); + else + static_cast(poBuilder)->Append( + poFeature->GetFieldAsInteger(i)); + break; + + case OFTInteger64: + static_cast(poBuilder)->Append( + static_cast(poFeature->GetFieldAsInteger64(i))); + break; + + case OFTReal: + { + const auto arrowType = m_poSchema->fields()[nArrowIdx]->type(); + const double dfVal = poFeature->GetFieldAsDouble(i); + if( arrowType->id() == arrow::Type::DECIMAL128 ) + { + auto res = arrow::Decimal128::FromReal( + dfVal, poFieldDefn->GetWidth(), poFieldDefn->GetPrecision()); + if( res.ok() ) + { + static_cast(poBuilder)->Append(*res); + } + else + { + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot parse %.18g as a %d.%d decimal", + dfVal, + poFieldDefn->GetWidth(), + poFieldDefn->GetPrecision()); + poBuilder->AppendNull(); + } + } + else if( arrowType->id() == arrow::Type::DECIMAL256 ) + { + auto res = arrow::Decimal256::FromReal( + dfVal, poFieldDefn->GetWidth(), poFieldDefn->GetPrecision()); + if( res.ok() ) + { + static_cast(poBuilder)->Append(*res); + } + else + { + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot parse %.18g as a %d.%d decimal", + dfVal, + poFieldDefn->GetWidth(), + poFieldDefn->GetPrecision()); + poBuilder->AppendNull(); + } + } + else if( eSubDT == OFSTFloat32 ) + { + static_cast(poBuilder)->Append( + static_cast(dfVal)); + } + else + { + static_cast(poBuilder)->Append(dfVal); + } + break; + } + + case OFTString: + case OFTWideString: + static_cast(poBuilder)->Append( + poFeature->GetFieldAsString(i)); + break; + + case OFTBinary: + { + int nSize = 0; + const auto pData = poFeature->GetFieldAsBinary(i, &nSize); + if( poFieldDefn->GetWidth() != 0 ) + { + if( poFieldDefn->GetWidth() != nSize ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot write field %s. Got %d bytes, expected %d", + poFieldDefn->GetNameRef(), + nSize, + poFieldDefn->GetWidth()); + poBuilder->AppendNull(); + } + else + { + static_cast(poBuilder)->Append(pData); + } + } + else + static_cast(poBuilder)->Append(pData, nSize); + break; + } + + case OFTIntegerList: + { + auto poListBuilder = static_cast(poBuilder); + if( eSubDT == OFSTBoolean ) + { + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + int nValues = 0; + const auto panValues = poFeature->GetFieldAsIntegerList(i, &nValues); + for(int j = 0; j < nValues; ++j ) + poValueBuilder->Append(panValues[j] != 0); + } + else if( eSubDT == OFSTInt16 ) + { + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + int nValues = 0; + const auto panValues = poFeature->GetFieldAsIntegerList(i, &nValues); + for(int j = 0; j < nValues; ++j ) + poValueBuilder->Append(static_cast(panValues[j])); + } + else + { + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + int nValues = 0; + const auto panValues = poFeature->GetFieldAsIntegerList(i, &nValues); + for(int j = 0; j < nValues; ++j ) + poValueBuilder->Append(panValues[j]); + } + break; + } + + case OFTInteger64List: + { + auto poListBuilder = static_cast(poBuilder); + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + int nValues = 0; + const auto panValues = poFeature->GetFieldAsInteger64List(i, &nValues); + for(int j = 0; j < nValues; ++j ) + poValueBuilder->Append(static_cast(panValues[j])); + break; + } + + case OFTRealList: + { + auto poListBuilder = static_cast(poBuilder); + if( eSubDT == OFSTFloat32 ) + { + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + int nValues = 0; + const auto padfValues = poFeature->GetFieldAsDoubleList(i, &nValues); + for(int j = 0; j < nValues; ++j ) + poValueBuilder->Append(static_cast(padfValues[j])); + } + else + { + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + int nValues = 0; + const auto padfValues = poFeature->GetFieldAsDoubleList(i, &nValues); + for(int j = 0; j < nValues; ++j ) + poValueBuilder->Append(padfValues[j]); + } + break; + } + + case OFTStringList: + case OFTWideStringList: + { + auto poListBuilder = static_cast(poBuilder); + poListBuilder->Append(); + auto poValueBuilder = static_cast(poListBuilder->value_builder()); + const auto papszValues = poFeature->GetFieldAsStringList(i); + for(int j = 0; papszValues && papszValues[j]; ++j ) + poValueBuilder->Append(papszValues[j]); + break; + } + + case OFTDate: + { + int nYear, nMonth, nDay, nHour, nMinute; + float fSec; + int nTZFlag; + poFeature->GetFieldAsDateTime(i, + &nYear, &nMonth, &nDay, &nHour, &nMinute, &fSec, &nTZFlag); + struct tm brokenDown; + memset(&brokenDown, 0, sizeof(brokenDown)); + brokenDown.tm_year = nYear - 1900; + brokenDown.tm_mon = nMonth - 1; + brokenDown.tm_mday = nDay; + GIntBig nVal = CPLYMDHMSToUnixTime(&brokenDown); + static_cast(poBuilder)->Append(static_cast(nVal / 86400)); + break; + } + + case OFTTime: + { + int nYear, nMonth, nDay, nHour, nMinute; + float fSec; + int nTZFlag; + poFeature->GetFieldAsDateTime(i, + &nYear, &nMonth, &nDay, &nHour, &nMinute, &fSec, &nTZFlag); + int nVal = nHour * 3600 + nMinute * 60; + static_cast(poBuilder)->Append( + static_cast((static_cast(nVal) + fSec) * 1000 + 0.5)); + break; + } + + case OFTDateTime: + { + int nYear, nMonth, nDay, nHour, nMinute; + float fSec; + int nTZFlag; + poFeature->GetFieldAsDateTime(i, + &nYear, &nMonth, &nDay, &nHour, &nMinute, &fSec, &nTZFlag); + struct tm brokenDown; + memset(&brokenDown, 0, sizeof(brokenDown)); + brokenDown.tm_year = nYear - 1900; + brokenDown.tm_mon = nMonth - 1; + brokenDown.tm_mday = nDay; + brokenDown.tm_hour = nHour; + brokenDown.tm_min = nMinute; + brokenDown.tm_sec = 0; + GIntBig nVal = CPLYMDHMSToUnixTime(&brokenDown); + if( !IsFileWriterCreated() && m_anTZFlag[i] != TZFLAG_MIXED ) + { + if( m_anTZFlag[i] == TZFLAG_UNINITIALIZED ) + m_anTZFlag[i] = nTZFlag; + else if( m_anTZFlag[i] != nTZFlag ) + { + if( m_anTZFlag[i] > 1 && nTZFlag > 1 ) + { + m_anTZFlag[i] = 100; // harmonize on UTC + } + else + { + CPLError(CE_Warning, CPLE_AppDefined, + "Field %s contains a mix of " + "timezone-aware and local/without " + "timezone values.", + poFieldDefn->GetNameRef()); + m_anTZFlag[i] = TZFLAG_MIXED; + } + } + } + if( nTZFlag != 0 && nTZFlag != 1 ) + { + nVal -= (nTZFlag - 100) * 15 * 60; + } + static_cast(poBuilder)->Append( + static_cast((static_cast(nVal) + fSec) * 1000 + 0.5)); + break; + } + + } + } + + // Write geometries + for( int i = 0; i < nGeomFieldCount; ++i, ++nArrowIdx ) + { + auto poBuilder = m_apoBuilders[nArrowIdx].get(); + const OGRGeometry* poGeom = poFeature->GetGeomFieldRef(i); + const auto eGType = poGeom ? poGeom->getGeometryType() : wkbNone; + const auto eColumnGType = m_poFeatureDefn->GetGeomFieldDefn(i)->GetType(); + const bool bIsEmpty = poGeom != nullptr && poGeom->IsEmpty(); + if( poGeom != nullptr && !bIsEmpty ) + { + OGREnvelope oEnvelope; + poGeom->getEnvelope(&oEnvelope); + m_aoEnvelopes[i].Merge(oEnvelope); + } + + if( poGeom == nullptr ) + { + if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POINT && + GetDriverUCName() == "PARQUET" ) + { + // For some reason, Parquet doesn't support a NULL FixedSizeList on reading + auto poPointBuilder = static_cast(poBuilder); + poPointBuilder->Append(); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poValueBuilder->Append(std::numeric_limits::quiet_NaN()); + poValueBuilder->Append(std::numeric_limits::quiet_NaN()); + if( OGR_GT_HasZ(eGType) ) + poValueBuilder->Append(std::numeric_limits::quiet_NaN()); + if( OGR_GT_HasM(eGType) ) + poValueBuilder->Append(std::numeric_limits::quiet_NaN()); + } + else + { + poBuilder->AppendNull(); + } + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::WKB ) + { + std::unique_ptr poGeom2D; + if( eGType != wkbFlatten(eGType) && + eColumnGType == wkbFlatten(eColumnGType) ) + { + static bool bHasWarned = false; + if( !bHasWarned ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Flattening geometry to 2D"); + bHasWarned = true; + } + poGeom2D.reset(poGeom->clone()); + poGeom2D->flattenTo2D(); + poGeom = poGeom2D.get(); + } + const auto nSize = poGeom->WkbSize(); + if( nSize < INT_MAX ) + { + m_abyBuffer.resize(nSize); + poGeom->exportToWkb(wkbNDR, &m_abyBuffer[0], wkbVariantIso); + static_cast(poBuilder)->Append( + m_abyBuffer.data(), static_cast(m_abyBuffer.size())); + } + else + { + CPLError(CE_Warning, CPLE_AppDefined, + "Too big geometry. " + "Writing null geometry"); + poBuilder->AppendNull(); + } + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::WKT ) + { + OGRWktOptions options; + options.variant = wkbVariantIso; + static_cast(poBuilder)->Append( + poGeom->exportToWkt(options)); + } + // The following checks are only valid for GeoArrow encoding + else if( (!bIsEmpty && eGType != eColumnGType) || + (bIsEmpty && wkbFlatten(eGType) != wkbFlatten(eColumnGType)) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry of type %s found, whereas %s is expected. " + "Writing null geometry", + OGRGeometryTypeToName(eGType), + OGRGeometryTypeToName(eColumnGType)); + poBuilder->AppendNull(); + } + else if( !bIsEmpty && + poGeom->Is3D() != OGR_GT_HasZ(eColumnGType) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry Z flag (%d) != column geometry type Z flag (%d)d. " + "Writing null geometry", + poGeom->Is3D(), + OGR_GT_HasZ(eColumnGType)); + poBuilder->AppendNull(); + } + else if( !bIsEmpty && + poGeom->IsMeasured() != OGR_GT_HasM(eColumnGType) ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry M flag (%d) != column geometry type M flag (%d)d. " + "Writing null geometry", + poGeom->IsMeasured(), + OGR_GT_HasM(eColumnGType)); + poBuilder->AppendNull(); + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POINT ) + { + const auto poPoint = poGeom->toPoint(); + auto poPointBuilder = static_cast(poBuilder); + poPointBuilder->Append(); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poValueBuilder->Append(poPoint->getX()); + poValueBuilder->Append(poPoint->getY()); + if( OGR_GT_HasZ(eColumnGType) ) + poValueBuilder->Append(poPoint->getZ()); + if( OGR_GT_HasM(eColumnGType) ) + poValueBuilder->Append(poPoint->getM()); + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_LINESTRING ) + { + const auto poLS = poGeom->toLineString(); + auto poListBuilder = static_cast(poBuilder); + auto poPointBuilder = static_cast(poListBuilder->value_builder()); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poListBuilder->Append(); + for( int j = 0; j < poLS->getNumPoints(); ++j ) + { + poPointBuilder->Append(); + poValueBuilder->Append(poLS->getX(j)); + poValueBuilder->Append(poLS->getY(j)); + if( poGeom->Is3D() ) + poValueBuilder->Append(poLS->getZ(j)); + if( poGeom->IsMeasured() ) + poValueBuilder->Append(poLS->getM(j)); + } + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POLYGON ) + { + const auto poPolygon = poGeom->toPolygon(); + auto poPolygonBuilder = static_cast(poBuilder); + auto poRingBuilder = static_cast(poPolygonBuilder->value_builder()); + auto poPointBuilder = static_cast(poRingBuilder->value_builder()); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poPolygonBuilder->Append(); + for( const auto* poRing: *poPolygon ) + { + poRingBuilder->Append(); + for( int j = 0; j < poRing->getNumPoints(); ++j ) + { + poPointBuilder->Append(); + poValueBuilder->Append(poRing->getX(j)); + poValueBuilder->Append(poRing->getY(j)); + if( poGeom->Is3D() ) + poValueBuilder->Append(poRing->getZ(j)); + if( poGeom->IsMeasured() ) + poValueBuilder->Append(poRing->getM(j)); + } + } + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_MULTIPOINT ) + { + const auto poMultiPoint = poGeom->toMultiPoint(); + auto poListBuilder = static_cast(poBuilder); + auto poPointBuilder = static_cast(poListBuilder->value_builder()); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poListBuilder->Append(); + for( const auto* poPoint: *poMultiPoint ) + { + poPointBuilder->Append(); + poValueBuilder->Append(poPoint->getX()); + poValueBuilder->Append(poPoint->getY()); + if( poGeom->Is3D() ) + poValueBuilder->Append(poPoint->getZ()); + if( poGeom->IsMeasured() ) + poValueBuilder->Append(poPoint->getM()); + } + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING ) + { + const auto poMLS = poGeom->toMultiLineString(); + auto poMLSBuilder = static_cast(poBuilder); + auto poLSBuilder = static_cast(poMLSBuilder->value_builder()); + auto poPointBuilder = static_cast(poLSBuilder->value_builder()); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poMLSBuilder->Append(); + for( const auto* poLS: *poMLS ) + { + poLSBuilder->Append(); + for( int j = 0; j < poLS->getNumPoints(); ++j ) + { + poPointBuilder->Append(); + poValueBuilder->Append(poLS->getX(j)); + poValueBuilder->Append(poLS->getY(j)); + if( poGeom->Is3D() ) + poValueBuilder->Append(poLS->getZ(j)); + if( poGeom->IsMeasured() ) + poValueBuilder->Append(poLS->getM(j)); + } + } + } + else if( m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON ) + { + const auto poMPoly = poGeom->toMultiPolygon(); + auto poMPolyBuilder = static_cast(poBuilder); + auto poPolyBuilder = static_cast(poMPolyBuilder->value_builder()); + auto poRingBuilder = static_cast(poPolyBuilder->value_builder()); + auto poPointBuilder = static_cast(poRingBuilder->value_builder()); + auto poValueBuilder = static_cast(poPointBuilder->value_builder()); + poMPolyBuilder->Append(); + for( const auto* poPolygon: *poMPoly ) + { + poPolyBuilder->Append(); + for( const auto* poRing: *poPolygon ) + { + poRingBuilder->Append(); + for( int j = 0; j < poRing->getNumPoints(); ++j ) + { + poPointBuilder->Append(); + poValueBuilder->Append(poRing->getX(j)); + poValueBuilder->Append(poRing->getY(j)); + if( poGeom->Is3D() ) + poValueBuilder->Append(poRing->getZ(j)); + if( poGeom->IsMeasured() ) + poValueBuilder->Append(poRing->getM(j)); + } + } + } + } + else + { + CPLAssert(false); + } + } + + m_nFeatureCount ++; + + // Flush the current row group if reaching the limit of rows per group. + if( !m_apoBuilders.empty() && m_apoBuilders[0]->length() == m_nRowGroupSize ) + { + if( !IsFileWriterCreated() ) + { + CreateWriter(); + if( !IsFileWriterCreated() ) + return OGRERR_FAILURE; + } + + if( !FlushGroup() ) + return OGRERR_FAILURE; + } + + return OGRERR_NONE; +} + +/************************************************************************/ +/* GetFeatureCount() */ +/************************************************************************/ + +inline +GIntBig OGRArrowWriterLayer::GetFeatureCount(int bForce) +{ + if( m_poAttrQuery == nullptr && m_poFilterGeom == nullptr ) + { + return m_nFeatureCount; + } + return OGRLayer::GetFeatureCount(bForce); +} + +/************************************************************************/ +/* TestCapability() */ +/************************************************************************/ + +inline +int OGRArrowWriterLayer::TestCapability(const char* pszCap) +{ + if( EQUAL(pszCap, OLCCreateField) || EQUAL(pszCap, OLCCreateGeomField) ) + return m_poSchema == nullptr; + + if( EQUAL(pszCap, OLCSequentialWrite) ) + return true; + + if( EQUAL(pszCap, OLCStringsAsUTF8) ) + return true; + + if( EQUAL(pszCap, OLCMeasuredGeometries) ) + return true; + + return false; +} + +/************************************************************************/ +/* WriteArrays() */ +/************************************************************************/ + +inline +bool OGRArrowWriterLayer::WriteArrays( + std::function&, + const std::shared_ptr&)> postProcessArray) +{ + int nArrowIdx = 0; + int nArrowIdxFirstField = !m_osFIDColumn.empty() ? 1 : 0; + for( const auto& poBuilder: m_apoBuilders ) + { + const auto& field = m_poSchema->fields()[nArrowIdx]; + + std::shared_ptr array; + auto status = poBuilder->Finish(&array); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "builder::Finish() for field %s failed with %s", + field->name().c_str(), + status.message().c_str()); + return false; + } + + // CPLDebug("ARROW", "%s", array->ToString().c_str()); + + const int iCol = nArrowIdx - nArrowIdxFirstField; + if( iCol >= 0 && iCol < m_poFeatureDefn->GetFieldCount() ) + { + const auto poFieldDefn = m_poFeatureDefn->GetFieldDefn(iCol); + const auto eFieldType = poFieldDefn->GetType(); + if( eFieldType == OFTInteger || eFieldType == OFTInteger64 ) + { + const auto& osDomainName = poFieldDefn->GetDomainName(); + const auto oIter = m_oMapFieldDomainToStringArray.find(osDomainName); + if( oIter != m_oMapFieldDomainToStringArray.end() ) + { + auto result = arrow::DictionaryArray::FromArrays( + field->type(), + array, + oIter->second); + if( !result.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "DictionaryArray::FromArrays() for field %s failed with %s", + field->name().c_str(), + result.status().message().c_str()); + return false; + } + array = *result; + } + } + } + + if( !postProcessArray(field, array) ) + { + return false; + } + + nArrowIdx ++; + } + return true; +} diff --git a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp index 42a274a1fa46..8908aee1f42a 100644 --- a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp +++ b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp @@ -256,6 +256,12 @@ void OGRRegisterAllInternal() #ifdef HANA_ENABLED RegisterOGRHANA(); #endif +#ifdef PARQUET_ENABLED + RegisterOGRParquet(); +#endif +#ifdef ARROW_ENABLED + RegisterOGRArrow(); +#endif // NOTE: you need to generally insert your own driver before that line. diff --git a/ogr/ogrsf_frmts/ogrsf_frmts.h b/ogr/ogrsf_frmts/ogrsf_frmts.h index 343be7ef142b..13001a9316cc 100644 --- a/ogr/ogrsf_frmts/ogrsf_frmts.h +++ b/ogr/ogrsf_frmts/ogrsf_frmts.h @@ -307,6 +307,9 @@ using OGRLayerUniquePtr = std::unique_ptr; */ template class OGRGetNextFeatureThroughRaw { +protected: + ~OGRGetNextFeatureThroughRaw() = default; + public: /** Implement OGRLayer::GetNextFeature(), relying on BaseLayer::GetNextRawFeature() */ @@ -551,6 +554,8 @@ void CPL_DLL RegisterOGRNGW(); void CPL_DLL RegisterOGRMapML(); void CPL_DLL RegisterOGRLVBAG(); void CPL_DLL RegisterOGRHANA(); +void CPL_DLL RegisterOGRParquet(); +void CPL_DLL RegisterOGRArrow(); // @endcond CPL_C_END diff --git a/ogr/ogrsf_frmts/parquet/CMakeLists.txt b/ogr/ogrsf_frmts/parquet/CMakeLists.txt new file mode 100644 index 000000000000..1044ff5c7bc2 --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/CMakeLists.txt @@ -0,0 +1,11 @@ +add_gdal_driver(TARGET ogr_Parquet + SOURCES ogrparquetdriver.cpp + ogrparquetdataset.cpp + ogrparquetlayer.cpp + ogrparquetwriterdataset.cpp + ogrparquetwriterlayer.cpp + PLUGIN_CAPABLE + CXX_WFLAGS_EFFCXX) +gdal_standard_includes(ogr_Parquet) +target_include_directories(ogr_Parquet PRIVATE $) +gdal_target_link_libraries(ogr_Parquet PRIVATE arrow_shared parquet_shared) diff --git a/ogr/ogrsf_frmts/parquet/ogr_include_parquet.h b/ogr/ogrsf_frmts/parquet/ogr_include_parquet.h new file mode 100644 index 000000000000..6e99a82681e7 --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogr_include_parquet.h @@ -0,0 +1,61 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_INCLUDE_PARQUET_H +#define OGR_INCLUDE_PARQUET_H + +#if defined(__GNUC__) && !defined(_MSC_VER) +#pragma GCC system_header +#endif + +#ifdef _MSC_VER +#pragma warning( push ) +#pragma warning( disable : 4244 ) /* warning 4244: 'initializing': conversion from 'int32_t' to 'int16_t', possible loss of data */ +#pragma warning( disable : 4458 ) /* warning 4458: declaration of 'type_id' hides class member */ +#endif + +#include "arrow/builder.h" +#include "arrow/memory_pool.h" +#include "arrow/array/array_dict.h" +#include "arrow/io/file.h" +#include "arrow/ipc/writer.h" +#include "arrow/util/base64.h" +#include "arrow/util/compression.h" +#include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" +#include "parquet/file_writer.h" +#include "parquet/schema.h" +#include "parquet/arrow/reader.h" +#include "parquet/arrow/writer.h" +#include "parquet/arrow/schema.h" + +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + +#endif diff --git a/ogr/ogrsf_frmts/parquet/ogr_parquet.h b/ogr/ogrsf_frmts/parquet/ogr_parquet.h new file mode 100644 index 000000000000..41c77f0611bf --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogr_parquet.h @@ -0,0 +1,178 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#ifndef OGR_PARQUET_H +#define OGR_PARQUET_H + +#include "ogrsf_frmts.h" + +#include + +#include "../arrow_common/ogr_arrow.h" +#include "ogr_include_parquet.h" + +/************************************************************************/ +/* OGRParquetLayer */ +/************************************************************************/ + +class OGRParquetDataset; + +class OGRParquetLayer final: public OGRArrowLayer + +{ + OGRParquetLayer(const OGRParquetLayer&) = delete; + OGRParquetLayer& operator= (const OGRParquetLayer&) = delete; + + OGRParquetDataset* m_poDS = nullptr; + std::unique_ptr m_poArrowReader{}; + std::shared_ptr m_poRecordBatchReader{}; + bool m_bSingleBatch = false; + int m_iFIDParquetColumn = -1; + std::vector m_anMapFieldIndexToParquetColumn{}; + std::vector m_anMapGeomFieldIndexToParquetColumn{}; + bool m_bHasMissingMappingToParquet = false; + + std::vector m_anRequestedParquetColumns{}; // only valid when m_bIgnoredFields is set +#ifdef DEBUG + int m_nExpectedBatchColumns = 0; // Should be equal to m_poBatch->num_columns() (when m_bIgnoredFields is set) +#endif + CPLStringList m_aosFeatherMetadata{}; + + void EstablishFeatureDefn(); + void LoadGeoMetadata(); + bool ReadNextBatch() override; + OGRwkbGeometryType ComputeGeometryColumnType(int iGeomCol, int iParquetCol) const; + void CreateFieldFromSchema( + const std::shared_ptr& field, + bool bParquetColValid, + int &iParquetCol, + const std::vector& path, + const std::map>& oMapFieldNameToGDALSchemaFieldDefn); + bool CheckMatchArrowParquetColumnNames( + int& iParquetCol, + const std::shared_ptr& field) const; + OGRFeature* GetFeatureExplicitFID(GIntBig nFID); + OGRFeature* GetFeatureByIndex(GIntBig nFID); + + virtual std::string GetDriverUCName() const override { return "PARQUET"; } + +public: + OGRParquetLayer(OGRParquetDataset* poDS, + const char* pszLayerName, + std::unique_ptr&& arrow_reader); + + void ResetReading() override; + OGRFeature *GetFeature(GIntBig nFID) override; + GIntBig GetFeatureCount(int bForce) override; + int TestCapability(const char* pszCap) override; + OGRErr SetIgnoredFields( const char **papszFields ) override; + const char* GetMetadataItem( const char* pszName, + const char* pszDomain = "" ) override; + char** GetMetadata( const char* pszDomain = "" ) override; + + std::unique_ptr BuildDomain(const std::string& osDomainName, + int iFieldIndex) const override; +}; + +/************************************************************************/ +/* OGRParquetDataset */ +/************************************************************************/ + +class OGRParquetDataset final: public OGRArrowDataset +{ +public: + explicit OGRParquetDataset(std::unique_ptr&& poMemoryPool); +}; + +/************************************************************************/ +/* OGRParquetWriterLayer */ +/************************************************************************/ + +class OGRParquetWriterLayer final: public OGRArrowWriterLayer +{ + OGRParquetWriterLayer(const OGRParquetWriterLayer&) = delete; + OGRParquetWriterLayer& operator= (const OGRParquetWriterLayer&) = delete; + + std::unique_ptr m_poFileWriter{}; + std::shared_ptr m_poKeyValueMetadata{}; + + virtual bool IsFileWriterCreated() const override { return m_poFileWriter != nullptr; } + virtual void CreateWriter() override; + virtual void CloseFileWriter() override; + + virtual void CreateSchema() override; + virtual void DoSomethingBeforeFinalFlushGroup() override; + + virtual bool FlushGroup() override; + + virtual std::string GetDriverUCName() const override { return "PARQUET"; } + +public: + OGRParquetWriterLayer( arrow::MemoryPool* poMemoryPool, + const std::shared_ptr& poOutputStream, + const char *pszLayerName ); + + ~OGRParquetWriterLayer() override; + + bool SetOptions( CSLConstList papszOptions, + OGRSpatialReference *poSpatialRef, + OGRwkbGeometryType eGType ); +}; + +/************************************************************************/ +/* OGRParquetWriterDataset */ +/************************************************************************/ + +class OGRParquetWriterDataset final: public GDALPamDataset +{ + std::unique_ptr m_poMemoryPool{}; + std::unique_ptr m_poLayer{}; + std::shared_ptr m_poOutputStream{}; + +public: + explicit OGRParquetWriterDataset( + const std::shared_ptr& poOutputStream); + + arrow::MemoryPool* GetMemoryPool() const { return m_poMemoryPool.get(); } + + int GetLayerCount() override ; + OGRLayer* GetLayer(int idx) override; + int TestCapability(const char* pszCap) override; + std::vector GetFieldDomainNames(CSLConstList /*papszOptions*/ = nullptr) const override; + const OGRFieldDomain* GetFieldDomain(const std::string& name) const override; + bool AddFieldDomain(std::unique_ptr&& domain, + std::string& failureReason) override; +protected: + OGRLayer *ICreateLayer( const char *pszName, + OGRSpatialReference *poSpatialRef = nullptr, + OGRwkbGeometryType eGType = wkbUnknown, + char ** papszOptions = nullptr ) override; + +}; + +#endif // OGR_PARQUET_H diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp new file mode 100644 index 000000000000..d3eec4e3e0c7 --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogrparquetdataset.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_parquet.h" + +#include "../arrow_common/ograrrowdataset.hpp" + +/************************************************************************/ +/* OGRParquetDataset() */ +/************************************************************************/ + +OGRParquetDataset::OGRParquetDataset(std::unique_ptr&& poMemoryPool): + OGRArrowDataset(std::move(poMemoryPool)) +{ +} diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp new file mode 100644 index 000000000000..01e7a8800486 --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp @@ -0,0 +1,320 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "gdal_pam.h" +#include "ogrsf_frmts.h" + +#include + +#include "ogr_parquet.h" +#include "../arrow_common/ograrrowrandomaccessfile.h" +#include "../arrow_common/ograrrowwritablefile.h" +#include "../arrow_common/ograrrowdataset.hpp" + +/************************************************************************/ +/* Identify() */ +/************************************************************************/ + +template constexpr int constexpr_length( const char (&) [N] ) +{ + return static_cast(N-1); +} + +static int OGRParquetDriverIdentify( GDALOpenInfo* poOpenInfo ) +{ + // See https://github.com/apache/parquet-format#file-format + bool bRet = false; + constexpr const char SIGNATURE[] = "PAR1"; + constexpr int SIGNATURE_SIZE = constexpr_length(SIGNATURE); + static_assert(SIGNATURE_SIZE == 4, "SIGNATURE_SIZE == 4"); + constexpr int METADATASIZE_SIZE = 4; + if( poOpenInfo->fpL != nullptr && + poOpenInfo->nHeaderBytes >= SIGNATURE_SIZE + METADATASIZE_SIZE + SIGNATURE_SIZE && + memcmp(poOpenInfo->pabyHeader, SIGNATURE, SIGNATURE_SIZE) == 0 ) + { + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_END); + const auto nFileSize = VSIFTellL(poOpenInfo->fpL); + VSIFSeekL(poOpenInfo->fpL, nFileSize - (METADATASIZE_SIZE + SIGNATURE_SIZE), SEEK_SET); + uint32_t nMetadataSize = 0; + static_assert(sizeof(nMetadataSize) == METADATASIZE_SIZE, "sizeof(nMetadataSize) == METADATASIZE_SIZE"); + VSIFReadL(&nMetadataSize, 1, sizeof(nMetadataSize), poOpenInfo->fpL); + CPL_LSBPTR32(&nMetadataSize); + unsigned char abyTrailingBytes[SIGNATURE_SIZE] = {0}; + VSIFReadL(&abyTrailingBytes[0], 1, SIGNATURE_SIZE, poOpenInfo->fpL); + bRet = memcmp(abyTrailingBytes, SIGNATURE, SIGNATURE_SIZE) == 0 && + nMetadataSize < nFileSize; + VSIFSeekL(poOpenInfo->fpL, 0, SEEK_SET); + } + return bRet; +} + +/************************************************************************/ +/* Open() */ +/************************************************************************/ + +static GDALDataset *OGRParquetDriverOpen( GDALOpenInfo* poOpenInfo ) +{ + if( !OGRParquetDriverIdentify(poOpenInfo) || + poOpenInfo->eAccess == GA_Update ) + { + return nullptr; + } + + try + { + std::shared_ptr infile; + if( STARTS_WITH(poOpenInfo->pszFilename, "/vsi") || + CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "NO")) ) + { + VSILFILE* fp = poOpenInfo->fpL; + poOpenInfo->fpL = nullptr; + infile = std::make_shared(fp); + } + else + { + PARQUET_ASSIGN_OR_THROW( + infile, + arrow::io::ReadableFile::Open(poOpenInfo->pszFilename)); + } + + // Open Parquet file reader + std::unique_ptr arrow_reader; + auto poMemoryPool = arrow::MemoryPool::CreateDefault(); + auto st = parquet::arrow::OpenFile(infile, poMemoryPool.get(), &arrow_reader); + if( !st.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "parquet::arrow::OpenFile() failed"); + return nullptr; + } + + auto poDS = cpl::make_unique(std::move(poMemoryPool)); + auto poLayer = cpl::make_unique( + poDS.get(), + CPLGetBasename(poOpenInfo->pszFilename), + std::move(arrow_reader)); + poDS->SetLayer(std::move(poLayer)); + return poDS.release(); + } + catch( const std::exception& e) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Parquet exception: %s", e.what()); + return nullptr; + } +} + +/************************************************************************/ +/* Create() */ +/************************************************************************/ + +static GDALDataset* OGRParquetDriverCreate(const char * pszName, + int nXSize, int nYSize, int nBands, + GDALDataType eType, + char ** /* papszOptions */ ) +{ + if( !(nXSize == 0 && nYSize == 0 && nBands == 0 && eType == GDT_Unknown) ) + return nullptr; + + try + { + std::shared_ptr out_file; + if( STARTS_WITH(pszName, "/vsi") || + CPLTestBool(CPLGetConfigOption("OGR_PARQUET_USE_VSI", "YES")) ) + { + VSILFILE* fp = VSIFOpenL(pszName, "wb"); + if( fp == nullptr ) + { + CPLError(CE_Failure, CPLE_FileIO, + "Cannot create %s", pszName); + return nullptr; + } + out_file = std::make_shared(fp); + } + else + { + PARQUET_ASSIGN_OR_THROW( + out_file, arrow::io::FileOutputStream::Open(pszName)); + } + + return new OGRParquetWriterDataset(out_file); + } + catch( const std::exception& e) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Parquet exception: %s", e.what()); + return nullptr; + } +} + +/************************************************************************/ +/* OGRParquetDriver() */ +/************************************************************************/ + +class OGRParquetDriver final: public GDALDriver +{ + bool m_bMetadataInitialized = false; + void InitMetadata(); + +public: + const char* GetMetadataItem(const char* pszName, const char* pszDomain) override + { + if( EQUAL(pszName, GDAL_DS_LAYER_CREATIONOPTIONLIST) ) + { + InitMetadata(); + } + return GDALDriver::GetMetadataItem(pszName, pszDomain); + } + + char** GetMetadata(const char* pszDomain) override + { + InitMetadata(); + return GDALDriver::GetMetadata(pszDomain); + } +}; + +void OGRParquetDriver::InitMetadata() +{ + if( m_bMetadataInitialized ) + return; + m_bMetadataInitialized = true; + + CPLXMLTreeCloser oTree(CPLCreateXMLNode( + nullptr, CXT_Element, "LayerCreationOptionList")); + + std::vector apszCompressionMethods; + bool bHasSnappy = false; + for( const char* pszMethod: { "SNAPPY", + "GZIP", + "BROTLI", + "ZSTD", + "LZ4", + "LZ4_FRAME", + "LZO", + "BZ2", + "LZ4_HADOOP" } ) + { + auto oResult = arrow::util::Codec::GetCompressionType( + CPLString(pszMethod).tolower()); + if( oResult.ok() && arrow::util::Codec::IsAvailable(*oResult) ) + { + if( EQUAL(pszMethod, "SNAPPY") ) + bHasSnappy = true; + apszCompressionMethods.emplace_back(pszMethod); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "COMPRESSION"); + CPLAddXMLAttributeAndValue(psOption, "type", "string-select"); + CPLAddXMLAttributeAndValue(psOption, "description", "Compression method"); + CPLAddXMLAttributeAndValue(psOption, "default", bHasSnappy ? "SNAPPY" : "NONE"); + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLAddXMLAttributeAndValue(poValueNode, "alias", "UNCOMPRESSED"); + CPLCreateXMLNode(poValueNode, CXT_Text, "NONE"); + } + for( const char* pszMethod: apszCompressionMethods ) + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLCreateXMLNode(poValueNode, CXT_Text, pszMethod); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_ENCODING"); + CPLAddXMLAttributeAndValue(psOption, "type", "string-select"); + CPLAddXMLAttributeAndValue(psOption, "description", "Encoding of geometry columns"); + CPLAddXMLAttributeAndValue(psOption, "default", "WKB"); + for( const char* pszEncoding : {"WKB", "WKT", "GEOARROW"} ) + { + auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); + CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding); + } + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "ROW_GROUP_SIZE"); + CPLAddXMLAttributeAndValue(psOption, "type", "integer"); + CPLAddXMLAttributeAndValue(psOption, "description", "Maximum number of rows per group"); + CPLAddXMLAttributeAndValue(psOption, "default", "65536"); + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "GEOMETRY_NAME"); + CPLAddXMLAttributeAndValue(psOption, "type", "string"); + CPLAddXMLAttributeAndValue(psOption, "description", "Name of geometry column"); + CPLAddXMLAttributeAndValue(psOption, "default", "geometry"); + } + + { + auto psOption = CPLCreateXMLNode(oTree.get(), CXT_Element, "Option"); + CPLAddXMLAttributeAndValue(psOption, "name", "FID"); + CPLAddXMLAttributeAndValue(psOption, "type", "string"); + CPLAddXMLAttributeAndValue(psOption, "description", "Name of the FID column to create"); + } + + char* pszXML = CPLSerializeXMLTree(oTree.get()); + GDALDriver::SetMetadataItem(GDAL_DS_LAYER_CREATIONOPTIONLIST, pszXML); + CPLFree(pszXML); +} + +/************************************************************************/ +/* RegisterOGRParquet() */ +/************************************************************************/ + +void RegisterOGRParquet() +{ + if( GDALGetDriverByName( "Parquet" ) != nullptr ) + return; + + auto poDriver = cpl::make_unique(); + + poDriver->SetDescription( "Parquet" ); + poDriver->SetMetadataItem( GDAL_DCAP_VECTOR, "YES" ); + poDriver->SetMetadataItem( GDAL_DMD_LONGNAME, "(Geo)Parquet" ); + poDriver->SetMetadataItem( GDAL_DMD_EXTENSION, "parquet" ); + poDriver->SetMetadataItem( GDAL_DMD_HELPTOPIC, "drivers/vector/parquet.html" ); + poDriver->SetMetadataItem( GDAL_DCAP_VIRTUALIO, "YES" ); + + poDriver->SetMetadataItem( GDAL_DMD_CREATIONFIELDDATATYPES, + "Integer Integer64 Real String Date Time DateTime " + "Binary IntegerList Integer64List RealList StringList" ); + poDriver->SetMetadataItem( GDAL_DMD_CREATIONFIELDDATASUBTYPES, + "Boolean Int16 Float32 JSON UUID" ); + + poDriver->pfnOpen = OGRParquetDriverOpen; + poDriver->pfnIdentify = OGRParquetDriverIdentify; + poDriver->pfnCreate = OGRParquetDriverCreate; + + GetGDALDriverManager()->RegisterDriver(poDriver.release()); +} diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp new file mode 100644 index 000000000000..cb8ee61f649f --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp @@ -0,0 +1,1017 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "cpl_json.h" +#include "cpl_time.h" +#include "gdal_pam.h" +#include "ogrsf_frmts.h" +#include "ogr_p.h" + +#include +#include +#include +#include +#include + +#include "ogr_parquet.h" + +#include "../arrow_common/ograrrowlayer.hpp" +#include "../arrow_common/ograrrowdataset.hpp" + +/************************************************************************/ +/* OGRParquetLayer() */ +/************************************************************************/ + +OGRParquetLayer::OGRParquetLayer(OGRParquetDataset* poDS, + const char* pszLayerName, + std::unique_ptr&& arrow_reader): + OGRArrowLayer(poDS, pszLayerName), + m_poDS(poDS), + m_poArrowReader(std::move(arrow_reader)) +{ + const char* pszParquetBatchSize = CPLGetConfigOption("OGR_PARQUET_BATCH_SIZE", nullptr); + if( pszParquetBatchSize ) + m_poArrowReader->set_batch_size(CPLAtoGIntBig(pszParquetBatchSize)); + + EstablishFeatureDefn(); + CPLAssert( static_cast(m_aeGeomEncoding.size()) == m_poFeatureDefn->GetGeomFieldCount() ); +} + +/************************************************************************/ +/* LoadGeoMetadata() */ +/************************************************************************/ + +void OGRParquetLayer::LoadGeoMetadata() +{ + const auto metadata = m_poArrowReader->parquet_reader()->metadata(); + const auto& kv_metadata = metadata->key_value_metadata(); + if( kv_metadata && kv_metadata->Contains("geo") ) + { + auto geo = kv_metadata->Get("geo"); + if( geo.ok() ) + { + CPLDebug("PARQUET", "geo = %s", geo->c_str()); + CPLJSONDocument oDoc; + if( oDoc.LoadMemory(*geo) ) + { + auto oRoot = oDoc.GetRoot(); + const auto osVersion = oRoot.GetString("version"); + if( osVersion != "0.1.0" ) + { + CPLDebug("PARQUET", + "version = %s not explicitly handled by the driver", + osVersion.c_str()); + } + + auto oColumns = oRoot.GetObj("columns"); + if( oColumns.IsValid() ) + { + for( const auto oColumn: oColumns.GetChildren() ) + { + m_oMapGeometryColumns[oColumn.GetName()] = oColumn; + } + } + } + else + { + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot parse 'geo' metadata"); + } + } + } +} + +/************************************************************************/ +/* EstablishFeatureDefn() */ +/************************************************************************/ + +void OGRParquetLayer::EstablishFeatureDefn() +{ + LoadGeoMetadata(); + + const auto metadata = m_poArrowReader->parquet_reader()->metadata(); + const auto& kv_metadata = metadata->key_value_metadata(); + const auto oMapFieldNameToGDALSchemaFieldDefn = LoadGDALMetadata(kv_metadata.get()); + + if( !m_poArrowReader->GetSchema(&m_poSchema).ok() ) + { + return; + } + + const auto fields = m_poSchema->fields(); + const auto poParquetSchema = metadata->schema(); + int iParquetCol = 0; + for( int i = 0; i < m_poSchema->num_fields(); ++i ) + { + const auto& field = fields[i]; + + const auto& field_kv_metadata = field->metadata(); + std::string osExtensionName; + if( field_kv_metadata ) + { + auto extension_name = kv_metadata->Get("ARROW:extension:name"); + if( extension_name.ok() ) + { + osExtensionName = *extension_name; + } +#ifdef DEBUG + CPLDebug("PARQUET", "Metadata field %s:", field->name().c_str()); + for(const auto& keyValue: field_kv_metadata->sorted_pairs() ) + { + CPLDebug("PARQUET", " %s = %s", + keyValue.first.c_str(), + keyValue.second.c_str()); + } +#endif + } + + bool bParquetColValid = CheckMatchArrowParquetColumnNames(iParquetCol, field); + if( !bParquetColValid ) + m_bHasMissingMappingToParquet = true; + + if( !m_osFIDColumn.empty() && + field->name() == m_osFIDColumn ) + { + m_iFIDArrowColumn = i; + if( bParquetColValid ) + { + m_iFIDParquetColumn = iParquetCol; + iParquetCol ++; + } + continue; + } + + bool bRegularField = true; + auto oIter = m_oMapGeometryColumns.find(field->name()); + if( oIter != m_oMapGeometryColumns.end() || + STARTS_WITH(osExtensionName.c_str(), "geoarrow.") ) + { + CPLJSONObject oJSONDef; + if( oIter != m_oMapGeometryColumns.end() ) + oJSONDef = oIter->second; + auto osEncoding = oJSONDef.GetString("encoding"); + if( osEncoding.empty() && !osExtensionName.empty() ) + osEncoding = osExtensionName; + + OGRwkbGeometryType eGeomType = wkbUnknown; + auto eGeomEncoding = OGRArrowGeomEncoding::WKB; + if( IsValidGeometryEncoding(field, osEncoding, eGeomType, eGeomEncoding) ) + { + bRegularField = false; + OGRGeomFieldDefn oField(field->name().c_str(), wkbUnknown); + + const auto osWKT = oJSONDef.GetString("crs"); + if( !oJSONDef.GetObj("crs").IsValid() ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Missing required 'crs' field for geometry column %s", + field->name().c_str()); + } + else if( !osWKT.empty() ) + { + OGRSpatialReference* poSRS = new OGRSpatialReference(); + poSRS->SetAxisMappingStrategy(OAMS_TRADITIONAL_GIS_ORDER); + if( poSRS->importFromWkt(osWKT.c_str()) == OGRERR_NONE ) + oField.SetSpatialRef(poSRS); + poSRS->Release(); + } + + // m_aeGeomEncoding be filled before calling ComputeGeometryColumnType() + m_aeGeomEncoding.push_back(eGeomEncoding); + if( eGeomType == wkbUnknown ) + { + auto osType = oJSONDef.GetString("geometry_type"); + if( osType.empty() ) + osType = oJSONDef.GetString("gdal:geometry_type"); + if( osType.empty() && CPLTestBool(CPLGetConfigOption( + "OGR_PARQUET_COMPUTE_GEOMETRY_TYPE", "YES")) ) + { + if( bParquetColValid && + poParquetSchema->Column(iParquetCol)->physical_type() == parquet::Type::BYTE_ARRAY ) + { + eGeomType = ComputeGeometryColumnType( + m_poFeatureDefn->GetGeomFieldCount(), iParquetCol); + } + } + else + eGeomType = GetGeometryTypeFromString(osType); + } + + oField.SetType(eGeomType); + oField.SetNullable(field->nullable()); + m_poFeatureDefn->AddGeomFieldDefn(&oField); + m_anMapGeomFieldIndexToArrowColumn.push_back(i); + m_anMapGeomFieldIndexToParquetColumn.push_back( bParquetColValid ? iParquetCol : -1 ); + if( bParquetColValid ) + iParquetCol ++; + } + } + + if( bRegularField ) + { + CreateFieldFromSchema(field, bParquetColValid, iParquetCol, {i}, + oMapFieldNameToGDALSchemaFieldDefn); + } + } + + CPLAssert( static_cast(m_anMapFieldIndexToArrowColumn.size()) == m_poFeatureDefn->GetFieldCount() ); + CPLAssert( static_cast(m_anMapFieldIndexToParquetColumn.size()) == m_poFeatureDefn->GetFieldCount() ); + CPLAssert( static_cast(m_anMapGeomFieldIndexToArrowColumn.size()) == m_poFeatureDefn->GetGeomFieldCount() ); + CPLAssert( static_cast(m_anMapGeomFieldIndexToParquetColumn.size()) == m_poFeatureDefn->GetGeomFieldCount() ); +} + +/************************************************************************/ +/* CheckMatchArrowParquetColumnNames() */ +/************************************************************************/ + +bool OGRParquetLayer::CheckMatchArrowParquetColumnNames(int& iParquetCol, + const std::shared_ptr& field) const +{ + const auto metadata = m_poArrowReader->parquet_reader()->metadata(); + const auto poParquetSchema = metadata->schema(); + const int nParquetColumns = poParquetSchema->num_columns(); + const auto fieldName = field->name(); + const int iParquetColBefore = iParquetCol; + + while( iParquetCol < nParquetColumns ) + { + const auto parquetColumn = poParquetSchema->Column(iParquetCol); + const auto parquetColumnName = parquetColumn->path()->ToDotString(); + if( fieldName == parquetColumnName || + (parquetColumnName.size() > fieldName.size() && + STARTS_WITH(parquetColumnName.c_str(), fieldName.c_str()) && + parquetColumnName[fieldName.size()] == '.') ) + { + return true; + } + else + { + iParquetCol ++; + } + } + + CPLError(CE_Warning, CPLE_AppDefined, + "Cannot match Arrow column name %s with a Parquet one", + fieldName.c_str()); + iParquetCol = iParquetColBefore; + return false; +} + +/************************************************************************/ +/* CreateFieldFromSchema() */ +/************************************************************************/ + +void OGRParquetLayer::CreateFieldFromSchema( + const std::shared_ptr& field, + bool bParquetColValid, + int& iParquetCol, + const std::vector& path, + const std::map>& oMapFieldNameToGDALSchemaFieldDefn) +{ + OGRFieldDefn oField(field->name().c_str(), OFTString); + OGRFieldType eType = OFTString; + OGRFieldSubType eSubType = OFSTNone; + bool bTypeOK = true; + + auto type = field->type(); + if( type->id() == arrow::Type::DICTIONARY && path.size() == 1 ) + { + const auto dictionaryType = std::static_pointer_cast(field->type()); + const auto indexType = dictionaryType->index_type(); + if( dictionaryType->value_type()->id() == arrow::Type::STRING && + IsIntegerArrowType(indexType->id()) ) + { + if( bParquetColValid ) + { + std::string osDomainName(field->name() + "Domain"); + m_poDS->RegisterDomainName(osDomainName, m_poFeatureDefn->GetFieldCount()); + oField.SetDomainName(osDomainName); + } + type = indexType; + } + else + { + bTypeOK = false; + } + } + + int nParquetColIncrement = 1; + switch( type->id() ) + { + case arrow::Type::STRUCT: + { + const auto subfields = field->Flatten(); + auto newpath = path; + newpath.push_back(0); + for( int j = 0; j < static_cast(subfields.size()); j++ ) + { + const auto& subfield = subfields[j]; + bParquetColValid = CheckMatchArrowParquetColumnNames(iParquetCol, subfield); + if( !bParquetColValid ) + m_bHasMissingMappingToParquet = true; + newpath.back() = j; + CreateFieldFromSchema(subfield, bParquetColValid, iParquetCol, + newpath, oMapFieldNameToGDALSchemaFieldDefn); + } + return; // return intended, not break + } + + case arrow::Type::MAP: + { + // A arrow map maps to 2 Parquet columns + nParquetColIncrement = 2; + break; + } + + default: + break; + + } + + if( bTypeOK ) + { + bTypeOK = MapArrowTypeToOGR(type, field, oField, eType, eSubType, + path, oMapFieldNameToGDALSchemaFieldDefn); + if( bTypeOK ) + { + m_anMapFieldIndexToParquetColumn.push_back(bParquetColValid ? iParquetCol : -1); + } + } + + if( bParquetColValid ) + iParquetCol += nParquetColIncrement; +}; + +/************************************************************************/ +/* BuildDomain() */ +/************************************************************************/ + +std::unique_ptr OGRParquetLayer::BuildDomain(const std::string& osDomainName, + int iFieldIndex) const +{ +#ifdef DEBUG + const int iArrowCol = m_anMapFieldIndexToArrowColumn[iFieldIndex][0]; + (void)iArrowCol; + CPLAssert( m_poSchema->fields()[iArrowCol]->type()->id() == arrow::Type::DICTIONARY ); +#endif + const int iParquetCol = m_anMapFieldIndexToParquetColumn[iFieldIndex]; + CPLAssert( iParquetCol >= 0 ); + std::shared_ptr poRecordBatchReader; + const auto oldBatchSize = m_poArrowReader->properties().batch_size(); + m_poArrowReader->set_batch_size(1); + m_poArrowReader->GetRecordBatchReader({0}, {iParquetCol}, + &poRecordBatchReader); + if( poRecordBatchReader != nullptr ) + { + std::shared_ptr poBatch; + auto status = poRecordBatchReader->ReadNext(&poBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + } + else if( poBatch ) + { + m_poArrowReader->set_batch_size(oldBatchSize); + return BuildDomainFromBatch(osDomainName, poBatch, 0); + } + } + m_poArrowReader->set_batch_size(oldBatchSize); + return nullptr; +} + +/************************************************************************/ +/* ComputeGeometryColumnType() */ +/************************************************************************/ + +OGRwkbGeometryType OGRParquetLayer::ComputeGeometryColumnType(int iGeomCol, + int iParquetCol) const +{ + // Compute type of geometry column by iterating over each geometry, and + // looking at the WKB geometry type in the first 5 bytes of each geometry. + + OGRwkbGeometryType eGeomType = wkbNone; + std::shared_ptr poRecordBatchReader; + + std::vector anRowGroups; + const int nNumGroups = m_poArrowReader->num_row_groups(); + anRowGroups.reserve(nNumGroups); + for( int i = 0; i < nNumGroups; ++i ) + anRowGroups.push_back(i); + m_poArrowReader->GetRecordBatchReader(anRowGroups, {iParquetCol}, + &poRecordBatchReader); + if( poRecordBatchReader != nullptr ) + { + std::shared_ptr poBatch; + while( true ) + { + auto status = poRecordBatchReader->ReadNext(&poBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + break; + } + else if( !poBatch ) + break; + + eGeomType = ComputeGeometryColumnTypeProcessBatch( + poBatch, iGeomCol, 0, eGeomType); + if( eGeomType == wkbUnknown ) + break; + } + } + + return eGeomType == wkbNone ? wkbUnknown : eGeomType; +} + +/************************************************************************/ +/* GetFeatureExplicitFID() */ +/************************************************************************/ + +OGRFeature* OGRParquetLayer::GetFeatureExplicitFID(GIntBig nFID) +{ + std::shared_ptr poRecordBatchReader; + + std::vector anRowGroups; + const int nNumGroups = m_poArrowReader->num_row_groups(); + anRowGroups.reserve(nNumGroups); + for( int i = 0; i < nNumGroups; ++i ) + anRowGroups.push_back(i); + if( m_bIgnoredFields ) + { + m_poArrowReader->GetRecordBatchReader(anRowGroups, + m_anRequestedParquetColumns, + &poRecordBatchReader); + } + else + { + m_poArrowReader->GetRecordBatchReader(anRowGroups, + &poRecordBatchReader); + } + if( poRecordBatchReader != nullptr ) + { + std::shared_ptr poBatch; + while( true ) + { + auto status = poRecordBatchReader->ReadNext(&poBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + break; + } + else if( !poBatch ) + break; + + const auto array = poBatch->column( + m_bIgnoredFields ? m_nRequestedFIDColumn : m_iFIDArrowColumn ); + const auto arrayPtr = array.get(); + const auto arrayTypeId = array->type_id(); + for( int64_t nIdxInBatch = 0; nIdxInBatch < poBatch->num_rows(); nIdxInBatch++ ) + { + if( !array->IsNull(nIdxInBatch) ) + { + if( arrayTypeId == arrow::Type::INT64 ) + { + const auto castArray = static_cast(arrayPtr); + if( castArray->Value(nIdxInBatch) == nFID ) + { + return ReadFeature(nIdxInBatch, poBatch->columns()); + } + } + else if( arrayTypeId == arrow::Type::INT32 ) + { + const auto castArray = static_cast(arrayPtr); + if( castArray->Value(nIdxInBatch) == nFID ) + { + return ReadFeature(nIdxInBatch, poBatch->columns()); + } + } + } + } + } + } + return nullptr; +} + +/************************************************************************/ +/* GetFeatureByIndex() */ +/************************************************************************/ + +OGRFeature* OGRParquetLayer::GetFeatureByIndex(GIntBig nFID) +{ + + if( nFID < 0 ) + return nullptr; + + const auto metadata = m_poArrowReader->parquet_reader()->metadata(); + const int nNumGroups = m_poArrowReader->num_row_groups(); + int64_t nAccRows = 0; + for( int iGroup = 0; iGroup < nNumGroups; ++iGroup ) + { + const int64_t nNextAccRows = nAccRows + metadata->RowGroup(iGroup)->num_rows(); + if( nFID < nNextAccRows ) + { + std::shared_ptr poRecordBatchReader; + if( m_bIgnoredFields ) + { + m_poArrowReader->GetRecordBatchReader({iGroup}, + m_anRequestedParquetColumns, + &poRecordBatchReader); + } + else + { + m_poArrowReader->GetRecordBatchReader({iGroup}, + &poRecordBatchReader); + } + if( poRecordBatchReader == nullptr ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "GetRecordBatchReader() failed"); + return nullptr; + } + + const int64_t nExpectedIdxInGroup = nFID - nAccRows; + int64_t nIdxInGroup = 0; + while( true ) + { + std::shared_ptr poBatch; + auto status = poRecordBatchReader->ReadNext(&poBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + return nullptr; + } + if( poBatch == nullptr ) + { + return nullptr; + } + if( nExpectedIdxInGroup < nIdxInGroup + poBatch->num_rows() ) + { + const auto nIdxInBatch = nExpectedIdxInGroup - nIdxInGroup; + auto poFeature = ReadFeature(nIdxInBatch, poBatch->columns()); + poFeature->SetFID(nFID); + return poFeature; + } + nIdxInGroup += poBatch->num_rows(); + } + } + nAccRows = nNextAccRows; + } + return nullptr; +} + +/************************************************************************/ +/* GetFeature() */ +/************************************************************************/ + +OGRFeature* OGRParquetLayer::GetFeature(GIntBig nFID) +{ + if( !m_osFIDColumn.empty() ) + { + return GetFeatureExplicitFID(nFID); + } + else + { + return GetFeatureByIndex(nFID); + } +} + +/************************************************************************/ +/* ResetReading() */ +/************************************************************************/ + +void OGRParquetLayer::ResetReading() +{ + if( m_iRecordBatch != 0 ) + { + m_poRecordBatchReader.reset(); + } + OGRArrowLayer::ResetReading(); +} + +/************************************************************************/ +/* ReadNextBatch() */ +/************************************************************************/ + +bool OGRParquetLayer::ReadNextBatch() +{ + m_nIdxInBatch = 0; + + if( m_bSingleBatch ) + { + CPLAssert( m_iRecordBatch == 0); + CPLAssert( m_poBatch != nullptr); + return false; + } + + CPLAssert( (m_iRecordBatch == -1 && m_poRecordBatchReader == nullptr) || + (m_iRecordBatch >= 0 && m_poRecordBatchReader != nullptr) ); + + if( m_poRecordBatchReader == nullptr ) + { + std::vector anRowGroups; + const int nNumGroups = m_poArrowReader->num_row_groups(); + anRowGroups.reserve(nNumGroups); + for( int i = 0; i < nNumGroups; ++i ) + anRowGroups.push_back(i); + if( m_bIgnoredFields ) + { + m_poArrowReader->GetRecordBatchReader(anRowGroups, + m_anRequestedParquetColumns, + &m_poRecordBatchReader); + } + else + { + m_poArrowReader->GetRecordBatchReader(anRowGroups, + &m_poRecordBatchReader); + } + if( m_poRecordBatchReader == nullptr ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "GetRecordBatchReader() failed"); + return false; + } + } + + ++m_iRecordBatch; + + std::shared_ptr poNextBatch; + auto status = m_poRecordBatchReader->ReadNext(&poNextBatch); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "ReadNext() failed: %s", + status.message().c_str()); + poNextBatch.reset(); + } + if( poNextBatch == nullptr ) + { + if( m_iRecordBatch == 1 ) + { + m_iRecordBatch = 0; + m_bSingleBatch = true; + } + else + m_poBatch.reset(); + return false; + } + m_poBatch = std::move(poNextBatch); + +#ifdef DEBUG + const auto& poColumns = m_poBatch->columns(); + + // Sanity checks + CPLAssert(m_poBatch->num_columns() == + (m_bIgnoredFields ? m_nExpectedBatchColumns : m_poSchema->num_fields())); + + for( int i = 0; i < m_poFeatureDefn->GetFieldCount(); ++i ) + { + int iCol; + if( m_bIgnoredFields ) + { + iCol = m_anMapFieldIndexToArrayIndex[i]; + if( iCol < 0 ) + continue; + } + else + { + iCol = m_anMapFieldIndexToArrowColumn[i][0]; + } + + CPLAssert(iCol < static_cast(poColumns.size())); + CPLAssert(m_poSchema->fields()[m_anMapFieldIndexToArrowColumn[i][0]]->type()->id() == + poColumns[iCol]->type_id()); + } + + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i ) + { + int iCol; + if( m_bIgnoredFields ) + { + iCol = m_anMapGeomFieldIndexToArrayIndex[i]; + if( iCol < 0 ) + continue; + } + else + { + iCol = m_anMapGeomFieldIndexToArrowColumn[i]; + } + + CPLAssert(iCol < static_cast(poColumns.size())); + CPLAssert(m_poSchema->fields()[m_anMapGeomFieldIndexToArrowColumn[i]]->type()->id() == + poColumns[iCol]->type_id()); + } +#endif + + return true; +} + +/************************************************************************/ +/* SetIgnoredFields() */ +/************************************************************************/ + +OGRErr OGRParquetLayer::SetIgnoredFields( const char **papszFields ) +{ + m_bIgnoredFields = false; + m_anRequestedParquetColumns.clear(); + m_anMapFieldIndexToArrayIndex.clear(); + m_anMapGeomFieldIndexToArrayIndex.clear(); + m_nRequestedFIDColumn = -1; + OGRErr eErr = OGRLayer::SetIgnoredFields(papszFields); + if( !m_bHasMissingMappingToParquet && eErr == OGRERR_NONE ) + { + m_bIgnoredFields = papszFields != nullptr && papszFields[0] != nullptr; + if( m_bIgnoredFields ) + { + int nBatchColumns = 0; + if( m_iFIDParquetColumn >= 0 ) + { + m_nRequestedFIDColumn = nBatchColumns; + nBatchColumns ++; + m_anRequestedParquetColumns.push_back(m_iFIDParquetColumn); + } + + for( int i = 0; i < m_poFeatureDefn->GetFieldCount(); ++i ) + { + const auto eArrowType = + m_poSchema->fields()[m_anMapFieldIndexToArrowColumn[i][0]]->type()->id(); + if( eArrowType == arrow::Type::STRUCT ) + { + // For a struct, for the sake of simplicity in GetNextRawFeature(), + // as soon as one of the member if requested, request all + // Parquet columns, so that the Arrow type doesn't change + bool bFoundNotIgnored = false; + for( int j = i; j < m_poFeatureDefn->GetFieldCount() && + m_anMapFieldIndexToArrowColumn[i][0] == + m_anMapFieldIndexToArrowColumn[j][0]; ++j ) + { + if( !m_poFeatureDefn->GetFieldDefn(j)->IsIgnored() ) + { + bFoundNotIgnored = true; + break; + } + } + if( bFoundNotIgnored ) + { + int j; + for( j = i; j < m_poFeatureDefn->GetFieldCount() && + m_anMapFieldIndexToArrowColumn[i][0] == + m_anMapFieldIndexToArrowColumn[j][0]; ++j ) + { + const int iParquetCol = m_anMapFieldIndexToParquetColumn[j]; + CPLAssert(iParquetCol >= 0); + if( !m_poFeatureDefn->GetFieldDefn(j)->IsIgnored() ) + { + m_anMapFieldIndexToArrayIndex.push_back(nBatchColumns); + } + else + { + m_anMapFieldIndexToArrayIndex.push_back(-1); + } + m_anRequestedParquetColumns.push_back(iParquetCol); + } + i = j - 1; + nBatchColumns ++; + } + else + { + int j; + for( j = i; j < m_poFeatureDefn->GetFieldCount() && + m_anMapFieldIndexToArrowColumn[i][0] == + m_anMapFieldIndexToArrowColumn[j][0]; ++j ) + { + m_anMapFieldIndexToArrayIndex.push_back(-1); + } + i = j - 1; + } + } + else if( !m_poFeatureDefn->GetFieldDefn(i)->IsIgnored() ) + { + const int iParquetCol = m_anMapFieldIndexToParquetColumn[i]; + CPLAssert(iParquetCol >= 0); + m_anMapFieldIndexToArrayIndex.push_back(nBatchColumns); + nBatchColumns ++; + m_anRequestedParquetColumns.push_back(iParquetCol); + if( eArrowType == arrow::Type::MAP ) + { + // For a map, request both keys and items Parquet columns + m_anRequestedParquetColumns.push_back(iParquetCol + 1); + } + } + else + { + m_anMapFieldIndexToArrayIndex.push_back(-1); + } + } + + CPLAssert(static_cast(m_anMapFieldIndexToArrayIndex.size()) == + m_poFeatureDefn->GetFieldCount() ); + + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i ) + { + if( !m_poFeatureDefn->GetGeomFieldDefn(i)->IsIgnored() ) + { + const int iParquetCol = m_anMapGeomFieldIndexToParquetColumn[i]; + CPLAssert(iParquetCol >= 0); + m_anMapGeomFieldIndexToArrayIndex.push_back(nBatchColumns); + nBatchColumns ++; + m_anRequestedParquetColumns.push_back(iParquetCol); + } + else + { + m_anMapGeomFieldIndexToArrayIndex.push_back(-1); + } + } + + CPLAssert(static_cast(m_anMapGeomFieldIndexToArrayIndex.size()) == + m_poFeatureDefn->GetGeomFieldCount() ); +#ifdef DEBUG + m_nExpectedBatchColumns = nBatchColumns; +#endif + } + } + + // Full invalidation + m_iRecordBatch = -1; + m_bSingleBatch = false; + ResetReading(); + + return eErr; +} + +/************************************************************************/ +/* GetFeatureCount() */ +/************************************************************************/ + +GIntBig OGRParquetLayer::GetFeatureCount(int bForce) +{ + if( m_poAttrQuery == nullptr && m_poFilterGeom == nullptr ) + { + auto metadata = m_poArrowReader->parquet_reader()->metadata(); + if( metadata ) + return metadata->num_rows(); + } + return OGRLayer::GetFeatureCount(bForce); +} + +/************************************************************************/ +/* TestCapability() */ +/************************************************************************/ + +int OGRParquetLayer::TestCapability(const char* pszCap) +{ + if( EQUAL(pszCap, OLCFastFeatureCount) ) + return m_poAttrQuery == nullptr && m_poFilterGeom == nullptr; + + if( EQUAL(pszCap, OLCFastGetExtent) ) + { + for(int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); i++ ) + { + auto oIter = m_oMapGeometryColumns.find( + m_poFeatureDefn->GetGeomFieldDefn(i)->GetNameRef() ); + if( oIter == m_oMapGeometryColumns.end() ) + { + return false; + } + const auto& oJSONDef = oIter->second; + const auto oBBox = oJSONDef.GetArray("bbox"); + if( !(oBBox.IsValid() && oBBox.Size() == 4) ) + { + return false; + } + } + return true; + } + + if( EQUAL(pszCap, OLCStringsAsUTF8) ) + return true; + + if( EQUAL(pszCap, OLCMeasuredGeometries) ) + return true; + + if( EQUAL(pszCap, OLCIgnoreFields) ) + return !m_bHasMissingMappingToParquet; + + return false; +} + +/************************************************************************/ +/* GetMetadataItem() */ +/************************************************************************/ + +const char* OGRParquetLayer::GetMetadataItem( const char* pszName, + const char* pszDomain ) +{ + // Mostly for unit test purposes + if( pszDomain != nullptr && EQUAL(pszDomain, "_PARQUET_") ) + { + int nRowGroupIdx = -1; + int nColumn = -1; + if( EQUAL(pszName, "NUM_ROW_GROUPS") ) + { + return CPLSPrintf("%d", m_poArrowReader->num_row_groups()); + } + else if( sscanf(pszName, "ROW_GROUPS[%d]", &nRowGroupIdx) == 1 && + strstr(pszName, ".NUM_ROWS") ) + { + try + { + auto poRowGroup = m_poArrowReader->parquet_reader()->RowGroup(nRowGroupIdx); + if( poRowGroup == nullptr ) + return nullptr; + return CPLSPrintf("%" PRId64, poRowGroup->metadata()->num_rows()); + } + catch( const std::exception& ) + { + } + } + else if( sscanf(pszName, "ROW_GROUPS[%d].COLUMNS[%d]", &nRowGroupIdx, &nColumn) == 2 && + strstr(pszName, ".COMPRESSION") ) + { + try + { + auto poRowGroup = m_poArrowReader->parquet_reader()->RowGroup(nRowGroupIdx); + if( poRowGroup == nullptr ) + return nullptr; + auto poColumn = poRowGroup->metadata()->ColumnChunk(nColumn); + return CPLSPrintf("%s", + arrow::util::Codec::GetCodecAsString(poColumn->compression()).c_str()); + } + catch( const std::exception& ) + { + } + } + return nullptr; + } + if( pszDomain != nullptr && EQUAL(pszDomain, "_PARQUET_METADATA_") ) + { + const auto metadata = m_poArrowReader->parquet_reader()->metadata(); + const auto& kv_metadata = metadata->key_value_metadata(); + if( kv_metadata && kv_metadata->Contains(pszName) ) + { + auto metadataItem = kv_metadata->Get(pszName); + if( metadataItem.ok() ) + { + return CPLSPrintf("%s", metadataItem->c_str()); + } + } + return nullptr; + } + return OGRLayer::GetMetadataItem(pszName, pszDomain); +} + +/************************************************************************/ +/* GetMetadata() */ +/************************************************************************/ + +char** OGRParquetLayer::GetMetadata( const char* pszDomain ) +{ + // Mostly for unit test purposes + if( pszDomain != nullptr && EQUAL(pszDomain, "_PARQUET_METADATA_") ) + { + m_aosFeatherMetadata.Clear(); + const auto metadata = m_poArrowReader->parquet_reader()->metadata(); + const auto& kv_metadata = metadata->key_value_metadata(); + if( kv_metadata ) + { + for( const auto& kv: kv_metadata->sorted_pairs() ) + { + m_aosFeatherMetadata.SetNameValue(kv.first.c_str(), kv.second.c_str()); + } + } + return m_aosFeatherMetadata.List(); + } + return OGRLayer::GetMetadata(pszDomain); +} diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp new file mode 100644 index 000000000000..f40de14028fd --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogrparquetwriterdataset.cpp @@ -0,0 +1,132 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_parquet.h" + +#include "../arrow_common/ograrrowwriterlayer.hpp" + +/************************************************************************/ +/* OGRParquetWriterDataset() */ +/************************************************************************/ + +OGRParquetWriterDataset::OGRParquetWriterDataset( + const std::shared_ptr& poOutputStream): + m_poMemoryPool(arrow::MemoryPool::CreateDefault()), + m_poOutputStream(poOutputStream) +{ +} + +/************************************************************************/ +/* GetLayerCount() */ +/************************************************************************/ + +int OGRParquetWriterDataset::GetLayerCount() +{ + return m_poLayer ? 1 : 0; +} + +/************************************************************************/ +/* GetLayer() */ +/************************************************************************/ + +OGRLayer* OGRParquetWriterDataset::GetLayer(int idx) +{ + return idx == 0 ? m_poLayer.get() : nullptr; +} + +/************************************************************************/ +/* TestCapability() */ +/************************************************************************/ + +int OGRParquetWriterDataset::TestCapability(const char* pszCap) +{ + if( EQUAL(pszCap, ODsCCreateLayer) ) + return m_poLayer == nullptr; + if( EQUAL(pszCap, ODsCAddFieldDomain) ) + return m_poLayer != nullptr; + return false; +} + +/************************************************************************/ +/* ICreateLayer() */ +/************************************************************************/ + +OGRLayer* OGRParquetWriterDataset::ICreateLayer( const char *pszName, + OGRSpatialReference *poSpatialRef, + OGRwkbGeometryType eGType, + char ** papszOptions ) +{ + if( m_poLayer ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Can write only one layer in a Parquet file"); + return nullptr; + } + m_poLayer = cpl::make_unique(m_poMemoryPool.get(), + m_poOutputStream, + pszName); + if( !m_poLayer->SetOptions(papszOptions, poSpatialRef, eGType) ) + { + m_poLayer.reset(); + return nullptr; + } + return m_poLayer.get(); +} + +/************************************************************************/ +/* AddFieldDomain() */ +/************************************************************************/ + +bool OGRParquetWriterDataset::AddFieldDomain(std::unique_ptr&& domain, + std::string& failureReason) +{ + if( m_poLayer == nullptr ) + { + failureReason = "Layer must be created"; + return false; + } + return m_poLayer->AddFieldDomain(std::move(domain), failureReason); +} + +/************************************************************************/ +/* GetFieldDomainNames() */ +/************************************************************************/ + +std::vector OGRParquetWriterDataset::GetFieldDomainNames(CSLConstList) const +{ + return m_poLayer ? m_poLayer->GetFieldDomainNames() : std::vector(); +} + +/************************************************************************/ +/* GetFieldDomain() */ +/************************************************************************/ + +const OGRFieldDomain* OGRParquetWriterDataset::GetFieldDomain(const std::string& name) const +{ + return m_poLayer ? m_poLayer->GetFieldDomain(name): nullptr; +} diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp new file mode 100644 index 000000000000..3ef75da346f9 --- /dev/null +++ b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp @@ -0,0 +1,402 @@ +/****************************************************************************** + * + * Project: Parquet Translator + * Purpose: Implements OGRParquetDriver. + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2022, Planet Labs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + ****************************************************************************/ + +#include "ogr_parquet.h" + +#include "../arrow_common/ograrrowwriterlayer.hpp" + +/************************************************************************/ +/* OGRParquetWriterLayer() */ +/************************************************************************/ + +OGRParquetWriterLayer::OGRParquetWriterLayer( + arrow::MemoryPool* poMemoryPool, + const std::shared_ptr& poOutputStream, + const char *pszLayerName): + OGRArrowWriterLayer(poMemoryPool, poOutputStream, pszLayerName) +{ +} + +/************************************************************************/ +/* ~OGRParquetWriterLayer() */ +/************************************************************************/ + +OGRParquetWriterLayer::~OGRParquetWriterLayer() +{ + if( m_bInitializationOK ) + FinalizeWriting(); +} + +/************************************************************************/ +/* SetOptions() */ +/************************************************************************/ + +bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions, + OGRSpatialReference *poSpatialRef, + OGRwkbGeometryType eGType) +{ + const char* pszGeomEncoding = CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING"); + m_eGeomEncoding = OGRArrowGeomEncoding::WKB; + if( pszGeomEncoding ) + { + if( EQUAL(pszGeomEncoding, "WKB") ) + m_eGeomEncoding = OGRArrowGeomEncoding::WKB; + else if( EQUAL(pszGeomEncoding, "WKT") ) + m_eGeomEncoding = OGRArrowGeomEncoding::WKT; + else if( EQUAL(pszGeomEncoding, "GEOARROW") ) + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC; + else + { + CPLError(CE_Failure, CPLE_NotSupported, + "Unsupported GEOMETRY_ENCODING = %s", + pszGeomEncoding); + return false; + } + } + + if( eGType != wkbNone ) + { + if( !IsSupportedGeometryType(eGType) ) + { + return false; + } + + if( poSpatialRef == nullptr ) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry column should have an associated CRS"); + } + + m_poFeatureDefn->SetGeomType(eGType); + auto eGeomEncoding = m_eGeomEncoding; + if( eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC ) + { + eGeomEncoding = GetPreciseArrowGeomEncoding(eGType); + if( eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC ) + return false; + } + m_aeGeomEncoding.push_back(eGeomEncoding); + m_poFeatureDefn->GetGeomFieldDefn(0)->SetName( + CSLFetchNameValueDef(papszOptions, "GEOMETRY_NAME", "geometry")); + if( poSpatialRef ) + { + auto poSRS = poSpatialRef->Clone(); + m_poFeatureDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS); + poSRS->Release(); + } + } + + m_osFIDColumn = CSLFetchNameValueDef(papszOptions, "FID", ""); + + const char* pszCompression = CSLFetchNameValue( + papszOptions, "COMPRESSION"); + if( pszCompression == nullptr ) + { + auto oResult = arrow::util::Codec::GetCompressionType("snappy"); + if( oResult.ok() && arrow::util::Codec::IsAvailable(*oResult) ) + { + pszCompression = "SNAPPY"; + } + else + { + pszCompression = "NONE"; + } + } + + if( EQUAL(pszCompression, "NONE") ) + pszCompression = "UNCOMPRESSED"; + auto oResult = arrow::util::Codec::GetCompressionType( + CPLString(pszCompression).tolower()); + if( !oResult.ok() ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Unrecognized compression method: %s", pszCompression); + return false; + } + m_eCompression = *oResult; + if( !arrow::util::Codec::IsAvailable(m_eCompression) ) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Compression method %s is known, but libarrow has not " + "been built with support for it", pszCompression); + return false; + } + + const char* pszRowGroupSize = CSLFetchNameValue(papszOptions, "ROW_GROUP_SIZE"); + if( pszRowGroupSize ) + { + auto nRowGroupSize = static_cast(atoll(pszRowGroupSize)); + if( nRowGroupSize > 0 ) + { + if( nRowGroupSize > INT_MAX ) + nRowGroupSize = INT_MAX; + m_nRowGroupSize = nRowGroupSize; + } + } + + m_bInitializationOK = true; + return true; +} + +/************************************************************************/ +/* CloseFileWriter() */ +/************************************************************************/ + +void OGRParquetWriterLayer::CloseFileWriter() +{ + auto status = m_poFileWriter->Close(); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "FileWriter::Close() failed with %s", + status.message().c_str()); + } +} + +/************************************************************************/ +/* DoSomethingBeforeFinalFlushGroup() */ +/************************************************************************/ + +void OGRParquetWriterLayer::DoSomethingBeforeFinalFlushGroup() +{ + if( m_poKeyValueMetadata && + m_poFeatureDefn->GetGeomFieldCount() != 0 && + CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_GEO", "YES")) ) + { + CPLJSONObject oRoot; + oRoot.Add("version", "0.1.0"); + oRoot.Add("primary_column", + m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()); + CPLJSONObject oColumns; + oRoot.Add("columns", oColumns); + for( int i = 0; i < m_poFeatureDefn->GetGeomFieldCount(); ++i ) + { + const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(i); + CPLJSONObject oColumn; + oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn); + oColumn.Add("encoding", + GetGeomEncodingAsString(m_aeGeomEncoding[i])); + + const auto poSRS = poGeomFieldDefn->GetSpatialRef(); + if( poSRS ) + { + const char* const apszOptions[] = { + "FORMAT=WKT2_2019", "MULTILINE=NO", nullptr }; + char* pszWKT = nullptr; + poSRS->exportToWkt(&pszWKT, apszOptions); + if( pszWKT ) + oColumn.Add("crs", pszWKT); + CPLFree(pszWKT); + } + + if( m_aoEnvelopes[i].IsInit() && + CPLTestBool(CPLGetConfigOption( + "OGR_PARQUET_WRITE_BBOX", "YES")) ) + { + CPLJSONArray oBBOX; + oBBOX.Add(m_aoEnvelopes[i].MinX); + oBBOX.Add(m_aoEnvelopes[i].MinY); + oBBOX.Add(m_aoEnvelopes[i].MaxX); + oBBOX.Add(m_aoEnvelopes[i].MaxY); + oColumn.Add("bbox", oBBOX); + } + + if( CPLTestBool(CPLGetConfigOption( + "OGR_PARQUET_WRITE_GDAL_GEOMETRY_TYPE", "YES")) ) + { + // Geometry type, place under a temporary "gdal:geometry_type" property + // pending acceptance of proposal at + // https://github.com/opengeospatial/geoparquet/issues/41 + const auto eType = poGeomFieldDefn->GetType(); + const char* pszType = "mixed"; + if( wkbPoint == eType ) + pszType = "Point"; + else if( wkbLineString == eType ) + pszType = "LineString"; + else if( wkbPolygon == eType ) + pszType = "Polygon"; + else if( wkbMultiPoint == eType ) + pszType = "MultiPoint"; + else if( wkbMultiLineString == eType ) + pszType = "MultiLineString"; + else if( wkbMultiPolygon == eType ) + pszType = "MultiPolygon"; + else if( wkbGeometryCollection == eType ) + pszType = "GeometryCollection"; + oColumn.Add("gdal:geometry_type", pszType); + } + } + + // HACK: it would be good for Arrow to provide a clean way to alter + // key value metadata before finalizing. + // We need to write metadata at end to write the bounding box. + const_cast(m_poKeyValueMetadata.get())->Append( + "geo", oRoot.Format(CPLJSONObject::PrettyFormat::Plain)); + } +} + +/************************************************************************/ +/* GetSchemaMetadata() */ +/************************************************************************/ + +// From ${arrow_root}/src/parquet/arrow/writer.cpp +static +arrow::Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, + const parquet::ArrowWriterProperties& properties, + std::shared_ptr* out) { + if (!properties.store_schema()) { + *out = nullptr; + return arrow::Status::OK(); + } + + static const std::string kArrowSchemaKey = "ARROW:schema"; + std::shared_ptr result; + if (schema.metadata()) { + result = schema.metadata()->Copy(); + } else { + result = std::make_shared(); + } + + if( CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_ARROW_SCHEMA", "YES")) ) + { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr serialized, + ::arrow::ipc::SerializeSchema(schema, pool)); + + // The serialized schema is not UTF-8, which is required for Thrift + std::string schema_as_string = serialized->ToString(); + std::string schema_base64 = ::arrow::util::base64_encode(schema_as_string); + result->Append(kArrowSchemaKey, schema_base64); + } + *out = result; + return arrow::Status::OK(); +} + +/************************************************************************/ +/* Open() */ +/************************************************************************/ + +// Same as parquet::arrow::FileWriter::Open(), except we also +// return KeyValueMetadata +static +arrow::Status Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, + std::shared_ptr<::arrow::io::OutputStream> sink, + std::shared_ptr properties, + std::shared_ptr arrow_properties, + std::unique_ptr* writer, + std::shared_ptr* outMetadata) { + std::shared_ptr parquet_schema; + RETURN_NOT_OK( + parquet::arrow::ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema)); + + auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); + + std::shared_ptr metadata; + RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata)); + + *outMetadata = metadata; + + std::unique_ptr base_writer; + PARQUET_CATCH_NOT_OK(base_writer = parquet::ParquetFileWriter::Open( + std::move(sink), schema_node, + std::move(properties), + metadata)); + + auto schema_ptr = std::make_shared<::arrow::Schema>(schema); + return parquet::arrow::FileWriter::Make( + pool, std::move(base_writer), std::move(schema_ptr), + std::move(arrow_properties), writer); +} + +/************************************************************************/ +/* CreateSchema() */ +/************************************************************************/ + +void OGRParquetWriterLayer::CreateSchema() +{ + CreateSchemaCommon(); +} + +/************************************************************************/ +/* CreateWriter() */ +/************************************************************************/ + +void OGRParquetWriterLayer::CreateWriter() +{ + CPLAssert( m_poFileWriter == nullptr ); + + if( m_poSchema == nullptr ) + { + CreateSchema(); + } + else + { + FinalizeSchema(); + } + + auto writerProperties = parquet::WriterProperties::Builder().compression(m_eCompression)->build(); + auto arrowWriterProperties = parquet::ArrowWriterProperties::Builder().store_schema()->build(); + Open(*m_poSchema, m_poMemoryPool, m_poOutputStream, + writerProperties, + arrowWriterProperties, + &m_poFileWriter, + &m_poKeyValueMetadata); +} + +/************************************************************************/ +/* FlushGroup() */ +/************************************************************************/ + +bool OGRParquetWriterLayer::FlushGroup() +{ + auto status = m_poFileWriter->NewRowGroup(m_apoBuilders[0]->length()); + if( !status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "NewRowGroup() failed with %s", status.message().c_str()); + m_apoBuilders.clear(); + return false; + } + + auto ret = WriteArrays([this](const std::shared_ptr& field, + const std::shared_ptr& array) + { + auto l_status = m_poFileWriter->WriteColumnChunk(*array); + if( !l_status.ok() ) + { + CPLError(CE_Failure, CPLE_AppDefined, + "WriteColumnChunk() failed for field %s: %s", + field->name().c_str(), + l_status.message().c_str()); + return false; + } + return true; + }); + + m_apoBuilders.clear(); + return ret; +} diff --git a/ogr/ogrutils.cpp b/ogr/ogrutils.cpp index 11205cfea568..4d90cfd09728 100644 --- a/ogr/ogrutils.cpp +++ b/ogr/ogrutils.cpp @@ -1768,6 +1768,60 @@ OGRErr OGRReadWKBGeometryType( const unsigned char * pabyData, return OGRERR_NONE; } +/************************************************************************/ +/* OGRReadWKTGeometryType() */ +/************************************************************************/ + +OGRErr OGRReadWKTGeometryType( const char* pszWKT, + OGRwkbGeometryType *peGeometryType ) +{ + if( !peGeometryType ) + return OGRERR_FAILURE; + + OGRwkbGeometryType eGeomType = wkbUnknown; + if( STARTS_WITH_CI(pszWKT, "POINT") ) + eGeomType = wkbPoint; + else if( STARTS_WITH_CI(pszWKT, "LINESTRING") ) + eGeomType = wkbLineString; + else if( STARTS_WITH_CI(pszWKT, "POLYGON") ) + eGeomType = wkbPolygon; + else if( STARTS_WITH_CI(pszWKT, "MULTIPOINT") ) + eGeomType = wkbMultiPoint; + else if( STARTS_WITH_CI(pszWKT, "MULTILINESTRING") ) + eGeomType = wkbMultiLineString; + else if( STARTS_WITH_CI(pszWKT, "MULTIPOLYGON") ) + eGeomType = wkbMultiPolygon; + else if( STARTS_WITH_CI(pszWKT, "GEOMETRYCOLLECTION") ) + eGeomType = wkbGeometryCollection; + else if( STARTS_WITH_CI(pszWKT, "CIRCULARSTRING") ) + eGeomType = wkbCircularString; + else if( STARTS_WITH_CI(pszWKT, "COMPOUNDCURVE") ) + eGeomType = wkbCompoundCurve; + else if( STARTS_WITH_CI(pszWKT, "CURVEPOLYGON") ) + eGeomType = wkbCurvePolygon; + else if( STARTS_WITH_CI(pszWKT, "MULTICURVE") ) + eGeomType = wkbMultiCurve; + else if( STARTS_WITH_CI(pszWKT, "MULTISURFACE") ) + eGeomType = wkbMultiSurface; + else if( STARTS_WITH_CI(pszWKT, "POLYHEDRALSURFACE") ) + eGeomType = wkbPolyhedralSurface; + else if( STARTS_WITH_CI(pszWKT, "TIN") ) + eGeomType = wkbTIN; + else + return OGRERR_UNSUPPORTED_GEOMETRY_TYPE; + + if( strstr(pszWKT, " ZM") ) + eGeomType = OGR_GT_SetModifier(eGeomType, true, true); + else if( strstr(pszWKT, " Z") ) + eGeomType = OGR_GT_SetModifier(eGeomType, true, false); + else if( strstr(pszWKT, " M") ) + eGeomType = OGR_GT_SetModifier(eGeomType, false, true); + + *peGeometryType = eGeomType; + + return OGRERR_NONE; +} + /************************************************************************/ /* OGRFormatFloat() */ /************************************************************************/