geopandas · brendan-ward · Oct 20, 2023 · May 7, 2023 · May 7, 2023 · May 8, 2023
diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml
@@ -66,4 +66,4 @@ jobs:
 
       - name: Test
         run: |
-          pytest -v -r s pyogrio/tests
+          pytest -v --color=yes -r s pyogrio/tests
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -599,7 +599,8 @@ cdef process_fields(
     object field_data_view,
     object field_indexes,
     object field_ogr_types,
-    encoding
+    encoding,
+    bint datetime_as_string
 ):
     cdef int j
     cdef int success
@@ -657,22 +658,27 @@ cdef process_fields(
             data[i] = bin_value[:ret_length]
 
         elif field_type == OFTDateTime or field_type == OFTDate:
-            success = OGR_F_GetFieldAsDateTimeEx(
-                ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)
+
+            if datetime_as_string:
+                # defer datetime parsing to user/ pandas layer 
+                data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding)
+            else:
+                success = OGR_F_GetFieldAsDateTimeEx(
+                    ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)
 
-            ms, ss = math.modf(fsecond)
-            second = int(ss)
-            # fsecond has millisecond accuracy
-            microsecond = round(ms * 1000) * 1000
+                ms, ss = math.modf(fsecond)
+                second = int(ss)
+                # fsecond has millisecond accuracy
+                microsecond = round(ms * 1000) * 1000
 
-            if not success:
-                data[i] = np.datetime64('NaT')
+                if not success:
+                    data[i] = np.datetime64('NaT')
 
-            elif field_type == OFTDate:
-                data[i] = datetime.date(year, month, day).isoformat()
+                elif field_type == OFTDate:
+                    data[i] = datetime.date(year, month, day).isoformat()
 
-            elif field_type == OFTDateTime:
-                data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
+                elif field_type == OFTDateTime:
+                    data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
 
 
 @cython.boundscheck(False)  # Deactivate bounds checking
@@ -685,7 +691,8 @@ cdef get_features(
     uint8_t force_2d,
     int skip_features,
     int num_features,
-    uint8_t return_fids
+    uint8_t return_fids,
+    bint datetime_as_string
 ):
 
     cdef OGRFeatureH ogr_feature = NULL
@@ -718,7 +725,9 @@ cdef get_features(
 
     field_data = [
         np.empty(shape=(num_features, ),
-        dtype=fields[field_index,3]) for field_index in range(n_fields)
+        dtype = ("object" if datetime_as_string and 
+                    fields[field_index,3].startswith("datetime") else fields[field_index,3])
+        ) for field_index in range(n_fields)
     ]
 
     field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
@@ -758,7 +767,7 @@ cdef get_features(
 
             process_fields(
                 ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding
+                field_indexes, field_ogr_types, encoding, datetime_as_string
             )
             i += 1
         finally:
@@ -788,7 +797,8 @@ cdef get_features_by_fid(
     object[:,:] fields,
     encoding,
     uint8_t read_geometry,
-    uint8_t force_2d
+    uint8_t force_2d,
+    bint datetime_as_string
 ):
 
     cdef OGRFeatureH ogr_feature = NULL
@@ -811,10 +821,11 @@ cdef get_features_by_fid(
     n_fields = fields.shape[0]
     field_indexes = fields[:,0]
     field_ogr_types = fields[:,1]
-
     field_data = [
         np.empty(shape=(count, ),
-        dtype=fields[field_index,3]) for field_index in range(n_fields)
+        dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") 
+            else fields[field_index,3])) 
+        for field_index in range(n_fields)
     ]
 
     field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
@@ -837,7 +848,7 @@ cdef get_features_by_fid(
 
             process_fields(
                 ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding
+                field_indexes, field_ogr_types, encoding, datetime_as_string
             )
         finally:
             if ogr_feature != NULL:
@@ -939,7 +950,9 @@ def ogr_read(
     object fids=None,
     str sql=None,
     str sql_dialect=None,
-    int return_fids=False):
+    int return_fids=False,
+    bint datetime_as_string=False
+    ):
 
     cdef int err = 0
     cdef const char *path_c = NULL
@@ -1022,6 +1035,7 @@ def ogr_read(
                 encoding,
                 read_geometry=read_geometry and geometry_type is not None,
                 force_2d=force_2d,
+                datetime_as_string=datetime_as_string
             )
 
             # bypass reading fids since these should match fids used for read
@@ -1051,13 +1065,15 @@ def ogr_read(
                 force_2d=force_2d,
                 skip_features=skip_features,
                 num_features=num_features,
-                return_fids=return_fids
+                return_fids=return_fids,
+                datetime_as_string=datetime_as_string
             )
 
         meta = {
             'crs': crs,
             'encoding': encoding,
             'fields': fields[:,2], # return only names
+            'dtypes':fields[:,3],
             'geometry_type': geometry_type,
         }
 
@@ -1796,7 +1812,6 @@ def ogr_write(
                     if np.isnat(field_value):
                         OGR_F_SetFieldNull(ogr_feature, field_idx)
                     else:
-                        # TODO: add support for timezones
                         datetime = field_value.astype("datetime64[ms]").item()
                         OGR_F_SetFieldDateTimeEx(
                             ogr_feature,

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -2,6 +2,7 @@
 from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS
 from pyogrio.raw import detect_driver, read, read_arrow, write
 from pyogrio.errors import DataSourceError
+from packaging.version import Version
 
 
 def _stringify_path(path):
@@ -19,6 +20,18 @@ def _stringify_path(path):
     return path
 
 
+def _try_parse_datetime(ser):
+    import pandas as pd  # only called in a block where pandas is known to be installed
+
+    if Version(pd.__version__) >= Version("2.0.0"):
+        res = pd.to_datetime(ser, format="ISO8601")
+        if res.dtype != "object":
+            res = res.dt.as_unit("ms")
+        return res
+    else:
+        return pd.to_datetime(ser, yearfirst=True)
+
+
 def read_dataframe(
     path_or_buffer,
     /,
@@ -146,6 +159,8 @@ def read_dataframe(
     path_or_buffer = _stringify_path(path_or_buffer)
 
     read_func = read_arrow if use_arrow else read
+    if not use_arrow and "datetime_as_string" not in kwargs:
+        kwargs["datetime_as_string"] = True
     result = read_func(
         path_or_buffer,
         layer=layer,
@@ -182,8 +197,10 @@ def read_dataframe(
         index = pd.Index(index, name="fid")
     else:
         index = None
-
     df = pd.DataFrame(data, columns=columns, index=index)
+    for dtype, c in zip(meta["dtypes"], df.columns):
+        if dtype.startswith("datetime"):
+            df[c] = _try_parse_datetime(df[c])
 
     if geometry is None or not read_geometry:
         return df
@@ -327,7 +344,12 @@ def write_dataframe(
     field_data = []
     field_mask = []
     for name in fields:
-        col = df[name].values
+        ser = df[name]
+        col = ser.values
+        if isinstance(ser.dtype, pd.DatetimeTZDtype):
+            # Deal with datetimes with timezones as strings
+            col = ser.astype(str)
+
         if isinstance(col, pd.api.extensions.ExtensionArray):
             from pandas.arrays import IntegerArray, FloatingArray, BooleanArray
 

diff --git a/pyogrio/raw.py b/pyogrio/raw.py
@@ -53,6 +53,7 @@ def read(
     sql=None,
     sql_dialect=None,
     return_fids=False,
+    datetime_as_string=False,
     **kwargs,
 ):
     """Read OGR data source into numpy arrays.
@@ -108,6 +109,10 @@ def read(
         number of features usings FIDs is also driver specific.
     return_fids : bool, optional (default: False)
         If True, will return the FIDs of the feature that were read.
+    datetime_as_string : bool, optional (default: False)
+        If True, will return datetime dtypes as detected by GDAL as a string
+        array, instead of a datetime64 array (used to extract timezone info).
+
     **kwargs
         Additional driver-specific dataset open options passed to OGR.  Invalid
         options will trigger a warning.
@@ -150,6 +155,7 @@ def read(
             sql_dialect=sql_dialect,
             return_fids=return_fids,
             dataset_kwargs=dataset_kwargs,
+            datetime_as_string=datetime_as_string,
         )
     finally:
         if buffer is not None:
@@ -387,6 +393,7 @@ def write(
     layer_options=None,
     **kwargs,
 ):
+    kwargs.pop("dtypes", None)
     if geometry_type is None:
         raise ValueError("geometry_type must be provided")
 

diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py
@@ -97,3 +97,8 @@ def test_ogr_types_list():
 @pytest.fixture(scope="session")
 def test_datetime():
     return _data_dir / "test_datetime.geojson"
+
+
+@pytest.fixture(scope="session")
+def test_datetime_tz():
+    return _data_dir / "test_datetime_tz.geojson"
diff --git a/pyogrio/tests/fixtures/test_datetime_tz.geojson b/pyogrio/tests/fixtures/test_datetime_tz.geojson
@@ -0,0 +1,7 @@
+{
+"type": "FeatureCollection",
+"features": [
+{ "type": "Feature", "properties": { "col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } },
+{ "type": "Feature", "properties": { "col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } }
+]
+}
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 import os
 from packaging.version import Version
-
 import numpy as np
 import pytest
 
@@ -18,7 +17,12 @@
 
 try:
     import pandas as pd
-    from pandas.testing import assert_frame_equal, assert_index_equal
+    from pandas.testing import (
+        assert_frame_equal,
+        assert_index_equal,
+        assert_series_equal,
+    )
+    import pytz
 
     import geopandas as gp
     from geopandas.array import from_wkt
@@ -139,6 +143,49 @@ def test_read_datetime(test_fgdb_vsi):
         assert df.SURVEY_DAT.dtype.name == "datetime64[ns]"
 
 
+def test_read_datetime_tz(test_datetime_tz, tmp_path):
+    df = read_dataframe(test_datetime_tz)
+    raw_expected = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"]
+
+    if Version(pd.__version__) >= Version("2.0.0"):
+        expected = pd.to_datetime(raw_expected, format="ISO8601").as_unit("ms")
+    else:
+        expected = pd.to_datetime(raw_expected)
+    expected = pd.Series(expected, name="col")
-    expected = pd.Series(expected, name="col")
+    expected = pd.Series(expected, name="datetime_col")
-    expected = pd.Series(expected, name="col")
+    expected = pd.Series(expected, name="datetime_col")
+
+    assert_series_equal(df.col, expected)
+    # test write and read round trips
+    # TODO gpkg doesn't work here, at least for my local gdal, writes NaT
+    fpath = tmp_path / "test.geojson"
+    write_dataframe(df, fpath)
+    df_read = read_dataframe(fpath)
+    assert_series_equal(df_read.col, expected)
+
+
+def test_write_datetime_mixed_offset(tmp_path):
+    # Summer Time (GMT+11), standard time (GMT+10)
+    dates = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"]
+    tz = pytz.timezone("Australia/Sydney")
+    ser_naive = pd.Series(pd.to_datetime(dates), name="dates")
+    ser_localised = ser_naive.dt.tz_localize(tz)
+    df = gp.GeoDataFrame(
+        {"dates": ser_localised, "geometry": [Point(1, 1), Point(1, 1)]}
+    )
+    fpath = tmp_path / "test.geojson"
+    write_dataframe(df, fpath)
+    df_no_tz = read_dataframe(
+        fpath, datetime_as_string=False
+    )  # TODO this shouldn't be called datetime as string in the pandas layer,
+    #     should it even be accessible?
+    # datetime_as_string=False ignores tz info, returns datetime objects
+    expected = ser_naive.astype("datetime64[ms]")
+    assert_series_equal(expected, df_no_tz["dates"])
+    # datetime_as_string=True keeps tz info, but pandas can't handle multiple offsets
+    # unless given a timezone to identify them with -> returned as strings
+    df_local = read_dataframe(fpath, datetime_as_string=True)
+    assert_series_equal(ser_localised.astype("object"), df_local["dates"])
+
+
 def test_read_null_values(test_fgdb_vsi):
     df = read_dataframe(test_fgdb_vsi, read_geometry=False)
 

diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py
@@ -613,6 +613,19 @@ def test_read_datetime_millisecond(test_datetime):
     assert field[1] == np.datetime64("2020-01-01 10:00:00.000")
 
 
+def test_read_datetime_tz(test_datetime_tz):
+    field = read(test_datetime_tz)[3][0]
+    assert field.dtype == "datetime64[ms]"
+    # timezone is ignored in numpy layer
+    assert field[0] == np.datetime64("2020-01-01 09:00:00.123")
+    assert field[1] == np.datetime64("2020-01-01 10:00:00.000")
+    field = read(test_datetime_tz, datetime_as_string=True)[3][0]
+    assert field.dtype == "object"
+    # GDAL doesn't return strings in ISO format (yet)
+    assert field[0] == "2020/01/01 09:00:00.123-05"
+    assert field[1] == "2020/01/01 10:00:00-05"
+
+
 @pytest.mark.parametrize("ext", ["gpkg", "geojson"])
 def test_read_write_null_geometry(tmp_path, ext):
     # Point(0, 0), null