geopandas · brendan-ward · Oct 20, 2023 · May 7, 2023 · May 7, 2023 · May 8, 2023
diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml
@@ -66,4 +66,4 @@ jobs:
 
       - name: Test
         run: |
-          pytest -v -r s pyogrio/tests
+          pytest -v --color=yes -r s pyogrio/tests
diff --git a/docs/source/known_issues.md b/docs/source/known_issues.md
@@ -55,16 +55,23 @@ with obscure error messages.
 ## Support for reading and writing DateTimes
 
 GDAL only supports datetimes at a millisecond resolution. Reading data will thus
-give at most millisecond resolution (`datetime64[ms]` data type), even though
-the data is cast `datetime64[ns]` data type when reading into a data frame
-using `pyogrio.read_dataframe()`. When writing, only precision up to ms is retained.
+give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0
+`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]` 
+correspondingly. For previous versions of pandas, `datetime64[ns]` is used as 
+ms precision was not supported. When writing, only precision up to 
+ms is retained.
 
 Not all file formats have dedicated support to store datetime data, like ESRI
 Shapefile. For such formats, or if you require precision > ms, a workaround is to
 convert the datetimes to string.
 
-Timezone information is ignored at the moment, both when reading and when writing
-datetime columns.
+Timezone information is preserved where possible, however GDAL only represents
+time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or 
+`zoneinfo`). This means that dataframes with columns containing multiple offsets 
+(e.g. when switching from standard time to summer time) will be written correctly,
+but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime 
+column, as there is no way to reconstruct the original timezone from the individual 
+offsets present.
 
 ## Support for OpenStreetMap (OSM) data
 

diff --git a/pyogrio/_compat.py b/pyogrio/_compat.py
@@ -18,11 +18,18 @@
 except ImportError:
     geopandas = None
 
+try:
+    import pandas
+except ImportError:
+    pandas = None
+
 
 HAS_ARROW_API = __gdal_version__ >= (3, 6, 0) and pyarrow is not None
 
 HAS_GEOPANDAS = geopandas is not None
 
+PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0")
+
 HAS_GDAL_GEOS = __gdal_geos_version__ is not None
 
 HAS_SHAPELY = shapely is not None and Version(shapely.__version__) >= Version("2.0.0")
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -719,7 +719,8 @@ cdef process_fields(
     object field_data_view,
     object field_indexes,
     object field_ogr_types,
-    encoding
+    encoding,
+    bint datetime_as_string
 ):
     cdef int j
     cdef int success
@@ -751,7 +752,7 @@ cdef process_fields(
                 else:
                     data[i] = np.nan
 
-            elif field_type in ( OFTDate, OFTDateTime):
+            elif field_type in ( OFTDate, OFTDateTime) and not datetime_as_string:
                 data[i] = np.datetime64('NaT')
 
             else:
@@ -777,22 +778,28 @@ cdef process_fields(
             data[i] = bin_value[:ret_length]
 
         elif field_type == OFTDateTime or field_type == OFTDate:
-            success = OGR_F_GetFieldAsDateTimeEx(
-                ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)
+
+            if datetime_as_string:
+                # defer datetime parsing to user/ pandas layer
+                # Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only
+                data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding)
+            else:
+                success = OGR_F_GetFieldAsDateTimeEx(
+                    ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)
 
-            ms, ss = math.modf(fsecond)
-            second = int(ss)
-            # fsecond has millisecond accuracy
-            microsecond = round(ms * 1000) * 1000
+                ms, ss = math.modf(fsecond)
+                second = int(ss)
+                # fsecond has millisecond accuracy
+                microsecond = round(ms * 1000) * 1000
 
-            if not success:
-                data[i] = np.datetime64('NaT')
+                if not success:
+                    data[i] = np.datetime64('NaT')
 
-            elif field_type == OFTDate:
-                data[i] = datetime.date(year, month, day).isoformat()
+                elif field_type == OFTDate:
+                    data[i] = datetime.date(year, month, day).isoformat()
 
-            elif field_type == OFTDateTime:
-                data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
+                elif field_type == OFTDateTime:
+                    data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
 
 
 @cython.boundscheck(False)  # Deactivate bounds checking
@@ -805,7 +812,8 @@ cdef get_features(
     uint8_t force_2d,
     int skip_features,
     int num_features,
-    uint8_t return_fids
+    uint8_t return_fids,
+    bint datetime_as_string
 ):
 
     cdef OGRFeatureH ogr_feature = NULL
@@ -838,7 +846,9 @@ cdef get_features(
 
     field_data = [
         np.empty(shape=(num_features, ),
-        dtype=fields[field_index,3]) for field_index in range(n_fields)
+        dtype = ("object" if datetime_as_string and 
+                    fields[field_index,3].startswith("datetime") else fields[field_index,3])
+        ) for field_index in range(n_fields)
     ]
 
     field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
@@ -879,7 +889,7 @@ cdef get_features(
 
             process_fields(
                 ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding
+                field_indexes, field_ogr_types, encoding, datetime_as_string
             )
             i += 1
         finally:
@@ -909,7 +919,8 @@ cdef get_features_by_fid(
     object[:,:] fields,
     encoding,
     uint8_t read_geometry,
-    uint8_t force_2d
+    uint8_t force_2d,
+    bint datetime_as_string
 ):
 
     cdef OGRFeatureH ogr_feature = NULL
@@ -932,10 +943,11 @@ cdef get_features_by_fid(
     n_fields = fields.shape[0]
     field_indexes = fields[:,0]
     field_ogr_types = fields[:,1]
-
     field_data = [
         np.empty(shape=(count, ),
-        dtype=fields[field_index,3]) for field_index in range(n_fields)
+        dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") 
+            else fields[field_index,3])) 
+        for field_index in range(n_fields)
     ]
 
     field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
@@ -958,7 +970,7 @@ cdef get_features_by_fid(
 
             process_fields(
                 ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding
+                field_indexes, field_ogr_types, encoding, datetime_as_string
             )
         finally:
             if ogr_feature != NULL:
@@ -1058,7 +1070,9 @@ def ogr_read(
     object fids=None,
     str sql=None,
     str sql_dialect=None,
-    int return_fids=False):
+    int return_fids=False,
+    bint datetime_as_string=False
+    ):
 
     cdef int err = 0
     cdef const char *path_c = NULL
@@ -1150,6 +1164,7 @@ def ogr_read(
                 encoding,
                 read_geometry=read_geometry and geometry_type is not None,
                 force_2d=force_2d,
+                datetime_as_string=datetime_as_string
             )
 
             # bypass reading fids since these should match fids used for read
@@ -1182,13 +1197,15 @@ def ogr_read(
                 force_2d=force_2d,
                 skip_features=skip_features,
                 num_features=num_features,
-                return_fids=return_fids
+                return_fids=return_fids,
+                datetime_as_string=datetime_as_string
             )
 
         meta = {
             'crs': crs,
             'encoding': encoding,
             'fields': fields[:,2], # return only names
+            'dtypes':fields[:,3],
             'geometry_type': geometry_type,
         }
 
@@ -1647,7 +1664,8 @@ def ogr_write(
     str path, str layer, str driver, geometry, fields, field_data, field_mask,
     str crs, str geometry_type, str encoding, object dataset_kwargs,
     object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True,
-    bint append=False, dataset_metadata=None, layer_metadata=None
+    bint append=False, dataset_metadata=None, layer_metadata=None,
+    timezone_cols_metadata=None
 ):
     cdef const char *path_c = NULL
     cdef const char *layer_c = NULL
@@ -1718,6 +1736,9 @@ def ogr_write(
     if not layer:
         layer = os.path.splitext(os.path.split(path)[1])[0]
 
+    if timezone_cols_metadata is None:
+        timezone_cols_metadata = {}
+
 
     # if shapefile, GeoJSON, or FlatGeobuf, always delete first
     # for other types, check if we can create layers
@@ -1990,8 +2011,12 @@ def ogr_write(
                     if np.isnat(field_value):
                         OGR_F_SetFieldNull(ogr_feature, field_idx)
                     else:
-                        # TODO: add support for timezones
                         datetime = field_value.astype("datetime64[ms]").item()
+                        tz_array = timezone_cols_metadata.get(fields[field_idx], None)
+                        if tz_array is None:
+                            gdal_tz = 0
+                        else:
+                            gdal_tz = tz_array[i]
                         OGR_F_SetFieldDateTimeEx(
                             ogr_feature,
                             field_idx,
@@ -2001,7 +2026,7 @@ def ogr_write(
                             datetime.hour,
                             datetime.minute,
                             datetime.second + datetime.microsecond / 10**6,
-                            0
+                            gdal_tz
                         )
 
                 else:

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from pyogrio._compat import HAS_GEOPANDAS
+from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_20
 from pyogrio.raw import (
     DRIVERS_NO_MIXED_SINGLE_MULTI,
     DRIVERS_NO_MIXED_DIMENSIONS,
@@ -12,6 +12,7 @@
     write,
 )
 from pyogrio.errors import DataSourceError
+import warnings
 
 
 def _stringify_path(path):
@@ -29,6 +30,40 @@ def _stringify_path(path):
     return path
 
 
+def _try_parse_datetime(ser):
+    import pandas as pd  # only called when pandas is known to be installed
+
+    if PANDAS_GE_20:
+        datetime_kwargs = dict(format="ISO8601", errors="ignore")
+    else:
+        datetime_kwargs = dict(yearfirst=True)
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            ".*parsing datetimes with mixed time zones will raise.*",
+            FutureWarning,
+        )
+        # pre-emptive try catch for when pandas will raise
+        # (can tighten the exception type in future when it does)
+        try:
+            res = pd.to_datetime(ser, **datetime_kwargs)
+        except Exception:
+            pass
+    # if object dtype, try parse as utc instead
+    if res.dtype == "object":
+        res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
+
+    if res.dtype != "object":
+        # GDAL only supports ms precision, convert outputs to match.
+        # Pandas 2.0 supports datetime[ms] directly, prior versions only support [ns],
+        # Instead, round the values to [ms] precision.
+        if PANDAS_GE_20:
+            res = res.dt.as_unit("ms")
+        else:
+            res = res.dt.round(freq="ms")
+    return res
+
+
 def read_dataframe(
     path_or_buffer,
     /,
@@ -194,6 +229,11 @@ def read_dataframe(
         use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0")))
 
     read_func = read_arrow if use_arrow else read
+    if not use_arrow:
+        # For arrow, datetimes are read as is.
+        # For numpy IO, datetimes are read as string values to preserve timezone info
+        # as numpy does not directly support timezones.
+        kwargs["datetime_as_string"] = True
     result = read_func(
         path_or_buffer,
         layer=layer,
@@ -246,8 +286,10 @@ def read_dataframe(
         index = pd.Index(index, name="fid")
     else:
         index = None
-
     df = pd.DataFrame(data, columns=columns, index=index)
+    for dtype, c in zip(meta["dtypes"], df.columns):
+        if dtype.startswith("datetime"):
+            df[c] = _try_parse_datetime(df[c])
 
     if geometry is None or not read_geometry:
         return df
@@ -389,19 +431,38 @@ def write_dataframe(
     # TODO: may need to fill in pd.NA, etc
     field_data = []
     field_mask = []
+    # dict[str, np.array(datetime.datetime)] special case for dt-tz fields
+    gdal_tz_offsets = {}
     for name in fields:
-        col = df[name].values
-        if isinstance(col, pd.api.extensions.ExtensionArray):
+        col = df[name]
+        values = col.values
+        if isinstance(col.dtype, pd.DatetimeTZDtype):
+            # Deal with datetimes with timezones by passing down timezone separately
+            # pass down naive datetime
+            naive = col.dt.tz_localize(None)
+            values = naive.values
+            # compute offset relative to UTC explicitly
+            tz_offset = naive - col.dt.tz_convert("UTC").dt.tz_localize(None)
+            # Convert to GDAL timezone offset representation.
+            # GMT is represented as 100 and offsets are represented by adding /
+            # subtracting 1 for every 15 minutes different from GMT.
+            # https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes
+            # Convert each row offset to a signed multiple of 15m and add to GMT value
+            gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
+            gdal_tz_offsets[name] = gdal_offset_representation
+        else:
+            values = col.values
+        if isinstance(values, pd.api.extensions.ExtensionArray):
             from pandas.arrays import IntegerArray, FloatingArray, BooleanArray
 
-            if isinstance(col, (IntegerArray, FloatingArray, BooleanArray)):
-                field_data.append(col._data)
-                field_mask.append(col._mask)
+            if isinstance(values, (IntegerArray, FloatingArray, BooleanArray)):
+                field_data.append(values._data)
+                field_mask.append(values._mask)
             else:
-                field_data.append(np.asarray(col))
-                field_mask.append(np.asarray(col.isna()))
+                field_data.append(np.asarray(values))
+                field_mask.append(np.asarray(values.isna()))
         else:
-            field_data.append(col)
+            field_data.append(values)
             field_mask.append(None)
 
     # Determine geometry_type and/or promote_to_multi
@@ -496,5 +557,6 @@ def write_dataframe(
         metadata=metadata,
         dataset_options=dataset_options,
         layer_options=layer_options,
+        timezone_cols_metadata=gdal_tz_offsets,
         **kwargs,
     )