try @thetorpedodog advice re _flags

single-cell-data · Sep 17, 2024 · 8ff4ec3 · 8ff4ec3
1 parent a96c72d
commit 8ff4ec3
Show file tree

Hide file tree

Showing 7 changed files with 294 additions and 23 deletions.
diff --git a/apis/python/src/tiledbsoma/__init__.py b/apis/python/src/tiledbsoma/__init__.py
@@ -134,19 +134,6 @@
     tiledbsoma_stats_json,
 )
 
-# Temporary for https://github.com/single-cell-data/TileDB-SOMA/issues/2407
-_new_shape_feature_flag = os.getenv("SOMA_PY_NEW_SHAPE") is not None
-
-
-def _new_shape_feature_flag_enabled() -> bool:
-    """
-    This is temporary only and will be removed once
-    https://github.com/single-cell-data/TileDB-SOMA/issues/2407
-    is complete.
-    """
-    return _new_shape_feature_flag
-
-
 # Load native libraries. On wheel builds, we may have a shared library
 # already linked. In this case, we can import directly
 try:

diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py
@@ -14,7 +14,7 @@
 from somacore import options
 from typing_extensions import Self
 
-from tiledbsoma import _new_shape_feature_flag_enabled
+from tiledbsoma._flags import _new_shape_feature_flag_enabled
 
 from . import _arrow_types, _util
 from . import pytiledbsoma as clib

diff --git a/apis/python/src/tiledbsoma/_flags.py b/apis/python/src/tiledbsoma/_flags.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2021-2023 The Chan Zuckerberg Initiative Foundation
+# Copyright (c) 2021-2023 TileDB, Inc.
+#
+# Licensed under the MIT License.
+
+"""Conversion to/from Arrow and TileDB type systems. Must be capable
+of representing full type semantics, and correctly performing a
+round trip conversion (e.g., T == to_arrow(to_tiledb(T)))
+
+Most primitive types are simple -- e.g., uint8. Of particular challenge
+are datetime/timestamps as TileDB has no distinction between a "datetime" and
+a "timedelta". The best Arrow match is TimestampType, as long as that
+TimestampType instance does NOT have a timezone set.
+
+Because of our round-trip requirement, all other Arrow temporal types
+are unsupported (even though they are just int64 under the covers).
+
+We auto-promote Arrow's string and binary to large_string and large_binary,
+respectively, as this is what TileDB stores -- a sequence of bytes preceded
+by a 64-bit (not 32-bit) length int.
+
+DataFrame-specific note: currently (as of 2.14), TileDB does not support
+Unicode array dimensions. All Arrow string types used in a DataFrame index
+columns (i.e., TileDB dimension) are coerced to ASCII. This equirement for
+ASCII-only dimensions will be relaxed in a future release. Unicode/UTF-8 is
+fully supported in SOMA DataFrame non-indexed columns.
+"""
+
+from typing import Any, Dict, Union
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+import pyarrow as pa
+
+_ARROW_TO_TDB_ATTR: Dict[Any, Union[str, TypeError]] = {
+    pa.string(): "U1",
+    pa.large_string(): "U1",
+    pa.binary(): "bytes",
+    pa.large_binary(): "bytes",
+    pa.timestamp("s"): "datetime64[s]",
+    pa.timestamp("ms"): "datetime64[ms]",
+    pa.timestamp("us"): "datetime64[us]",
+    pa.timestamp("ns"): "datetime64[ns]",
+    #
+    # Unsupported types in TileDB type system
+    pa.float16(): TypeError("float16 - unsupported type (use float32)"),
+    pa.date32(): TypeError("32-bit date - unsupported type (use TimestampType)"),
+    pa.date64(): TypeError("64-bit date - unsupported type (use TimestampType)"),
+}
+"""Dict of types unsupported by to_pandas_dtype, which require overrides for
+use in TileDB Attributes (aka DataFrame non-indexe columns).
+
+If the value is an instance of Exception, it will be raised.
+
+IMPORTANT: ALL non-primitive types supported by TileDB must be in this table.
+"""
+
+_PYARROW_TO_CARROW: Dict[pa.DataType, str] = {
+    pa.bool_(): "b",
+    pa.int8(): "c",
+    pa.int16(): "s",
+    pa.int32(): "i",
+    pa.int64(): "l",
+    pa.uint8(): "C",
+    pa.uint16(): "S",
+    pa.uint32(): "I",
+    pa.uint64(): "L",
+    pa.float32(): "f",
+    pa.float64(): "g",
+    pa.timestamp("s"): "tss:",
+    pa.timestamp("ms"): "tsm:",
+    pa.timestamp("us"): "tsu:",
+    pa.timestamp("ns"): "tsn:",
+}
+
+# Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions.
+# Any type system differences from the base-case Attr should be added here.
+_ARROW_TO_TDB_DIM: Dict[Any, Union[str, TypeError]] = _ARROW_TO_TDB_ATTR.copy()
+"""Same as _ARROW_TO_TDB_ATTR, but used for DataFrame indexed columns, aka TileDB Dimensions.
+Any type system differences from the base-case Attr should be added here.
+"""
+_ARROW_TO_TDB_DIM.update(
+    {
+        pa.string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
+        pa.large_string(): "ascii",  # TODO: temporary work-around until Dimension UTF8 support is available.
+    }
+)
+
+
+def tiledb_type_from_arrow_type(
+    t: pa.DataType, is_indexed_column: bool = False
+) -> npt.DTypeLike:
+    """Given an Arrow type, return the corresponding TileDB type as a NumPy dtype.
+    Building block for Arrow-to-TileDB schema translation.
+
+    TileDB currently has different Unicode handling for dimensions and attributes.
+    Set the ``is_dimension`` parameter to True for indexed-column (AKA dimension)
+    rules, which currently requires all strings to be ASCII.
+
+    If type is unsupported, with raise a TypeError exception.
+
+    Args:
+        t:
+            Arrow DataType instance, e.g., pyarrow.int8().
+        is_indexed_column:
+            Use TileDB dimension type conversion rules.
+
+    Returns:
+        The numpy dtype corresponding to the ``t`` parameter.
+
+    Raises:
+        TypeError: if the type is unsupported.
+    """
+    if pa.types.is_dictionary(t):
+        t = t.index_type
+
+    arrow_to_tdb = _ARROW_TO_TDB_DIM if is_indexed_column else _ARROW_TO_TDB_ATTR
+    if t in arrow_to_tdb:
+        arrow_type = arrow_to_tdb[t]
+        if isinstance(arrow_type, Exception):
+            raise arrow_type
+        if arrow_type in ["ascii", "bytes"]:
+            return arrow_type
+        return np.dtype(arrow_type)
+
+    if not pa.types.is_primitive(t):
+        raise TypeError(f"Type {str(t)} - unsupported type")
+    if pa.types.is_timestamp(t):
+        raise TypeError("TimeStampType - unsupported type (timezone not supported)")
+    if pa.types.is_time32(t):
+        raise TypeError("Time64Type - unsupported type (use TimestampType)")
+    if pa.types.is_time64(t):
+        raise TypeError("Time32Type - unsupported type (use TimestampType)")
+    if pa.types.is_duration(t):
+        raise TypeError("DurationType - unsupported type (use TimestampType)")
+
+    # else lets try the default conversion path
+    try:
+        # Must force into a dtype to catch places where the Pandas type
+        # system has extra information that can't be expressed
+        return np.dtype(t.to_pandas_dtype())
+    except NotImplementedError as exc:
+        raise TypeError("Unsupported Arrow type") from exc
+
+
+def arrow_type_from_tiledb_dtype(
+    tiledb_dtype: npt.DTypeLike, bytes_are_ascii: bool = True
+) -> pa.DataType:
+    """Maps a TileDB dtype (``'bytes'``, ``'ascii'``, or an ``np.dtype``) to an Arrow type.  Note that
+    when we read tiledb schema off storage, ``ascii`` and ``bytes`` both have ``dtype`` of `"S"`
+    which is equal to ``bytes`` -- so, the caller should disambgiuate.
+    """
+    if tiledb_dtype == "bytes":
+        if bytes_are_ascii:
+            return pa.large_string()
+        else:
+            return pa.large_binary()
+    elif tiledb_dtype == "ascii" or tiledb_dtype == np.dtype(str):
+        return pa.large_string()
+    else:
+        return pa.from_numpy_dtype(tiledb_dtype)
+
+
+def is_string_dtypelike(dtype: npt.DTypeLike) -> bool:
+    # Much of this (including the null-check) is to make the type-checker happy,
+    # as npt.DTypeLike is a complex union including 'str' and None.
+    if dtype is None:
+        return False
+    if dtype == "str":
+        return True
+    if isinstance(dtype, np.dtype):
+        return is_string_dtype(dtype)
+    return False
+
+
+def is_string_dtype(dtype: Any) -> bool:
+    return dtype.name in ["object", "string", "str32", "str64"]
+
+
+def df_to_arrow(df: pd.DataFrame) -> pa.Table:
+    """
+    Handle special cases where pa.Table.from_pandas is not sufficient.
+    """
+    nullable_fields = set()
+    # Not for name, col in df.items() since we need df[k] on the left-hand sides
+    for key in df:
+        # Make attributes nullable. Context:
+        # * df_to_arrow is _solely_ for use of tiledbsoma.io
+        #   o Anyone calling the SOMA API directly has user-provided Arrow
+        #     schema which must be respected
+        #   o Anyone calling tiledbsoma.io -- including from_h5ad/from_anndata,
+        #     and update_obs/update_var -- does not provide an Arrow schema
+        #     explicitly.  We compute an Arrow schema for them here.
+        # * Even when the _initial_ data is all non-null down a particular
+        #   string column, there are two ways a _subsequent_ write can provide
+        #   nulls: append-mode ingest, or, update_obs/update_var wherein the new
+        #   data has nulls even when the data used at schema-create time was
+        #   non-null.
+        # * We have no way of knowing at initial ingest time whether or not
+        #   users will later be appending, or updating, with null data.
+        # * Note that Arrow has a per-field nullable flag in its schema metadata
+        #   -- and so do TileDB array schemas.
+        #
+        # Note in particular this is for the use of tiledbsoma.io:
+        #
+        # * In the tiledbsoma API (e.g. DataFrame.create) the user passes an
+        #   Arrow schema and we respect it as-is. They specify nullability, or
+        #   not, as they wish.
+        # * In tiledbsoma.io, the user-provided inputs are AnnData objects.
+        #   We compute the Arrow schema _for_ them. And we must accommodate
+        #   reasonable/predictable needs.
+
+        nullable_fields.add(key)
+
+        # Handle special cases for all null columns where the dtype is "object"
+        # or "category" and must be explicitly casted to the correct pandas
+        # extension dtype.
+        #
+        # Note: with
+        #   anndata.obs['new_col'] = pd.Series(data=np.nan, dtype=np.dtype(str))
+        # the dtype comes in to us via `tiledbsoma.io.from_anndata` not
+        # as `pd.StringDtype()` but rather as `object`.
+        if df[key].isnull().all():
+            if df[key].dtype.name == "object":
+                df[key] = pd.Series([None] * df.shape[0], dtype=pd.StringDtype())
+            elif df[key].dtype.name == "category":
+                df[key] = pd.Series([None] * df.shape[0], dtype=pd.CategoricalDtype())
+
+    # For categoricals, it's possible to get
+    #   TypeError: Object of type bool_ is not JSON serializable
+    # deep within library functions. Debugging reveals that this happens when
+    # the df[key].values.ordered is of type np.bool_ rather than Python bool.
+    # So, we cast and reconstruct.
+    for key in df:
+        column = df[key]
+        if isinstance(column.dtype, pd.CategoricalDtype):
+            if hasattr(column.values, "categories"):
+                categories = column.values.categories
+
+            if hasattr(column.values, "ordered"):
+                ordered = bool(column.values.ordered)
+
+            df[key] = pd.Categorical(
+                values=column, categories=categories, ordered=ordered
+            )
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    md = arrow_table.schema.metadata
+    md.update(dict.fromkeys(nullable_fields, "nullable"))
+    arrow_table = arrow_table.replace_schema_metadata(md)
+
+    # For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
+    # later on. This means that on fresh ingest we must use a larger bit-width than
+    # the bare minimum necessary.
+    new_map = {}
+    for field in arrow_table.schema:
+        if pa.types.is_dictionary(field.type):
+            old_index_type = field.type.index_type
+            new_index_type = (
+                pa.int32()
+                if old_index_type in [pa.int8(), pa.int16()]
+                else old_index_type
+            )
+            new_map[field.name] = pa.dictionary(
+                new_index_type,
+                field.type.value_type,
+                field.type.ordered,
+            )
+        else:
+            new_map[field.name] = field.type
+    new_schema = pa.schema(new_map, metadata=arrow_table.schema.metadata)
+
+    arrow_table = pa.Table.from_pandas(df, schema=new_schema)
+
+    return arrow_table
+
+
+def pyarrow_to_carrow_type(pa_type: pa.DataType) -> str:
+    try:
+        return _PYARROW_TO_CARROW[pa_type]
+    except KeyError:
+        raise TypeError(f"Invalid pyarrow type {pa_type}") from None
diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py
@@ -26,7 +26,7 @@
 from somacore.options import PlatformConfig
 from typing_extensions import Self
 
-from tiledbsoma import _new_shape_feature_flag_enabled
+from tiledbsoma._flags import _new_shape_feature_flag_enabled
 
 from . import _util
 

diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py
@@ -92,7 +92,7 @@ def test_dataframe(tmp_path, arrow_schema):
 
         # More to come on https://github.com/single-cell-data/TileDB-SOMA/issues/2407
         assert (
-            sdf.tiledbsoma_has_upgraded_domain == soma._new_shape_feature_flag_enabled()
+            sdf.tiledbsoma_has_upgraded_domain == soma._flags._new_shape_feature_flag_enabled()
         )
 
         with pytest.raises(AttributeError):

diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py
@@ -48,7 +48,7 @@ def test_sparse_nd_array_basics(
         # More to come on https://github.com/single-cell-data/TileDB-SOMA/issues/2407
         assert (
             snda.tiledbsoma_has_upgraded_shape
-            == tiledbsoma._new_shape_feature_flag_enabled()
+            == tiledbsoma._flags._new_shape_feature_flag_enabled()
         )
 
         # Before current-domain support: shape is maxshape.
@@ -58,7 +58,7 @@ def test_sparse_nd_array_basics(
         # involving R compatibility, and leaving room for a single tile
         # capacity, etc ...  we could check for some magic value but it suffices
         # to check that it's over 2 billion.)
-        if tiledbsoma._new_shape_feature_flag_enabled():
+        if tiledbsoma._flags._new_shape_feature_flag_enabled():
             for e in snda.maxshape:
                 assert e > 2_000_000_000
         else:
@@ -89,7 +89,7 @@ def test_sparse_nd_array_basics(
     with tiledbsoma.SparseNDArray.open(uri) as snda:
         assert snda.shape == arg_shape
         # This will change with current-domain support
-        if tiledbsoma._new_shape_feature_flag_enabled():
+        if tiledbsoma._flags._new_shape_feature_flag_enabled():
             for e in snda.maxshape:
                 assert e > 2_000_000_000
         else:
@@ -113,7 +113,7 @@ def test_sparse_nd_array_basics(
     with tiledbsoma.SparseNDArray.open(uri) as snda:
         assert snda.shape == arg_shape
 
-    if tiledbsoma._new_shape_feature_flag_enabled():
+    if tiledbsoma._flags._new_shape_feature_flag_enabled():
 
         # Test resize down
         new_shape = tuple([arg_shape[i] - 50 for i in range(ndim)])
@@ -267,11 +267,11 @@ def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names):
         has_sjid_dim = "soma_joinid" in index_column_names
         if has_sjid_dim:
             assert sdf._maybe_soma_joinid_shape == 1 + soma_joinid_domain[1]
-            if not tiledbsoma._new_shape_feature_flag_enabled():
+            if not tiledbsoma._flags._new_shape_feature_flag_enabled():
                 assert sdf._maybe_soma_joinid_maxshape == 1 + soma_joinid_domain[1]
         else:
             assert sdf._maybe_soma_joinid_shape is None
-            if not tiledbsoma._new_shape_feature_flag_enabled():
+            if not tiledbsoma._flags._new_shape_feature_flag_enabled():
                 assert sdf._maybe_soma_joinid_maxshape is None
 
         assert len(sdf.non_empty_domain()) == len(index_column_names)
diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py
@@ -1088,7 +1088,7 @@ def test_tile_extents(tmp_path):
     ).close()
 
     with tiledb.open(uri) as A:
-        if soma._new_shape_feature_flag_enabled():
+        if soma._flags._new_shape_feature_flag_enabled():
             assert A.schema.domain.dim(0).tile == 2048
             assert A.schema.domain.dim(1).tile == 2048
         else: