diff --git a/altair/_magics.py b/altair/_magics.py index bac190aa3..99bc8bc67 100644 --- a/altair/_magics.py +++ b/altair/_magics.py @@ -9,7 +9,7 @@ import IPython from IPython.core import magic_arguments -import pandas as pd +from altair.utils.core import DataFrameLike, _is_pandas_dataframe from toolz import curried from altair.vegalite import v5 as vegalite_v5 @@ -40,10 +40,12 @@ def _prepare_data(data, data_transformers): """Convert input data to data for use within schema""" if data is None or isinstance(data, dict): return data - elif isinstance(data, pd.DataFrame): + elif isinstance(data, DataFrameLike): return curried.pipe(data, data_transformers.get()) elif isinstance(data, str): return {"url": data} + elif _is_pandas_dataframe(data): + return curried.pipe(data, data_transformers.get()) else: warnings.warn("data of type {} not recognized".format(type(data)), stacklevel=1) return data diff --git a/altair/utils/_importers.py b/altair/utils/_importers.py index 718fa9129..2a8c6aee6 100644 --- a/altair/utils/_importers.py +++ b/altair/utils/_importers.py @@ -95,3 +95,27 @@ def pyarrow_available() -> bool: return True except ImportError: return False + + +def import_pandas() -> ModuleType: + min_version = "0.25" + try: + version = importlib_version("pandas") + if Version(version) < Version(min_version): + raise RuntimeError( + f"The pandas package must be version {min_version} or greater. " + f"Found version {version}" + ) + import pandas as pd + + return pd + except ImportError as err: + raise ImportError( + f"Serialization of the DataFrame requires\n" + f"version {min_version} or greater of the 'pandas' package. \n" + f"This can be installed with pip using:\n" + f' pip install "pandas>={min_version}"\n' + "or conda:\n" + f' conda install -c conda-forge "pandas>={min_version}"\n\n' + f"ImportError: {err.args[0]}" + ) from err diff --git a/altair/utils/core.py b/altair/utils/core.py index baf1013f7..b8ecaba07 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -25,9 +25,7 @@ from types import ModuleType import jsonschema -import pandas as pd import numpy as np -from pandas.api.types import infer_dtype from altair.utils.schemapi import SchemaBase from altair.utils._dfi_types import Column, DtypeKind, DataFrame as DfiDataFrame @@ -40,6 +38,7 @@ from typing import Literal, Protocol, TYPE_CHECKING, runtime_checkable if TYPE_CHECKING: + import pandas as pd from pandas.core.interchange.dataframe_protocol import Column as PandasColumn V = TypeVar("V") @@ -53,6 +52,11 @@ def __dataframe__( ) -> DfiDataFrame: ... +def _is_pandas_dataframe(obj: Any) -> bool: + """Check if the object is an instance of a pandas DataFrame.""" + return all(attr in dir(obj) for attr in ["iloc", "columns", "index"]) + + TYPECODE_MAP = { "ordinal": "O", "nominal": "N", @@ -208,7 +212,10 @@ def infer_vegalite_type( ---------- data: object """ - typ = infer_dtype(data, skipna=False) + from altair.utils._importers import import_pandas + + pd = import_pandas() + typ = pd.api.types.infer_dtype(data, skipna=False) if typ in [ "floating", @@ -299,7 +306,7 @@ def numpy_is_subtype(dtype: Any, subtype: Any) -> bool: return False -def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame: # noqa: C901 +def sanitize_dataframe(df: "pd.DataFrame") -> "pd.DataFrame": # noqa: C901 """Sanitize a DataFrame to prepare it for serialization. * Make a copy @@ -316,6 +323,9 @@ def sanitize_dataframe(df: pd.DataFrame) -> pd.DataFrame: # noqa: C901 * convert dedicated string column to objects and replace NaN with None * Raise a ValueError for TimeDelta dtypes """ + from altair.utils._importers import import_pandas + + pd = import_pandas() df = df.copy() if isinstance(df.columns, pd.RangeIndex): @@ -448,7 +458,7 @@ def sanitize_arrow_table(pa_table): def parse_shorthand( shorthand: Union[Dict[str, Any], str], - data: Optional[Union[pd.DataFrame, DataFrameLike]] = None, + data: Optional[Union[DataFrameLike, "pd.DataFrame"]] = None, parse_aggregates: bool = True, parse_window_ops: bool = False, parse_timeunits: bool = True, @@ -601,7 +611,7 @@ def parse_shorthand( # Fall back to pandas-based inference. # Note: The AttributeError catch is a workaround for # https://github.com/pandas-dev/pandas/issues/55332 - if isinstance(data, pd.DataFrame): + if _is_pandas_dataframe(data): attrs["type"] = infer_vegalite_type(data[unescaped_field]) else: raise @@ -609,7 +619,7 @@ def parse_shorthand( if isinstance(attrs["type"], tuple): attrs["sort"] = attrs["type"][1] attrs["type"] = attrs["type"][0] - elif isinstance(data, pd.DataFrame): + elif _is_pandas_dataframe(data): # Fallback if pyarrow is not installed or if pandas is older than 1.5 # # Remove escape sequences so that types can be inferred for columns with special characters diff --git a/altair/utils/data.py b/altair/utils/data.py index 871b43092..1b68e8518 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -5,12 +5,16 @@ import warnings from typing import Union, MutableMapping, Optional, Dict, Sequence, TYPE_CHECKING, List -import pandas as pd from toolz import curried from typing import TypeVar from ._importers import import_pyarrow_interchange -from .core import sanitize_dataframe, sanitize_arrow_table, DataFrameLike +from .core import ( + sanitize_dataframe, + sanitize_arrow_table, + DataFrameLike, + _is_pandas_dataframe, +) from .core import sanitize_geo_interface from .deprecation import AltairDeprecationWarning from .plugin_registry import PluginRegistry @@ -21,13 +25,14 @@ if TYPE_CHECKING: import pyarrow.lib + import pandas as pd class SupportsGeoInterface(Protocol): __geo_interface__: MutableMapping -DataType = Union[dict, pd.DataFrame, SupportsGeoInterface, DataFrameLike] +DataType = Union[dict, DataFrameLike, SupportsGeoInterface, "pd.DataFrame"] TDataType = TypeVar("TDataType", bound=DataType) VegaLiteDataDict = Dict[str, Union[str, dict, List[dict]]] @@ -96,7 +101,7 @@ def raise_max_rows_error(): values = data.__geo_interface__["features"] else: values = data.__geo_interface__ - elif isinstance(data, pd.DataFrame): + elif _is_pandas_dataframe(data): values = data elif isinstance(data, dict): if "values" in data: @@ -122,10 +127,10 @@ def raise_max_rows_error(): @curried.curry def sample( data: DataType, n: Optional[int] = None, frac: Optional[float] = None -) -> Optional[Union[pd.DataFrame, Dict[str, Sequence], "pyarrow.lib.Table"]]: +) -> Optional[Union["pd.DataFrame", Dict[str, Sequence], "pyarrow.lib.Table"]]: """Reduce the size of the data model by sampling without replacement.""" check_data_type(data) - if isinstance(data, pd.DataFrame): + if _is_pandas_dataframe(data): return data.sample(n=n, frac=frac) elif isinstance(data, dict): if "values" in data: @@ -196,7 +201,7 @@ def to_json( @curried.curry def to_csv( - data: Union[dict, pd.DataFrame, DataFrameLike], + data: Union[dict, DataFrameLike, "pd.DataFrame"], prefix: str = "altair-data", extension: str = "csv", filename: str = "{prefix}-{hash}.{extension}", @@ -216,13 +221,13 @@ def to_values(data: DataType) -> ToValuesReturnType: """Replace a DataFrame by a data model with values.""" check_data_type(data) if hasattr(data, "__geo_interface__"): - if isinstance(data, pd.DataFrame): + if _is_pandas_dataframe(data): data = sanitize_dataframe(data) # Maybe the type could be further clarified here that it is # SupportGeoInterface and then the ignore statement is not needed? data_sanitized = sanitize_geo_interface(data.__geo_interface__) # type: ignore[arg-type] return {"values": data_sanitized} - elif isinstance(data, pd.DataFrame): + elif _is_pandas_dataframe(data): data = sanitize_dataframe(data) return {"values": data.to_dict(orient="records")} elif isinstance(data, dict): @@ -238,8 +243,10 @@ def to_values(data: DataType) -> ToValuesReturnType: def check_data_type(data: DataType) -> None: - if not isinstance(data, (dict, pd.DataFrame, DataFrameLike)) and not any( - hasattr(data, attr) for attr in ["__geo_interface__"] + if ( + not isinstance(data, (dict, DataFrameLike)) + and not _is_pandas_dataframe(data) + and not any(hasattr(data, attr) for attr in ["__geo_interface__"]) ): raise TypeError( "Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format( @@ -259,13 +266,13 @@ def _data_to_json_string(data: DataType) -> str: """Return a JSON string representation of the input data""" check_data_type(data) if hasattr(data, "__geo_interface__"): - if isinstance(data, pd.DataFrame): + if _is_pandas_dataframe(data): data = sanitize_dataframe(data) # Maybe the type could be further clarified here that it is # SupportGeoInterface and then the ignore statement is not needed? data = sanitize_geo_interface(data.__geo_interface__) # type: ignore[arg-type] return json.dumps(data) - elif isinstance(data, pd.DataFrame): + elif _is_pandas_dataframe(data): data = sanitize_dataframe(data) return data.to_json(orient="records", double_precision=15) elif isinstance(data, dict): @@ -281,7 +288,7 @@ def _data_to_json_string(data: DataType) -> str: ) -def _data_to_csv_string(data: Union[dict, pd.DataFrame, DataFrameLike]) -> str: +def _data_to_csv_string(data: Union[dict, DataFrameLike, "pd.DataFrame"]) -> str: """return a CSV string representation of the input data""" check_data_type(data) if hasattr(data, "__geo_interface__"): @@ -289,15 +296,17 @@ def _data_to_csv_string(data: Union[dict, pd.DataFrame, DataFrameLike]) -> str: "to_csv does not work with data that " "contains the __geo_interface__ attribute" ) - elif isinstance(data, pd.DataFrame): + elif _is_pandas_dataframe(data): data = sanitize_dataframe(data) return data.to_csv(index=False) elif isinstance(data, dict): + from altair.utils._importers import import_pandas + if "values" not in data: raise KeyError("values expected in data dict, but not present") + pd = import_pandas() return pd.DataFrame.from_dict(data["values"]).to_csv(index=False) elif isinstance(data, DataFrameLike): - # experimental interchange dataframe support import pyarrow as pa import pyarrow.csv as pa_csv diff --git a/altair/utils/schemapi.py b/altair/utils/schemapi.py index b6cfa0ded..c34d0d4ff 100644 --- a/altair/utils/schemapi.py +++ b/altair/utils/schemapi.py @@ -22,6 +22,7 @@ overload, Literal, TypeVar, + TYPE_CHECKING, ) from itertools import zip_longest from importlib.metadata import version as importlib_version @@ -31,7 +32,6 @@ import jsonschema.exceptions import jsonschema.validators import numpy as np -import pandas as pd from packaging.version import Version # This leads to circular imports with the vegalite module. Currently, this works @@ -44,6 +44,15 @@ else: from typing_extensions import Self +if TYPE_CHECKING: + pass + + +class _PandasTimestamp: + def isoformat(self): + return "dummy_isoformat" # Return a dummy ISO format string + + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] @@ -477,7 +486,9 @@ def _todict(obj: Any, context: Optional[Dict[str, Any]]) -> Any: return obj.to_dict() elif isinstance(obj, np.number): return float(obj) - elif isinstance(obj, (pd.Timestamp, np.datetime64)): + elif isinstance(obj, (_PandasTimestamp, np.datetime64)): + import pandas as pd + return pd.Timestamp(obj).isoformat() else: return obj @@ -936,7 +947,7 @@ def to_dict( # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) - # Prevent that pandas categorical data is automatically sorted + # Prevent that categorical data is automatically sorted # when a non-ordinal data type is specifed manually # or if the encoding channel does not support sorting if "sort" in parsed_shorthand and ( diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py index dfde5ee7e..789b0fc4f 100644 --- a/altair/vegalite/v5/api.py +++ b/altair/vegalite/v5/api.py @@ -4,7 +4,6 @@ import io import json import jsonschema -import pandas as pd from toolz.curried import pipe as _pipe import itertools import sys @@ -26,7 +25,7 @@ using_vegafusion as _using_vegafusion, compile_with_vegafusion as _compile_with_vegafusion, ) -from ...utils.core import DataFrameLike +from ...utils.core import DataFrameLike, _is_pandas_dataframe from ...utils.data import DataType if sys.version_info >= (3, 11): @@ -106,15 +105,15 @@ def _prepare_data(data, context=None): if data is Undefined: return data - # convert dataframes or objects with __geo_interface__ to dict - elif isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"): + # convert dataframes or objects with __geo_interface__ to dict + elif isinstance(data, DataFrameLike) or hasattr(data, "__geo_interface__"): data = _pipe(data, data_transformers.get()) # convert string input to a URLData elif isinstance(data, str): data = core.UrlData(data) - elif isinstance(data, DataFrameLike): + elif _is_pandas_dataframe(data): data = _pipe(data, data_transformers.get()) # consolidate inline data to top-level datasets diff --git a/altair/vegalite/v5/schema/channels.py b/altair/vegalite/v5/schema/channels.py index 089a534a6..62adc8301 100644 --- a/altair/vegalite/v5/schema/channels.py +++ b/altair/vegalite/v5/schema/channels.py @@ -11,7 +11,7 @@ import sys from . import core -import pandas as pd +from altair.utils.core import DataFrameLike from altair.utils.schemapi import Undefined, UndefinedType, with_property_setters from altair.utils import parse_shorthand from typing import Any, overload, Sequence, List, Literal, Union, Optional @@ -60,7 +60,7 @@ def to_dict( # We still parse it out of the shorthand, but drop it here. parsed.pop("type", None) elif not (type_in_shorthand or type_defined_explicitly): - if isinstance(context.get("data", None), pd.DataFrame): + if isinstance(context.get("data", None), DataFrameLike): raise ValueError( 'Unable to determine data type for the field "{}";' " verify that the field name is not misspelled." diff --git a/tests/utils/test_core.py b/tests/utils/test_core.py index 27cd3b7ee..185588456 100644 --- a/tests/utils/test_core.py +++ b/tests/utils/test_core.py @@ -4,11 +4,11 @@ import numpy as np import pandas as pd +from pandas.api.types import infer_dtype import pytest import altair as alt from altair.utils.core import parse_shorthand, update_nested, infer_encoding_types -from altair.utils.core import infer_dtype json_schema_specification = alt.load_schema()["$schema"] json_schema_dict_str = f'{{"$schema": "{json_schema_specification}"}}' diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index fdd73df54..9481693b8 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -139,7 +139,7 @@ def to_dict( # We still parse it out of the shorthand, but drop it here. parsed.pop("type", None) elif not (type_in_shorthand or type_defined_explicitly): - if isinstance(context.get("data", None), pd.DataFrame): + if isinstance(context.get("data", None), DataFrameLike): raise ValueError( 'Unable to determine data type for the field "{}";' " verify that the field name is not misspelled." @@ -547,7 +547,7 @@ def generate_vegalite_channel_wrappers( imports = [ "import sys", "from . import core", - "import pandas as pd", + "from altair.utils.core import DataFrameLike", "from altair.utils.schemapi import Undefined, UndefinedType, with_property_setters", "from altair.utils import parse_shorthand", "from typing import Any, overload, Sequence, List, Literal, Union, Optional", diff --git a/tools/schemapi/schemapi.py b/tools/schemapi/schemapi.py index 52d58d1d3..ef494f3ff 100644 --- a/tools/schemapi/schemapi.py +++ b/tools/schemapi/schemapi.py @@ -20,6 +20,7 @@ overload, Literal, TypeVar, + TYPE_CHECKING, ) from itertools import zip_longest from importlib.metadata import version as importlib_version @@ -29,7 +30,6 @@ import jsonschema.exceptions import jsonschema.validators import numpy as np -import pandas as pd from packaging.version import Version # This leads to circular imports with the vegalite module. Currently, this works @@ -42,6 +42,15 @@ else: from typing_extensions import Self +if TYPE_CHECKING: + pass + + +class _PandasTimestamp: + def isoformat(self): + return "dummy_isoformat" # Return a dummy ISO format string + + TSchemaBase = TypeVar("TSchemaBase", bound=Type["SchemaBase"]) ValidationErrorList = List[jsonschema.exceptions.ValidationError] @@ -475,7 +484,9 @@ def _todict(obj: Any, context: Optional[Dict[str, Any]]) -> Any: return obj.to_dict() elif isinstance(obj, np.number): return float(obj) - elif isinstance(obj, (pd.Timestamp, np.datetime64)): + elif isinstance(obj, (_PandasTimestamp, np.datetime64)): + import pandas as pd + return pd.Timestamp(obj).isoformat() else: return obj @@ -934,7 +945,7 @@ def to_dict( # parsed_shorthand is removed from context if it exists so that it is # not passed to child to_dict function calls parsed_shorthand = context.pop("parsed_shorthand", {}) - # Prevent that pandas categorical data is automatically sorted + # Prevent that categorical data is automatically sorted # when a non-ordinal data type is specifed manually # or if the encoding channel does not support sorting if "sort" in parsed_shorthand and (