diff --git a/py-polars/polars/api.py b/py-polars/polars/api.py index 9d714e26930f..84c3e79b2446 100644 --- a/py-polars/polars/api.py +++ b/py-polars/polars/api.py @@ -89,7 +89,7 @@ def register_expr_namespace(name: str) -> Callable[[type[NS]], type[NS]]: ... def nearest(self, p: int) -> pl.Expr: ... return (p ** (self._expr.log(p)).round(0).cast(pl.Int64)).cast(pl.Int64) >>> - >>> df = pl.DataFrame([1.4, 24.3, 55.0, 64.001], columns=["n"]) + >>> df = pl.DataFrame([1.4, 24.3, 55.0, 64.001], schema=["n"]) >>> df.select( ... [ ... pl.col("n"), @@ -152,7 +152,7 @@ def register_dataframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: >>> >>> df = pl.DataFrame( ... data=[["xx", 2, 3, 4], ["xy", 4, 5, 6], ["yy", 5, 6, 7], ["yz", 6, 7, 8]], - ... columns=["a1", "a2", "b1", "b2"], + ... schema=["a1", "a2", "b1", "b2"], ... orient="row", ... ) >>> df @@ -248,7 +248,7 @@ def register_lazyframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: >>> >>> ldf = pl.DataFrame( ... data={"a": [1, 2], "b": [3, 4], "c": [5.6, 6.7]}, - ... columns=[("a", pl.Int16), ("b", pl.Int32), ("c", pl.Float32)], + ... schema=[("a", pl.Int16), ("b", pl.Int32), ("c", pl.Float32)], ... ).lazy() >>> >>> ldf.collect() @@ -274,7 +274,7 @@ def register_lazyframe_namespace(name: str) -> Callable[[type[NS]], type[NS]]: >>> >>> ldf = pl.DataFrame( ... data=[["xx", 2, 3, 4], ["xy", 4, 5, 6], ["yy", 5, 6, 7], ["yz", 6, 7, 8]], - ... columns=["a1", "a2", "b1", "b2"], + ... schema=["a1", "a2", "b1", "b2"], ... orient="row", ... ).lazy() >>> diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index 08263d842116..94138c06922d 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -7,31 +7,40 @@ from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.internals import DataFrame, Series +from polars.internals.construction import _unpack_schema, include_unknowns from polars.utils import deprecated_alias if TYPE_CHECKING: from polars.internals.type_aliases import Orientation +@deprecated_alias(columns="schema") def from_dict( data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | Series], - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, *, schema_overrides: SchemaDict | None = None, ) -> DataFrame: """ Construct a DataFrame from a dictionary of sequences. - This operation clones data, unless you pass in a ``Dict[str, pl.Series]``. + This operation clones data, unless you pass a ``{str: pl.Series,}`` dict. Parameters ---------- data : dict of sequences Two-dimensional data represented as a dictionary. dict must contain Sequences. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. If specified, overrides any - labels already present in the data. Must match data dimensions. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. @@ -56,15 +65,15 @@ def from_dict( """ return DataFrame._from_dict( - data=data, schema=columns, schema_overrides=schema_overrides + data=data, schema=schema, schema_overrides=schema_overrides ) -@deprecated_alias(schema="schema_overrides") def from_dicts( dicts: Sequence[dict[str, Any]], infer_schema_length: int | None = N_INFER_DEFAULT, *, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, ) -> DataFrame: """ @@ -76,7 +85,17 @@ def from_dicts( Sequence with dictionaries mapping column name to value infer_schema_length How many dictionaries/rows to scan to determine the data types - if set to `None` all rows are scanned. This will be slow. + if set to `None` then ALL dicts are scanned; this will be slow. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support override of inferred types for one or more columns. @@ -100,24 +119,9 @@ def from_dicts( │ 3 ┆ 6 │ └─────┴─────┘ - >>> # overwrite first column name and dtype - >>> pl.from_dicts(data, schema_overrides={"c": pl.Int32}) - shape: (3, 2) - ┌─────┬─────┐ - │ c ┆ b │ - │ --- ┆ --- │ - │ i32 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 4 │ - │ 2 ┆ 5 │ - │ 3 ┆ 6 │ - └─────┴─────┘ - - >>> # let polars infer the dtypes - >>> # but inform about a 3rd column - >>> pl.from_dicts( - ... data, schema_overrides={"a": pl.Unknown, "b": pl.Unknown, "c": pl.Int32} - ... ) + >>> # let polars infer the first two column dtypes, and + >>> # explicitly inform the constructor about a third column + >>> pl.from_dicts(data, schema=["a", "b"], schema_overrides={"c": pl.Int32}) shape: (3, 3) ┌─────┬─────┬──────┐ │ a ┆ b ┆ c │ @@ -130,15 +134,21 @@ def from_dicts( └─────┴─────┴──────┘ """ + columns, schema = _unpack_schema( + schema, schema_overrides=schema_overrides, include_overrides_in_columns=True + ) + schema_overrides = include_unknowns(schema, columns or list(schema)) return DataFrame._from_dicts( dicts, infer_schema_length, schema_overrides=schema_overrides ) +@deprecated_alias(columns="schema") def from_records( data: Sequence[Sequence[Any]], - columns: Sequence[str] | None = None, + schema: Sequence[str] | None = None, orient: Orientation | None = None, + *, infer_schema_length: int | None = N_INFER_DEFAULT, schema_overrides: SchemaDict | None = None, ) -> DataFrame: @@ -151,9 +161,16 @@ def from_records( ---------- data : Sequence of sequences Two-dimensional data represented as a sequence of sequences. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. Must match data dimensions. - If not specified, columns will be named `column_0`, `column_1`, etc. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. orient : {None, 'col', 'row'} Whether to interpret two-dimensional data as columns or as rows. If None, the orientation is inferred by matching the columns and data dimensions. If @@ -172,7 +189,7 @@ def from_records( Examples -------- >>> data = [[1, 2, 3], [4, 5, 6]] - >>> df = pl.from_records(data, columns=["a", "b"]) + >>> df = pl.from_records(data, schema=["a", "b"]) >>> df shape: (3, 2) ┌─────┬─────┐ @@ -188,16 +205,17 @@ def from_records( """ return DataFrame._from_records( data, - columns=columns, + schema=schema, schema_overrides=schema_overrides, orient=orient, infer_schema_length=infer_schema_length, ) +@deprecated_alias(columns="schema") def from_numpy( data: np.ndarray[Any, Any], - columns: Sequence[str] | None = None, + schema: SchemaDefinition | None = None, orient: Orientation | None = None, schema_overrides: SchemaDict | None = None, ) -> DataFrame: @@ -210,9 +228,16 @@ def from_numpy( ---------- data : :class:`numpy.ndarray` Two-dimensional data represented as a numpy ndarray. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. Must match data dimensions. - If not specified, columns will be named `column_0`, `column_1`, etc. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. orient : {None, 'col', 'row'} Whether to interpret two-dimensional data as columns or as rows. If None, the orientation is inferred by matching the columns and data dimensions. If @@ -229,7 +254,7 @@ def from_numpy( -------- >>> import numpy as np >>> data = np.array([[1, 2, 3], [4, 5, 6]]) - >>> df = pl.from_numpy(data, columns=["a", "b"], orient="col") + >>> df = pl.from_numpy(data, schema=["a", "b"], orient="col") >>> df shape: (3, 2) ┌─────┬─────┐ @@ -244,7 +269,7 @@ def from_numpy( """ return DataFrame._from_numpy( - data, columns=columns, orient=orient, schema_overrides=schema_overrides + data, schema=schema, orient=orient, schema_overrides=schema_overrides ) @@ -271,14 +296,14 @@ def from_arrow( If not specified, existing Array table columns are used, with missing names named as `column_0`, `column_1`, etc. schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - The resulting DataFrame schema may be declared in several ways: + The DataFrame schema may be declared in several ways: - * As a dict of {name:type} pairs; if the type is None, it will be auto-inferred. - * As a list of column names; in this case types are all automatically inferred. + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. * As a list of (name,type) pairs; this is equivalent to the dictionary form. If you supply a list of column names that does not match the names in the - underlying data, the names supplied here will overwrite them. The number + underlying data, the names given here will overwrite them. The number of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None @@ -325,7 +350,7 @@ def from_arrow( """ if isinstance(a, pa.Table): return DataFrame._from_arrow( - a, rechunk=rechunk, columns=schema, schema_overrides=schema_overrides + a, rechunk=rechunk, schema=schema, schema_overrides=schema_overrides ) elif isinstance(a, (pa.Array, pa.ChunkedArray)): return Series._from_arrow("", a, rechunk) diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py index 4e305322aaa8..f73240b0323e 100644 --- a/py-polars/polars/internals/construction.py +++ b/py-polars/polars/internals/construction.py @@ -90,7 +90,10 @@ def include_unknowns( schema: SchemaDict, cols: Sequence[str] ) -> MutableMapping[str, PolarsDataType]: """Complete partial schema dict by including Unknown type.""" - return {col: schema.get(col, Unknown) for col in cols} + return { + col: (schema.get(col, Unknown) or Unknown) # type: ignore[truthy-bool] + for col in cols + } ################################ @@ -307,7 +310,7 @@ def sequence_to_pyseries( empty = {} # type: ignore[var-annotated] return sequence_to_pydf( data=[(empty if v is None else v) for v in values], - columns=struct_schema, + schema=struct_schema, orient="row", ).to_struct(name) else: @@ -478,12 +481,10 @@ def _pandas_series_to_arrow( dtype = getattr(values, "dtype", None) if dtype == "object": first_non_none = _get_first_non_none(values.values) # type: ignore[arg-type] - if isinstance(first_non_none, str): return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) - if first_non_none is None: + elif first_non_none is None: return pa.nulls(min_len, pa.large_utf8()) - return pa.array(values, from_pandas=nan_to_none) elif dtype: return pa.array(values, from_pandas=nan_to_none) @@ -543,7 +544,7 @@ def _post_apply_columns( ) -> PyDataFrame: """Apply 'columns' param _after_ PyDataFrame creation (if no alternative).""" pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes() - columns, dtypes = _unpack_columns( + columns, dtypes = _unpack_schema( (columns or pydf_columns), schema_overrides=schema_overrides ) column_subset: list[str] = [] @@ -573,11 +574,12 @@ def _post_apply_columns( return pydf -def _unpack_columns( - columns: SchemaDefinition | None, +def _unpack_schema( + schema: SchemaDefinition | None, schema_overrides: SchemaDict | None = None, n_expected: int | None = None, lookup_names: Iterable[str] | None = None, + include_overrides_in_columns: bool = False, ) -> tuple[list[str], SchemaDict]: """ Unpack column names and create dtype lookup. @@ -585,11 +587,11 @@ def _unpack_columns( Works for any (name, dtype) pairs or schema dict input, overriding any inferred dtypes with explicit dtypes if supplied. """ - if isinstance(columns, dict): - columns = list(columns.items()) + if isinstance(schema, dict): + schema = list(schema.items()) column_names = [ (col or f"column_{i}") if isinstance(col, str) else col[0] - for i, col in enumerate(columns or []) + for i, col in enumerate(schema or []) ] if not column_names and n_expected: column_names = [f"column_{i}" for i in range(n_expected)] @@ -598,11 +600,15 @@ def _unpack_columns( } column_dtypes = { lookup.get(col[0], col[0]): col[1] - for col in (columns or []) + for col in (schema or []) if not isinstance(col, str) and col[1] } if schema_overrides: column_dtypes.update(schema_overrides) + if schema and include_overrides_in_columns: + column_names = column_names + [ + col for col in column_dtypes if col not in column_names + ] return ( column_names or None, # type: ignore[return-value] @@ -660,23 +666,23 @@ def _expand_dict_scalars( def dict_to_pydf( data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | pli.Series], - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, ) -> PyDataFrame: """Construct a PyDataFrame from a dictionary of sequences.""" - if not columns: - columns = list(data) - if columns: + if not schema: + schema = list(data) + if schema: # the columns arg may also set the dtype/column order of the series - if isinstance(columns, dict) and data: - if not all((col in columns) for col in data): + if isinstance(schema, dict) and data: + if not all((col in schema) for col in data): raise ValueError( "The given column-schema names do not match the data dictionary" ) - data = {col: data[col] for col in columns} + data = {col: data[col] for col in schema} - columns, schema_overrides = _unpack_columns( - columns, lookup_names=data.keys(), schema_overrides=schema_overrides + columns, schema_overrides = _unpack_schema( + schema, lookup_names=data.keys(), schema_overrides=schema_overrides ) if not data and schema_overrides: data_series = [ @@ -724,7 +730,7 @@ def dict_to_pydf( def sequence_to_pydf( data: Sequence[Any], - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, @@ -732,30 +738,30 @@ def sequence_to_pydf( """Construct a PyDataFrame from a sequence.""" data_series: list[PySeries] if len(data) == 0: - return dict_to_pydf({}, columns=columns, schema_overrides=schema_overrides) + return dict_to_pydf({}, schema=schema, schema_overrides=schema_overrides) if isinstance(data[0], Generator): data = [list(row) for row in data] if isinstance(data[0], pli.Series): series_names = [s.name for s in data] - columns, schema_overrides = _unpack_columns( - columns or series_names, + column_names, schema_overrides = _unpack_schema( + schema or series_names, schema_overrides=schema_overrides, n_expected=len(data), ) data_series = [] for i, s in enumerate(data): if not s.name: # TODO: Replace by `if s.name is None` once allowed - s.rename(columns[i], in_place=True) - new_dtype = schema_overrides.get(columns[i]) + s.rename(column_names[i], in_place=True) + new_dtype = schema_overrides.get(column_names[i]) if new_dtype and new_dtype != s.dtype: s = s.cast(new_dtype) data_series.append(s._s) elif isinstance(data[0], dict): - column_names, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides ) dtypes = ( include_unknowns(schema_overrides, column_names) @@ -769,22 +775,22 @@ def sequence_to_pydf( elif isinstance(data[0], (list, tuple, Sequence)) and not isinstance(data[0], str): if is_namedtuple(data[0]): - if columns is None: - columns = data[0]._fields # type: ignore[attr-defined] - if len(data[0].__annotations__) == len(columns): - columns = [ + if schema is None: + schema = data[0]._fields # type: ignore[attr-defined] + if len(data[0].__annotations__) == len(schema): + schema = [ (name, py_type_to_dtype(tp, raise_unmatched=False)) for name, tp in data[0].__annotations__.items() ] elif orient is None: orient = "row" - if orient is None and columns is not None: - orient = "col" if len(columns) == len(data) else "row" + if orient is None and schema is not None: + orient = "col" if len(schema) == len(data) else "row" if orient == "row": - column_names, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides, n_expected=len(data[0]) + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides, n_expected=len(data[0]) ) schema_override = ( include_unknowns(schema_overrides, column_names) @@ -810,11 +816,13 @@ def sequence_to_pydf( return pydf elif orient == "col" or orient is None: - columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides, n_expected=len(data) + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides, n_expected=len(data) ) data_series = [ - pli.Series(columns[i], data[i], schema_overrides.get(columns[i]))._s + pli.Series( + column_names[i], data[i], schema_overrides.get(column_names[i]) + )._s for i in range(len(data)) ] else: @@ -823,15 +831,15 @@ def sequence_to_pydf( ) elif is_dataclass(data[0]): - if columns: - columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides + if schema: + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides ) schema_override = { - col: schema_overrides.get(col, Unknown) for col in columns + col: schema_overrides.get(col, Unknown) for col in column_names } else: - columns = None + column_names = None schema_override = { col: (py_type_to_dtype(tp, raise_unmatched=False) or Unknown) for col, tp in dataclass_type_hints(data[0].__class__).items() @@ -850,44 +858,46 @@ def sequence_to_pydf( col: tp for col, tp in schema_override.items() if isinstance(tp, Struct) } pydf = _post_apply_columns( - pydf, columns, structs, schema_overrides=schema_overrides + pydf, column_names, structs, schema_overrides=schema_overrides ) return pydf elif _check_for_pandas(data[0]) and isinstance( data[0], (pd.Series, pd.DatetimeIndex) ): - if columns is not None: - columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides, n_expected=1 + if schema is None: + column_names = None + else: + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides, n_expected=1 ) schema_overrides = schema_overrides or {} data_series = [] for i, s in enumerate(data): - name = columns[i] if columns else s.name + name = column_names[i] if column_names else s.name dtype = schema_overrides.get(name, None) pyseries = pandas_to_pyseries(name=name, values=s) if dtype is not None and dtype != pyseries.dtype(): pyseries = pyseries.cast(dtype, strict=True) data_series.append(pyseries) - columns = None + column_names = None else: - columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides, n_expected=1 + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides, n_expected=1 ) data_series = [ - pli.Series(columns[0], data, schema_overrides.get(columns[0]))._s + pli.Series(column_names[0], data, schema_overrides.get(column_names[0]))._s ] - data_series = _handle_columns_arg(data_series, columns=columns) + data_series = _handle_columns_arg(data_series, columns=column_names) return PyDataFrame(data_series) def numpy_to_pydf( data: np.ndarray[Any, Any], - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, ) -> PyDataFrame: @@ -904,13 +914,13 @@ def numpy_to_pydf( elif len(shape) == 2: # default convention # first axis is rows, second axis is columns - if orient is None and columns is None: + if orient is None and schema is None: n_columns = shape[1] orient = "row" # Infer orientation if columns argument is given - elif orient is None and columns is not None: - if len(columns) == shape[0]: + elif orient is None and schema is not None: + if len(schema) == shape[0]: orient = "col" n_columns = shape[0] else: @@ -930,11 +940,11 @@ def numpy_to_pydf( "Cannot create DataFrame from numpy array with more than two dimensions." ) - if columns is not None and len(columns) != n_columns: + if schema is not None and len(schema) != n_columns: raise ValueError("Dimensions of columns arg must match data dimensions.") - columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides, n_expected=n_columns + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides, n_expected=n_columns ) # Convert data to series @@ -943,39 +953,43 @@ def numpy_to_pydf( elif len(shape) == 1: data_series = [ - pli.Series(columns[0], data, schema_overrides.get(columns[0]))._s + pli.Series(column_names[0], data, schema_overrides.get(column_names[0]))._s ] else: if orient == "row": data_series = [ - pli.Series(columns[i], data[:, i], schema_overrides.get(columns[i]))._s + pli.Series( + column_names[i], data[:, i], schema_overrides.get(column_names[i]) + )._s for i in range(n_columns) ] else: data_series = [ - pli.Series(columns[i], data[i], schema_overrides.get(columns[i]))._s + pli.Series( + column_names[i], data[i], schema_overrides.get(column_names[i]) + )._s for i in range(n_columns) ] - data_series = _handle_columns_arg(data_series, columns=columns) + data_series = _handle_columns_arg(data_series, columns=column_names) return PyDataFrame(data_series) def arrow_to_pydf( data: pa.Table, - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, rechunk: bool = True, ) -> PyDataFrame: """Construct a PyDataFrame from an Arrow Table.""" - original_columns = columns - columns, schema_overrides = _unpack_columns( - (columns or data.column_names), schema_overrides=schema_overrides + original_schema = schema + column_names, schema_overrides = _unpack_schema( + (schema or data.column_names), schema_overrides=schema_overrides ) try: - if columns and columns != data.column_names: - data = data.rename_columns(columns) + if column_names and column_names != data.column_names: + data = data.rename_columns(column_names) except pa.lib.ArrowInvalid as e: raise ValueError("Dimensions of columns arg must match data dimensions.") from e @@ -1039,16 +1053,16 @@ def arrow_to_pydf( df = df[names] pydf = df._df - if columns != original_columns and (schema_overrides or original_columns): + if column_names != original_schema and (schema_overrides or original_schema): pydf = _post_apply_columns( - pydf, original_columns, schema_overrides=schema_overrides + pydf, original_schema, schema_overrides=schema_overrides ) elif schema_overrides: for col, dtype in zip(pydf.columns(), pydf.dtypes()): override_dtype = schema_overrides.get(col) if override_dtype is not None and dtype != override_dtype: pydf = _post_apply_columns( - pydf, original_columns, schema_overrides=schema_overrides + pydf, original_schema, schema_overrides=schema_overrides ) break @@ -1057,69 +1071,69 @@ def arrow_to_pydf( def series_to_pydf( data: pli.Series, - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, ) -> PyDataFrame: """Construct a PyDataFrame from a Polars Series.""" data_series = [data._s] series_name = [s.name() for s in data_series] - columns, schema_overrides = _unpack_columns( - columns or series_name, schema_overrides=schema_overrides, n_expected=1 + colum_names, schema_overrides = _unpack_schema( + schema or series_name, schema_overrides=schema_overrides, n_expected=1 ) if schema_overrides: new_dtype = list(schema_overrides.values())[0] if new_dtype != data.dtype: data_series[0] = data_series[0].cast(new_dtype, True) - data_series = _handle_columns_arg(data_series, columns=columns) + data_series = _handle_columns_arg(data_series, columns=colum_names) return PyDataFrame(data_series) def iterable_to_pydf( data: Iterable[Any], - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, chunk_size: int | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, ) -> PyDataFrame: """Construct a PyDataFrame from an iterable/generator.""" - original_columns = columns + original_schema, column_names = schema, None dtypes_by_idx: dict[int, PolarsDataType] = {} - if columns is not None: - columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides + if schema is not None: + column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides ) elif schema_overrides: - _columns, schema_overrides = _unpack_columns( - columns, schema_overrides=schema_overrides + _column_names, schema_overrides = _unpack_schema( + schema, schema_overrides=schema_overrides ) if not isinstance(data, Generator): data = iter(data) if orient == "col": - if columns and schema_overrides: + if column_names and schema_overrides: dtypes_by_idx = { idx: schema_overrides.get(col, Unknown) - for idx, col in enumerate(columns) + for idx, col in enumerate(column_names) } return pli.DataFrame( { - (f"column_{idx}" if columns is None else columns[idx]): pli.Series( - coldata, dtype=dtypes_by_idx.get(idx) - ) + ( + f"column_{idx}" if column_names is None else column_names[idx] + ): pli.Series(coldata, dtype=dtypes_by_idx.get(idx)) for idx, coldata in enumerate(data) } )._df def to_frame_chunk( - values: list[Any], columns: SchemaDefinition | None + values: list[Any], schema: SchemaDefinition | None ) -> pli.DataFrame: return pli.DataFrame( data=values, - columns=columns, + schema=schema, orient="row", infer_schema_length=infer_schema_length, ) @@ -1129,8 +1143,8 @@ def to_frame_chunk( if chunk_size: adaptive_chunk_size = chunk_size - elif columns: - adaptive_chunk_size = n_chunk_elems // len(columns) + elif column_names: + adaptive_chunk_size = n_chunk_elems // len(column_names) else: adaptive_chunk_size = None @@ -1139,11 +1153,11 @@ def to_frame_chunk( values = list(islice(data, adaptive_chunk_size or 1000)) if not values: break - frame_chunk = to_frame_chunk(values, original_columns) + frame_chunk = to_frame_chunk(values, original_schema) if df is None: df = frame_chunk - if not original_columns: - original_columns = list(df.schema.items()) + if not original_schema: + original_schema = list(df.schema.items()) if not adaptive_chunk_size: adaptive_chunk_size = n_chunk_elems // len(df.columns) else: @@ -1151,14 +1165,14 @@ def to_frame_chunk( n_chunks += 1 if df is None: - df = to_frame_chunk([], original_columns) + df = to_frame_chunk([], original_schema) return (df.rechunk() if n_chunks > 0 else df)._df def pandas_to_pydf( data: pd.DataFrame, - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, rechunk: bool = True, nan_to_none: bool = True, @@ -1173,7 +1187,7 @@ def pandas_to_pydf( } arrow_table = pa.table(arrow_dict) return arrow_to_pydf( - arrow_table, columns=columns, schema_overrides=schema_overrides, rechunk=rechunk + arrow_table, schema=schema, schema_overrides=schema_overrides, rechunk=rechunk ) diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index 45d88965fba4..a1c4c6b48f2c 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -76,6 +76,7 @@ _prepare_row_count_args, _process_null_values, _timedelta_to_pl_duration, + deprecated_alias, handle_projection_columns, is_bool_sequence, is_int_sequence, @@ -152,9 +153,16 @@ class DataFrame: data : dict, Sequence, ndarray, Series, or pandas.DataFrame Two-dimensional data in various forms. dict must contain Sequences. Sequence may contain Series or other Sequences. - columns : Sequence of str, (str,DataType) pairs, or {str:DataType,} dict - Column labels (with optional type) to use for resulting DataFrame. If specified, - overrides any labels already present in the data. Must match data dimensions. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. orient : {'col', 'row'}, default None Whether to interpret two-dimensional data as columns or as rows. If None, the orientation is inferred by matching the columns and data dimensions. If @@ -164,7 +172,7 @@ class DataFrame: data is a sequence or generator of rows; other input is read as-is. schema_overrides : dict, default None Support type specification or override of one or more columns; note that - any dtypes inferred from the columns param will be overridden. + any dtypes inferred from the schema param will be overridden. Examples -------- @@ -188,14 +196,11 @@ class DataFrame: >>> df.dtypes [Int64, Int64] - In order to specify dtypes for your columns, initialize the DataFrame with a list - of typed Series: + To specify the frame schema you supply the `schema` parameter with a dictionary + of (name,dtype) pairs... - >>> data = [ - ... pl.Series("col1", [1, 2], dtype=pl.Float32), - ... pl.Series("col2", [3, 4], dtype=pl.Int64), - ... ] - >>> df2 = pl.DataFrame(data) + >>> data = {"col1": [0, 2], "col2": [3, 7]} + >>> df2 = pl.DataFrame(data, schema={"col1": pl.Float32, "col2": pl.Int64}) >>> df2 shape: (2, 2) ┌──────┬──────┐ @@ -203,15 +208,14 @@ class DataFrame: │ --- ┆ --- │ │ f32 ┆ i64 │ ╞══════╪══════╡ - │ 1.0 ┆ 3 │ - │ 2.0 ┆ 4 │ + │ 0.0 ┆ 3 │ + │ 2.0 ┆ 7 │ └──────┴──────┘ - Or set the `columns` parameter with a list of (name,dtype) pairs (compatible with - all of the other valid data parameter types): + ...a sequence of (name,dtype) pairs... >>> data = {"col1": [1, 2], "col2": [3, 4]} - >>> df3 = pl.DataFrame(data, columns=[("col1", pl.Float32), ("col2", pl.Int64)]) + >>> df3 = pl.DataFrame(data, schema=[("col1", pl.Float32), ("col2", pl.Int64)]) >>> df3 shape: (2, 2) ┌──────┬──────┐ @@ -223,11 +227,13 @@ class DataFrame: │ 2.0 ┆ 4 │ └──────┴──────┘ - The `columns` parameter could also be set with a `dict` containing the schema of the - expected DataFrame + ...or a list of typed Series. - >>> data = {"col1": [0, 2], "col2": [3, 7]} - >>> df4 = pl.DataFrame(data, columns={"col1": pl.Float32, "col2": pl.Int64}) + >>> data = [ + ... pl.Series("col1", [1, 2], dtype=pl.Float32), + ... pl.Series("col2", [3, 4], dtype=pl.Int64), + ... ] + >>> df4 = pl.DataFrame(data) >>> df4 shape: (2, 2) ┌──────┬──────┐ @@ -235,15 +241,15 @@ class DataFrame: │ --- ┆ --- │ │ f32 ┆ i64 │ ╞══════╪══════╡ - │ 0.0 ┆ 3 │ - │ 2.0 ┆ 7 │ + │ 1.0 ┆ 3 │ + │ 2.0 ┆ 4 │ └──────┴──────┘ Constructing a DataFrame from a numpy ndarray, specifying column names: >>> import numpy as np >>> data = np.array([(1, 2), (3, 4)], dtype=np.int64) - >>> df5 = pl.DataFrame(data, columns=["a", "b"], orient="col") + >>> df5 = pl.DataFrame(data, schema=["a", "b"], orient="col") >>> df5 shape: (2, 2) ┌─────┬─────┐ @@ -258,7 +264,7 @@ class DataFrame: Constructing a DataFrame from a list of lists, row orientation inferred: >>> data = [[1, 2, 3], [4, 5, 6]] - >>> df6 = pl.DataFrame(data, columns=["a", "b", "c"]) + >>> df6 = pl.DataFrame(data, schema=["a", "b", "c"]) >>> df6 shape: (2, 3) ┌─────┬─────┬─────┐ @@ -286,6 +292,7 @@ class DataFrame: _accessors: set[str] = set() + @deprecated_alias(columns="schema") def __init__( self, data: ( @@ -297,7 +304,7 @@ def __init__( | pli.Series | None ) = None, - columns: SchemaDefinition | None = None, + schema: SchemaDefinition | None = None, orient: Orientation | None = None, *, infer_schema_length: int | None = N_INFER_DEFAULT, @@ -305,46 +312,46 @@ def __init__( ): if data is None: self._df = dict_to_pydf( - {}, columns=columns, schema_overrides=schema_overrides + {}, schema=schema, schema_overrides=schema_overrides ) elif isinstance(data, dict): self._df = dict_to_pydf( - data, columns=columns, schema_overrides=schema_overrides + data, schema=schema, schema_overrides=schema_overrides ) elif isinstance(data, (list, tuple, Sequence)): self._df = sequence_to_pydf( data, - columns=columns, + schema=schema, schema_overrides=schema_overrides, orient=orient, infer_schema_length=infer_schema_length, ) elif isinstance(data, pli.Series): self._df = series_to_pydf( - data, columns=columns, schema_overrides=schema_overrides + data, schema=schema, schema_overrides=schema_overrides ) elif _check_for_numpy(data) and isinstance(data, np.ndarray): self._df = numpy_to_pydf( - data, columns=columns, schema_overrides=schema_overrides, orient=orient + data, schema=schema, schema_overrides=schema_overrides, orient=orient ) elif _check_for_pyarrow(data) and isinstance(data, pa.Table): self._df = arrow_to_pydf( - data, columns=columns, schema_overrides=schema_overrides + data, schema=schema, schema_overrides=schema_overrides ) elif _check_for_pandas(data) and isinstance(data, pd.DataFrame): self._df = pandas_to_pydf( - data, columns=columns, schema_overrides=schema_overrides + data, schema=schema, schema_overrides=schema_overrides ) elif not isinstance(data, Sized) and isinstance(data, (Generator, Iterable)): self._df = iterable_to_pydf( data, - columns=columns, + schema=schema, schema_overrides=schema_overrides, orient=orient, infer_schema_length=infer_schema_length, @@ -389,8 +396,15 @@ def _from_dict( Two-dimensional data represented as a dictionary. dict must contain Sequences. schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict - Column labels to use for resulting DataFrame. If specified, overrides any - labels already present in the data. Must match data dimensions. + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. @@ -401,14 +415,15 @@ def _from_dict( """ return cls._from_pydf( - dict_to_pydf(data, columns=schema, schema_overrides=schema_overrides) + dict_to_pydf(data, schema=schema, schema_overrides=schema_overrides) ) @classmethod + @deprecated_alias(columns="schema") def _from_records( cls: type[DF], data: Sequence[Sequence[Any]], - columns: Sequence[str] | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, @@ -420,9 +435,16 @@ def _from_records( ---------- data : Sequence of sequences Two-dimensional data represented as a sequence of sequences. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. Must match data dimensions. - If not specified, columns will be named `column_0`, `column_1`, etc. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. @@ -441,7 +463,7 @@ def _from_records( return cls._from_pydf( sequence_to_pydf( data, - columns=columns, + schema=schema, schema_overrides=schema_overrides, orient=orient, infer_schema_length=infer_schema_length, @@ -449,10 +471,11 @@ def _from_records( ) @classmethod + @deprecated_alias(columns="schema") def _from_numpy( cls: type[DF], data: np.ndarray[Any, Any], - columns: Sequence[str] | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, ) -> DF: @@ -463,9 +486,16 @@ def _from_numpy( ---------- data : numpy ndarray Two-dimensional data represented as a numpy ndarray. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. Must match data dimensions. - If not specified, columns will be named `column_0`, `column_1`, etc. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. @@ -481,15 +511,16 @@ def _from_numpy( """ return cls._from_pydf( numpy_to_pydf( - data, columns=columns, schema_overrides=schema_overrides, orient=orient + data, schema=schema, schema_overrides=schema_overrides, orient=orient ) ) @classmethod + @deprecated_alias(columns="schema") def _from_arrow( cls: type[DF], data: pa.Table, - columns: Sequence[str] | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, rechunk: bool = True, ) -> DF: @@ -503,10 +534,16 @@ def _from_arrow( ---------- data : arrow table, array, or sequence of sequences Data representing an Arrow Table or Array. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. Must match data dimensions. - If not specified, existing Array table columns are used, with missing names - named as `column_0`, `column_1`, etc. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. @@ -521,17 +558,18 @@ def _from_arrow( return cls._from_pydf( arrow_to_pydf( data, - columns=columns, + schema=schema, schema_overrides=schema_overrides, rechunk=rechunk, ) ) @classmethod + @deprecated_alias(columns="schema") def _from_pandas( cls: type[DF], data: pd.DataFrame, - columns: Sequence[str] | None = None, + schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, rechunk: bool = True, nan_to_none: bool = True, @@ -543,9 +581,16 @@ def _from_pandas( ---------- data : pandas DataFrame Two-dimensional data represented as a pandas DataFrame. - columns : Sequence of str, default None - Column labels to use for resulting DataFrame. If specified, overrides any - labels already present in the data. Must match data dimensions. + schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict + The DataFrame schema may be declared in several ways: + + * As a dict of {name:type} pairs; if type is None, it will be auto-inferred. + * As a list of column names; in this case types are automatically inferred. + * As a list of (name,type) pairs; this is equivalent to the dictionary form. + + If you supply a list of column names that does not match the names in the + underlying data, the names given here will overwrite them. The number + of names given in the schema should match the underlying data dimensions. schema_overrides : dict, default None Support type specification or override of one or more columns; note that any dtypes inferred from the columns param will be overridden. @@ -562,7 +607,7 @@ def _from_pandas( return cls._from_pydf( pandas_to_pydf( data, - columns=columns, + schema=schema, schema_overrides=schema_overrides, rechunk=rechunk, nan_to_none=nan_to_none, @@ -2385,7 +2430,7 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: ... "y": [v / 1000 for v in range(1_000_000)], ... "z": [str(v) for v in range(1_000_000)], ... }, - ... columns=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], + ... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.Utf8)], ... ) >>> df.estimated_size() 25888898 @@ -6253,7 +6298,7 @@ def n_unique( If instead you want to count the number of unique values per-column, you can also use expression-level syntax to return a new frame containing that result: - >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], columns=["a", "b", "c"]) + >>> df = pl.DataFrame([[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"]) >>> df_nunique = df.select(pl.all().n_unique()) In aggregate context there is also an equivalent method for returning the @@ -7003,7 +7048,7 @@ def pearson_corr(self, **kwargs: dict[str, Any]) -> DataFrame: """ return DataFrame( np.corrcoef(self, **kwargs), - columns=self.columns, + schema=self.columns, ) def merge_sorted(self: DF, other: DataFrame, key: str) -> DF: diff --git a/py-polars/polars/internals/lazy_functions.py b/py-polars/polars/internals/lazy_functions.py index 9e67eb205932..3816cdf9b48a 100644 --- a/py-polars/polars/internals/lazy_functions.py +++ b/py-polars/polars/internals/lazy_functions.py @@ -2543,7 +2543,7 @@ def coalesce( ... (None, None, 3.0), ... (None, None, None), ... ], - ... columns=[("a", pl.Float64), ("b", pl.Float64), ("c", pl.Float64)], + ... schema=[("a", pl.Float64), ("b", pl.Float64), ("c", pl.Float64)], ... ) >>> df.with_column(pl.coalesce(["a", "b", "c", 99.9]).alias("d")) shape: (4, 4) diff --git a/py-polars/polars/internals/lazyframe/frame.py b/py-polars/polars/internals/lazyframe/frame.py index 9048de8936f6..58aa4906d740 100644 --- a/py-polars/polars/internals/lazyframe/frame.py +++ b/py-polars/polars/internals/lazyframe/frame.py @@ -1485,7 +1485,7 @@ def cleared(self) -> LazyFrame: └─────┴─────┴──────┘ """ - return pli.DataFrame(columns=self.schema).lazy() + return pli.DataFrame(schema=self.schema).lazy() def clone(self: LDF) -> LDF: """ diff --git a/py-polars/polars/internals/lazyframe/groupby.py b/py-polars/polars/internals/lazyframe/groupby.py index 34b0bba3521d..97c13ef0d2a5 100644 --- a/py-polars/polars/internals/lazyframe/groupby.py +++ b/py-polars/polars/internals/lazyframe/groupby.py @@ -176,10 +176,9 @@ def apply( f Function to apply over each group of the `LazyFrame`. schema - Schema of the output function. This has to be known statically. - If the schema provided is incorrect, this is a bug in the callers - query and may lead to errors. - If set to None, polars assumes the schema is unchanged. + Schema of the output function. This has to be known statically. If the + given schema is incorrect, this is a bug in the caller's query and may + lead to errors. If set to None, polars assumes the schema is unchanged. Examples diff --git a/py-polars/polars/internals/slice.py b/py-polars/polars/internals/slice.py index 559ba69d3fdf..a327f1df29ae 100644 --- a/py-polars/polars/internals/slice.py +++ b/py-polars/polars/internals/slice.py @@ -32,7 +32,7 @@ def _as_original(lazy: "pli.LazyFrame", original: FrameOrSeries) -> FrameOrSerie @staticmethod def _lazify(obj: FrameOrSeries) -> "pli.LazyFrame": """Make lazy to ensure efficient/consistent handling.""" - return obj.lazy() if isinstance(obj, pli.DataFrame) else obj.to_frame().lazy() + return obj.to_frame().lazy() if isinstance(obj, pli.Series) else obj.lazy() def _slice_positive(self, obj: "pli.LazyFrame") -> "pli.LazyFrame": """Logic for slices with positive stride.""" diff --git a/py-polars/polars/testing/_parametric.py b/py-polars/polars/testing/_parametric.py index cea0a56643ab..8143f973a2ea 100644 --- a/py-polars/polars/testing/_parametric.py +++ b/py-polars/polars/testing/_parametric.py @@ -251,8 +251,8 @@ def columns( >>> from string import punctuation >>> >>> def test_special_char_colname_init() -> None: - ... cols = [(c.name, c.dtype) for c in columns(punctuation)] - ... df = pl.DataFrame(columns=cols) + ... schema = [(c.name, c.dtype) for c in columns(punctuation)] + ... df = pl.DataFrame(schema=schema) ... assert len(cols) == len(df.columns) ... assert 0 == len(df.rows()) ... @@ -617,7 +617,7 @@ def draw_frames(draw: DrawFn) -> pli.DataFrame | pli.LazyFrame: ) for c in coldefs }, - columns=frame_columns, # type: ignore[arg-type] + schema=frame_columns, # type: ignore[arg-type] ) # optionally generate frames with n_chunks > 1 if series_size > 1 and chunked is True: diff --git a/py-polars/tests/db-benchmark/various.py b/py-polars/tests/db-benchmark/various.py index 8f449faa258a..3f4af1992351 100644 --- a/py-polars/tests/db-benchmark/various.py +++ b/py-polars/tests/db-benchmark/various.py @@ -15,7 +15,7 @@ # test mean overflow issues np.random.seed(1) mean = 769.5607652 -df = pl.DataFrame(np.random.randint(500, 1040, 5000000), columns=["value"]) +df = pl.DataFrame(np.random.randint(500, 1040, 5000000), schema=["value"]) assert np.isclose(df.with_column(pl.mean("value"))[0, 0], mean) assert np.isclose( df.with_column(pl.col("value").cast(pl.Int32)).with_column(pl.mean("value"))[0, 0], diff --git a/py-polars/tests/parametric/test_dataframe.py b/py-polars/tests/parametric/test_dataframe.py index e661ed3ce375..6053058642c7 100644 --- a/py-polars/tests/parametric/test_dataframe.py +++ b/py-polars/tests/parametric/test_dataframe.py @@ -23,7 +23,7 @@ def test_repr(df: pl.DataFrame) -> None: min_size=1, min_cols=1, null_probability=0.25, excluded_dtypes=[pl.Utf8] ) ) -@example(df=pl.DataFrame(columns=["x", "y", "z"])) +@example(df=pl.DataFrame(schema=["x", "y", "z"])) @example(df=pl.DataFrame()) def test_null_count(df: pl.DataFrame) -> None: # note: the zero-row and zero-col cases are always passed as explicit examples diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 26165b6ef11a..ac7aaea618c4 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -887,7 +887,7 @@ def test_datetime_format_inferred_precision( "x": [datetime(2022, 9, 4, 10, 30, 45, 123000)], "y": [datetime(2022, 9, 4, 10, 30, 45, 123000)], }, - columns=[ + schema=[ ("x", pl.Datetime(tu1)), ("y", pl.Datetime(tu2)), ], diff --git a/py-polars/tests/unit/test_api.py b/py-polars/tests/unit/test_api.py index 70282be8f897..7b9e79b5971c 100644 --- a/py-polars/tests/unit/test_api.py +++ b/py-polars/tests/unit/test_api.py @@ -26,7 +26,7 @@ def by_first_letter_of_column_values(self, col: str) -> list[pl.DataFrame]: df = pl.DataFrame( data=[["xx", 2, 3, 4], ["xy", 4, 5, 6], ["yy", 5, 6, 7], ["yz", 6, 7, 8]], - columns=["a1", "a2", "b1", "b2"], + schema=["a1", "a2", "b1", "b2"], orient="row", ) @@ -58,7 +58,7 @@ def previous(self, p: int) -> pl.Expr: def nearest(self, p: int) -> pl.Expr: return (p ** (self._expr.log(p)).round(0).cast(pl.Int64)).cast(pl.Int64) - df = pl.DataFrame([1.4, 24.3, 55.0, 64.001], columns=["n"]) + df = pl.DataFrame([1.4, 24.3, 55.0, 64.001], schema=["n"]) assert df.select( [ pl.col("n"), @@ -88,18 +88,18 @@ def by_column_dtypes(self) -> list[pl.LazyFrame]: ldf = pl.DataFrame( data=[["xx", 2, 3, 4], ["xy", 4, 5, 6], ["yy", 5, 6, 7], ["yz", 6, 7, 8]], - columns=["a1", "a2", "b1", "b2"], + schema=["a1", "a2", "b1", "b2"], orient="row", ).lazy() df1, df2 = (d.collect() for d in ldf.split.by_column_dtypes()) assert_frame_equal( - df1, pl.DataFrame([("xx",), ("xy",), ("yy",), ("yz",)], columns=["a1"]) + df1, pl.DataFrame([("xx",), ("xy",), ("yy",), ("yz",)], schema=["a1"]) ) assert_frame_equal( df2, pl.DataFrame( - [(2, 3, 4), (4, 5, 6), (5, 6, 7), (6, 7, 8)], columns=["a2", "b1", "b2"] + [(2, 3, 4), (4, 5, 6), (5, 6, 7), (6, 7, 8)], schema=["a2", "b1", "b2"] ), ) diff --git a/py-polars/tests/unit/test_binary.py b/py-polars/tests/unit/test_binary.py index 1dad982fe01d..a5330c93e514 100644 --- a/py-polars/tests/unit/test_binary.py +++ b/py-polars/tests/unit/test_binary.py @@ -22,7 +22,7 @@ def test_contains() -> None: (2, b"(with) special\n * chars"), (3, b"**etc...?$"), ], - columns=["idx", "bin"], + schema=["idx", "bin"], ) for pattern, expected in ( (b"e * ", [True, False, False]), diff --git a/py-polars/tests/unit/test_categorical.py b/py-polars/tests/unit/test_categorical.py index c5378f1ca5aa..9a452874a55d 100644 --- a/py-polars/tests/unit/test_categorical.py +++ b/py-polars/tests/unit/test_categorical.py @@ -128,7 +128,7 @@ def test_cat_to_dummies() -> None: def test_comp_categorical_lit_dtype() -> None: df = pl.DataFrame( data={"column": ["a", "b", "e"], "values": [1, 5, 9]}, - columns=[("column", pl.Categorical), ("more", pl.Int32)], + schema=[("column", pl.Categorical), ("more", pl.Int32)], ) assert df.with_column( diff --git a/py-polars/tests/unit/test_constructors.py b/py-polars/tests/unit/test_constructors.py index a785ab3861e0..8a872b04655f 100644 --- a/py-polars/tests/unit/test_constructors.py +++ b/py-polars/tests/unit/test_constructors.py @@ -21,8 +21,8 @@ def test_init_dict() -> None: assert df.schema == {"a": pl.Float32, "b": pl.Float32} for df in ( - pl.DataFrame({}, columns={"a": pl.Date, "b": pl.Utf8}), - pl.DataFrame({"a": [], "b": []}, columns={"a": pl.Date, "b": pl.Utf8}), + pl.DataFrame({}, schema={"a": pl.Date, "b": pl.Utf8}), + pl.DataFrame({"a": [], "b": []}, schema={"a": pl.Date, "b": pl.Utf8}), ): assert df.shape == (0, 2) assert df.schema == {"a": pl.Date, "b": pl.Utf8} @@ -41,7 +41,7 @@ def test_init_dict() -> None: df = pl.DataFrame( data={"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]}, - columns=[("a", pl.Int8), ("b", pl.Float32)], + schema=[("a", pl.Int8), ("b", pl.Float32)], ) assert df.schema == {"a": pl.Int8, "b": pl.Float32} @@ -68,23 +68,23 @@ def test_init_dict() -> None: ): df = pl.DataFrame( data={"dt": dates, "dtm": datetimes}, - columns=coldefs, + schema=coldefs, ) assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime} assert df.rows() == list(zip(py_dates, py_datetimes)) # Overriding dict column names/types - df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["c", "d"]) + df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, schema=["c", "d"]) assert df.columns == ["c", "d"] df = pl.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6]}, - columns=["c", ("d", pl.Int8)], + schema=["c", ("d", pl.Int8)], ) # partial type info (allowed, but mypy doesn't like it ;p) assert df.schema == {"c": pl.Int64, "d": pl.Int8} df = pl.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, columns=[("c", pl.Int8), ("d", pl.Int16)] + {"a": [1, 2, 3], "b": [4, 5, 6]}, schema=[("c", pl.Int8), ("d", pl.Int16)] ) assert df.schema == {"c": pl.Int8, "d": pl.Int16} @@ -99,11 +99,11 @@ def test_init_ndarray(monkeypatch: Any) -> None: assert df.frame_equal(pl.DataFrame()) # 1D array - df = pl.DataFrame(np.array([1, 2, 3]), columns=["a"]) + df = pl.DataFrame(np.array([1, 2, 3]), schema=["a"]) truth = pl.DataFrame({"a": [1, 2, 3]}) assert df.frame_equal(truth) - df = pl.DataFrame(np.array([1, 2, 3]), columns=[("a", pl.Int32)]) + df = pl.DataFrame(np.array([1, 2, 3]), schema=[("a", pl.Int32)]) truth = pl.DataFrame({"a": [1, 2, 3]}).with_column(pl.col("a").cast(pl.Int32)) assert df.frame_equal(truth) @@ -120,7 +120,7 @@ def test_init_ndarray(monkeypatch: Any) -> None: df = pl.DataFrame( data=[[1, 2.0, "a"], [None, None, None]], - columns=[("x", pl.Boolean), ("y", pl.Int32), "z"], + schema=[("x", pl.Boolean), ("y", pl.Int32), "z"], orient="row", ) assert df.rows() == [(True, 2, "a"), (None, None, None)] @@ -136,22 +136,22 @@ def test_init_ndarray(monkeypatch: Any) -> None: assert df.shape == (3, 1) # 2D array - row orientation inferred - df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b", "c"]) + df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), schema=["a", "b", "c"]) truth = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]}) assert df.frame_equal(truth) # 2D array - column orientation inferred - df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b"]) + df = pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), schema=["a", "b"]) truth = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) assert df.frame_equal(truth) # 2D array - orientation conflicts with columns with pytest.raises(ValueError): - pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), columns=["a", "b"], orient="row") + pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), schema=["a", "b"], orient="row") with pytest.raises(ValueError): pl.DataFrame( np.array([[1, 2, 3], [4, 5, 6]]), - columns=[("a", pl.UInt32), ("b", pl.UInt32)], + schema=[("a", pl.UInt32), ("b", pl.UInt32)], orient="row", ) @@ -168,16 +168,16 @@ def test_init_ndarray(monkeypatch: Any) -> None: # Dimensions mismatch with pytest.raises(ValueError): - _ = pl.DataFrame(np.array([1, 2, 3]), columns=[]) + _ = pl.DataFrame(np.array([1, 2, 3]), schema=[]) with pytest.raises(ValueError): - _ = pl.DataFrame(np.array([[1, 2], [3, 4]]), columns=["a"]) + _ = pl.DataFrame(np.array([[1, 2], [3, 4]]), schema=["a"]) # NumPy not available monkeypatch.setattr( pl.internals.dataframe.frame, "_check_for_numpy", lambda x: False ) with pytest.raises(ValueError): - pl.DataFrame(np.array([1, 2, 3]), columns=["a"]) + pl.DataFrame(np.array([1, 2, 3]), schema=["a"]) # 2D numpy arrays df = pl.DataFrame({"a": np.arange(5, dtype=np.int64).reshape(1, -1)}) @@ -196,22 +196,20 @@ def test_init_arrow() -> None: assert df.frame_equal(truth) # Rename columns - df = pl.DataFrame(pa.table({"a": [1, 2], "b": [3, 4]}), columns=["c", "d"]) + df = pl.DataFrame(pa.table({"a": [1, 2], "b": [3, 4]}), schema=["c", "d"]) truth = pl.DataFrame({"c": [1, 2], "d": [3, 4]}) assert df.frame_equal(truth) df = pl.DataFrame( pa.table({"a": [1, 2], None: [3, 4]}), - columns=[("c", pl.Int32), ("d", pl.Float32)], + schema=[("c", pl.Int32), ("d", pl.Float32)], ) assert df.schema == {"c": pl.Int32, "d": pl.Float32} assert df.rows() == [(1, 3.0), (2, 4.0)] # Bad columns argument with pytest.raises(ValueError): - pl.DataFrame( - pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), columns=["c", "d", "e"] - ) + pl.DataFrame(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), schema=["c", "d", "e"]) def test_init_series() -> None: @@ -226,7 +224,7 @@ def test_init_series() -> None: df = pl.DataFrame( (pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))), - columns=[("x", pl.Float64), ("y", pl.Float64)], + schema=[("x", pl.Float64), ("y", pl.Float64)], ) assert df.schema == {"x": pl.Float64, "y": pl.Float64} assert df.rows() == [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)] @@ -244,7 +242,7 @@ def test_init_series() -> None: df = pl.DataFrame( [pl.Series([None]), pl.Series([1.0])], - columns=[("x", pl.Date), ("y", pl.Boolean)], + schema=[("x", pl.Date), ("y", pl.Boolean)], ) assert df.schema == {"x": pl.Date, "y": pl.Boolean} assert df.rows() == [(None, True)] @@ -255,7 +253,7 @@ def test_init_series() -> None: assert df.schema == {"a": pl.Int64} assert df.frame_equal(truth) - df = pl.DataFrame(pl.Series("a", [1, 2, 3]), columns=[("a", pl.UInt32)]) + df = pl.DataFrame(pl.Series("a", [1, 2, 3]), schema=[("a", pl.UInt32)]) assert df.rows() == [(1,), (2,), (3,)] assert df.schema == {"a": pl.UInt32} @@ -265,13 +263,13 @@ def test_init_series() -> None: def test_init_seq_of_seq() -> None: # List of lists - df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"]) truth = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]}) assert df.frame_equal(truth) df = pl.DataFrame( [[1, 2, 3], [4, 5, 6]], - columns=[("a", pl.Int8), ("b", pl.Int16), ("c", pl.Int32)], + schema=[("a", pl.Int8), ("b", pl.Int16), ("c", pl.Int32)], ) assert df.schema == {"a": pl.Int8, "b": pl.Int16, "c": pl.Int32} assert df.rows() == [(1, 2, 3), (4, 5, 6)] @@ -282,12 +280,12 @@ def test_init_seq_of_seq() -> None: assert df.frame_equal(truth) # Row orientation - df = pl.DataFrame(((1, 2), (3, 4)), columns=("a", "b"), orient="row") + df = pl.DataFrame(((1, 2), (3, 4)), schema=("a", "b"), orient="row") truth = pl.DataFrame({"a": [1, 3], "b": [2, 4]}) assert df.frame_equal(truth) df = pl.DataFrame( - ((1, 2), (3, 4)), columns=(("a", pl.Float32), ("b", pl.Float32)), orient="row" + ((1, 2), (3, 4)), schema=(("a", pl.Float32), ("b", pl.Float32)), orient="row" ) assert df.schema == {"a": pl.Float32, "b": pl.Float32} assert df.rows() == [(1.0, 2.0), (3.0, 4.0)] @@ -303,16 +301,16 @@ def test_init_1d_sequence() -> None: assert df.frame_equal(pl.DataFrame()) # List of strings - df = pl.DataFrame(["a", "b", "c"], columns=["hi"]) + df = pl.DataFrame(["a", "b", "c"], schema=["hi"]) truth = pl.DataFrame({"hi": ["a", "b", "c"]}) assert df.frame_equal(truth) - df = pl.DataFrame([None, True, False], columns=[("xx", pl.Int8)]) + df = pl.DataFrame([None, True, False], schema=[("xx", pl.Int8)]) assert df.schema == {"xx": pl.Int8} assert df.rows() == [(None,), (1,), (0,)] # String sequence - assert pl.DataFrame("abc", columns=["s"]).to_dict(False) == {"s": ["a", "b", "c"]} + assert pl.DataFrame("abc", schema=["s"]).to_dict(False) == {"s": ["a", "b", "c"]} def test_init_pandas(monkeypatch: Any) -> None: @@ -325,7 +323,7 @@ def test_init_pandas(monkeypatch: Any) -> None: assert df.schema == {"1": pl.Int64, "2": pl.Int64} # override column names, types - df = pl.DataFrame(pandas_df, columns=[("x", pl.Float64), ("y", pl.Float64)]) + df = pl.DataFrame(pandas_df, schema=[("x", pl.Float64), ("y", pl.Float64)]) assert df.schema == {"x": pl.Float64, "y": pl.Float64} assert df.rows() == [(1.0, 2.0), (3.0, 4.0)] @@ -353,7 +351,7 @@ def _constructor(self) -> type: dtype=np.dtype(" None: # Columns don't match data dimensions with pytest.raises(pl.ShapeError): - pl.DataFrame([[1, 2], [3, 4]], columns=["a", "b", "c"]) + pl.DataFrame([[1, 2], [3, 4]], schema=["a", "b", "c"]) # Unmatched input with pytest.raises(ValueError): @@ -391,11 +389,11 @@ def test_init_records() -> None: assert df.frame_equal(expected) assert df.to_dicts() == dicts - df_cd = pl.DataFrame(dicts, columns=["c", "d"]) + df_cd = pl.DataFrame(dicts, schema=["c", "d"]) expected = pl.DataFrame({"c": [1, 2, 1], "d": [2, 1, 2]}) assert df_cd.frame_equal(expected) - df_xy = pl.DataFrame(dicts, columns=[("x", pl.UInt32), ("y", pl.UInt32)]) + df_xy = pl.DataFrame(dicts, schema=[("x", pl.UInt32), ("y", pl.UInt32)]) expected = pl.DataFrame({"x": [1, 2, 1], "y": [2, 1, 2]}).with_columns( [pl.col("x").cast(pl.UInt32), pl.col("y").cast(pl.UInt32)] ) @@ -405,7 +403,7 @@ def test_init_records() -> None: def test_init_only_columns() -> None: - df = pl.DataFrame(columns=["a", "b", "c"]) + df = pl.DataFrame(schema=["a", "b", "c"]) truth = pl.DataFrame({"a": [], "b": [], "c": []}) assert df.shape == (0, 3) assert df.frame_equal(truth, null_equal=True) @@ -416,7 +414,7 @@ def test_init_only_columns() -> None: for no_data in (None, {}, []): df = pl.DataFrame( data=no_data, - columns=[ + schema=[ ("a", pl.Date), ("b", pl.UInt64), ("c", pl.Int8), @@ -510,7 +508,7 @@ def test_from_rows_dtype() -> None: # 5182 df = pl.DataFrame( data=[(None, None)] * 50 + [("1.23", None)], - columns=[("foo", pl.Utf8), ("bar", pl.Utf8)], + schema=[("foo", pl.Utf8), ("bar", pl.Utf8)], orient="row", ) assert df.dtypes == [pl.Utf8, pl.Utf8] @@ -523,7 +521,7 @@ def test_from_rows_dtype() -> None: df = pl.DataFrame( data=type1 * 50 + type2, - columns=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)], + schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)], ) assert df.dtypes == [pl.Int32, pl.Object, pl.Object] @@ -536,7 +534,7 @@ def test_from_rows_dtype() -> None: df = pl.DataFrame( data=type1 * 50 + type2, - columns=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)], + schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)], ) assert df.dtypes == [pl.Int32, pl.Object, pl.Object] assert df.null_count().row(0) == (0, 0, 0) @@ -545,16 +543,29 @@ def test_from_rows_dtype() -> None: def test_from_dicts_schema() -> None: data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}] - # let polars infer the dtypes - # but inform about a 3rd column - df = pl.from_dicts( - data, schema_overrides={"a": pl.Unknown, "b": pl.Unknown, "c": pl.Int32} - ) - assert df.dtypes == [pl.Int64, pl.Int64, pl.Int32] + # let polars infer the dtypes, but inform it about a 3rd column. + # can supply as schema, or as mixed schema/overrides. + for schema, overrides in ( + (None, {"a": None, "b": pl.Unknown, "c": pl.Int32}), + ({"a": None, "b": pl.Unknown, "c": pl.Int32}, None), + ({"a": None, "b": pl.Unknown}, {"c": pl.Int32}), + ): + df = pl.from_dicts( + data, + schema=schema, # type: ignore[arg-type] + schema_overrides=overrides, # type: ignore[arg-type] + ) + assert df.dtypes == [pl.Int64, pl.Int64, pl.Int32] + assert df.to_dict(False) == { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": [None, None, None], + } + + df = pl.from_dicts(data, schema=["x", "y"]) assert df.to_dict(False) == { - "a": [1, 2, 3], - "b": [4, 5, 6], - "c": [None, None, None], + "x": [1, 2, 3], + "y": [4, 5, 6], } diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 71099df35551..b8fc85f3cab9 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -41,7 +41,7 @@ def test_cse_schema_6081() -> None: [date(2022, 12, 12), 1, 2], [date(2022, 12, 13), 5, 2], ], - columns=["date", "id", "value"], + schema=["date", "id", "value"], orient="row", ).lazy() diff --git a/py-polars/tests/unit/test_datelike.py b/py-polars/tests/unit/test_datelike.py index f00930748481..0d68aadc7c57 100644 --- a/py-polars/tests/unit/test_datelike.py +++ b/py-polars/tests/unit/test_datelike.py @@ -64,7 +64,7 @@ def test_fill_null_temporal() -> None: [dtm, dtm_ms, dtm, dtm, dt, tm, td, td, td, td], [None] * 10, ], - columns=[ + schema=[ ("a", pl.Datetime), ("b", pl.Datetime("ms")), ("c", pl.Datetime("us")), @@ -1325,7 +1325,7 @@ def test_asof_join() -> None: def test_temporal_dtypes_apply() -> None: df = pl.DataFrame( {"timestamp": [1284286794000, None, 1234567890000]}, - columns=[("timestamp", pl.Datetime("ms"))], + schema=[("timestamp", pl.Datetime("ms"))], ) const_dtm = datetime(2010, 9, 12) @@ -1354,7 +1354,7 @@ def test_temporal_dtypes_apply() -> None: time(23, 31, 30), ), ], - columns={ + schema={ "timestamp": pl.Datetime("ms"), "const_dtm": pl.Datetime("us"), "date": pl.Date, @@ -1380,7 +1380,7 @@ def test_timedelta_timeunit_init() -> None: df = pl.DataFrame( [[td_us, td_us, td_us]], - columns=[ + schema=[ ("x", pl.Duration("ms")), ("y", pl.Duration("us")), ("z", pl.Duration("ns")), @@ -1597,7 +1597,7 @@ def test_datetime_instance_selection() -> None: } df = pl.DataFrame( data=test_data, - columns=[ + schema=[ ("ns", pl.Datetime("ns")), ("us", pl.Datetime("us")), ("ms", pl.Datetime("ms")), @@ -1919,7 +1919,7 @@ def test_shift_and_fill_group_logicals() -> None: (date(2001, 1, 3), "B"), (date(2001, 1, 4), "B"), ], - columns=["d", "s"], + schema=["d", "s"], ) assert df.select( pl.col("d").shift_and_fill(-1, pl.col("d").max()).over("s") @@ -1958,7 +1958,7 @@ def test_datetime_string_casts() -> None: "y": [1661855445123456], "z": [1661855445123456789], }, - columns=[ + schema=[ ("x", pl.Datetime("ms")), ("y", pl.Datetime("us")), ("z", pl.Datetime("ns")), @@ -2053,7 +2053,7 @@ def test_tz_datetime_duration_arithm_5221() -> None: ] out = pl.DataFrame( data={"run_datetime": run_datetimes}, - columns=[("run_datetime", pl.Datetime(time_zone="UTC"))], + schema=[("run_datetime", pl.Datetime(time_zone="UTC"))], ) utc = zoneinfo.ZoneInfo("UTC") assert out.to_dict(False) == { diff --git a/py-polars/tests/unit/test_df.py b/py-polars/tests/unit/test_df.py index 1c5575a30de3..c0d61b4148d2 100644 --- a/py-polars/tests/unit/test_df.py +++ b/py-polars/tests/unit/test_df.py @@ -55,12 +55,17 @@ def test_init_empty() -> None: def test_special_char_colname_init() -> None: from string import punctuation - cols = [(c.name, c.dtype) for c in columns(punctuation)] - df = pl.DataFrame(columns=cols) + with pl.StringCache(): + cols = [(c.name, c.dtype) for c in columns(punctuation)] + df = pl.DataFrame(schema=cols) + + assert len(cols) == len(df.columns) + assert len(df.rows()) == 0 + assert df.is_empty() - assert len(cols) == len(df.columns) - assert len(df.rows()) == 0 - assert df.is_empty() + # remove once 'columns' -> 'schema' transition complete + df2 = pl.DataFrame(columns=cols) # type: ignore[call-arg] + assert_frame_equal(df, df2) def test_comparisons() -> None: @@ -247,8 +252,8 @@ def test_from_dict_with_column_order() -> None: schema = {"a": pl.UInt8, "b": pl.UInt32} data = {"b": [3, 4], "a": [1, 2]} for df in ( - pl.DataFrame(data, columns=schema), - pl.DataFrame(data, columns=["a", "b"], schema_overrides=schema), + pl.DataFrame(data, schema=schema), + pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema), ): # ┌─────┬─────┐ # │ a ┆ b │ @@ -265,7 +270,7 @@ def test_from_dict_with_column_order() -> None: # expect an error mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32} with pytest.raises(ValueError): - pl.DataFrame({"b": [3, 4], "a": [1, 2]}, columns=mismatched_schema) + pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema) def test_from_dict_with_scalars() -> None: @@ -304,7 +309,7 @@ def test_from_dict_with_scalars() -> None: "other": map(float, [7, 8, 9]), "value": {0: "x", 1: "y", 2: "z"}.values(), }, - columns={ + schema={ "value": pl.Utf8, "other": pl.Float32, "misc": pl.Int32, @@ -333,7 +338,7 @@ def test_from_dict_with_scalars() -> None: ), pl.from_dict( {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"}, - columns=["x", ("y", pl.Int8), "z"], + schema=["x", ("y", pl.Int8), "z"], ), ): assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")] @@ -395,7 +400,7 @@ class TradeNT(NamedTuple): # in conjunction with full 'columns' override (rename/downcast) df = pl.DataFrame( data=trades, - columns=[ + schema=[ ("ts", pl.Datetime("ms")), ("tk", pl.Categorical), ("pc", pl.Float32), @@ -824,7 +829,7 @@ def test_get_dummies() -> None: df = pl.DataFrame( {"i": [1, 2, 3], "category": ["dog", "cat", "cat"]}, - columns={"i": pl.Int32, "category": pl.Categorical}, + schema={"i": pl.Int32, "category": pl.Categorical}, ) expected = pl.DataFrame( { @@ -832,7 +837,7 @@ def test_get_dummies() -> None: "category_cat": [0, 1, 1], "category_dog": [1, 0, 0], }, - columns={"i": pl.Int32, "category_cat": pl.UInt8, "category_dog": pl.UInt8}, + schema={"i": pl.Int32, "category_cat": pl.UInt8, "category_dog": pl.UInt8}, ) result = pl.get_dummies(df, columns=["category"]) assert result.frame_equal(expected) @@ -1158,7 +1163,7 @@ def test_literal_series() -> None: (21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1), (21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3), ], - columns=expected_schema, # type: ignore[arg-type] + schema=expected_schema, # type: ignore[arg-type] ), out, atol=0.00001, @@ -1223,12 +1228,12 @@ def __iter__(self) -> Iterator[Any]: ) # check init from row-oriented generators (more common) expected = pl.DataFrame( - data=list(gen(4)), columns=["a", "b", "c", "d"], orient="row" + data=list(gen(4)), schema=["a", "b", "c", "d"], orient="row" ) for generated_frame in ( - pl.DataFrame(data=gen(4), columns=["a", "b", "c", "d"]), - pl.DataFrame(data=Rows(4), columns=["a", "b", "c", "d"]), - pl.DataFrame(data=(x for x in Rows(4)), columns=["a", "b", "c", "d"]), + pl.DataFrame(data=gen(4), schema=["a", "b", "c", "d"]), + pl.DataFrame(data=Rows(4), schema=["a", "b", "c", "d"]), + pl.DataFrame(data=(x for x in Rows(4)), schema=["a", "b", "c", "d"]), ): assert_frame_equal(expected, generated_frame) assert generated_frame.schema == { @@ -1255,14 +1260,14 @@ def __iter__(self) -> Iterator[Any]: {"data": gen(4), "infer_schema_length": 3, "chunk_size": 2}, {"data": gen(4), "infer_schema_length": None, "chunk_size": 3}, ): - d = iterable_to_pydf(columns=cols, **params) # type: ignore[arg-type] + d = iterable_to_pydf(schema=cols, **params) # type: ignore[arg-type] assert expected_data == d.row_tuples() assert expected_schema == list(zip(d.columns(), d.dtypes())) # empty iterator assert_frame_equal( - pl.DataFrame(data=gen(0), columns=["a", "b", "c", "d"]), - pl.DataFrame(columns=["a", "b", "c", "d"]), + pl.DataFrame(data=gen(0), schema=["a", "b", "c", "d"]), + pl.DataFrame(schema=["a", "b", "c", "d"]), ) # dict-related generator-views @@ -2610,7 +2615,7 @@ def test_init_physical_with_timezone() -> None: dtm = {"ms": dtm_us // 1_000, "ns": dtm_us * 1_000}.get(str(tu), dtm_us) df = pl.DataFrame( data={"d1": [dtm], "d2": [dtm]}, - columns=[ + schema=[ ("d1", pl.Datetime(tu, tz_uae)), ("d2", pl.Datetime(tu, tz_asia)), ], diff --git a/py-polars/tests/unit/test_empty.py b/py-polars/tests/unit/test_empty.py index 2b87b95cfddf..c5e3cbafe5a8 100644 --- a/py-polars/tests/unit/test_empty.py +++ b/py-polars/tests/unit/test_empty.py @@ -2,7 +2,7 @@ def test_empty_str_concat_lit() -> None: - df = pl.DataFrame({"a": [], "b": []}, columns=[("a", pl.Utf8), ("b", pl.Utf8)]) + df = pl.DataFrame({"a": [], "b": []}, schema=[("a", pl.Utf8), ("b", pl.Utf8)]) assert df.with_column(pl.lit("asd") + pl.col("a")).schema == { "a": pl.Utf8, "b": pl.Utf8, diff --git a/py-polars/tests/unit/test_exprs.py b/py-polars/tests/unit/test_exprs.py index 096f30eb9d81..52a4f59c03b9 100644 --- a/py-polars/tests/unit/test_exprs.py +++ b/py-polars/tests/unit/test_exprs.py @@ -40,7 +40,7 @@ def test_prefix(fruits_cars: pl.DataFrame) -> None: def test_cumcount() -> None: - df = pl.DataFrame([["a"], ["a"], ["a"], ["b"], ["b"], ["a"]], columns=["A"]) + df = pl.DataFrame([["a"], ["a"], ["a"], ["b"], ["b"], ["a"]], schema=["A"]) out = df.groupby("A", maintain_order=True).agg( [pl.col("A").cumcount(reverse=False).alias("foo")] @@ -280,7 +280,7 @@ def test_dot_in_groupby() -> None: def test_dtype_col_selection() -> None: df = pl.DataFrame( data=[], - columns={ + schema={ "a1": pl.Datetime, "a2": pl.Datetime("ms"), "a3": pl.Datetime("ms"), @@ -526,7 +526,7 @@ def test_ewm_with_multiple_chunks() -> None: ("y", 4.0, 3.0), ("z", 3.0, 4.0), ], - columns=["a", "b", "c"], + schema=["a", "b", "c"], ).with_columns( [ pl.col(pl.Float64).log().diff().prefix("ld_"), diff --git a/py-polars/tests/unit/test_functions.py b/py-polars/tests/unit/test_functions.py index 93905b733428..4f6700ecb358 100644 --- a/py-polars/tests/unit/test_functions.py +++ b/py-polars/tests/unit/test_functions.py @@ -73,7 +73,7 @@ def test_all_any_horizontally() -> None: [False, None, True], [None, None, False], ], - columns=["var1", "var2", "var3"], + schema=["var1", "var2", "var3"], ) expected = pl.DataFrame( { diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py index 4afbfddb5d47..d25c0f3186d3 100644 --- a/py-polars/tests/unit/test_interop.py +++ b/py-polars/tests/unit/test_interop.py @@ -296,7 +296,7 @@ def test_from_dicts_schema_override() -> None: for n_infer in (0, 3, 5, 8, 10, 100): df = pl.DataFrame( data=(data1 + data2), - columns=schema, # type: ignore[arg-type] + schema=schema, # type: ignore[arg-type] infer_schema_length=n_infer, ) assert df.schema == schema @@ -332,7 +332,7 @@ def test_from_dicts_struct() -> None: def test_from_records() -> None: data = [[1, 2, 3], [4, 5, 6]] - df = pl.from_records(data, columns=["a", "b"]) + df = pl.from_records(data, schema=["a", "b"]) assert df.shape == (3, 2) assert df.rows() == [(1, 4), (2, 5), (3, 6)] @@ -341,7 +341,7 @@ def test_from_numpy() -> None: data = np.array([[1, 2, 3], [4, 5, 6]]) df = pl.from_numpy( data, - columns=["a", "b"], + schema=["a", "b"], orient="col", schema_overrides={"a": pl.UInt32, "b": pl.UInt32}, ) @@ -393,7 +393,7 @@ def test_from_optional_not_available() -> None: np = _LazyModule("numpy", module_available=False) with pytest.raises(ImportError, match=r"np\.array requires 'numpy'"): - pl.from_numpy(np.array([[1, 2], [3, 4]]), columns=["a", "b"]) + pl.from_numpy(np.array([[1, 2], [3, 4]]), schema=["a", "b"]) pa = _LazyModule("pyarrow", module_available=False) with pytest.raises(ImportError, match=r"pa\.table requires 'pyarrow'"): @@ -467,7 +467,7 @@ def test_from_empty_pandas_with_dtypes() -> None: df = pl.DataFrame( data=[], - columns={ + schema={ "a": pl.Int32, "b": pl.Datetime, "c": pl.Float32, diff --git a/py-polars/tests/unit/test_joins.py b/py-polars/tests/unit/test_joins.py index 88c46655f001..c61538ff3f6a 100644 --- a/py-polars/tests/unit/test_joins.py +++ b/py-polars/tests/unit/test_joins.py @@ -64,7 +64,7 @@ def test_semi_anti_join() -> None: def test_join_same_cat_src() -> None: df = pl.DataFrame( data={"column": ["a", "a", "b"], "more": [1, 2, 3]}, - columns=[("column", pl.Categorical), ("more", pl.Int32)], + schema=[("column", pl.Categorical), ("more", pl.Int32)], ) df_agg = df.groupby("column").agg(pl.col("more").mean()) assert df.join(df_agg, on="column").to_dict(False) == { diff --git a/py-polars/tests/unit/test_lazy.py b/py-polars/tests/unit/test_lazy.py index 99457df536e0..854507d7d0b0 100644 --- a/py-polars/tests/unit/test_lazy.py +++ b/py-polars/tests/unit/test_lazy.py @@ -1543,10 +1543,8 @@ def test_lazy_cache_hit(capfd: Any) -> None: def test_quadratic_behavior_4736() -> None: - # we don't assert anything. - # If this function does not stall - # our tests it has passed. - df = pl.DataFrame(columns=list(ascii_letters)) + # no assert; if this function does not stall our tests it has passed! + df = pl.DataFrame(schema=list(ascii_letters)) df.lazy().select(reduce(add, (pl.col(fld) for fld in df.columns))) diff --git a/py-polars/tests/unit/test_lists.py b/py-polars/tests/unit/test_lists.py index 91b19b05ab91..b1da5a4544a5 100644 --- a/py-polars/tests/unit/test_lists.py +++ b/py-polars/tests/unit/test_lists.py @@ -85,7 +85,7 @@ def test_dtype() -> None: "dt": [[date(2022, 12, 31)]], "dtm": [[datetime(2022, 12, 31, 1, 2, 3)]], }, - columns=[ + schema=[ ("i", pl.List(pl.Int8)), ("tm", pl.List(pl.Time)), ("dt", pl.List(pl.Date)), @@ -355,7 +355,7 @@ def test_empty_list_construction() -> None: False ) == {"array": [[]], "not_array": [1234]} - df = pl.DataFrame(columns=[("col", pl.List)]) + df = pl.DataFrame(schema=[("col", pl.List)]) assert df.schema == {"col": pl.List} assert df.rows() == [] @@ -636,7 +636,7 @@ def test_fast_explode_on_list_struct_6208() -> None: df = pl.DataFrame( data, - columns={ + schema={ "label": pl.Utf8, "tag": pl.Utf8, "ref": pl.Int64, diff --git a/py-polars/tests/unit/test_object.py b/py-polars/tests/unit/test_object.py index eec4ee378563..1b2e6c74d275 100644 --- a/py-polars/tests/unit/test_object.py +++ b/py-polars/tests/unit/test_object.py @@ -24,7 +24,7 @@ def test_object_empty_filter_5911() -> None: data=[ (1, "dog", {}), ], - columns=[ + schema=[ ("pet_id", pl.Int64), ("pet_type", pl.Categorical), ("pet_obj", pl.Object), @@ -57,7 +57,7 @@ def test_empty_sort() -> None: ({"name": "bar", "sort_key": 2},), ({"name": "foo", "sort_key": 1},), ], - columns=[ + schema=[ ("blob", pl.Object), ], orient="row", diff --git a/py-polars/tests/unit/test_pivot.py b/py-polars/tests/unit/test_pivot.py index 1b1a9e208a25..df13bcd2170f 100644 --- a/py-polars/tests/unit/test_pivot.py +++ b/py-polars/tests/unit/test_pivot.py @@ -27,7 +27,7 @@ def test_pivot() -> None: ("B", None, None, 2, 4, None), ("C", None, None, None, None, 2), ], - columns=["foo", "k", "l", "m", "n", "o"], + schema=["foo", "k", "l", "m", "n", "o"], ) assert_frame_equal(result, expected) @@ -122,7 +122,7 @@ def test_pivot_categorical_3968() -> None: def test_pivot_categorical_index() -> None: df = pl.DataFrame( {"A": ["Fire", "Water", "Water", "Fire"], "B": ["Car", "Car", "Car", "Ship"]}, - columns=[("A", pl.Categorical), ("B", pl.Categorical)], + schema=[("A", pl.Categorical), ("B", pl.Categorical)], ) result = df.pivot(values="B", index=["A"], columns="B", aggregate_fn="count") @@ -139,7 +139,7 @@ def test_pivot_categorical_index() -> None: "B": ["Car", "Car", "Car", "Ship"], "C": ["Paper", "Paper", "Paper", "Paper"], }, - columns=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)], + schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)], ) result = df.pivot(values="B", index=["A", "C"], columns="B", aggregate_fn="count") expected = { diff --git a/py-polars/tests/unit/test_queries.py b/py-polars/tests/unit/test_queries.py index c82c4017c1c2..437bd74d8c95 100644 --- a/py-polars/tests/unit/test_queries.py +++ b/py-polars/tests/unit/test_queries.py @@ -177,7 +177,7 @@ def test_groupby_agg_equals_zero_3535() -> None: ("cc", -99, 10.5), ("cc", None, 0.0), ], - columns=[ + schema=[ ("key", pl.Utf8), ("val1", pl.Int16), ("val2", pl.Float32), diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 078bd95b9826..456c9a6a5d3f 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -283,7 +283,7 @@ def test_schema_owned_arithmetic_5669() -> None: def test_fill_null_f32_with_lit() -> None: # ensure the literal integer does not upcast the f32 to an f64 - df = pl.DataFrame({"a": [1.1, 1.2]}, columns=[("a", pl.Float32)]) + df = pl.DataFrame({"a": [1.1, 1.2]}, schema=[("a", pl.Float32)]) assert df.fill_null(value=0).dtypes == [pl.Float32] diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py index 222bfd5b734c..ed6868d9b0fa 100644 --- a/py-polars/tests/unit/test_series.py +++ b/py-polars/tests/unit/test_series.py @@ -136,7 +136,7 @@ def test_init_inputs(monkeypatch: Any) -> None: # numpy not available monkeypatch.setattr(pl.internals.series.series, "_check_for_numpy", lambda x: False) with pytest.raises(ValueError): - pl.DataFrame(np.array([1, 2, 3]), columns=["a"]) + pl.DataFrame(np.array([1, 2, 3]), schema=["a"]) def test_init_dataclass_namedtuple() -> None: diff --git a/py-polars/tests/unit/test_strings.py b/py-polars/tests/unit/test_strings.py index 533e9de5f094..a5daae2892cc 100644 --- a/py-polars/tests/unit/test_strings.py +++ b/py-polars/tests/unit/test_strings.py @@ -27,7 +27,7 @@ def test_auto_explode() -> None: def test_contains() -> None: df = pl.DataFrame( data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")], - columns=["idx", "text"], + schema=["idx", "text"], ) for pattern, as_literal, expected in ( (r"\* \*", False, [True, False, False]), @@ -67,7 +67,7 @@ def test_null_comparisons() -> None: def test_replace() -> None: df = pl.DataFrame( data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")], - columns=["idx", "text"], + schema=["idx", "text"], orient="row", ) for pattern, replacement, as_literal, expected in ( @@ -97,7 +97,7 @@ def test_replace() -> None: def test_replace_all() -> None: df = pl.DataFrame( data=[(1, "* * text"), (2, "(with) special * chars **etc...?$")], - columns=["idx", "text"], + schema=["idx", "text"], orient="row", ) for pattern, replacement, as_literal, expected in ( diff --git a/py-polars/tests/unit/test_struct.py b/py-polars/tests/unit/test_struct.py index d53c9f278dbc..bee76478ad72 100644 --- a/py-polars/tests/unit/test_struct.py +++ b/py-polars/tests/unit/test_struct.py @@ -773,7 +773,7 @@ class TestData: for frame_data in (dict_data, dataclass_data): df = pl.DataFrame( data=frame_data, - columns=frame_schema, # type: ignore[arg-type] + schema=frame_schema, # type: ignore[arg-type] ) assert df.schema == frame_schema assert df.unnest("y").columns == ["x", "a", "b", "c"] diff --git a/py-polars/tests/unit/test_testing.py b/py-polars/tests/unit/test_testing.py index 9dcacb99adf8..cce128ff880c 100644 --- a/py-polars/tests/unit/test_testing.py +++ b/py-polars/tests/unit/test_testing.py @@ -142,13 +142,13 @@ def test_compare_frame_equal_nans() -> None: df1 = pl.DataFrame( data={"x": [1.0, nan], "y": [nan, 2.0]}, - columns=[("x", pl.Float32), ("y", pl.Float64)], + schema=[("x", pl.Float32), ("y", pl.Float64)], ) assert_frame_equal(df1, df1, check_exact=True) df2 = pl.DataFrame( data={"x": [1.0, nan], "y": [None, 2.0]}, - columns=[("x", pl.Float32), ("y", pl.Float64)], + schema=[("x", pl.Float32), ("y", pl.Float64)], ) with pytest.raises( AssertionError, match="DataFrames are different\n\nExact value mismatch"