Merge branch 'master' into numpy_arithmetic

pola-rs · Feb 18, 2023 · 8876323 · 8876323
2 parents 40006b9 + b7178f1
commit 8876323
Show file tree

Hide file tree

Showing 21 changed files with 483 additions and 162 deletions.
diff --git a/polars/polars-arrow/Cargo.toml b/polars/polars-arrow/Cargo.toml
@@ -10,7 +10,7 @@ description = "Arrow interfaces for Polars DataFrame library"
 
 [dependencies]
 arrow.workspace = true
-chrono = { version = "0.4", optional = true }
+chrono = { version = "0.4", default-features = false, features = ["std"], optional = true }
 chrono-tz = { version = "0.8", optional = true }
 hashbrown.workspace = true
 num.workspace = true

diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml
@@ -155,7 +155,7 @@ ahash.workspace = true
 anyhow.workspace = true
 arrow.workspace = true
 bitflags.workspace = true
-chrono = { version = "0.4", optional = true }
+chrono = { version = "0.4", default-features = false, features = ["std"], optional = true }
 chrono-tz = { version = "0.8", optional = true }
 comfy-table = { version = "6.1.4", optional = true, default_features = false }
 hashbrown.workspace = true

diff --git a/polars/polars-core/src/chunked_array/cast.rs b/polars/polars-core/src/chunked_array/cast.rs
@@ -122,7 +122,28 @@ where
     }
 
     fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult<Series> {
-        self.cast_impl(data_type, false)
+        match data_type {
+            #[cfg(feature = "dtype-categorical")]
+            DataType::Categorical(Some(rev_map)) => {
+                if self.dtype() == &DataType::UInt32 {
+                    // safety:
+                    // we are guarded by the type system.
+                    let ca = unsafe { &*(self as *const ChunkedArray<T> as *const UInt32Chunked) };
+                    Ok(unsafe {
+                        CategoricalChunked::from_cats_and_rev_map_unchecked(
+                            ca.clone(),
+                            rev_map.clone(),
+                        )
+                    }
+                    .into_series())
+                } else {
+                    Err(PolarsError::ComputeError(
+                        "Cannot cast numeric types to 'Categorical'".into(),
+                    ))
+                }
+            }
+            _ => self.cast_impl(data_type, false),
+        }
     }
 }
 

diff --git a/polars/polars-io/Cargo.toml b/polars/polars-io/Cargo.toml
@@ -55,7 +55,7 @@ anyhow.workspace = true
 arrow.workspace = true
 async-trait = { version = "0.1.59", optional = true }
 bytes = "1.3.0"
-chrono = { version = "0.4.23", optional = true }
+chrono = { version = "0.4", default-features = false, features = ["std"], optional = true }
 chrono-tz = { version = "0.8.1", optional = true }
 dirs = "4.0"
 flate2 = { version = "1", optional = true, default-features = false }

diff --git a/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs b/polars/polars-lazy/polars-plan/src/dsl/function_expr/schema.rs
@@ -33,7 +33,6 @@ impl FunctionExpr {
             Ok(fld)
         };
 
-        #[cfg(any(feature = "rolling_window", feature = "trigonometry", feature = "log"))]
         // set float supertype
         let float_dtype = || {
             map_dtype(&|dtype| match dtype {
@@ -108,7 +107,7 @@ impl FunctionExpr {
         use FunctionExpr::*;
         match self {
             NullCount => with_dtype(IDX_DTYPE),
-            Pow => super_type(),
+            Pow => float_dtype(),
             Coalesce => super_type(),
             #[cfg(feature = "row_hash")]
             Hash(..) => with_dtype(DataType::UInt64),

diff --git a/polars/polars-ops/src/series/ops/is_unique.rs b/polars/polars-ops/src/series/ops/is_unique.rs
@@ -56,7 +56,8 @@ fn dispatcher(s: &Series, invert: bool) -> PolarsResult<BooleanChunked> {
         #[cfg(feature = "dtype-binary")]
         Utf8 => {
             let s = s.cast(&Binary).unwrap();
-            return is_unique(&s);
+            let ca = s.binary().unwrap();
+            is_unique_ca(ca, invert)
         }
         Float32 => {
             let ca = s.bit_repr_small();

diff --git a/polars/polars-time/Cargo.toml b/polars/polars-time/Cargo.toml
@@ -9,7 +9,7 @@ description = "Time related code for the polars dataframe library"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false, features = ["std"] }
 chrono-tz = { version = "0.8", optional = true }
 lexical = { version = "6", default-features = false, features = ["std", "parse-floats", "parse-integers"] }
 now = "0.1"

diff --git a/py-polars/docs/source/reference/series/descriptive.rst b/py-polars/docs/source/reference/series/descriptive.rst
@@ -28,8 +28,10 @@ Descriptive
     Series.is_unique
     Series.is_utf8
     Series.len
+    Series.lower_bound
     Series.n_chunks
     Series.n_unique
     Series.null_count
     Series.unique_counts
+    Series.upper_bound
     Series.value_counts
diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py
@@ -3596,8 +3596,8 @@ def groupby(
         Parameters
         ----------
         by
-            Column or columns to group by. Accepts expression input. Strings are parsed
-            as column names.
+            Column(s) to group by. Accepts expression input. Strings are parsed as
+            column names.
         *more_by
             Additional columns to group by, specified as positional arguments.
         maintain_order

diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py
@@ -110,7 +110,9 @@ def expr_to_lit_or_expr(
     Expr
 
     """
-    if isinstance(expr, str) and not str_to_lit:
+    if isinstance(expr, Expr):
+        pass
+    elif isinstance(expr, str) and not str_to_lit:
         expr = pli.col(expr)
     elif (
         isinstance(expr, (int, float, str, pli.Series, datetime, date, time, timedelta))
@@ -123,7 +125,7 @@ def expr_to_lit_or_expr(
         structify = False
     elif isinstance(expr, (pli.WhenThen, pli.WhenThenThen)):
         expr = expr.otherwise(None)  # implicitly add the null branch.
-    elif not isinstance(expr, Expr):
+    else:
         raise TypeError(
             f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with"
             " pl.lit or pl.col"
@@ -2766,68 +2768,102 @@ def last(self) -> Self:
         """
         return self._from_pyexpr(self._pyexpr.last())
 
-    def over(self, expr: str | Expr | list[Expr | str]) -> Self:
+    def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self:
         """
-        Apply window function over a subgroup.
+        Compute expressions over the given groups.
 
-        This is similar to a groupby + aggregation + self join.
-        Or similar to `window functions in Postgres
-        <https://www.postgresql.org/docs/current/tutorial-window.html>`_.
+        This expression is similar to performing a groupby aggregation and joining the
+        result back into the original dataframe.
+
+        The outcome is similar to how `window functions
+        <https://www.postgresql.org/docs/current/tutorial-window.html>`_
+        work in PostgreSQL.
 
         Parameters
         ----------
         expr
-            Column(s) to group by.
+            Column(s) to group by. Accepts expression input. Strings are parsed as
+            column names.
+        *more_exprs
+            Additional columns to group by, specified as positional arguments.
 
         Examples
         --------
+        Pass the name of a column to compute the expression over that column.
+
         >>> df = pl.DataFrame(
         ...     {
-        ...         "groups": ["g1", "g1", "g2"],
-        ...         "values": [1, 2, 3],
-        ...     }
-        ... )
-        >>> df.with_columns(pl.col("values").max().over("groups").alias("max_by_group"))
-        shape: (3, 3)
-        ┌────────┬────────┬──────────────┐
-        │ groups ┆ values ┆ max_by_group │
-        │ ---    ┆ ---    ┆ ---          │
-        │ str    ┆ i64    ┆ i64          │
-        ╞════════╪════════╪══════════════╡
-        │ g1     ┆ 1      ┆ 2            │
-        │ g1     ┆ 2      ┆ 2            │
-        │ g2     ┆ 3      ┆ 3            │
-        └────────┴────────┴──────────────┘
-        >>> df = pl.DataFrame(
-        ...     {
-        ...         "groups": [1, 1, 2, 2, 1, 2, 3, 3, 1],
-        ...         "values": [1, 2, 3, 4, 5, 6, 7, 8, 8],
+        ...         "a": ["a", "a", "b", "b", "b"],
+        ...         "b": [1, 2, 3, 5, 3],
+        ...         "c": [5, 4, 3, 2, 1],
         ...     }
         ... )
-        >>> df.lazy().select(
-        ...     pl.col("groups").sum().over("groups"),
-        ... ).collect()
-        shape: (9, 1)
-        ┌────────┐
-        │ groups │
-        │ ---    │
-        │ i64    │
-        ╞════════╡
-        │ 4      │
-        │ 4      │
-        │ 6      │
-        │ 6      │
-        │ ...    │
-        │ 6      │
-        │ 6      │
-        │ 6      │
-        │ 4      │
-        └────────┘
-
-        """
-        pyexprs = selection_to_pyexpr_list(expr)
-
-        return self._from_pyexpr(self._pyexpr.over(pyexprs))
+        >>> df.with_columns(pl.col("c").max().over("a").suffix("_max"))
+        shape: (5, 4)
+        ┌─────┬─────┬─────┬───────┐
+        │ a   ┆ b   ┆ c   ┆ c_max │
+        │ --- ┆ --- ┆ --- ┆ ---   │
+        │ str ┆ i64 ┆ i64 ┆ i64   │
+        ╞═════╪═════╪═════╪═══════╡
+        │ a   ┆ 1   ┆ 5   ┆ 5     │
+        │ a   ┆ 2   ┆ 4   ┆ 5     │
+        │ b   ┆ 3   ┆ 3   ┆ 3     │
+        │ b   ┆ 5   ┆ 2   ┆ 3     │
+        │ b   ┆ 3   ┆ 1   ┆ 3     │
+        └─────┴─────┴─────┴───────┘
+
+        Expression input is supported.
+
+        >>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max"))
+        shape: (5, 4)
+        ┌─────┬─────┬─────┬───────┐
+        │ a   ┆ b   ┆ c   ┆ c_max │
+        │ --- ┆ --- ┆ --- ┆ ---   │
+        │ str ┆ i64 ┆ i64 ┆ i64   │
+        ╞═════╪═════╪═════╪═══════╡
+        │ a   ┆ 1   ┆ 5   ┆ 5     │
+        │ a   ┆ 2   ┆ 4   ┆ 4     │
+        │ b   ┆ 3   ┆ 3   ┆ 4     │
+        │ b   ┆ 5   ┆ 2   ┆ 2     │
+        │ b   ┆ 3   ┆ 1   ┆ 4     │
+        └─────┴─────┴─────┴───────┘
+
+        Group by multiple columns by passing a list of column names or expressions.
+
+        >>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min"))
+        shape: (5, 4)
+        ┌─────┬─────┬─────┬───────┐
+        │ a   ┆ b   ┆ c   ┆ c_min │
+        │ --- ┆ --- ┆ --- ┆ ---   │
+        │ str ┆ i64 ┆ i64 ┆ i64   │
+        ╞═════╪═════╪═════╪═══════╡
+        │ a   ┆ 1   ┆ 5   ┆ 5     │
+        │ a   ┆ 2   ┆ 4   ┆ 4     │
+        │ b   ┆ 3   ┆ 3   ┆ 1     │
+        │ b   ┆ 5   ┆ 2   ┆ 2     │
+        │ b   ┆ 3   ┆ 1   ┆ 1     │
+        └─────┴─────┴─────┴───────┘
+
+        Or use positional arguments to group by multiple columns in the same way.
+
+        >>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min"))
+        shape: (5, 4)
+        ┌─────┬─────┬─────┬───────┐
+        │ a   ┆ b   ┆ c   ┆ c_min │
+        │ --- ┆ --- ┆ --- ┆ ---   │
+        │ str ┆ i64 ┆ i64 ┆ i64   │
+        ╞═════╪═════╪═════╪═══════╡
+        │ a   ┆ 1   ┆ 5   ┆ 5     │
+        │ a   ┆ 2   ┆ 4   ┆ 4     │
+        │ b   ┆ 3   ┆ 3   ┆ 1     │
+        │ b   ┆ 5   ┆ 2   ┆ 1     │
+        │ b   ┆ 3   ┆ 1   ┆ 1     │
+        └─────┴─────┴─────┴───────┘
+
+        """
+        exprs = selection_to_pyexpr_list(expr)
+        exprs.extend(selection_to_pyexpr_list(more_exprs))
+        return self._from_pyexpr(self._pyexpr.over(exprs))
 
     def is_unique(self) -> Self:
         """

diff --git a/py-polars/polars/internals/io.py b/py-polars/polars/internals/io.py
@@ -101,6 +101,10 @@ def managed_file(file: Any) -> Iterator[Any]:
     )
     encoding_str = encoding if encoding else "utf8"
 
+    # PyArrow allows directories, so we only check that something is not
+    # a dir if we are not using PyArrow
+    check_not_dir = not use_pyarrow
+
     if isinstance(file, bytes):
         if has_non_utf8_non_utf8_lossy_encoding:
             return _check_empty(
@@ -138,7 +142,7 @@ def managed_file(file: Any) -> Iterator[Any]:
                 BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")),
                 context=f"Path ({file!r})",
             )
-        return managed_file(normalise_filepath(file))
+        return managed_file(normalise_filepath(file, check_not_dir))
 
     if isinstance(file, str):
         # make sure that this is before fsspec
@@ -151,7 +155,7 @@ def managed_file(file: Any) -> Iterator[Any]:
 
             if not has_non_utf8_non_utf8_lossy_encoding:
                 if infer_storage_options(file)["protocol"] == "file":
-                    return managed_file(normalise_filepath(file))
+                    return managed_file(normalise_filepath(file, check_not_dir))
             kwargs["encoding"] = encoding
             return fsspec.open(file, **kwargs)
 
@@ -161,12 +165,14 @@ def managed_file(file: Any) -> Iterator[Any]:
 
             if not has_non_utf8_non_utf8_lossy_encoding:
                 if all(infer_storage_options(f)["protocol"] == "file" for f in file):
-                    return managed_file([normalise_filepath(f) for f in file])
+                    return managed_file(
+                        [normalise_filepath(f, check_not_dir) for f in file]
+                    )
             kwargs["encoding"] = encoding
             return fsspec.open_files(file, **kwargs)
 
     if isinstance(file, str):
-        file = normalise_filepath(file)
+        file = normalise_filepath(file, check_not_dir)
         if has_non_utf8_non_utf8_lossy_encoding:
             with open(file, encoding=encoding_str) as f:
                 return _check_empty(