Skip to content

Commit

Permalink
Merge branch 'master' into numpy_arithmetic
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 18, 2023
2 parents 40006b9 + b7178f1 commit 8876323
Show file tree
Hide file tree
Showing 21 changed files with 483 additions and 162 deletions.
2 changes: 1 addition & 1 deletion polars/polars-arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ description = "Arrow interfaces for Polars DataFrame library"

[dependencies]
arrow.workspace = true
chrono = { version = "0.4", optional = true }
chrono = { version = "0.4", default-features = false, features = ["std"], optional = true }
chrono-tz = { version = "0.8", optional = true }
hashbrown.workspace = true
num.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ ahash.workspace = true
anyhow.workspace = true
arrow.workspace = true
bitflags.workspace = true
chrono = { version = "0.4", optional = true }
chrono = { version = "0.4", default-features = false, features = ["std"], optional = true }
chrono-tz = { version = "0.8", optional = true }
comfy-table = { version = "6.1.4", optional = true, default_features = false }
hashbrown.workspace = true
Expand Down
23 changes: 22 additions & 1 deletion polars/polars-core/src/chunked_array/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,28 @@ where
}

fn cast_unchecked(&self, data_type: &DataType) -> PolarsResult<Series> {
self.cast_impl(data_type, false)
match data_type {
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(Some(rev_map)) => {
if self.dtype() == &DataType::UInt32 {
// safety:
// we are guarded by the type system.
let ca = unsafe { &*(self as *const ChunkedArray<T> as *const UInt32Chunked) };
Ok(unsafe {
CategoricalChunked::from_cats_and_rev_map_unchecked(
ca.clone(),
rev_map.clone(),
)
}
.into_series())
} else {
Err(PolarsError::ComputeError(
"Cannot cast numeric types to 'Categorical'".into(),
))
}
}
_ => self.cast_impl(data_type, false),
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ anyhow.workspace = true
arrow.workspace = true
async-trait = { version = "0.1.59", optional = true }
bytes = "1.3.0"
chrono = { version = "0.4.23", optional = true }
chrono = { version = "0.4", default-features = false, features = ["std"], optional = true }
chrono-tz = { version = "0.8.1", optional = true }
dirs = "4.0"
flate2 = { version = "1", optional = true, default-features = false }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ impl FunctionExpr {
Ok(fld)
};

#[cfg(any(feature = "rolling_window", feature = "trigonometry", feature = "log"))]
// set float supertype
let float_dtype = || {
map_dtype(&|dtype| match dtype {
Expand Down Expand Up @@ -108,7 +107,7 @@ impl FunctionExpr {
use FunctionExpr::*;
match self {
NullCount => with_dtype(IDX_DTYPE),
Pow => super_type(),
Pow => float_dtype(),
Coalesce => super_type(),
#[cfg(feature = "row_hash")]
Hash(..) => with_dtype(DataType::UInt64),
Expand Down
3 changes: 2 additions & 1 deletion polars/polars-ops/src/series/ops/is_unique.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ fn dispatcher(s: &Series, invert: bool) -> PolarsResult<BooleanChunked> {
#[cfg(feature = "dtype-binary")]
Utf8 => {
let s = s.cast(&Binary).unwrap();
return is_unique(&s);
let ca = s.binary().unwrap();
is_unique_ca(ca, invert)
}
Float32 => {
let ca = s.bit_repr_small();
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-time/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ description = "Time related code for the polars dataframe library"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
chrono = "0.4"
chrono = { version = "0.4", default-features = false, features = ["std"] }
chrono-tz = { version = "0.8", optional = true }
lexical = { version = "6", default-features = false, features = ["std", "parse-floats", "parse-integers"] }
now = "0.1"
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series/descriptive.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ Descriptive
Series.is_unique
Series.is_utf8
Series.len
Series.lower_bound
Series.n_chunks
Series.n_unique
Series.null_count
Series.unique_counts
Series.upper_bound
Series.value_counts
4 changes: 2 additions & 2 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3596,8 +3596,8 @@ def groupby(
Parameters
----------
by
Column or columns to group by. Accepts expression input. Strings are parsed
as column names.
Column(s) to group by. Accepts expression input. Strings are parsed as
column names.
*more_by
Additional columns to group by, specified as positional arguments.
maintain_order
Expand Down
138 changes: 87 additions & 51 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ def expr_to_lit_or_expr(
Expr
"""
if isinstance(expr, str) and not str_to_lit:
if isinstance(expr, Expr):
pass
elif isinstance(expr, str) and not str_to_lit:
expr = pli.col(expr)
elif (
isinstance(expr, (int, float, str, pli.Series, datetime, date, time, timedelta))
Expand All @@ -123,7 +125,7 @@ def expr_to_lit_or_expr(
structify = False
elif isinstance(expr, (pli.WhenThen, pli.WhenThenThen)):
expr = expr.otherwise(None) # implicitly add the null branch.
elif not isinstance(expr, Expr):
else:
raise TypeError(
f"did not expect value {expr} of type {type(expr)}, maybe disambiguate with"
" pl.lit or pl.col"
Expand Down Expand Up @@ -2766,68 +2768,102 @@ def last(self) -> Self:
"""
return self._from_pyexpr(self._pyexpr.last())

def over(self, expr: str | Expr | list[Expr | str]) -> Self:
def over(self, expr: IntoExpr | Iterable[IntoExpr], *more_exprs: IntoExpr) -> Self:
"""
Apply window function over a subgroup.
Compute expressions over the given groups.
This is similar to a groupby + aggregation + self join.
Or similar to `window functions in Postgres
<https://www.postgresql.org/docs/current/tutorial-window.html>`_.
This expression is similar to performing a groupby aggregation and joining the
result back into the original dataframe.
The outcome is similar to how `window functions
<https://www.postgresql.org/docs/current/tutorial-window.html>`_
work in PostgreSQL.
Parameters
----------
expr
Column(s) to group by.
Column(s) to group by. Accepts expression input. Strings are parsed as
column names.
*more_exprs
Additional columns to group by, specified as positional arguments.
Examples
--------
Pass the name of a column to compute the expression over that column.
>>> df = pl.DataFrame(
... {
... "groups": ["g1", "g1", "g2"],
... "values": [1, 2, 3],
... }
... )
>>> df.with_columns(pl.col("values").max().over("groups").alias("max_by_group"))
shape: (3, 3)
┌────────┬────────┬──────────────┐
│ groups ┆ values ┆ max_by_group │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪════════╪══════════════╡
│ g1 ┆ 1 ┆ 2 │
│ g1 ┆ 2 ┆ 2 │
│ g2 ┆ 3 ┆ 3 │
└────────┴────────┴──────────────┘
>>> df = pl.DataFrame(
... {
... "groups": [1, 1, 2, 2, 1, 2, 3, 3, 1],
... "values": [1, 2, 3, 4, 5, 6, 7, 8, 8],
... "a": ["a", "a", "b", "b", "b"],
... "b": [1, 2, 3, 5, 3],
... "c": [5, 4, 3, 2, 1],
... }
... )
>>> df.lazy().select(
... pl.col("groups").sum().over("groups"),
... ).collect()
shape: (9, 1)
┌────────┐
│ groups │
│ --- │
│ i64 │
╞════════╡
│ 4 │
│ 4 │
│ 6 │
│ 6 │
│ ... │
│ 6 │
│ 6 │
│ 6 │
│ 4 │
└────────┘
"""
pyexprs = selection_to_pyexpr_list(expr)

return self._from_pyexpr(self._pyexpr.over(pyexprs))
>>> df.with_columns(pl.col("c").max().over("a").suffix("_max"))
shape: (5, 4)
┌─────┬─────┬─────┬───────┐
│ a ┆ b ┆ c ┆ c_max │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═══════╡
│ a ┆ 1 ┆ 5 ┆ 5 │
│ a ┆ 2 ┆ 4 ┆ 5 │
│ b ┆ 3 ┆ 3 ┆ 3 │
│ b ┆ 5 ┆ 2 ┆ 3 │
│ b ┆ 3 ┆ 1 ┆ 3 │
└─────┴─────┴─────┴───────┘
Expression input is supported.
>>> df.with_columns(pl.col("c").max().over(pl.col("b") // 2).suffix("_max"))
shape: (5, 4)
┌─────┬─────┬─────┬───────┐
│ a ┆ b ┆ c ┆ c_max │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═══════╡
│ a ┆ 1 ┆ 5 ┆ 5 │
│ a ┆ 2 ┆ 4 ┆ 4 │
│ b ┆ 3 ┆ 3 ┆ 4 │
│ b ┆ 5 ┆ 2 ┆ 2 │
│ b ┆ 3 ┆ 1 ┆ 4 │
└─────┴─────┴─────┴───────┘
Group by multiple columns by passing a list of column names or expressions.
>>> df.with_columns(pl.col("c").min().over(["a", "b"]).suffix("_min"))
shape: (5, 4)
┌─────┬─────┬─────┬───────┐
│ a ┆ b ┆ c ┆ c_min │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═══════╡
│ a ┆ 1 ┆ 5 ┆ 5 │
│ a ┆ 2 ┆ 4 ┆ 4 │
│ b ┆ 3 ┆ 3 ┆ 1 │
│ b ┆ 5 ┆ 2 ┆ 2 │
│ b ┆ 3 ┆ 1 ┆ 1 │
└─────┴─────┴─────┴───────┘
Or use positional arguments to group by multiple columns in the same way.
>>> df.with_columns(pl.col("c").min().over("a", pl.col("b") % 2).suffix("_min"))
shape: (5, 4)
┌─────┬─────┬─────┬───────┐
│ a ┆ b ┆ c ┆ c_min │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═══════╡
│ a ┆ 1 ┆ 5 ┆ 5 │
│ a ┆ 2 ┆ 4 ┆ 4 │
│ b ┆ 3 ┆ 3 ┆ 1 │
│ b ┆ 5 ┆ 2 ┆ 1 │
│ b ┆ 3 ┆ 1 ┆ 1 │
└─────┴─────┴─────┴───────┘
"""
exprs = selection_to_pyexpr_list(expr)
exprs.extend(selection_to_pyexpr_list(more_exprs))
return self._from_pyexpr(self._pyexpr.over(exprs))

def is_unique(self) -> Self:
"""
Expand Down
14 changes: 10 additions & 4 deletions py-polars/polars/internals/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ def managed_file(file: Any) -> Iterator[Any]:
)
encoding_str = encoding if encoding else "utf8"

# PyArrow allows directories, so we only check that something is not
# a dir if we are not using PyArrow
check_not_dir = not use_pyarrow

if isinstance(file, bytes):
if has_non_utf8_non_utf8_lossy_encoding:
return _check_empty(
Expand Down Expand Up @@ -138,7 +142,7 @@ def managed_file(file: Any) -> Iterator[Any]:
BytesIO(file.read_bytes().decode(encoding_str).encode("utf8")),
context=f"Path ({file!r})",
)
return managed_file(normalise_filepath(file))
return managed_file(normalise_filepath(file, check_not_dir))

if isinstance(file, str):
# make sure that this is before fsspec
Expand All @@ -151,7 +155,7 @@ def managed_file(file: Any) -> Iterator[Any]:

if not has_non_utf8_non_utf8_lossy_encoding:
if infer_storage_options(file)["protocol"] == "file":
return managed_file(normalise_filepath(file))
return managed_file(normalise_filepath(file, check_not_dir))
kwargs["encoding"] = encoding
return fsspec.open(file, **kwargs)

Expand All @@ -161,12 +165,14 @@ def managed_file(file: Any) -> Iterator[Any]:

if not has_non_utf8_non_utf8_lossy_encoding:
if all(infer_storage_options(f)["protocol"] == "file" for f in file):
return managed_file([normalise_filepath(f) for f in file])
return managed_file(
[normalise_filepath(f, check_not_dir) for f in file]
)
kwargs["encoding"] = encoding
return fsspec.open_files(file, **kwargs)

if isinstance(file, str):
file = normalise_filepath(file)
file = normalise_filepath(file, check_not_dir)
if has_non_utf8_non_utf8_lossy_encoding:
with open(file, encoding=encoding_str) as f:
return _check_empty(
Expand Down
Loading

0 comments on commit 8876323

Please sign in to comment.