Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): add map_dict method for Series #6946

Merged
merged 2 commits into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/computation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Computation
Series.kurtosis
Series.log
Series.log10
Series.map_dict
Series.pct_change
Series.peak_max
Series.peak_min
Expand Down
10 changes: 5 additions & 5 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -1233,12 +1233,12 @@ def pandas_has_default_index(df: pd.DataFrame) -> bool:

index_cols = df.index.names

if len(index_cols) > 1: # noqa: SIM114
return False # not default: more than one index
elif index_cols not in ([None], [""]):
return False # not default: index is named
if len(index_cols) > 1 or index_cols not in ([None], [""]):
# not default: more than one index, or index is named
return False
elif df.index.equals(RangeIndex(start=0, stop=len(df), step=1)):
return True # is default: simple range index
# is default: simple range index
return True
else:
# finally, is the index _equivalent_ to a default unnamed
# integer index with frame data that was previously sorted
Expand Down
15 changes: 7 additions & 8 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6047,9 +6047,9 @@ def map_dict(
Parameters
----------
remapping
Mapping dictionary to use for remapping the values.
Dictionary containing the before/after values to map.
default
Value to use when original value was not found in remapping dictionary.
Value to use when the remapping dict does not contain the lookup value.

Warnings
--------
Expand Down Expand Up @@ -6097,7 +6097,7 @@ def map_dict(
│ 3 ┆ DE ┆ Germany │
└────────┴──────────────┴───────────────┘

Set a default value for values that couldn't be mapped.
Set a default value for values that cannot be mapped...

>>> df.with_columns(
... pl.col("country_code")
Expand All @@ -6116,7 +6116,7 @@ def map_dict(
│ 3 ┆ DE ┆ Germany │
└────────┴──────────────┴───────────────┘

Keep the original value for values that couldn't be mapped.
...or keep the original value:

>>> df.with_columns(
... pl.col("country_code")
Expand All @@ -6135,10 +6135,9 @@ def map_dict(
│ 3 ┆ DE ┆ Germany │
└────────┴──────────────┴───────────────┘

If you need to access different columns to set a default value for values that
couldn't be mapped, a struct needs to be constructed, with in the first field
the column that you want to remap and the rest of the fields the other columns
you use in the default expression.
If you need to access different columns to set a default value, a struct needs
to be constructed; in the first field is the column that you want to remap and
the rest of the fields are the other columns used in the default expression.

>>> df.with_columns(
... pl.struct(pl.col(["country_code", "row_nr"])).map_dict(
Expand Down
61 changes: 56 additions & 5 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4780,6 +4780,57 @@ def clip_max(self, max_val: int | float) -> Series:

"""

def map_dict(
self,
remapping: dict[Any, Any],
*,
default: Any = None,
) -> Self:
"""
Replace values in the Series using a remapping dictionary.

Parameters
----------
remapping
Dictionary containing the before/after values to map.
default
Value to use when the remapping dict does not contain the lookup value.

Examples
--------
>>> s = pl.Series("iso3166", ["TUR", "???", "JPN", "NLD"])
>>> country_lookup = {
... "JPN": "Japan",
... "TUR": "Türkiye",
... "NLD": "Netherlands",
... }

Remap, setting a default for unrecognised values...

>>> s.map_dict(country_lookup, default="Unspecified").rename("country_name")
shape: (4,)
Series: 'country_name' [str]
[
"Türkiye"
"Unspecified"
"Japan"
"Netherlands"
]

...or keep the original value:

>>> s.map_dict(country_lookup, default=s).rename("country_name")
shape: (4,)
Series: 'country_name' [str]
[
"Türkiye"
"???"
"Japan"
"Netherlands"
]

"""

def reshape(self, dims: tuple[int, ...]) -> Series:
"""
Reshape this Series to a flat Series or a Series of Lists.
Expand Down Expand Up @@ -5136,6 +5187,11 @@ def arr(self) -> ListNameSpace:
"""Create an object namespace of all list related methods."""
return ListNameSpace(self)

@property
def bin(self) -> BinaryNameSpace:
"""Create an object namespace of all binary related methods."""
return BinaryNameSpace(self)

@property
def cat(self) -> CatNameSpace:
"""Create an object namespace of all categorical related methods."""
Expand All @@ -5151,11 +5207,6 @@ def str(self) -> StringNameSpace:
"""Create an object namespace of all string related methods."""
return StringNameSpace(self)

@property
def bin(self) -> BinaryNameSpace:
"""Create an object namespace of all binary related methods."""
return BinaryNameSpace(self)

@property
def struct(self) -> StructNameSpace:
"""Create an object namespace of all struct related methods."""
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def test_map_dict() -> None:

assert (
df.with_columns(
pl.struct(pl.col(["country_code", "row_nr"])) # type: ignore[union-attr]
pl.struct(pl.col(["country_code", "row_nr"]))
.map_dict(
country_code_dict,
default=pl.col("row_nr").cast(pl.Utf8),
Expand Down
14 changes: 14 additions & 0 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2392,3 +2392,17 @@ def test_is_between() -> None:
True,
False,
]


def test_map_dict() -> None:
s = pl.Series("s", [-1, 2, None, 4, -5])
remap = {1: "one", 2: "two", 3: "three", 4: "four", 5: "five"}

assert_series_equal(
s.abs().map_dict(remap, default="?"),
pl.Series("s", ["one", "two", "?", "four", "five"]),
)
assert_series_equal(
s.map_dict(remap, default=s.cast(pl.Utf8)),
pl.Series("s", ["-1", "two", None, "four", "-5"]),
)