From 63cf839c09e02e23aff015b712c9c94c84f0a8f7 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Wed, 3 Jun 2020 23:13:27 +0300 Subject: [PATCH 1/2] Add implementation of dt property (#1450) Signed-off-by: Alexey Prutskov --- docs/supported_apis/series_supported.rst | 2 +- modin/backends/pandas/query_compiler.py | 99 ++++++++++++++ modin/pandas/series.py | 159 ++++++++++++++++++++++- modin/pandas/test/test_series.py | 57 ++++++++ 4 files changed, 315 insertions(+), 2 deletions(-) diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index d443830845f..9a7a128446e 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -130,7 +130,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``dropna`` | Y | +-----------------------------+---------------------------------+ -| ``dt`` | D | +| ``dt`` | Y | +-----------------------------+---------------------------------+ | ``dtype`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index ee947829660..f2300e89884 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -60,6 +60,61 @@ def str_op_builder(df, *args, **kwargs): return str_op_builder +def _dt_prop_map(property_name): + """ + Create a function that call property of property `dt` of the series. + + Parameters + ---------- + property_name + The property of `dt`, which will be applied. + + Returns + ------- + A callable function to be applied in the partitions + + Notes + ----- + This applies non-callable properties of `Series.dt`. + """ + + def dt_op_builder(df, *args, **kwargs): + prop_val = getattr(df.squeeze().dt, property_name) + if isinstance(prop_val, pandas.Series): + return prop_val.to_frame() + else: + return pandas.DataFrame([prop_val]) + + return dt_op_builder + + +def _dt_func_map(func_name): + """ + Create a function that call method of property `dt` of the series. + + Parameters + ---------- + func_name + The method of `dt`, which will be applied. + + Returns + ------- + A callable function to be applied in the partitions + + Notes + ----- + This applies callable methods of `Series.dt`. + """ + + def dt_op_builder(df, *args, **kwargs): + dt_s = df.squeeze().dt + return pandas.DataFrame( + getattr(pandas.Series.dt, func_name)(dt_s, *args, **kwargs) + ) + + return dt_op_builder + + def copy_df_for_func(func): """Create a function that copies the dataframe, likely because `func` is inplace. @@ -493,6 +548,50 @@ def unique(self): ) return self.__constructor__(new_modin_frame) + # Dt map partitions operations + + dt_date = MapFunction.register(_dt_prop_map("date")) + dt_time = MapFunction.register(_dt_prop_map("time")) + dt_timetz = MapFunction.register(_dt_prop_map("timetz")) + dt_year = MapFunction.register(_dt_prop_map("year")) + dt_month = MapFunction.register(_dt_prop_map("month")) + dt_day = MapFunction.register(_dt_prop_map("day")) + dt_hour = MapFunction.register(_dt_prop_map("hour")) + dt_minute = MapFunction.register(_dt_prop_map("minute")) + dt_second = MapFunction.register(_dt_prop_map("second")) + dt_microsecond = MapFunction.register(_dt_prop_map("microsecond")) + dt_nanosecond = MapFunction.register(_dt_prop_map("nanosecond")) + dt_week = MapFunction.register(_dt_prop_map("week")) + dt_weekofyear = MapFunction.register(_dt_prop_map("weekofyear")) + dt_dayofweek = MapFunction.register(_dt_prop_map("dayofweek")) + dt_weekday = MapFunction.register(_dt_prop_map("weekday")) + dt_dayofyear = MapFunction.register(_dt_prop_map("dayofyear")) + dt_quarter = MapFunction.register(_dt_prop_map("quarter")) + dt_is_month_start = MapFunction.register(_dt_prop_map("is_month_start")) + dt_is_month_end = MapFunction.register(_dt_prop_map("is_month_end")) + dt_is_quarter_start = MapFunction.register(_dt_prop_map("is_quarter_start")) + dt_is_quarter_end = MapFunction.register(_dt_prop_map("is_quarter_end")) + dt_is_year_start = MapFunction.register(_dt_prop_map("is_year_start")) + dt_is_year_end = MapFunction.register(_dt_prop_map("is_year_end")) + dt_is_leap_year = MapFunction.register(_dt_prop_map("is_leap_year")) + dt_daysinmonth = MapFunction.register(_dt_prop_map("daysinmonth")) + dt_days_in_month = MapFunction.register(_dt_prop_map("days_in_month")) + dt_tz = MapFunction.register(_dt_prop_map("tz")) + dt_freq = MapFunction.register(_dt_prop_map("freq")) + dt_to_period = MapFunction.register(_dt_func_map("to_period")) + dt_to_pydatetime = MapFunction.register(_dt_func_map("to_pydatetime")) + dt_tz_localize = MapFunction.register(_dt_func_map("tz_localize")) + dt_tz_convert = MapFunction.register(_dt_func_map("tz_convert")) + dt_normalize = MapFunction.register(_dt_func_map("normalize")) + dt_strftime = MapFunction.register(_dt_func_map("strftime")) + dt_round = MapFunction.register(_dt_func_map("round")) + dt_floor = MapFunction.register(_dt_func_map("floor")) + dt_ceil = MapFunction.register(_dt_func_map("ceil")) + dt_month_name = MapFunction.register(_dt_func_map("month_name")) + dt_day_name = MapFunction.register(_dt_func_map("day_name")) + + # END Dt map partitions operations + def astype(self, col_dtypes, **kwargs): """Converts columns dtypes to given dtypes. diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d022020df35..f19254a5722 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1311,7 +1311,7 @@ def cat(self): @property def dt(self): - return self._default_to_pandas(pandas.Series.dt) + return DatetimeProperties(self) @property def dtype(self): @@ -1399,6 +1399,163 @@ def _to_pandas(self): return series +class DatetimeProperties(object): + def __init__(self, series): + self._series = series + self._query_compiler = series._query_compiler + + @property + def date(self): + return Series(query_compiler=self._query_compiler.dt_date()) + + @property + def time(self): + return Series(query_compiler=self._query_compiler.dt_time()) + + @property + def timetz(self): + return Series(query_compiler=self._query_compiler.dt_timetz()) + + @property + def year(self): + return Series(query_compiler=self._query_compiler.dt_year()) + + @property + def month(self): + return Series(query_compiler=self._query_compiler.dt_month()) + + @property + def day(self): + return Series(query_compiler=self._query_compiler.dt_day()) + + @property + def hour(self): + return Series(query_compiler=self._query_compiler.dt_hour()) + + @property + def minute(self): + return Series(query_compiler=self._query_compiler.dt_minute()) + + @property + def second(self): + return Series(query_compiler=self._query_compiler.dt_second()) + + @property + def microsecond(self): + return Series(query_compiler=self._query_compiler.dt_microsecond()) + + @property + def nanosecond(self): + return Series(query_compiler=self._query_compiler.dt_nanosecond()) + + @property + def week(self): + return Series(query_compiler=self._query_compiler.dt_week()) + + @property + def weekofyear(self): + return Series(query_compiler=self._query_compiler.dt_weekofyear()) + + @property + def dayofweek(self): + return Series(query_compiler=self._query_compiler.dt_dayofweek()) + + @property + def weekday(self): + return Series(query_compiler=self._query_compiler.dt_weekday()) + + @property + def dayofyear(self): + return Series(query_compiler=self._query_compiler.dt_dayofyear()) + + @property + def quarter(self): + return Series(query_compiler=self._query_compiler.dt_quarter()) + + @property + def is_month_start(self): + return Series(query_compiler=self._query_compiler.dt_is_month_start()) + + @property + def is_month_end(self): + return Series(query_compiler=self._query_compiler.dt_is_month_end()) + + @property + def is_quarter_start(self): + return Series(query_compiler=self._query_compiler.dt_is_quarter_start()) + + @property + def is_quarter_end(self): + return Series(query_compiler=self._query_compiler.dt_is_quarter_end()) + + @property + def is_year_start(self): + return Series(query_compiler=self._query_compiler.dt_is_year_start()) + + @property + def is_year_end(self): + return Series(query_compiler=self._query_compiler.dt_is_year_end()) + + @property + def is_leap_year(self): + return Series(query_compiler=self._query_compiler.dt_is_leap_year()) + + @property + def daysinmonth(self): + return Series(query_compiler=self._query_compiler.dt_daysinmonth()) + + @property + def days_in_month(self): + return Series(query_compiler=self._query_compiler.dt_days_in_month()) + + @property + def tz(self): + return self._query_compiler.dt_tz().to_pandas().squeeze() + + @property + def freq(self): + return self._query_compiler.dt_freq().to_pandas().squeeze() + + def to_period(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_to_period(*args, **kwargs)) + + def to_pydatetime(self): + return Series(query_compiler=self._query_compiler.dt_to_pydatetime()).to_numpy() + + def tz_localize(self, *args, **kwargs): + return Series( + query_compiler=self._query_compiler.dt_tz_localize(*args, **kwargs) + ) + + def tz_convert(self, *args, **kwargs): + return Series( + query_compiler=self._query_compiler.dt_tz_convert(*args, **kwargs) + ) + + def normalize(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_normalize(*args, **kwargs)) + + def strftime(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_strftime(*args, **kwargs)) + + def round(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_round(*args, **kwargs)) + + def floor(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_floor(*args, **kwargs)) + + def ceil(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_ceil(*args, **kwargs)) + + def month_name(self, *args, **kwargs): + return Series( + query_compiler=self._query_compiler.dt_month_name(*args, **kwargs) + ) + + def day_name(self, *args, **kwargs): + return Series(query_compiler=self._query_compiler.dt_day_name(*args, **kwargs)) + + class StringMethods(object): def __init__(self, series): # Check if dtypes is objects diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index f588c32cb52..11b6f02986d 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1338,6 +1338,63 @@ def test_dtype(data): df_equals(modin_series.dtype, pandas_series.dtypes) +def test_dt(): + data = pd.date_range("2016-12-31", "2017-01-08", freq="D", tz="Europe/Berlin") + modin_series = pd.Series(data) + pandas_series = pandas.Series(data) + + df_equals(modin_series.dt.date, pandas_series.dt.date) + df_equals(modin_series.dt.time, pandas_series.dt.time) + df_equals(modin_series.dt.timetz, pandas_series.dt.timetz) + df_equals(modin_series.dt.year, pandas_series.dt.year) + df_equals(modin_series.dt.month, pandas_series.dt.month) + df_equals(modin_series.dt.day, pandas_series.dt.day) + df_equals(modin_series.dt.hour, pandas_series.dt.hour) + df_equals(modin_series.dt.minute, pandas_series.dt.minute) + df_equals(modin_series.dt.second, pandas_series.dt.second) + df_equals(modin_series.dt.microsecond, pandas_series.dt.microsecond) + df_equals(modin_series.dt.nanosecond, pandas_series.dt.nanosecond) + df_equals(modin_series.dt.week, pandas_series.dt.week) + df_equals(modin_series.dt.weekofyear, pandas_series.dt.weekofyear) + df_equals(modin_series.dt.dayofweek, pandas_series.dt.dayofweek) + df_equals(modin_series.dt.weekday, pandas_series.dt.weekday) + df_equals(modin_series.dt.dayofyear, pandas_series.dt.dayofyear) + df_equals(modin_series.dt.quarter, pandas_series.dt.quarter) + df_equals(modin_series.dt.is_month_start, pandas_series.dt.is_month_start) + df_equals(modin_series.dt.is_month_end, pandas_series.dt.is_month_end) + df_equals(modin_series.dt.is_quarter_start, pandas_series.dt.is_quarter_start) + df_equals(modin_series.dt.is_quarter_end, pandas_series.dt.is_quarter_end) + df_equals(modin_series.dt.is_year_start, pandas_series.dt.is_year_start) + df_equals(modin_series.dt.is_year_end, pandas_series.dt.is_year_end) + df_equals(modin_series.dt.is_leap_year, pandas_series.dt.is_leap_year) + df_equals(modin_series.dt.daysinmonth, pandas_series.dt.daysinmonth) + df_equals(modin_series.dt.days_in_month, pandas_series.dt.days_in_month) + assert modin_series.dt.tz == pandas_series.dt.tz + assert modin_series.dt.freq == pandas_series.dt.freq + df_equals(modin_series.dt.to_period("W"), pandas_series.dt.to_period("W")) + assert_array_equal( + modin_series.dt.to_pydatetime(), pandas_series.dt.to_pydatetime() + ) + df_equals( + modin_series.dt.tz_localize(None), pandas_series.dt.tz_localize(None), + ) + df_equals( + modin_series.dt.tz_convert(tz="Europe/Berlin"), + pandas_series.dt.tz_convert(tz="Europe/Berlin"), + ) + + df_equals(modin_series.dt.normalize(), pandas_series.dt.normalize()) + df_equals( + modin_series.dt.strftime("%B %d, %Y, %r"), + pandas_series.dt.strftime("%B %d, %Y, %r"), + ) + df_equals(modin_series.dt.round("H"), pandas_series.dt.round("H")) + df_equals(modin_series.dt.floor("H"), pandas_series.dt.floor("H")) + df_equals(modin_series.dt.ceil("H"), pandas_series.dt.ceil("H")) + df_equals(modin_series.dt.month_name(), pandas_series.dt.month_name()) + df_equals(modin_series.dt.day_name(), pandas_series.dt.day_name()) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize( "keep", ["last", "first", False], ids=["last", "first", "False"] From 2d74813d11fb42e64054ee5ca0c69269cc2d9ee0 Mon Sep 17 00:00:00 2001 From: YarShev Date: Thu, 4 Jun 2020 19:02:23 +0300 Subject: [PATCH 2/2] Add `unique` implementation as free function (#1537) Signed-off-by: Yaroslav Igoshev yaroslav.igoshev@intel.com --- docs/supported_apis/utilities_supported.rst | 2 +- modin/pandas/__init__.py | 4 +- modin/pandas/general.py | 16 ++++++ modin/pandas/test/test_general.py | 59 +++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/docs/supported_apis/utilities_supported.rst b/docs/supported_apis/utilities_supported.rst index 419b9ec8006..894a8a97f6b 100644 --- a/docs/supported_apis/utilities_supported.rst +++ b/docs/supported_apis/utilities_supported.rst @@ -19,7 +19,7 @@ default to pandas. +---------------------------+---------------------------------+----------------------------------------------------+ | `pd.eval`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ -| `pd.unique`_ | D | | +| `pd.unique`_ | Y | | +---------------------------+---------------------------------+----------------------------------------------------+ | ``pd.value_counts`` | D | | +---------------------------+---------------------------------+----------------------------------------------------+ diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index ab61e2a5821..6e79f2c7548 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -27,7 +27,6 @@ from pandas import ( eval, - unique, value_counts, cut, to_numeric, @@ -132,6 +131,7 @@ notnull, notna, pivot, + unique, ) from .plotting import Plotting as plotting from .. import __execution_engine__ as execution_engine @@ -283,7 +283,6 @@ def import_pandas(*args): "json_normalize", "concat", "eval", - "unique", "value_counts", "cut", "to_numeric", @@ -363,6 +362,7 @@ def import_pandas(*args): "notnull", "notna", "pivot", + "unique", "datetime", "NamedAgg", "DEFAULT_NPARTITIONS", diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 636d98f80c2..8d1f29f0b89 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -16,6 +16,7 @@ from modin.error_message import ErrorMessage from .base import BasePandasDataset from .dataframe import DataFrame +from .series import Series from .utils import to_pandas @@ -217,3 +218,18 @@ def pivot(data, index=None, columns=None, values=None): if not isinstance(data, DataFrame): raise ValueError("can not pivot with instance of type {}".format(type(data))) return data.pivot(index=index, columns=columns, values=values) + + +def unique(values): + """ + Return unique values of input data. + + Uniques are returned in order of appearance. Hash table-based unique, + therefore does NOT sort. + + Returns + ------- + ndarray + The unique values returned as a NumPy array. + """ + return Series(values).unique() diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index c5d93e88a92..d7418792f30 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -15,6 +15,7 @@ import pytest import modin.pandas as pd import numpy as np +from numpy.testing import assert_array_equal from .utils import test_data_values, test_data_keys, df_equals @@ -260,6 +261,64 @@ def test_pivot_table(): ) +def test_unique(): + modin_result = pd.unique([2, 1, 3, 3]) + pandas_result = pandas.unique([2, 1, 3, 3]) + assert_array_equal(modin_result, pandas_result) + + modin_result = pd.unique(pd.Series([2] + [1] * 5)) + pandas_result = pandas.unique(pandas.Series([2] + [1] * 5)) + assert_array_equal(modin_result, pandas_result) + + modin_result = pd.unique( + pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]) + ) + pandas_result = pandas.unique( + pandas.Series([pandas.Timestamp("20160101"), pandas.Timestamp("20160101")]) + ) + assert_array_equal(modin_result, pandas_result) + + modin_result = pd.unique( + pd.Series( + [ + pd.Timestamp("20160101", tz="US/Eastern"), + pd.Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + pandas_result = pandas.unique( + pandas.Series( + [ + pandas.Timestamp("20160101", tz="US/Eastern"), + pandas.Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + assert_array_equal(modin_result, pandas_result) + + modin_result = pd.unique( + pd.Index( + [ + pd.Timestamp("20160101", tz="US/Eastern"), + pd.Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + pandas_result = pandas.unique( + pandas.Index( + [ + pandas.Timestamp("20160101", tz="US/Eastern"), + pandas.Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + assert_array_equal(modin_result, pandas_result) + + modin_result = pd.unique(pd.Series(pd.Categorical(list("baabc")))) + pandas_result = pandas.unique(pandas.Series(pandas.Categorical(list("baabc")))) + assert_array_equal(modin_result, pandas_result) + + def test_to_datetime(): # DataFrame input for to_datetime modin_df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})