diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 581579cafe6..1aa39f9e731 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -159,27 +159,146 @@ def merge_asof( "can not merge DataFrame with instance of type {}".format(type(right)) ) ErrorMessage.default_to_pandas("`merge_asof`") - if isinstance(right, DataFrame): - right = to_pandas(right) - return DataFrame( - pandas.merge_asof( - to_pandas(left), - right, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - by=by, - left_by=left_by, - right_by=right_by, - suffixes=suffixes, - tolerance=tolerance, - allow_exact_matches=allow_exact_matches, - direction=direction, + + # As of Pandas 1.2 these should raise an error; before that it did + # something likely random: + if ( + (on and (left_index or right_index)) + or (left_on and left_index) + or (right_on and right_index) + ): + raise ValueError("Can't combine left/right_index with left/right_on or on.") + + # Pandas fallbacks for tricky cases: + if ( + # No idea how this works or why it does what it does; and in fact + # there's a Pandas bug suggesting it's wrong: + # https://github.com/pandas-dev/pandas/issues/33463 + (left_index and right_on is not None) + # This is the case where by is a list of columns. If we're copying lots + # of columns out of Pandas, maybe not worth trying our path, it's not + # clear it's any better: + or not isinstance(by, (str, type(None))) + or not isinstance(left_by, (str, type(None))) + or not isinstance(right_by, (str, type(None))) + ): + if isinstance(right, DataFrame): + right = to_pandas(right) + return DataFrame( + pandas.merge_asof( + to_pandas(left), + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + by=by, + left_by=left_by, + right_by=right_by, + suffixes=suffixes, + tolerance=tolerance, + allow_exact_matches=allow_exact_matches, + direction=direction, + ) ) + + left_column = None + right_column = None + + if on is not None: + if left_on is not None or right_on is not None: + raise ValueError("If 'on' is set, 'left_on' and 'right_on' can't be set.") + left_on = on + right_on = on + + if left_on is not None: + left_column = to_pandas(left[left_on]) + elif left_index: + left_column = left.index + else: + raise ValueError("Need some sort of 'on' spec") + + if right_on is not None: + right_column = to_pandas(right[right_on]) + elif right_index: + right_column = right.index + else: + raise ValueError("Need some sort of 'on' spec") + + # If we haven't set these by now, there's a bug in this function. + assert left_column is not None + assert right_column is not None + + if by is not None: + if left_by is not None or right_by is not None: + raise ValueError("Can't have both 'by' and 'left_by' or 'right_by'") + left_by = right_by = by + + # List of columns case should have been handled by direct Pandas fallback + # earlier: + assert isinstance(left_by, (str, type(None))) + assert isinstance(right_by, (str, type(None))) + + left_pandas_limited = {"on": left_column} + right_pandas_limited = {"on": right_column, "right_labels": right.index} + extra_kwargs = {} # extra arguments to Pandas merge_asof + + if left_by is not None or right_by is not None: + extra_kwargs["by"] = "by" + left_pandas_limited["by"] = to_pandas(left[left_by]) + right_pandas_limited["by"] = to_pandas(right[right_by]) + + # 1. Construct Pandas DataFrames with just the 'on' and optional 'by' + # columns, and the index as another column. + left_pandas_limited = pandas.DataFrame(left_pandas_limited, index=left.index) + right_pandas_limited = pandas.DataFrame(right_pandas_limited) + + # 2. Use Pandas' merge_asof to figure out how to map labels on left to + # labels on the right. + merged = pandas.merge_asof( + left_pandas_limited, + right_pandas_limited, + on="on", + direction=direction, + allow_exact_matches=allow_exact_matches, + tolerance=tolerance, + **extra_kwargs, + ) + # Now merged["right_labels"] shows which labels from right map to left's index. + + # 3. Re-index right using the merged["right_labels"]; at this point right + # should be same length and (semantically) same order as left: + right_subset = right.reindex(index=pandas.Index(merged["right_labels"])) + if not right_index: + right_subset.drop(columns=[right_on], inplace=True) + if right_by is not None and left_by == right_by: + right_subset.drop(columns=[right_by], inplace=True) + right_subset.index = left.index + + # 4. Merge left and the new shrunken right: + result = merge( + left, + right_subset, + left_index=True, + right_index=True, + suffixes=suffixes, + how="left", ) + # 5. Clean up to match Pandas output: + if left_on is not None and right_index: + result.insert( + # In theory this could use get_indexer_for(), but that causes an error: + list(result.columns).index(left_on + suffixes[0]), + left_on, + result[left_on + suffixes[0]], + ) + if not left_index and not right_index: + result.index = pandas.RangeIndex(start=0, stop=len(result)) + + return result + @_inherit_docstrings(pandas.pivot_table) def pivot_table( diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index cab6ddc6c9d..d24d9f78474 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -16,7 +16,7 @@ import modin.pandas as pd import numpy as np from numpy.testing import assert_array_equal -from modin.utils import get_current_backend +from modin.utils import get_current_backend, to_pandas from .utils import test_data_values, test_data_keys, df_equals @@ -217,6 +217,264 @@ def test_merge_asof(): ) +def test_merge_asof_on_variations(): + """on=,left_on=,right_on=,right_index=,left_index= options match Pandas.""" + left = {"a": [1, 5, 10], "left_val": ["a", "b", "c"]} + left_index = [6, 8, 12] + right = {"a": [1, 2, 3, 6, 7], "right_val": ["d", "e", "f", "g", "h"]} + right_index = [6, 7, 8, 9, 15] + pandas_left, pandas_right = ( + pandas.DataFrame(left, index=left_index), + pandas.DataFrame(right, index=right_index), + ) + modin_left, modin_right = pd.DataFrame(left, index=left_index), pd.DataFrame( + right, index=right_index + ) + for on_arguments in [ + {"on": "a"}, + {"left_on": "a", "right_on": "a"}, + {"left_on": "a", "right_index": True}, + {"left_index": True, "right_on": "a"}, + {"left_index": True, "right_index": True}, + ]: + pandas_merged = pandas.merge_asof(pandas_left, pandas_right, **on_arguments) + modin_merged = pd.merge_asof(modin_left, modin_right, **on_arguments) + df_equals(pandas_merged, modin_merged) + + +def test_merge_asof_suffixes(): + """Suffix variations are handled the same as Pandas.""" + left = {"a": [1, 5, 10]} + right = {"a": [2, 3, 6]} + pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right)) + modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right) + for suffixes in [("a", "b"), (False, "c"), ("d", False)]: + pandas_merged = pandas.merge_asof( + pandas_left, + pandas_right, + left_index=True, + right_index=True, + suffixes=suffixes, + ) + modin_merged = pd.merge_asof( + modin_left, + modin_right, + left_index=True, + right_index=True, + suffixes=suffixes, + ) + df_equals(pandas_merged, modin_merged) + + with pytest.raises(ValueError): + pandas.merge_asof( + pandas_left, + pandas_right, + left_index=True, + right_index=True, + suffixes=(False, False), + ) + with pytest.raises(ValueError): + modin_merged = pd.merge_asof( + modin_left, + modin_right, + left_index=True, + right_index=True, + suffixes=(False, False), + ) + + +def test_merge_asof_bad_arguments(): + left = {"a": [1, 5, 10], "b": [5, 7, 9]} + right = {"a": [2, 3, 6], "b": [6, 5, 20]} + pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right)) + modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right) + + # Can't mix by with left_by/right_by + with pytest.raises(ValueError): + pandas.merge_asof( + pandas_left, pandas_right, on="a", by="b", left_by="can't do with by" + ) + with pytest.raises(ValueError): + pd.merge_asof( + modin_left, modin_right, on="a", by="b", left_by="can't do with by" + ) + with pytest.raises(ValueError): + pandas.merge_asof( + pandas_left, pandas_right, by="b", on="a", right_by="can't do with by" + ) + with pytest.raises(ValueError): + pd.merge_asof( + modin_left, modin_right, by="b", on="a", right_by="can't do with by" + ) + + # Can't mix on with left_on/right_on + with pytest.raises(ValueError): + pandas.merge_asof(pandas_left, pandas_right, on="a", left_on="can't do with by") + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right, on="a", left_on="can't do with by") + with pytest.raises(ValueError): + pandas.merge_asof( + pandas_left, pandas_right, on="a", right_on="can't do with by" + ) + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right, on="a", right_on="can't do with by") + + # Can't mix left_index with left_on or on, similarly for right. + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right, on="a", right_index=True) + with pytest.raises(ValueError): + pd.merge_asof( + modin_left, modin_right, left_on="a", right_on="a", right_index=True + ) + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right, on="a", left_index=True) + with pytest.raises(ValueError): + pd.merge_asof( + modin_left, modin_right, left_on="a", right_on="a", left_index=True + ) + + # Need both left and right + with pytest.raises(Exception): # Pandas bug, didn't validate inputs sufficiently + pandas.merge_asof(pandas_left, pandas_right, left_on="a") + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right, left_on="a") + with pytest.raises(Exception): # Pandas bug, didn't validate inputs sufficiently + pandas.merge_asof(pandas_left, pandas_right, right_on="a") + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right, right_on="a") + with pytest.raises(ValueError): + pandas.merge_asof(pandas_left, pandas_right) + with pytest.raises(ValueError): + pd.merge_asof(modin_left, modin_right) + + +def test_merge_asof_merge_options(): + modin_quotes = pd.DataFrame( + { + "time": [ + pd.Timestamp("2016-05-25 13:30:00.023"), + pd.Timestamp("2016-05-25 13:30:00.023"), + pd.Timestamp("2016-05-25 13:30:00.030"), + pd.Timestamp("2016-05-25 13:30:00.041"), + pd.Timestamp("2016-05-25 13:30:00.048"), + pd.Timestamp("2016-05-25 13:30:00.049"), + pd.Timestamp("2016-05-25 13:30:00.072"), + pd.Timestamp("2016-05-25 13:30:00.075"), + ], + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + } + ) + modin_trades = pd.DataFrame( + { + "time": [ + pd.Timestamp("2016-05-25 13:30:00.023"), + pd.Timestamp("2016-05-25 13:30:00.038"), + pd.Timestamp("2016-05-25 13:30:00.048"), + pd.Timestamp("2016-05-25 13:30:00.048"), + pd.Timestamp("2016-05-25 13:30:00.048"), + ], + "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.0], + "quantity": [75, 155, 100, 100, 100], + } + ) + pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas(modin_trades) + + # left_by + right_by + df_equals( + pandas.merge_asof( + pandas_quotes, + pandas_trades, + on="time", + left_by="ticker", + right_by="ticker2", + ), + pd.merge_asof( + modin_quotes, + modin_trades, + on="time", + left_by="ticker", + right_by="ticker2", + ), + ) + + # Just by: + pandas_trades["ticker"] = pandas_trades["ticker2"] + modin_trades["ticker"] = modin_trades["ticker2"] + df_equals( + pandas.merge_asof( + pandas_quotes, + pandas_trades, + on="time", + by="ticker", + ), + pd.merge_asof( + modin_quotes, + modin_trades, + on="time", + by="ticker", + ), + ) + + # Tolerance + df_equals( + pandas.merge_asof( + pandas_quotes, + pandas_trades, + on="time", + by="ticker", + tolerance=pd.Timedelta("2ms"), + ), + pd.merge_asof( + modin_quotes, + modin_trades, + on="time", + by="ticker", + tolerance=pd.Timedelta("2ms"), + ), + ) + + # Direction + df_equals( + pandas.merge_asof( + pandas_quotes, + pandas_trades, + on="time", + by="ticker", + direction="forward", + ), + pd.merge_asof( + modin_quotes, + modin_trades, + on="time", + by="ticker", + direction="forward", + ), + ) + + # Allow exact matches + df_equals( + pandas.merge_asof( + pandas_quotes, + pandas_trades, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ), + pd.merge_asof( + modin_quotes, + modin_trades, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ), + ) + + def test_pivot(): test_df = pd.DataFrame( {