From 5ae1a59bc940947ffb45c73bf7f927effada6070 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Oct 2011 15:32:50 -0400 Subject: [PATCH] ENH: implement multi-key joining. fairly naive impl for now --- pandas/core/common.py | 6 +++--- pandas/core/frame.py | 8 +++++++- pandas/core/index.py | 11 +++++++---- pandas/core/internals.py | 3 +-- pandas/tests/test_frame.py | 30 ++++++++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 10 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index ea2aea1ef7e6b..fd2863735d0bf 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -514,11 +514,11 @@ def intersection(*seqs): result &= seq return type(seqs[0])(list(result)) -def _asarray_tuplesafe(values): - if not isinstance(values, (list, np.ndarray)): +def _asarray_tuplesafe(values, dtype=None): + if not isinstance(values, (list, tuple, np.ndarray)): values = list(values) - result = np.asarray(values) + result = np.asarray(values, dtype=dtype) if issubclass(result.dtype.type, basestring): result = np.asarray(values, dtype=object) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6f5154d66c92..125120d2ecb41 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2219,7 +2219,13 @@ def _join_on(self, other, on, lsuffix, rsuffix): if len(other.index) == 0: return self - new_data = self._data.join_on(other._data, self[on], axis=1, + if isinstance(on, (list, tuple)): + join_key = zip(*[self[k] for k in on]) + join_key = common._asarray_tuplesafe(join_key, dtype=object) + else: + join_key = np.asarray(self[on]) + + new_data = self._data.join_on(other._data, join_key, axis=1, lsuffix=lsuffix, rsuffix=rsuffix) return self._constructor(new_data) diff --git a/pandas/core/index.py b/pandas/core/index.py index 672736956c8d0..8335ec0429b43 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -49,10 +49,13 @@ def __new__(cls, data, dtype=None, copy=False, name=None): 'of some kind, %s was passed' % repr(data)) else: # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - subarr = np.empty(len(data), dtype=object) - subarr[:] = data + subarr = _asarray_tuplesafe(data, dtype=object) + + # if not isinstance(data, (list, tuple)): + # data = list(data) + + # subarr = np.empty(len(data), dtype=object) + # subarr[:] = data subarr = subarr.view(cls) subarr.name = name diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5f1d9c5be5035..a420cc1e615ff 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -732,8 +732,7 @@ def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) other_axis = other.axes[axis] - indexer = lib.merge_indexer_object(on.astype(object), - other_axis.indexMap) + indexer = other_axis.get_indexer(on) # TODO: deal with length-0 case? or does it fall out? mask = indexer == -1 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3f2ebfc6e9ddf..cf64e5a5d377f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2551,6 +2551,36 @@ def test_join_on(self): self.assertRaises(Exception, target.join, source, on='C', how='left') + def test_join_on_multikey(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + + joined = data.join(to_join, on=['key1', 'key2']) + + join_key = Index(zip(key1, key2)) + indexer = to_join.index.get_indexer(join_key) + ex_values = to_join.values.take(indexer, axis=0) + ex_values[indexer == -1] = np.nan + expected = data.join(DataFrame(ex_values, columns=to_join.columns)) + + # TODO: columns aren't in the same order yet + assert_frame_equal(joined, expected.ix[:, joined.columns]) + def test_join_index_mixed(self): df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True},