diff --git a/RELEASE.rst b/RELEASE.rst index b88580bdf2aa1..c9508c6a47e73 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -39,6 +39,7 @@ pandas 0.9.0 **Improvements to existing features** + - Proper handling of NA values in merge operations (#1990) - Add ``flags`` option for ``re.compile`` in some Series.str methods (#1659) - Parsing of UTC date strings in read_* functions (#1693) - Handle generator input to Series (#1679) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d64d79913b10f..4a50016c39927 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -142,7 +142,6 @@ def _merger(x, y): -# TODO: NA group handling # TODO: transformations?? # TODO: only copy DataFrames when modification necessary @@ -572,7 +571,16 @@ def _factorize_keys(lk, rk, sort=True): if sort: llab, rlab = _sort_labels(rizer.uniques, llab, rlab) - # TODO: na handling + # NA group + lmask = llab == -1; lany = lmask.any() + rmask = rlab == -1; rany = rmask.any() + + if lany or rany: + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 return llab, rlab, count diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 829471deb9e6d..d7ad584e8f62f 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -777,6 +777,34 @@ def test_left_merge_na_buglet(self): expected = left.join(rdf) tm.assert_frame_equal(merged, expected) + def test_merge_na_keys(self): + data = [[1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.], + [1950, "C", 4.], + [1960, "C", np.nan], + [1965, "C", 3.], + [1970, "C", 4.]] + + frame = DataFrame(data, columns=["year", "panel", "data"]) + + other_data = [[1960, 'A', np.nan], + [1970, 'A', np.nan], + [1955, 'A', np.nan], + [1965, 'A', np.nan], + [1965, 'B', np.nan], + [1955, 'C', np.nan]] + other = DataFrame(other_data, columns=['year', 'panel', 'data']) + + result = frame.merge(other, how='outer') + + expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') + expected = expected.replace(-999, np.nan) + + tm.assert_frame_equal(result, expected) + def _check_join(left, right, result, join_col, how='left', lsuffix='_x', rsuffix='_y'): diff --git a/setup.py b/setup.py index a91a3b63181f8..67cd8e8aad43e 100755 --- a/setup.py +++ b/setup.py @@ -372,8 +372,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): algos_ext = Extension('pandas._algos', sources=[srcpath('generated', suffix=suffix)], - include_dirs=[np.get_include()], - ) + include_dirs=[np.get_include()]) lib_depends = tseries_depends + ['pandas/src/numpy_helper.h', 'pandas/src/datetime/np_datetime.h',