Skip to content

Commit

Permalink
BUG/ENH: treat NA as additional group in merge operations. close #1990
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Sep 29, 2012
1 parent eb48812 commit f82d931
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 4 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pandas 0.9.0

**Improvements to existing features**

- Proper handling of NA values in merge operations (#1990)
- Add ``flags`` option for ``re.compile`` in some Series.str methods (#1659)
- Parsing of UTC date strings in read_* functions (#1693)
- Handle generator input to Series (#1679)
Expand Down
12 changes: 10 additions & 2 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ def _merger(x, y):



# TODO: NA group handling
# TODO: transformations??
# TODO: only copy DataFrames when modification necessary

Expand Down Expand Up @@ -572,7 +571,16 @@ def _factorize_keys(lk, rk, sort=True):
if sort:
llab, rlab = _sort_labels(rizer.uniques, llab, rlab)

# TODO: na handling
# NA group
lmask = llab == -1; lany = lmask.any()
rmask = rlab == -1; rany = rmask.any()

if lany or rany:
if lany:
np.putmask(llab, lmask, count)
if rany:
np.putmask(rlab, rmask, count)
count += 1

return llab, rlab, count

Expand Down
28 changes: 28 additions & 0 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,34 @@ def test_left_merge_na_buglet(self):
expected = left.join(rdf)
tm.assert_frame_equal(merged, expected)

def test_merge_na_keys(self):
data = [[1950, "A", 1.5],
[1950, "B", 1.5],
[1955, "B", 1.5],
[1960, "B", np.nan],
[1970, "B", 4.],
[1950, "C", 4.],
[1960, "C", np.nan],
[1965, "C", 3.],
[1970, "C", 4.]]

frame = DataFrame(data, columns=["year", "panel", "data"])

other_data = [[1960, 'A', np.nan],
[1970, 'A', np.nan],
[1955, 'A', np.nan],
[1965, 'A', np.nan],
[1965, 'B', np.nan],
[1955, 'C', np.nan]]
other = DataFrame(other_data, columns=['year', 'panel', 'data'])

result = frame.merge(other, how='outer')

expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
expected = expected.replace(-999, np.nan)

tm.assert_frame_equal(result, expected)


def _check_join(left, right, result, join_col, how='left',
lsuffix='_x', rsuffix='_y'):
Expand Down
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,8 +372,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'):

algos_ext = Extension('pandas._algos',
sources=[srcpath('generated', suffix=suffix)],
include_dirs=[np.get_include()],
)
include_dirs=[np.get_include()])

lib_depends = tseries_depends + ['pandas/src/numpy_helper.h',
'pandas/src/datetime/np_datetime.h',
Expand Down

0 comments on commit f82d931

Please sign in to comment.