From 5c5bb46068f58b8285fff2bb04a0091db58d2e39 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 28 Aug 2017 23:55:45 +0200 Subject: [PATCH] BUG: make order of index from pd.concat deterministic closes #17344 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/common.py | 14 ++++++++++++++ pandas/core/indexes/api.py | 9 ++------- pandas/tests/reshape/test_concat.py | 13 ++++++++++++- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index fcadd26156b1d4..4fbe289f25acf1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -405,6 +405,7 @@ Reshaping - Bug in :func:`crosstab` where passing two ``Series`` with the same name raised a ``KeyError`` (:issue:`13279`) - :func:`Series.argmin`, :func:`Series.argmax`, and their counterparts on ``DataFrame`` and groupby objects work correctly with floating point data that contains infinite values (:issue:`13595`). - Bug in :func:`unique` where checking a tuple of strings raised a ``TypeError`` (:issue:`17108`) +- Bug in :func:`concat` which would randomly determine the order of the index along the common dimension (:issue:`17344`) Numeric ^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index 44cb36b8a32076..515a4010961205 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -629,3 +629,17 @@ def _random_state(state=None): else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None") + + +def _get_distinct_objs(objs): + """ + Return a list with distinct elements of "objs" (different ids). + Preserves order. + """ + ids = set() + res = [] + for obj in objs: + if not id(obj) in ids: + ids.add(id(obj)) + res.append(obj) + return res diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index db73a6878258ad..323d50166e7b6f 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -23,8 +23,7 @@ 'PeriodIndex', 'DatetimeIndex', '_new_Index', 'NaT', '_ensure_index', '_get_na_value', '_get_combined_index', - '_get_objs_combined_axis', - '_get_distinct_indexes', '_union_indexes', + '_get_objs_combined_axis', '_union_indexes', '_get_consensus_names', '_all_indexes_same'] @@ -41,7 +40,7 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0): def _get_combined_index(indexes, intersect=False): # TODO: handle index names! - indexes = _get_distinct_indexes(indexes) + indexes = com._get_distinct_objs(indexes) if len(indexes) == 0: return Index([]) if len(indexes) == 1: @@ -55,10 +54,6 @@ def _get_combined_index(indexes, intersect=False): return _ensure_index(union) -def _get_distinct_indexes(indexes): - return list(dict((id(x), x) for x in indexes).values()) - - def _union_indexes(indexes): if len(indexes) == 0: raise AssertionError('Must have at least 1 Index to union') diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 52cd18126859a1..6e646f9b294429 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -5,7 +5,7 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems +from pandas.compat import StringIO, iteritems, PY2 import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, @@ -1944,6 +1944,17 @@ def test_concat_categoricalindex(self): index=exp_idx) tm.assert_frame_equal(result, exp) + def test_concat_order(self): + # GH 17344 + dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] + dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) + for i in range(100)] + result = pd.concat(dfs).columns + expected = dfs[0].columns + if PY2: + expected = expected.sort_values() + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) @pytest.mark.parametrize('dt', np.sctypes['float'])