Skip to content

Commit

Permalink
ARROW-1754: [Python] alternative fix for duplicate index/column name …
Browse files Browse the repository at this point in the history
…that preserves index name if available
  • Loading branch information
jorisvandenbossche committed Dec 10, 2017
1 parent 501d60e commit f8e5b79
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 16 deletions.
45 changes: 31 additions & 14 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,8 @@ def get_column_metadata(column, name, arrow_type, field_name):
}


index_level_name = '__index_level_{:d}__'.format


def construct_metadata(df, column_names, index_levels, preserve_index, types):
def construct_metadata(df, column_names, index_levels, index_column_names,
preserve_index, types):
"""Returns a dictionary containing enough metadata to reconstruct a pandas
DataFrame as an Arrow Table, including index columns.
Expand All @@ -197,9 +195,8 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
-------
dict
"""
ncolumns = len(column_names)
df_types = types[:ncolumns - len(index_levels)]
index_types = types[ncolumns - len(index_levels):]
df_types = types[:-len(index_levels)]
index_types = types[-len(index_levels):]

column_metadata = [
get_column_metadata(
Expand All @@ -213,9 +210,6 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
]

if preserve_index:
index_column_names = list(map(
index_level_name, range(len(index_levels))
))
index_column_metadata = [
get_column_metadata(
level,
Expand Down Expand Up @@ -294,9 +288,29 @@ def _column_name_to_strings(name):
return str(name)


def _index_level_name(index, i, column_names):
"""Return the name of an index level or a default name if `index.name` is
None or is already a column name.
Parameters
----------
index : pandas.Index
i : int
Returns
-------
name : str
"""
if index.name is not None and index.name not in column_names:
return index.name
else:
return '__index_level_{:d}__'.format(i)


def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
names = []
column_names = []
index_columns = []
index_column_names = []
type = None

if preserve_index:
Expand Down Expand Up @@ -324,12 +338,13 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):

columns_to_convert.append(col)
convert_types.append(type)
names.append(name)
column_names.append(name)

for i, column in enumerate(index_columns):
columns_to_convert.append(column)
convert_types.append(None)
names.append(index_level_name(i))
name = _index_level_name(column, i, column_names)
index_column_names.append(name)

# NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
# using a thread pool is worth it. Currently the heuristic is whether the
Expand Down Expand Up @@ -358,8 +373,10 @@ def convert_column(col, ty):
types = [x.type for x in arrays]

metadata = construct_metadata(
df, names, index_columns, preserve_index, types
df, column_names, index_columns, index_column_names, preserve_index,
types
)
names = column_names + index_column_names
return names, arrays, metadata


Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ def test_index_metadata_field_name(self):
assert idx0['field_name'] == idx0_name
assert idx0['name'] is None

assert foo_name == '__index_level_1__'
assert foo['name'] == 'foo'
assert foo_name == 'foo'
assert foo['field_name'] == foo_name
assert foo['name'] == foo_name

def test_categorical_column_index(self):
df = pd.DataFrame(
Expand Down

0 comments on commit f8e5b79

Please sign in to comment.