Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-1754: [Python] alternative fix for duplicate index/column name that preserves index name if available #1408

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 31 additions & 14 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,8 @@ def get_column_metadata(column, name, arrow_type, field_name):
}


index_level_name = '__index_level_{:d}__'.format


def construct_metadata(df, column_names, index_levels, preserve_index, types):
def construct_metadata(df, column_names, index_levels, index_column_names,
preserve_index, types):
"""Returns a dictionary containing enough metadata to reconstruct a pandas
DataFrame as an Arrow Table, including index columns.

Expand All @@ -197,9 +195,8 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
-------
dict
"""
ncolumns = len(column_names)
df_types = types[:ncolumns - len(index_levels)]
index_types = types[ncolumns - len(index_levels):]
df_types = types[:-len(index_levels)]
index_types = types[-len(index_levels):]

column_metadata = [
get_column_metadata(
Expand All @@ -213,9 +210,6 @@ def construct_metadata(df, column_names, index_levels, preserve_index, types):
]

if preserve_index:
index_column_names = list(map(
index_level_name, range(len(index_levels))
))
index_column_metadata = [
get_column_metadata(
level,
Expand Down Expand Up @@ -294,9 +288,29 @@ def _column_name_to_strings(name):
return str(name)


def _index_level_name(index, i, column_names):
"""Return the name of an index level or a default name if `index.name` is
None or is already a column name.

Parameters
----------
index : pandas.Index
i : int

Returns
-------
name : str
"""
if index.name is not None and index.name not in column_names:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be concerned about the linear search for index.name not in column_names? If so, let's create a set outside the loop below that we can check so that we don't need to do a full scan of the column names for every index column.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did some timings, and conversion to a set typically takes twice the time of a single search in the list. So you already need to have 3 index levels to benefit from this, and I don't think this is the typical use case?
So I would personally leave it as is, but can certainly also easily add the suggestion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine by me.

return index.name
else:
return '__index_level_{:d}__'.format(i)


def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
names = []
column_names = []
index_columns = []
index_column_names = []
type = None

if preserve_index:
Expand Down Expand Up @@ -324,12 +338,13 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):

columns_to_convert.append(col)
convert_types.append(type)
names.append(name)
column_names.append(name)

for i, column in enumerate(index_columns):
columns_to_convert.append(column)
convert_types.append(None)
names.append(index_level_name(i))
name = _index_level_name(column, i, column_names)
index_column_names.append(name)

# NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
# using a thread pool is worth it. Currently the heuristic is whether the
Expand Down Expand Up @@ -358,8 +373,10 @@ def convert_column(col, ty):
types = [x.type for x in arrays]

metadata = construct_metadata(
df, names, index_columns, preserve_index, types
df, column_names, index_columns, index_column_names, preserve_index,
types
)
names = column_names + index_column_names
return names, arrays, metadata


Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ def test_index_metadata_field_name(self):
assert idx0['field_name'] == idx0_name
assert idx0['name'] is None

assert foo_name == '__index_level_1__'
assert foo['name'] == 'foo'
assert foo_name == 'foo'
assert foo['field_name'] == foo_name
assert foo['name'] == foo_name

def test_categorical_column_index(self):
df = pd.DataFrame(
Expand Down