Skip to content

Commit

Permalink
BUG: Thoroughly dedup column names in read_csv
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Jul 27, 2017
1 parent c6e5bf6 commit 8b55754
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ I/O
^^^

- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
- Bug in :func:`read_csv` in which specified column names were not being thoroughly de-duplicated (:issue:`17095`)
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`).
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
Expand Down
7 changes: 5 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,9 +1323,12 @@ def _maybe_dedup_names(self, names):
for i, col in enumerate(names):
cur_count = counts.get(col, 0)

if cur_count > 0:
names[i] = '%s.%d' % (col, cur_count)
while cur_count > 0:
counts[col] = cur_count + 1
col = '%s.%d' % (col, cur_count)
cur_count = counts.get(col, 0)

names[i] = col
counts[col] = cur_count + 1

return names
Expand Down
24 changes: 23 additions & 1 deletion pandas/tests/io/parser/mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_basic(self):
mangle_dupe_cols=True)
assert list(df.columns) == expected

def test_thorough_mangle(self):
def test_thorough_mangle_columns(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
Expand All @@ -40,3 +40,25 @@ def test_thorough_mangle(self):
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

def test_thorough_mangle_names(self):
# see gh-17095
data = "a,b,b\n1,2,3"
names = ["a", "a", "a.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1"]

data = "a,b,c,d,e,f\n1,2,3,4,5,6"
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

0 comments on commit 8b55754

Please sign in to comment.