From 97636ec932ca1620cbcf0ddcfc1600e95b99e237 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 23 Jul 2017 20:15:02 -0700 Subject: [PATCH] BUG: Thoroughly dedup columns in read_csv --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/_libs/parsers.pyx | 13 ++++---- pandas/io/parsers.py | 10 +++++-- pandas/tests/io/parser/common.py | 19 ------------ pandas/tests/io/parser/mangle_dupes.py | 41 ++++++++++++++++++++++++++ pandas/tests/io/parser/test_parsers.py | 4 ++- 6 files changed, 61 insertions(+), 28 deletions(-) create mode 100644 pandas/tests/io/parser/mangle_dupes.py diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 096040bb85a10..ff194fb130516 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -263,11 +263,11 @@ Indexing I/O ^^^ +- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`) - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`). - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) - - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) Plotting diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 9866eff3e5f31..543a943aea311 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -788,11 +788,14 @@ cdef class TextReader: unnamed_count += 1 count = counts.get(name, 0) - if (count > 0 and self.mangle_dupe_cols - and not self.has_mi_columns): - this_header.append('%s.%d' % (name, count)) - else: - this_header.append(name) + + if not self.has_mi_columns and self.mangle_dupe_cols: + while count > 0: + counts[name] = count + 1 + name = '%s.%d' % (name, count) + count = counts.get(name, 0) + + this_header.append(name) counts[name] = count + 1 if self.has_mi_columns: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1e7d9d420b35d..b0a13234782ec 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2331,10 +2331,16 @@ def _infer_columns(self): if not have_mi_columns and self.mangle_dupe_cols: counts = {} + for i, col in enumerate(this_columns): cur_count = counts.get(col, 0) - if cur_count > 0: - this_columns[i] = '%s.%d' % (col, cur_count) + + while cur_count > 0: + counts[col] = cur_count + 1 + col = "%s.%d" % (col, cur_count) + cur_count = counts.get(col, 0) + + this_columns[i] = col counts[col] = cur_count + 1 elif have_mi_columns: diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 4d1f9936af983..91cf238391252 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -224,25 +224,6 @@ def test_unnamed_columns(self): Index(['A', 'B', 'C', 'Unnamed: 3', 'Unnamed: 4'])) - def test_duplicate_columns(self): - # TODO: add test for condition 'mangle_dupe_cols=False' - # once it is actually supported (gh-12935) - data = """A,A,B,B,B -1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - - for method in ('read_csv', 'read_table'): - - # check default behavior - df = getattr(self, method)(StringIO(data), sep=',') - assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2'] - - df = getattr(self, method)(StringIO(data), sep=',', - mangle_dupe_cols=True) - assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2'] - def test_csv_mixed_type(self): data = """A,B,C a,1,2 diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py new file mode 100644 index 0000000000000..ee3ba2ecf0810 --- /dev/null +++ b/pandas/tests/io/parser/mangle_dupes.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +""" +Tests that duplicate columns are handled appropriately when parsed by the +CSV engine. They are either ignored or are thoroughly de-duplicated. +""" + +from pandas.compat import StringIO + + +class DupeColumnTests(object): + def test_basic(self): + # TODO: add test for condition "mangle_dupe_cols=False" + # once it is actually supported (gh-12935) + data = "a,a,b,b,b\n1,2,3,4,5" + + for method in ("read_csv", "read_table"): + # Check default behavior. + expected = ["a", "a.1", "b", "b.1", "b.2"] + df = getattr(self, method)(StringIO(data), sep=",") + assert list(df.columns) == expected + + df = getattr(self, method)(StringIO(data), sep=",", + mangle_dupe_cols=True) + assert list(df.columns) == expected + + def test_thorough_mangle(self): + # see gh-17060 + data = "a,a,a.1\n1,2,3" + df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.1.1"] + + data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6" + df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"] + + data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7" + df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) + assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"] diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 9bbc624dff90f..2fee2451c5e36 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -19,6 +19,7 @@ from .c_parser_only import CParserTests from .parse_dates import ParseDatesTests from .compression import CompressionTests +from .mangle_dupes import DupeColumnTests from .multithread import MultithreadTests from .python_parser_only import PythonParserTests from .dtypes import DtypeTests @@ -26,11 +27,12 @@ class BaseParser(CommentTests, CompressionTests, ConverterTests, DialectTests, + DtypeTests, DupeColumnTests, HeaderTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, UsecolsTests, - QuotingTests, DtypeTests): + QuotingTests): def read_csv(self, *args, **kwargs): raise NotImplementedError