BUG: Thoroughly dedup columns in read_csv

pandas-dev · Jul 24, 2017 · 2ae3988 · 2ae3988
1 parent e7c10bb
commit 2ae3988
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 28 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -263,11 +263,11 @@ Indexing
 I/O
 ^^^
 
+- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
 - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
 - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`).
 - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
 - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
-
 - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
 
 Plotting

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -788,11 +788,14 @@ cdef class TextReader:
                         unnamed_count += 1
 
                     count = counts.get(name, 0)
-                    if (count > 0 and self.mangle_dupe_cols
-                        and not self.has_mi_columns):
-                        this_header.append('%s.%d' % (name, count))
-                    else:
-                        this_header.append(name)
+
+                    if not self.has_mi_columns and self.mangle_dupe_cols:
+                        while count > 0:
+                            counts[name] = count + 1
+                            name = '%s.%d' % (name, count)
+                            count = counts.get(name, 0)
+
+                    this_header.append(name)
                     counts[name] = count + 1
 
                 if self.has_mi_columns:

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2331,10 +2331,16 @@ def _infer_columns(self):
 
                 if not have_mi_columns and self.mangle_dupe_cols:
                     counts = {}
+
                     for i, col in enumerate(this_columns):
                         cur_count = counts.get(col, 0)
-                        if cur_count > 0:
-                            this_columns[i] = '%s.%d' % (col, cur_count)
+
+                        while cur_count > 0:
+                            counts[col] = cur_count + 1
+                            col = "%s.%d" % (col, cur_count)
+                            cur_count = counts.get(col, 0)
+
+                        this_columns[i] = col
                         counts[col] = cur_count + 1
                 elif have_mi_columns:
 

diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -224,25 +224,6 @@ def test_unnamed_columns(self):
                               Index(['A', 'B', 'C', 'Unnamed: 3',
                                      'Unnamed: 4']))
 
-    def test_duplicate_columns(self):
-        # TODO: add test for condition 'mangle_dupe_cols=False'
-        # once it is actually supported (gh-12935)
-        data = """A,A,B,B,B
-1,2,3,4,5
-6,7,8,9,10
-11,12,13,14,15
-"""
-
-        for method in ('read_csv', 'read_table'):
-
-            # check default behavior
-            df = getattr(self, method)(StringIO(data), sep=',')
-            assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']
-
-            df = getattr(self, method)(StringIO(data), sep=',',
-                                       mangle_dupe_cols=True)
-            assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']
-
     def test_csv_mixed_type(self):
         data = """A,B,C
 a,1,2

diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that duplicate columns are handled appropriately when parsed by the
+CSV engine. In general, the expected result is that they are either thoroughly
+de-duplicated (if mangling requested) or ignored otherwise.
+"""
+
+from pandas.compat import StringIO
+
+
+class DupeColumnTests(object):
+    def test_basic(self):
+        # TODO: add test for condition "mangle_dupe_cols=False"
+        # once it is actually supported (gh-12935)
+        data = "a,a,b,b,b\n1,2,3,4,5"
+
+        for method in ("read_csv", "read_table"):
+            # Check default behavior.
+            expected = ["a", "a.1", "b", "b.1", "b.2"]
+            df = getattr(self, method)(StringIO(data), sep=",")
+            assert list(df.columns) == expected
+
+            df = getattr(self, method)(StringIO(data), sep=",",
+                                       mangle_dupe_cols=True)
+            assert list(df.columns) == expected
+
+    def test_thorough_mangle(self):
+        # see gh-17060
+        data = "a,a,a.1\n1,2,3"
+        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
+        assert list(df.columns) == ["a", "a.1", "a.1.1"]
+
+        data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
+        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
+        assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
+                                    "a.1.1.1.1", "a.1.1.1.1.1"]
+
+        data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
+        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
+        assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
+                                    "a.2", "a.2.1", "a.3.1"]
diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py
@@ -19,18 +19,20 @@
 from .c_parser_only import CParserTests
 from .parse_dates import ParseDatesTests
 from .compression import CompressionTests
+from .mangle_dupes import DupeColumnTests
 from .multithread import MultithreadTests
 from .python_parser_only import PythonParserTests
 from .dtypes import DtypeTests
 
 
 class BaseParser(CommentTests, CompressionTests,
                  ConverterTests, DialectTests,
+                 DtypeTests, DupeColumnTests,
                  HeaderTests, IndexColTests,
                  MultithreadTests, NAvaluesTests,
                  ParseDatesTests, ParserTests,
                  SkipRowsTests, UsecolsTests,
-                 QuotingTests, DtypeTests):
+                 QuotingTests):
 
     def read_csv(self, *args, **kwargs):
         raise NotImplementedError