Skip to content

Commit

Permalink
Merge pull request #3978 from jreback/parser_iterator
Browse files Browse the repository at this point in the history
BUG  (GH3967) csv parsers would loop infinitely if iterator=True but no chunksize specified
  • Loading branch information
jreback committed Jun 21, 2013
2 parents 40064ec + 79e81a0 commit 78a71b1
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 36 deletions.
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ pandas 0.11.1
- Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing
two integer arrays with at least 10000 cells total (:issue:`3764`)
- Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`)
- csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was
specified (:issue:`3967`), python parser failing with ``chunksize=1``

.. _Gh3616: https://github.com/pydata/pandas/issues/3616

Expand Down
86 changes: 50 additions & 36 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def _read(filepath_or_buffer, kwds):
kwds['parse_dates'] = True

# Extract some of the arguments (pass chunksize on).
iterator = kwds.pop('iterator', False)
iterator = kwds.get('iterator', False)
nrows = kwds.pop('nrows', None)
chunksize = kwds.get('chunksize', None)

Expand Down Expand Up @@ -569,8 +569,11 @@ def _clean_options(self, options, engine):

def __iter__(self):
try:
while True:
yield self.read(self.chunksize)
if self.chunksize:
while True:
yield self.read(self.chunksize)
else:
yield self.read()
except StopIteration:
pass

Expand Down Expand Up @@ -1594,47 +1597,58 @@ def _rows_to_cols(self, content):
def _get_lines(self, rows=None):
source = self.data
lines = self.buf
new_rows = None

# already fetched some number
if rows is not None:
rows -= len(self.buf)

if isinstance(source, list):
if self.pos > len(source):
raise StopIteration
if rows is None:
lines.extend(source[self.pos:])
self.pos = len(source)
# we already have the lines in the buffer
if len(self.buf) >= rows:
new_rows, self.buf = self.buf[:rows], self.buf[rows:]

# need some lines
else:
lines.extend(source[self.pos:self.pos + rows])
self.pos += rows
else:
new_rows = []
try:
if rows is not None:
for _ in xrange(rows):
new_rows.append(next(source))
lines.extend(new_rows)
rows -= len(self.buf)

if new_rows is None:
if isinstance(source, list):
if self.pos > len(source):
raise StopIteration
if rows is None:
lines.extend(source[self.pos:])
self.pos = len(source)
else:
rows = 0
while True:
try:
lines.extend(source[self.pos:self.pos + rows])
self.pos += rows
else:
new_rows = []
try:
if rows is not None:
for _ in xrange(rows):
new_rows.append(next(source))
rows += 1
except csv.Error, inst:
if 'newline inside string' in str(inst):
row_num = str(self.pos + rows)
msg = ('EOF inside string starting with line '
+ row_num)
raise Exception(msg)
raise
except StopIteration:
lines.extend(new_rows)
if len(lines) == 0:
raise
self.pos += len(new_rows)
lines.extend(new_rows)
else:
rows = 0
while True:
try:
new_rows.append(next(source))
rows += 1
except csv.Error, inst:
if 'newline inside string' in str(inst):
row_num = str(self.pos + rows)
msg = ('EOF inside string starting with line '
+ row_num)
raise Exception(msg)
raise
except StopIteration:
lines.extend(new_rows)
if len(lines) == 0:
raise
self.pos += len(new_rows)

self.buf = []
self.buf = []
else:
lines = new_rows

if self.skip_footer:
lines = lines[:-self.skip_footer]
Expand Down
18 changes: 18 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,24 @@ def test_iterator(self):
iterator=True)
self.assert_(isinstance(treader, TextFileReader))

# stopping iteration when on chunksize is specified, GH 3967
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
reader = self.read_csv(StringIO(data), iterator=True)
result = list(reader)
expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
tm.assert_frame_equal(result[0], expected)

# chunksize = 1
reader = self.read_csv(StringIO(data), chunksize=1)
result = list(reader)
expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz'])
self.assert_(len(result) == 3)
tm.assert_frame_equal(pd.concat(result), expected)

def test_header_not_first_line(self):
data = """got,to,ignore,this,line
got,to,ignore,this,line
Expand Down

0 comments on commit 78a71b1

Please sign in to comment.