Skip to content

Commit

Permalink
Avoid loading file into memory in read_csv
Browse files Browse the repository at this point in the history
agate.from_csv reads the entire csv file into memory _twice_ before actually reating a Table. This PR keeps things as iterators so that the file is only loaded line by line as we create the table, unless `sniff_limit` is None in which case the file is loaded into memory upfront.
  • Loading branch information
scottgigante committed Nov 20, 2023
1 parent 9e79a33 commit 1db7277
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions agate/table/from_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,18 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
else:
raise ValueError('skip_lines argument must be an int')

contents = StringIO(f.read())
handle = f

if sniff_limit is None:
kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue())
# avoid reading the file twice
handle = StringIO(f.read())
kwargs['dialect'] = csv.Sniffer().sniff(handle.getvalue())
elif sniff_limit > 0:
kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue()[:sniff_limit])
kwargs['dialect'] = csv.Sniffer().sniff(f.read(sniff_limit))
# return to the start of the file
f.seek(0)

reader = csv.reader(contents, header=header, **kwargs)
reader = csv.reader(f, header=header, **kwargs)

if header:
if column_names is None:
Expand All @@ -75,9 +79,9 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
next(reader)

if row_limit is None:
rows = tuple(reader)
rows = reader
else:
rows = tuple(itertools.islice(reader, row_limit))
rows = itertools.islice(reader, row_limit)

finally:
if close:
Expand Down

0 comments on commit 1db7277

Please sign in to comment.