From 1db72776ab6b72d8d45d0c9e27cef851f303113e Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Mon, 20 Nov 2023 14:30:14 -0500 Subject: [PATCH] Avoid loading file into memory in read_csv agate.from_csv reads the entire csv file into memory _twice_ before actually reating a Table. This PR keeps things as iterators so that the file is only loaded line by line as we create the table, unless `sniff_limit` is None in which case the file is loaded into memory upfront. --- agate/table/from_csv.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/agate/table/from_csv.py b/agate/table/from_csv.py index 3b1ac074..5b9d10c2 100644 --- a/agate/table/from_csv.py +++ b/agate/table/from_csv.py @@ -59,14 +59,18 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk else: raise ValueError('skip_lines argument must be an int') - contents = StringIO(f.read()) + handle = f if sniff_limit is None: - kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue()) + # avoid reading the file twice + handle = StringIO(f.read()) + kwargs['dialect'] = csv.Sniffer().sniff(handle.getvalue()) elif sniff_limit > 0: - kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue()[:sniff_limit]) + kwargs['dialect'] = csv.Sniffer().sniff(f.read(sniff_limit)) + # return to the start of the file + f.seek(0) - reader = csv.reader(contents, header=header, **kwargs) + reader = csv.reader(f, header=header, **kwargs) if header: if column_names is None: @@ -75,9 +79,9 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk next(reader) if row_limit is None: - rows = tuple(reader) + rows = reader else: - rows = tuple(itertools.islice(reader, row_limit)) + rows = itertools.islice(reader, row_limit) finally: if close: