Avoid loading file into memory in read_csv

agate.from_csv reads the entire csv file into memory _twice_ before actually reating a Table. This PR keeps things as iterators so that the file is only loaded line by line as we create the table, unless `sniff_limit` is None in which case the file is loaded into memory upfront.
wireservice · Nov 20, 2023 · 1db7277 · 1db7277
1 parent 9e79a33
commit 1db7277
Showing 1 changed file with 10 additions and 6 deletions.
diff --git a/agate/table/from_csv.py b/agate/table/from_csv.py
@@ -59,14 +59,18 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
         else:
             raise ValueError('skip_lines argument must be an int')
 
-        contents = StringIO(f.read())
+        handle = f
 
         if sniff_limit is None:
-            kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue())
+            # avoid reading the file twice
+            handle = StringIO(f.read())
+            kwargs['dialect'] = csv.Sniffer().sniff(handle.getvalue())
         elif sniff_limit > 0:
-            kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue()[:sniff_limit])
+            kwargs['dialect'] = csv.Sniffer().sniff(f.read(sniff_limit))
+            # return to the start of the file
+            f.seek(0)
 
-        reader = csv.reader(contents, header=header, **kwargs)
+        reader = csv.reader(f, header=header, **kwargs)
 
         if header:
             if column_names is None:
@@ -75,9 +79,9 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
                 next(reader)
 
         if row_limit is None:
-            rows = tuple(reader)
+            rows = reader
         else:
-            rows = tuple(itertools.islice(reader, row_limit))
+            rows = itertools.islice(reader, row_limit)
 
     finally:
         if close: