Expose 'read_only' parameter for 'import_set' and 'import_book' (#483)

jazzband · Dec 4, 2020 · e8f5481 · e8f5481
1 parent e877404
commit e8f5481
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 4 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -9,6 +9,7 @@
 ### Improvements
 
 - Added Python 3.9 support
+- Added read_only option to xlsx file reader (#482).
 
 ### Bugfixes
 

diff --git a/docs/formats.rst b/docs/formats.rst
@@ -206,6 +206,15 @@ Import/export data in Excel 07+ Spreadsheet representation.
 This format is optional, install Tablib with ``pip install "tablib[xlsx]"`` to
 make the format available.
 
+The ``import_set()`` and ``import_book()`` methods accept keyword
+argument ``read_only``.  If its value is ``True`` (the default), the
+XLSX data source is read lazily.  Lazy reading generally reduces time
+and memory consumption, especially for large spreadsheets.  However,
+it relies on the XLSX data source declaring correct dimensions.  Some
+programs generate XLSX files with incorrect dimensions.  Such files
+may need to be loaded with this optimization turned off by passing
+``read_only=False``.
+
 .. note::
 
     When reading an ``xlsx`` file containing formulas in its cells, Tablib will

diff --git a/src/tablib/formats/_xlsx.py b/src/tablib/formats/_xlsx.py
@@ -59,12 +59,12 @@ def export_book(cls, databook, freeze_panes=True):
         return stream.getvalue()
 
     @classmethod
-    def import_set(cls, dset, in_stream, headers=True):
+    def import_set(cls, dset, in_stream, headers=True, read_only=True):
         """Returns databook from XLS stream."""
 
         dset.wipe()
 
-        xls_book = load_workbook(in_stream, read_only=True, data_only=True)
+        xls_book = load_workbook(in_stream, read_only=read_only, data_only=True)
         sheet = xls_book.active
 
         dset.title = sheet.title
@@ -77,12 +77,12 @@ def import_set(cls, dset, in_stream, headers=True):
                 dset.append(row_vals)
 
     @classmethod
-    def import_book(cls, dbook, in_stream, headers=True):
+    def import_book(cls, dbook, in_stream, headers=True, read_only=True):
         """Returns databook from XLS stream."""
 
         dbook.wipe()
 
-        xls_book = load_workbook(in_stream, read_only=True, data_only=True)
+        xls_book = load_workbook(in_stream, read_only=read_only, data_only=True)
 
         for sheet in xls_book.worksheets:
             data = tablib.Dataset()

diff --git a/tests/files/bad_dimensions.xlsx b/tests/files/bad_dimensions.xlsx
diff --git a/tests/test_tablib.py b/tests/test_tablib.py
@@ -1040,6 +1040,13 @@ def test_xlsx_cell_values(self):
             data = tablib.Dataset().load(fh)
         self.assertEqual(data.headers[0], 'Hello World')
 
+    def test_xlsx_bad_dimensions(self):
+        """Test loading file with bad dimension.  Must be done with
+        read_only=False."""
+        xls_source = Path(__file__).parent / 'files' / 'bad_dimensions.xlsx'
+        with xls_source.open('rb') as fh:
+            data = tablib.Dataset().load(fh, read_only=False)
+        self.assertEqual(data.height, 3)
 
 class JSONTests(BaseTestCase):
     def test_json_format_detect(self):