From de6c672d71912ea15cfded2f4a232c6dd238519f Mon Sep 17 00:00:00 2001 From: geisserml Date: Fri, 23 Jun 2023 21:12:09 +0200 Subject: [PATCH 1/9] Add pypdfium2 rendering backend (experimental patch) --- camelot/backends/image_conversion.py | 12 ++++++--- camelot/backends/pdfium_backend.py | 15 ++++++++++++ setup.py | 8 +++++- tests/test_common.py | 34 ++++++++++++++++++++++++++ tests/test_image_conversion_backend.py | 6 ++--- 5 files changed, 68 insertions(+), 7 deletions(-) create mode 100644 camelot/backends/pdfium_backend.py diff --git a/camelot/backends/image_conversion.py b/camelot/backends/image_conversion.py index 7d2c4d7a..f43017b1 100644 --- a/camelot/backends/image_conversion.py +++ b/camelot/backends/image_conversion.py @@ -1,19 +1,25 @@ # -*- coding: utf-8 -*- +from .pdfium_backend import PdfiumBackend from .poppler_backend import PopplerBackend from .ghostscript_backend import GhostscriptBackend -BACKENDS = {"poppler": PopplerBackend, "ghostscript": GhostscriptBackend} +BACKENDS = { + "pdfium": PdfiumBackend, + "poppler": PopplerBackend, + "ghostscript": GhostscriptBackend, +} class ImageConversionBackend(object): - def __init__(self, backend="poppler", use_fallback=True): + def __init__(self, backend="pdfium", use_fallback=True): if backend not in BACKENDS.keys(): raise ValueError(f"Image conversion backend '{backend}' not supported") self.backend = backend self.use_fallback = use_fallback - self.fallbacks = list(filter(lambda x: x != backend, BACKENDS.keys())) + self.fallbacks = list(BACKENDS.keys()) + self.fallbacks.remove(self.backend) def convert(self, pdf_path, png_path): try: diff --git a/camelot/backends/pdfium_backend.py b/camelot/backends/pdfium_backend.py new file mode 100644 index 00000000..cc8374a2 --- /dev/null +++ b/camelot/backends/pdfium_backend.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +try: + import pypdfium2 as pdfium +except Exception: + pdfium = None + +class PdfiumBackend(object): + def convert(self, pdf_path, png_path, resolution=300): + if not pdfium: + raise OSError("pypdfium2 is not installed.") + doc = pdfium.PdfDocument(pdf_path) + assert len(doc) == 1 + image = doc[0].render(scale=resolution/72).to_pil() + image.save(png_path) diff --git a/setup.py b/setup.py index b0274d6d..169bcb9c 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,13 @@ "tabulate>=0.8.9", ] -base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pdftopng>=0.2.3"] +base_requires = [ + "opencv-python>=3.4.2.17", + "pypdfium2>=4,<5", + "pillow", + "ghostscript>=0.7", # deprecate? + "pdftopng>=0.2.3", # deprecate? +] plot_requires = [ "matplotlib>=2.2.3", diff --git a/tests/test_common.py b/tests/test_common.py index 5d0054b8..78db149b 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -59,6 +59,14 @@ def test_password(): assert_frame_equal(df, tables[0].df) +def test_repr_pdfium(): + filename = os.path.join(testdir, "foo.pdf") + tables = camelot.read_pdf(filename, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "" + assert repr(tables[0].cells[0][0]) == "" + + def test_repr_poppler(): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename, backend="poppler") @@ -76,6 +84,14 @@ def test_repr_ghostscript(): assert repr(tables[0].cells[0][0]) == "" +def test_url_pdfium(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + def test_url_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url, backend="poppler") @@ -93,6 +109,24 @@ def test_url_ghostscript(): assert repr(tables[0].cells[0][0]) == "" +def test_pages_pdfium(): + url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" + tables = camelot.read_pdf(url, backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages="1-end", backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + tables = camelot.read_pdf(url, pages="all", backend="pdfium") + assert repr(tables) == "" + assert repr(tables[0]) == "
" + assert repr(tables[0].cells[0][0]) == "" + + def test_pages_poppler(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url, backend="poppler") diff --git a/tests/test_image_conversion_backend.py b/tests/test_image_conversion_backend.py index 39f56e69..a8d9948c 100644 --- a/tests/test_image_conversion_backend.py +++ b/tests/test_image_conversion_backend.py @@ -29,7 +29,7 @@ def test_poppler_backend_error_when_no_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True ) - backend = ImageConversionBackend(use_fallback=False) + backend = ImageConversionBackend(backend="poppler", use_fallback=False) message = "Image conversion failed with image conversion backend 'poppler'" with pytest.raises(ValueError, match=message): @@ -44,7 +44,7 @@ def test_ghostscript_backend_when_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True ) - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="poppler") backend.convert("foo", "bar") @@ -53,7 +53,7 @@ def test_ghostscript_backend_error_when_use_fallback(monkeypatch): monkeypatch.setattr( "camelot.backends.image_conversion.BACKENDS", BACKENDS, raising=True ) - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="poppler") message = "Image conversion failed with image conversion backend 'ghostscript'" with pytest.raises(ValueError, match=message): From 6d60163efe24c0d525fab3a83235588fa9f09654 Mon Sep 17 00:00:00 2001 From: mara004 Date: Sat, 24 Jun 2023 13:07:44 +0200 Subject: [PATCH 2/9] Add init_forms() call --- camelot/backends/pdfium_backend.py | 1 + 1 file changed, 1 insertion(+) diff --git a/camelot/backends/pdfium_backend.py b/camelot/backends/pdfium_backend.py index cc8374a2..00daf3fc 100644 --- a/camelot/backends/pdfium_backend.py +++ b/camelot/backends/pdfium_backend.py @@ -11,5 +11,6 @@ def convert(self, pdf_path, png_path, resolution=300): raise OSError("pypdfium2 is not installed.") doc = pdfium.PdfDocument(pdf_path) assert len(doc) == 1 + doc.init_forms() image = doc[0].render(scale=resolution/72).to_pil() image.save(png_path) From edb686c496ee2e832098bbcfb7d86dc28ea57918 Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 24 Sep 2023 22:33:59 +0200 Subject: [PATCH 3/9] Post-merge fixup --- tests/test_image_conversion_backend.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_image_conversion_backend.py b/tests/test_image_conversion_backend.py index 0c6e52d8..98b2c45f 100644 --- a/tests/test_image_conversion_backend.py +++ b/tests/test_image_conversion_backend.py @@ -2,6 +2,8 @@ from camelot.backends import ImageConversionBackend +# TODO consider adding pdfium backend + @pytest.fixture def patch_backends(monkeypatch): @@ -31,7 +33,7 @@ def convert(self, pdf_path, png_path): def test_poppler_backend_error_when_no_use_fallback(patch_backends): - backend = ImageConversionBackend(use_fallback=False) + backend = ImageConversionBackend(backend="poppler", use_fallback=False) message = "Image conversion failed with image conversion backend 'poppler'" with pytest.raises(ValueError, match=message): @@ -39,7 +41,7 @@ def test_poppler_backend_error_when_no_use_fallback(patch_backends): def test_ghostscript_backend_when_use_fallback(patch_backends): - backend = ImageConversionBackend() + backend = ImageConversionBackend(backend="ghostscript") backend.convert("foo", "bar") From 83ff330250ea5ca4fcf9acf355af5a8549d8598a Mon Sep 17 00:00:00 2001 From: geisserml Date: Sun, 24 Sep 2023 22:35:01 +0200 Subject: [PATCH 4/9] Post-merge fixup 2 --- setup.py | 97 -------------------------------------------------------- 1 file changed, 97 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 169bcb9c..00000000 --- a/setup.py +++ /dev/null @@ -1,97 +0,0 @@ -# -*- coding: utf-8 -*- - -import os -from setuptools import find_packages - - -here = os.path.abspath(os.path.dirname(__file__)) -about = {} -with open(os.path.join(here, "camelot", "__version__.py"), "r") as f: - exec(f.read(), about) - -with open("README.md", "r") as f: - readme = f.read() - - -requires = [ - "chardet>=3.0.4", - "click>=6.7", - "numpy>=1.13.3", - "openpyxl>=2.5.8", - "pandas>=0.23.4", - "pdfminer.six>=20200726", - "pypdf>=3.0.0", - "tabulate>=0.8.9", -] - -base_requires = [ - "opencv-python>=3.4.2.17", - "pypdfium2>=4,<5", - "pillow", - "ghostscript>=0.7", # deprecate? - "pdftopng>=0.2.3", # deprecate? -] - -plot_requires = [ - "matplotlib>=2.2.3", -] - -dev_requires = [ - "codecov>=2.0.15", - "pytest>=5.4.3", - "pytest-cov>=2.10.0", - "pytest-mpl>=0.11", - "pytest-runner>=5.2", - "Sphinx>=3.1.2", - "sphinx-autobuild>=2021.3.14", -] - -all_requires = base_requires + plot_requires -dev_requires = dev_requires + all_requires - - -def setup_package(): - metadata = dict( - name=about["__title__"], - version=about["__version__"], - description=about["__description__"], - long_description=readme, - long_description_content_type="text/markdown", - url=about["__url__"], - author=about["__author__"], - author_email=about["__author_email__"], - license=about["__license__"], - packages=find_packages(exclude=("tests",)), - install_requires=requires, - extras_require={ - "all": all_requires, - "base": base_requires, - "cv": base_requires, # deprecate - "dev": dev_requires, - "plot": plot_requires, - }, - entry_points={ - "console_scripts": [ - "camelot = camelot.cli:cli", - ], - }, - classifiers=[ - # Trove classifiers - # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - ], - ) - - try: - from setuptools import setup - except ImportError: - from distutils.core import setup - - setup(**metadata) - - -if __name__ == "__main__": - setup_package() From cf65d1ab29a65253557d388ccdb83cc62bb97fc1 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 25 Sep 2023 18:36:24 +0200 Subject: [PATCH 5/9] Add pypdfium2 to nox --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 0f02b6e1..654eb7d4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -155,7 +155,7 @@ def mypy(session: Session) -> None: session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py") -base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17"] +base_requires = ["ghostscript>=0.7", "opencv-python>=3.4.2.17", "pypdfium2>=4,<5"] plot_requires = [ "matplotlib>=2.2.3", From cbeca807db6c2f311554857071ca87fd7b8ea4c6 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 25 Sep 2023 18:36:55 +0200 Subject: [PATCH 6/9] Use poetry-style version bounds --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 42ff97a1..734e1921 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ myst-parser = {version = ">=0.16.1"} camelot = "camelot.__main__:main" [tool.poetry.group.base.dependencies] -pypdfium2 = ">=4,<5" +pypdfium2 = "^4" ghostscript = "^0.7" # remove in favor of pypdfium2? opencv-python = "^4.7.0.68" From 65ee4ae419aca699feb5cf7c7385727ff2717140 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 25 Sep 2023 18:38:20 +0200 Subject: [PATCH 7/9] Remove `coding: utf-8` --- camelot/backends/pdfium_backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/camelot/backends/pdfium_backend.py b/camelot/backends/pdfium_backend.py index 00daf3fc..c30154b6 100644 --- a/camelot/backends/pdfium_backend.py +++ b/camelot/backends/pdfium_backend.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - try: import pypdfium2 as pdfium except Exception: From 60d3f2f43b0cf60039798f5079816821b1321607 Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 25 Sep 2023 18:45:49 +0200 Subject: [PATCH 8/9] Add windows skips Fails with `PermissionError: [WinError 32] The process cannot access the file because it is being used by another process` That seems to be an issue with camelot rsp. its test suite, not pypdfium2. --- tests/test_common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_common.py b/tests/test_common.py index 05ab4ad3..6db6215f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -50,6 +50,7 @@ def test_password(testdir): assert_frame_equal(df, tables[0].df) +@skip_on_windows def test_repr_pdfium(testdir): filename = os.path.join(testdir, "foo.pdf") tables = camelot.read_pdf(filename, backend="pdfium") @@ -76,6 +77,7 @@ def test_repr_ghostscript(testdir): assert repr(tables[0].cells[0][0]) == "" +@skip_on_windows def test_url_pdfium(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url, backend="pdfium") @@ -102,6 +104,7 @@ def test_url_ghostscript(testdir): assert repr(tables[0].cells[0][0]) == "" +@skip_on_windows def test_pages_pdfium(): url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf" tables = camelot.read_pdf(url, backend="pdfium") From 3fe5625735fbe221010f1f8eb6b5fe68fa5a136e Mon Sep 17 00:00:00 2001 From: geisserml Date: Mon, 25 Sep 2023 20:25:43 +0200 Subject: [PATCH 9/9] Capture possible error message In case the `try/except` captures something other than a classical ModuleNotFoundError, we want to know what happened (e.g. library load error) --- camelot/backends/pdfium_backend.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/camelot/backends/pdfium_backend.py b/camelot/backends/pdfium_backend.py index c30154b6..a658ff9e 100644 --- a/camelot/backends/pdfium_backend.py +++ b/camelot/backends/pdfium_backend.py @@ -1,12 +1,16 @@ try: import pypdfium2 as pdfium -except Exception: +except Exception as e: pdfium = None + pdfium_exc = e +else: + pdfium_exc = None -class PdfiumBackend(object): + +class PdfiumBackend: def convert(self, pdf_path, png_path, resolution=300): if not pdfium: - raise OSError("pypdfium2 is not installed.") + raise OSError(f"pypdfium2 is not available: {pdfium_exc!r}") doc = pdfium.PdfDocument(pdf_path) assert len(doc) == 1 doc.init_forms()