Add support for WebP image format

Belval · May 7, 2023 · 530f006 · 530f006
1 parent 1915dbd
commit 530f006
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,9 @@ ENV/
 
 # Ignore my vscode config
 .vscode/
+
+# Ignore MacOS thumbnails
+.DS_Store
+
+# Ignore PyCharm files
+.idea
diff --git a/pdf2image/parsers.py b/pdf2image/parsers.py
@@ -1,9 +1,10 @@
 """
     pdf2image custom buffer parsers
 """
-
+import os
+import tempfile
 from io import BytesIO
-from typing import List
+from typing import List, Tuple
 
 from PIL import Image
 
@@ -22,10 +23,10 @@ def parse_buffer_to_ppm(data: bytes) -> List[Image.Image]:
     index = 0
 
     while index < len(data):
-        code, size, rgb = tuple(data[index : index + 40].split(b"\n")[0:3])
+        code, size, rgb = _parse_ppm_pgm_buffer(data, index)
         size_x, size_y = tuple(size.split(b" "))
         file_size = len(code) + len(size) + len(rgb) + 3 + int(size_x) * int(size_y) * 3
-        images.append(Image.open(BytesIO(data[index : index + file_size])))
+        images.append(Image.open(BytesIO(data[index: index + file_size])))
         index += file_size
 
     return images
@@ -45,10 +46,10 @@ def parse_buffer_to_pgm(data: bytes) -> List[Image.Image]:
     index = 0
 
     while index < len(data):
-        code, size, maxval = tuple(data[index : index + 40].split(b"\n")[0:3])
+        code, size, maxval = _parse_ppm_pgm_buffer(data, index)
         size_x, size_y = tuple(size.split(b" "))
         file_size = len(code) + len(size) + len(maxval) + 3 + int(size_x) * int(size_y)
-        images.append(Image.open(BytesIO(data[index : index + file_size])))
+        images.append(Image.open(BytesIO(data[index: index + file_size])))
         index += file_size
 
     return images
@@ -87,12 +88,48 @@ def parse_buffer_to_png(data: bytes) -> List[Image.Image]:
     data_len = len(data)
     while c1 < data_len:
         # IEND can appear in a PNG without being the actual end
-        if data[c2 : c2 + 4] == b"IEND" and (
-            c2 + 8 == data_len or data[c2 + 9 : c2 + 12] == b"PNG"
+        if data[c2: c2 + 4] == b"IEND" and (
+            c2 + 8 == data_len or data[c2 + 9: c2 + 12] == b"PNG"
+        ):
+            images.append(Image.open(BytesIO(data[c1: c2 + 8])))
+            c1 = c2 + 8
+            c2 = c1
+        c2 += 1
+
+    return images
+
+
+def parse_buffer_to_webp(data: bytes) -> List[Image.Image]:
+    """Parse WebP file bytes to Pillow Image
+
+    :param data: pdftoppm/pdftocairo output bytes
+    :type data: bytes
+    :return: List of WebP images parsed from the output
+    :rtype: List[Image.Image]
+    """
+    images = []
+
+    c1 = c2 = 0
+    data_len = len(data)
+
+    while c1 < data_len:
+        # IEND can appear in a PNG without being the actual end
+        if data[c2: c2 + 4] == b"IEND" and (
+            c2 + 8 == data_len or data[c2 + 9: c2 + 12] == b"PNG"
         ):
-            images.append(Image.open(BytesIO(data[c1 : c2 + 8])))
+            cur_image = Image.open(BytesIO(data[c1: c2 + 8]))
             c1 = c2 + 8
             c2 = c1
+
+            _, temp_filename = tempfile.mkstemp()
+            cur_image.save(temp_filename, format='webp')
+            images.append(Image.open(temp_filename))
+            os.remove(temp_filename)
+
         c2 += 1
 
     return images
+
+
+def _parse_ppm_pgm_buffer(data: bytes, index: int) -> tuple[bytes, ...]:
+    return tuple(data[index: index + 40].split(b"\n")[0:3])
diff --git a/pdf2image/pdf2image.py b/pdf2image/pdf2image.py
@@ -21,6 +21,7 @@
     parse_buffer_to_ppm,
     parse_buffer_to_jpeg,
     parse_buffer_to_png,
+    parse_buffer_to_webp
 )
 
 from pdf2image.exceptions import (
@@ -464,14 +465,16 @@ def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable
         fmt = fmt[1:]
     if fmt in ("jpeg", "jpg"):
         return "jpeg", "jpg", parse_buffer_to_jpeg, False
-    if fmt == "png":
+    elif fmt == "png":
         return "png", "png", parse_buffer_to_png, False
-    if fmt in ("tif", "tiff"):
-        return "tiff", "tif", None, True
-    if fmt == "ppm" and grayscale:
+    elif fmt in ("tif", "tiff"):
+        return "tiff", "tif", lambda _: None, True
+    elif fmt == "ppm" and grayscale:
         return "pgm", "pgm", parse_buffer_to_pgm, False
-    # Unable to parse the format so we'll use the default
-    return "ppm", "ppm", parse_buffer_to_ppm, False
+    elif fmt == "webp":
+        return "png", "webp", parse_buffer_to_webp, False
+    else:  # Unable to parse the format, so we'll use the default
+        return "ppm", "ppm", parse_buffer_to_ppm, False
 
 
 def _parse_jpegopt(jpegopt: Dict) -> str:

diff --git a/tests.py b/tests.py
@@ -2,7 +2,6 @@
 import sys
 import errno
 import pathlib
-import tempfile
 import unittest
 import time
 import shutil
@@ -467,7 +466,7 @@ def test_conversion_to_jpeg_from_bytes(self):
             images_from_bytes = convert_from_bytes(pdf_file.read(), fmt="jpg")
             self.assertTrue(images_from_bytes[0].format == "JPEG")
         print(
-            "test_conversion_to_jpeg_from_bytes_14: {} sec".format(
+            "test_conversion_to_jpeg_from_bytes: {} sec".format(
                 (time.time() - start_time) / 14.0
             )
         )
@@ -483,7 +482,7 @@ def test_conversion_to_jpeg_from_path_using_dir(self):
             self.assertTrue(images_from_path[0].format == "JPEG")
             [im.close() for im in images_from_path]
         print(
-            "test_conversion_to_jpeg_from_path_using_dir_14: {} sec".format(
+            "test_conversion_to_jpeg_from_path_using_dir: {} sec".format(
                 (time.time() - start_time) / 14.0
             )
         )
@@ -498,7 +497,7 @@ def test_conversion_to_png_from_bytes(self):
             images_from_bytes = convert_from_bytes(pdf_file.read(), fmt="png")
             self.assertTrue(images_from_bytes[0].format == "PNG")
         print(
-            "test_conversion_to_png_from_bytes_14: {} sec".format(
+            "test_conversion_to_png_from_bytes: {} sec".format(
                 (time.time() - start_time) / 14.0
             )
         )
@@ -514,7 +513,38 @@ def test_conversion_to_png_from_path_using_dir(self):
             self.assertTrue(images_from_path[0].format == "PNG")
             [im.close() for im in images_from_path]
         print(
-            "test_conversion_to_png_from_path_using_dir_14: {} sec".format(
+            "test_conversion_to_png_from_path_using_dir: {} sec".format(
+                (time.time() - start_time) / 14.0
+            )
+        )
+
+    ## Test output as webp
+
+    @profile
+    @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
+    def test_conversion_to_webp_from_bytes(self):
+        start_time = time.time()
+        with open("./tests/test.pdf", "rb") as pdf_file:
+            images_from_bytes = convert_from_bytes(pdf_file.read(), fmt="webp")
+            self.assertTrue(images_from_bytes[0].format == "WEBP")
+        print(
+            "test_conversion_to_webp_from_bytes: {} sec".format(
+                (time.time() - start_time) / 14.0
+            )
+        )
+
+    @profile
+    @unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
+    def test_conversion_to_webp_from_path_using_dir(self):
+        start_time = time.time()
+        with TemporaryDirectory() as path:
+            images_from_path = convert_from_path(
+                "./tests/test.pdf", output_folder=path, fmt="webp"
+            )
+            self.assertTrue(images_from_path[0].format == "WEBP")
+            [im.close() for im in images_from_path]
+        print(
+            "test_conversion_to_webp_from_path_using_dir: {} sec".format(
                 (time.time() - start_time) / 14.0
             )
         )