Skip to content

Commit

Permalink
Add support for WebP image format
Browse files Browse the repository at this point in the history
  • Loading branch information
drnushooz authored May 7, 2023
1 parent 1915dbd commit 530f006
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 20 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,9 @@ ENV/

# Ignore my vscode config
.vscode/

# Ignore MacOS thumbnails
.DS_Store

# Ignore PyCharm files
.idea
55 changes: 46 additions & 9 deletions pdf2image/parsers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""
pdf2image custom buffer parsers
"""

import os
import tempfile
from io import BytesIO
from typing import List
from typing import List, Tuple

from PIL import Image

Expand All @@ -22,10 +23,10 @@ def parse_buffer_to_ppm(data: bytes) -> List[Image.Image]:
index = 0

while index < len(data):
code, size, rgb = tuple(data[index : index + 40].split(b"\n")[0:3])
code, size, rgb = _parse_ppm_pgm_buffer(data, index)
size_x, size_y = tuple(size.split(b" "))
file_size = len(code) + len(size) + len(rgb) + 3 + int(size_x) * int(size_y) * 3
images.append(Image.open(BytesIO(data[index : index + file_size])))
images.append(Image.open(BytesIO(data[index: index + file_size])))
index += file_size

return images
Expand All @@ -45,10 +46,10 @@ def parse_buffer_to_pgm(data: bytes) -> List[Image.Image]:
index = 0

while index < len(data):
code, size, maxval = tuple(data[index : index + 40].split(b"\n")[0:3])
code, size, maxval = _parse_ppm_pgm_buffer(data, index)
size_x, size_y = tuple(size.split(b" "))
file_size = len(code) + len(size) + len(maxval) + 3 + int(size_x) * int(size_y)
images.append(Image.open(BytesIO(data[index : index + file_size])))
images.append(Image.open(BytesIO(data[index: index + file_size])))
index += file_size

return images
Expand Down Expand Up @@ -87,12 +88,48 @@ def parse_buffer_to_png(data: bytes) -> List[Image.Image]:
data_len = len(data)
while c1 < data_len:
# IEND can appear in a PNG without being the actual end
if data[c2 : c2 + 4] == b"IEND" and (
c2 + 8 == data_len or data[c2 + 9 : c2 + 12] == b"PNG"
if data[c2: c2 + 4] == b"IEND" and (
c2 + 8 == data_len or data[c2 + 9: c2 + 12] == b"PNG"
):
images.append(Image.open(BytesIO(data[c1: c2 + 8])))
c1 = c2 + 8
c2 = c1
c2 += 1

return images


def parse_buffer_to_webp(data: bytes) -> List[Image.Image]:
"""Parse WebP file bytes to Pillow Image
:param data: pdftoppm/pdftocairo output bytes
:type data: bytes
:return: List of WebP images parsed from the output
:rtype: List[Image.Image]
"""
images = []

c1 = c2 = 0
data_len = len(data)

while c1 < data_len:
# IEND can appear in a PNG without being the actual end
if data[c2: c2 + 4] == b"IEND" and (
c2 + 8 == data_len or data[c2 + 9: c2 + 12] == b"PNG"
):
images.append(Image.open(BytesIO(data[c1 : c2 + 8])))
cur_image = Image.open(BytesIO(data[c1: c2 + 8]))
c1 = c2 + 8
c2 = c1

_, temp_filename = tempfile.mkstemp()
cur_image.save(temp_filename, format='webp')
images.append(Image.open(temp_filename))
os.remove(temp_filename)

c2 += 1

return images


def _parse_ppm_pgm_buffer(data: bytes, index: int) -> tuple[bytes, ...]:
return tuple(data[index: index + 40].split(b"\n")[0:3])
15 changes: 9 additions & 6 deletions pdf2image/pdf2image.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
parse_buffer_to_ppm,
parse_buffer_to_jpeg,
parse_buffer_to_png,
parse_buffer_to_webp
)

from pdf2image.exceptions import (
Expand Down Expand Up @@ -464,14 +465,16 @@ def _parse_format(fmt: str, grayscale: bool = False) -> Tuple[str, str, Callable
fmt = fmt[1:]
if fmt in ("jpeg", "jpg"):
return "jpeg", "jpg", parse_buffer_to_jpeg, False
if fmt == "png":
elif fmt == "png":
return "png", "png", parse_buffer_to_png, False
if fmt in ("tif", "tiff"):
return "tiff", "tif", None, True
if fmt == "ppm" and grayscale:
elif fmt in ("tif", "tiff"):
return "tiff", "tif", lambda _: None, True
elif fmt == "ppm" and grayscale:
return "pgm", "pgm", parse_buffer_to_pgm, False
# Unable to parse the format so we'll use the default
return "ppm", "ppm", parse_buffer_to_ppm, False
elif fmt == "webp":
return "png", "webp", parse_buffer_to_webp, False
else: # Unable to parse the format, so we'll use the default
return "ppm", "ppm", parse_buffer_to_ppm, False


def _parse_jpegopt(jpegopt: Dict) -> str:
Expand Down
40 changes: 35 additions & 5 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys
import errno
import pathlib
import tempfile
import unittest
import time
import shutil
Expand Down Expand Up @@ -467,7 +466,7 @@ def test_conversion_to_jpeg_from_bytes(self):
images_from_bytes = convert_from_bytes(pdf_file.read(), fmt="jpg")
self.assertTrue(images_from_bytes[0].format == "JPEG")
print(
"test_conversion_to_jpeg_from_bytes_14: {} sec".format(
"test_conversion_to_jpeg_from_bytes: {} sec".format(
(time.time() - start_time) / 14.0
)
)
Expand All @@ -483,7 +482,7 @@ def test_conversion_to_jpeg_from_path_using_dir(self):
self.assertTrue(images_from_path[0].format == "JPEG")
[im.close() for im in images_from_path]
print(
"test_conversion_to_jpeg_from_path_using_dir_14: {} sec".format(
"test_conversion_to_jpeg_from_path_using_dir: {} sec".format(
(time.time() - start_time) / 14.0
)
)
Expand All @@ -498,7 +497,7 @@ def test_conversion_to_png_from_bytes(self):
images_from_bytes = convert_from_bytes(pdf_file.read(), fmt="png")
self.assertTrue(images_from_bytes[0].format == "PNG")
print(
"test_conversion_to_png_from_bytes_14: {} sec".format(
"test_conversion_to_png_from_bytes: {} sec".format(
(time.time() - start_time) / 14.0
)
)
Expand All @@ -514,7 +513,38 @@ def test_conversion_to_png_from_path_using_dir(self):
self.assertTrue(images_from_path[0].format == "PNG")
[im.close() for im in images_from_path]
print(
"test_conversion_to_png_from_path_using_dir_14: {} sec".format(
"test_conversion_to_png_from_path_using_dir: {} sec".format(
(time.time() - start_time) / 14.0
)
)

## Test output as webp

@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
def test_conversion_to_webp_from_bytes(self):
start_time = time.time()
with open("./tests/test.pdf", "rb") as pdf_file:
images_from_bytes = convert_from_bytes(pdf_file.read(), fmt="webp")
self.assertTrue(images_from_bytes[0].format == "WEBP")
print(
"test_conversion_to_webp_from_bytes: {} sec".format(
(time.time() - start_time) / 14.0
)
)

@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed!")
def test_conversion_to_webp_from_path_using_dir(self):
start_time = time.time()
with TemporaryDirectory() as path:
images_from_path = convert_from_path(
"./tests/test.pdf", output_folder=path, fmt="webp"
)
self.assertTrue(images_from_path[0].format == "WEBP")
[im.close() for im in images_from_path]
print(
"test_conversion_to_webp_from_path_using_dir: {} sec".format(
(time.time() - start_time) / 14.0
)
)
Expand Down

0 comments on commit 530f006

Please sign in to comment.