From a73501059636927beeacbaa719e94cb5237d727f Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 09:40:23 +0200 Subject: [PATCH 01/18] STY: Apply black and isort --- PyPDF2/_reader.py | 5 ++++- PyPDF2/_utils.py | 2 +- tests/test_reader.py | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index e82d8b0e3..8b345d4fc 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -980,8 +980,11 @@ def _get_object_from_stream( if self.strict and idx != i: raise PdfReadError("Object is in wrong index.") stream_data.seek(int(obj_stm["/First"] + offset), 0) # type: ignore - read_non_whitespace(stream_data) # to cope with some case where the 'pointer' is on a white space + + # to cope with some case where the 'pointer' is on a white space + read_non_whitespace(stream_data) stream_data.seek(-1, 1) + try: obj = read_object(stream_data, self) except PdfStreamError as exc: diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 9ae1bb582..61e864697 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -31,10 +31,10 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" +import warnings from codecs import getencoder from io import BufferedReader, BufferedWriter, BytesIO, FileIO from typing import Any, Dict, Optional, Tuple, Union, overload -import warnings try: # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ diff --git a/tests/test_reader.py b/tests/test_reader.py index 397331918..d92c18896 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1,8 +1,8 @@ import io import os import time -from io import BytesIO import urllib.request +from io import BytesIO import pytest @@ -670,8 +670,8 @@ def test_convertToInt_deprecated(): def test_iss925(): - reader = PdfReader(BytesIO(urllib.request.urlopen( - "https://github.com/py-pdf/PyPDF2/files/8796328/1.pdf").read())) + url = "https://github.com/py-pdf/PyPDF2/files/8796328/1.pdf" + reader = PdfReader(BytesIO(urllib.request.urlopen(url).read())) for page_sliced in reader.pages: page_object = page_sliced.get_object() From acb9723b08ab471c3f61822348dadbe73934b126 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 09:48:25 +0200 Subject: [PATCH 02/18] MAINT: Use new functions in tests This should make our tests easier to read --- tests/test_basic_features.py | 26 ++++++++++---------------- tests/test_generic.py | 2 -- tests/test_page.py | 16 ++++++---------- tests/test_reader.py | 3 +-- tests/test_workflows.py | 6 ++---- tests/test_xmp.py | 9 ++++----- 6 files changed, 23 insertions(+), 39 deletions(-) diff --git a/tests/test_basic_features.py b/tests/test_basic_features.py index 541334a6c..ae2a55bcf 100644 --- a/tests/test_basic_features.py +++ b/tests/test_basic_features.py @@ -22,26 +22,20 @@ def test_basic_features(): # add page 2 from input1, but rotated clockwise 90 degrees writer.add_page(reader.pages[0].rotate(90)) - # add page 3 from input1, rotated the other way: - with pytest.warns(PendingDeprecationWarning): - rotated = reader.pages[0].rotateCounterClockwise(90) - writer.add_page(rotated) - # alt: output.addPage(input1.pages[0].rotate(270)) - - # add page 4 from input1, but first add a watermark from another PDF: - page4 = reader.pages[0] + # add page 3 from input1, but first add a watermark from another PDF: + page3 = reader.pages[0] watermark_pdf = pdf_path watermark = PdfReader(watermark_pdf) - page4.merge_page(watermark.pages[0]) - writer.add_page(page4) + page3.merge_page(watermark.pages[0]) + writer.add_page(page3) - # add page 5 from input1, but crop it to half size: - page5 = reader.pages[0] - page5.mediabox.upper_right = ( - page5.mediabox.right / 2, - page5.mediabox.top / 2, + # add page 4 from input1, but crop it to half size: + page4 = reader.pages[0] + page4.mediabox.upper_right = ( + page4.mediabox.right / 2, + page4.mediabox.top / 2, ) - writer.add_page(page5) + writer.add_page(page4) # add some Javascript to launch the print window on opening this PDF. # the password dialog may prevent the print dialog from being shown, diff --git a/tests/test_generic.py b/tests/test_generic.py index a56972690..3c6c024d4 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -243,8 +243,6 @@ def test_DictionaryObject_key_is_no_pdfobject(): def test_DictionaryObject_xmp_meta(): do = DictionaryObject({NameObject("/S"): NameObject("/GoTo")}) assert do.xmp_metadata is None - with pytest.warns(PendingDeprecationWarning): - assert do.xmpMetadata is None def test_DictionaryObject_value_is_no_pdfobject(): diff --git a/tests/test_page.py b/tests/test_page.py index cd53d796e..273bbe649 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -73,22 +73,18 @@ def test_page_operations(pdf_path, password): reader.decrypt(password) page: PageObject = reader.pages[0] - with pytest.warns(PendingDeprecationWarning): - page.mergeRotatedScaledTranslatedPage( - page, 90, scale=1, tx=1, ty=1, expand=True - ) + + transformation = Transformation().rotate(90).scale(1).translate(1, 1) + page.add_transformation(transformation, expand=True) page.add_transformation((1, 0, 0, 0, 0, 0)) page.scale(2, 2) page.scale_by(0.5) page.scale_to(100, 100) page.compress_content_streams() page.extract_text() - with pytest.warns(PendingDeprecationWarning): - page.scaleBy(0.5) - with pytest.warns(PendingDeprecationWarning): - page.scaleTo(100, 100) - with pytest.warns(PendingDeprecationWarning): - page.extractText() + page.scale_by(0.5) + page.scale_to(100, 100) + page.extract_text() def test_transformation_equivalence(): diff --git a/tests/test_reader.py b/tests/test_reader.py index d92c18896..06da28156 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -65,8 +65,7 @@ def test_get_num_pages(src, num_pages): def test_read_metadata(pdf_path, expected): with open(pdf_path, "rb") as inputfile: reader = PdfReader(inputfile) - with pytest.warns(PendingDeprecationWarning): - docinfo = reader.documentInfo + docinfo = reader.metadata assert docinfo is not None metadict = dict(docinfo) assert metadict == expected diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 296d65a53..944e12cdc 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -92,8 +92,7 @@ def test_rotate(degree): with open(os.path.join(RESOURCE_ROOT, "crazyones.pdf"), "rb") as inputfile: reader = PdfReader(inputfile) page = reader.pages[0] - with pytest.warns(PendingDeprecationWarning): - page.rotateCounterClockwise(degree) + page.rotate(degree) def test_rotate_45(): @@ -101,8 +100,7 @@ def test_rotate_45(): reader = PdfReader(inputfile) page = reader.pages[0] with pytest.raises(ValueError) as exc: - with pytest.warns(PendingDeprecationWarning): - page.rotateCounterClockwise(45) + page.rotate(45) assert exc.value.args[0] == "Rotation angle must be a multiple of 90" diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 152e2900f..31f8741c2 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -32,12 +32,11 @@ def test_read_xmp(src, has_xmp): assert xmp.dc_contributor == [] -def get_all_tiff(xmp): +def get_all_tiff(xmp: PyPDF2.xmp.XmpInformation): data = {} - with pytest.warns(PendingDeprecationWarning): - tiff_ns = xmp.getNodesInNamespace( - aboutUri="", namespace="http://ns.adobe.com/tiff/1.0/" - ) + tiff_ns = xmp.get_nodes_in_namespace( + about_uri="", namespace="http://ns.adobe.com/tiff/1.0/" + ) for tag in tiff_ns: contents = [] for content in tag.childNodes: From f58771f1adb88e99a1f3b1fa374bd8a93ae6b8fd Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 09:50:24 +0200 Subject: [PATCH 03/18] Flake8 fix --- tests/test_basic_features.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_basic_features.py b/tests/test_basic_features.py index ae2a55bcf..5a6d23bfd 100644 --- a/tests/test_basic_features.py +++ b/tests/test_basic_features.py @@ -1,7 +1,5 @@ import os -import pytest - from PyPDF2 import PdfReader, PdfWriter TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) From 8e6e00ce12f0fab9964fab07439fff663887be02 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 10:07:52 +0200 Subject: [PATCH 04/18] Don't shaddow builtins --- PyPDF2/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index c81e6da5f..5fb5ca8cf 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -895,8 +895,8 @@ def remove_child(self, child: Any) -> None: if NameObject("/Next") in cur: # Removing first tree node next_ref = cur[NameObject("/Next")] - next = next_ref.get_object() - del next[NameObject("/Prev")] + next_obj = next_ref.get_object() + del next_obj[NameObject("/Prev")] self[NameObject("/First")] = next_ref self[NameObject("/Count")] -= 1 # type: ignore @@ -911,8 +911,8 @@ def remove_child(self, child: Any) -> None: if NameObject("/Next") in cur: # Removing middle tree node next_ref = cur[NameObject("/Next")] - next = next_ref.get_object() - next[NameObject("/Prev")] = prev_ref + next_obj = next_ref.get_object() + next_obj[NameObject("/Prev")] = prev_ref prev[NameObject("/Next")] = next_ref self[NameObject("/Count")] -= 1 else: From 5011b9af48b8e2c4456abf70cf99e467218625bf Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 10:08:57 +0200 Subject: [PATCH 05/18] Names --- tests/test_generic.py | 6 +++--- tests/test_utils.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index 3c6c024d4..65ec0682b 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -320,7 +320,7 @@ class tst: # to replace pdf @pytest.mark.parametrize( - ("strict", "length", "shouldFail"), + ("strict", "length", "should_fail"), [ (True, 6, False), (True, 10, False), @@ -330,7 +330,7 @@ class tst: # to replace pdf ], ) def test_DictionaryObject_read_from_stream_stream_stream_valid( - strict, length, shouldFail + strict, length, should_fail ): stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length) @@ -347,7 +347,7 @@ class tst: # to replace pdf assert b"BT /F1" in do._StreamObject__data raise PdfReadError("__ALLGOOD__") print(exc.value) - assert shouldFail ^ (exc.value.args[0] == "__ALLGOOD__") + assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__") def test_RectangleObject(): diff --git a/tests/test_utils.py b/tests/test_utils.py index 5fb1a49d0..5d2393233 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -22,11 +22,11 @@ (io.BytesIO(b" \n"), True), ], ) -def test_skipOverWhitespace(stream, expected): +def test_skip_over_whitespace(stream, expected): assert PyPDF2._utils.skip_over_whitespace(stream) == expected -def test_readUntilWhitespace(): +def test_read_until_whitespace(): assert PyPDF2._utils.read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" @@ -39,12 +39,12 @@ def test_readUntilWhitespace(): (io.BytesIO(b"% foo%\nbar"), b"bar"), ], ) -def test_skipOverComment(stream, remainder): +def test_skip_over_comment(stream, remainder): PyPDF2._utils.skip_over_comment(stream) assert stream.read() == remainder -def test_readUntilRegex_premature_ending_raise(): +def test_read_until_regex_premature_ending_raise(): import re stream = io.BytesIO(b"") @@ -53,7 +53,7 @@ def test_readUntilRegex_premature_ending_raise(): assert exc.value.args[0] == "Stream has ended unexpectedly" -def test_readUntilRegex_premature_ending_name(): +def test_read_until_regex_premature_ending_name(): import re stream = io.BytesIO(b"") @@ -70,17 +70,17 @@ def test_readUntilRegex_premature_ending_name(): (((3,), (7,)), ((5, 13),), ((3 * 5, 3 * 13), (7 * 5, 7 * 13))), ], ) -def test_matrixMultiply(a, b, expected): +def test_matrix_multiply(a, b, expected): assert PyPDF2._utils.matrix_multiply(a, b) == expected -def test_markLocation(): +def test_mark_location(): stream = io.BytesIO(b"abde" * 6000) PyPDF2._utils.mark_location(stream) os.remove("PyPDF2_pdfLocation.txt") # cleanup -def test_hexStr(): +def test_hex_str(): assert PyPDF2._utils.hex_str(10) == "0xa" From 1d7d75ad337f34038413fbe714754817c17a5387 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 10:09:21 +0200 Subject: [PATCH 06/18] Ignore ASS001 for tests --- .flake8 | 4 +++- tests/test_reader.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.flake8 b/.flake8 index fbc73fdc2..c56c75ff6 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,7 @@ [flake8] # The flake8 config should work well with black, # see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 -ignore = E203,E501,E741,W503,W604 +ignore = E203,E501,E741,W503,W604,N817,N814 exclude = build,sample-files +per-file-ignores = + tests/*: ASS001 diff --git a/tests/test_reader.py b/tests/test_reader.py index 06da28156..bfc4bf27e 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -461,7 +461,7 @@ def test_read_missing_startxref(): b"%010d 00000 n\n" b"%010d 00000 n\n" b"trailer << /Root 5 0 R /Size 6 >>\n" - # b"startxref %d\n" + # Removed for this test: b"startxref %d\n" b"%%%%EOF" ) pdf_data = pdf_data % ( @@ -470,7 +470,7 @@ def test_read_missing_startxref(): pdf_data.find(b"3 0 obj"), pdf_data.find(b"4 0 obj"), pdf_data.find(b"5 0 obj"), - # pdf_data.find(b"xref") - 1, + # Removed for this test: pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) with pytest.raises(PdfReadError) as exc: @@ -554,7 +554,7 @@ def test_do_not_get_stuck_on_large_files_without_start_xref(): assert parse_duration < 60 -def test_PdfReaderDecryptWhenNoID(): +def test_PdfReader_decrypt_when_no_id(): """ Decrypt an encrypted file that's missing the 'ID' value in its trailer. From 883adf5616c0fdd59baf2b9c900c4cdf33908b3b Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 10:38:17 +0200 Subject: [PATCH 07/18] Add caching mechanism for downloaded files to speed up local testing --- .gitignore | 1 + tests/__init__.py | 27 +++++++++++++++++++++++++++ tests/test_page.py | 4 ++-- tests/test_reader.py | 8 ++++---- tests/test_workflows.py | 20 ++++++++++++++++---- 5 files changed, 50 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 9a3837b31..48a4771fa 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,4 @@ Image9.png PyPDF2_pdfLocation.txt .python-version +tests/pdf_cache/ diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..5c4b01d5f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,27 @@ +import os +import urllib.request + + +def get_pdf_from_url(url: str, name: str) -> bytes: + """ + Download a PDF from a URL and return its contents. + + This function makes sure the PDF is not downloaded too often. + This function is a last resort for PDF files where we are uncertain if + we may add it for testing purposes to https://github.com/py-pdf/sample-files + + URL: location of the PDF file + name: unique name accross all files + """ + cache_dir = os.path.join(os.path.dirname(__file__), "pdf_cache") + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + cache_path = os.path.join(cache_dir, name) + if not os.path.exists(cache_path): + with urllib.request.urlopen(url) as response, open( + cache_path, "wb" + ) as out_file: + out_file.write(response.read()) + with open(cache_path, "rb") as fp: + data = fp.read() + return data diff --git a/tests/test_page.py b/tests/test_page.py index 273bbe649..e996a32b5 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,6 +1,5 @@ import json import os -import urllib.request from copy import deepcopy from io import BytesIO @@ -10,6 +9,7 @@ from PyPDF2._page import PageObject from PyPDF2.constants import PageAttributes as PG from PyPDF2.generic import DictionaryObject, NameObject, RectangleObject +from tests import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -64,7 +64,7 @@ def test_page_operations(pdf_path, password): output is as expected. """ if pdf_path.startswith("http"): - pdf_path = BytesIO(urllib.request.urlopen(pdf_path).read()) + pdf_path = BytesIO(get_pdf_from_url(pdf_path, pdf_path.split("/")[-1])) else: pdf_path = os.path.join(RESOURCE_ROOT, pdf_path) reader = PdfReader(pdf_path) diff --git a/tests/test_reader.py b/tests/test_reader.py index bfc4bf27e..c0b69f37d 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -1,7 +1,6 @@ import io import os import time -import urllib.request from io import BytesIO import pytest @@ -13,6 +12,7 @@ from PyPDF2.constants import Ressources as RES from PyPDF2.errors import PdfReadError, PdfReadWarning from PyPDF2.filters import _xobj_to_image +from tests import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -554,7 +554,7 @@ def test_do_not_get_stuck_on_large_files_without_start_xref(): assert parse_duration < 60 -def test_PdfReader_decrypt_when_no_id(): +def test_decrypt_when_no_id(): """ Decrypt an encrypted file that's missing the 'ID' value in its trailer. @@ -638,7 +638,7 @@ def test_decode_permissions(): assert reader.decode_permissions(8) == modify -def test_VirtualList(): +def test_pages_attribute(): pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf") reader = PdfReader(pdf_path) @@ -670,7 +670,7 @@ def test_convertToInt_deprecated(): def test_iss925(): url = "https://github.com/py-pdf/PyPDF2/files/8796328/1.pdf" - reader = PdfReader(BytesIO(urllib.request.urlopen(url).read())) + reader = PdfReader(BytesIO(get_pdf_from_url(url, name="iss925.pdf"))) for page_sliced in reader.pages: page_object = page_sliced.get_object() diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 944e12cdc..680a00af3 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -1,13 +1,13 @@ import binascii import os import sys -import urllib.request from io import BytesIO import pytest from PyPDF2 import PdfReader from PyPDF2.constants import PageAttributes as PG +from tests import get_pdf_from_url TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.dirname(TESTS_ROOT) @@ -83,8 +83,20 @@ def test_decrypt(): "/Creator": "Writer", "/Producer": "LibreOffice 6.4", } - # Is extract_text() broken for encrypted files? - # assert reader.pages[0].extract_text().replace('\n', '') == "\n˘\n\u02c7\u02c6˙\n\n\n˘\u02c7\u02c6˙\n\n" + + +def test_text_extraction_encrypted(): + inputfile = os.path.join(RESOURCE_ROOT, "libreoffice-writer-password.pdf") + reader = PdfReader(inputfile) + assert reader.is_encrypted is True + reader.decrypt("openpassword") + assert ( + reader.pages[0] + .extract_text() + .replace("\n", "") + .strip() + .startswith("Lorem ipsum dolor sit amet") + ) @pytest.mark.parametrize("degree", [0, 90, 180, 270, 360, -90]) @@ -132,7 +144,7 @@ def test_rotate_45(): def test_extract_textbench(enable, url, pages, print_result=False): if not enable: return - reader = PdfReader(BytesIO(urllib.request.urlopen(url).read())) + reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1]))) for page_number in pages: if print_result: print(f"**************** {url} / page {page_number} ****************") From a4127acef7d98200cc584bf974f72a515b885f57 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 10:38:53 +0200 Subject: [PATCH 08/18] Minor flake8 issues --- .flake8 | 4 ++-- tests/bench.py | 4 ++-- tests/test_generic.py | 9 ++------- tests/test_utils.py | 27 +++++++++++++++++---------- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/.flake8 b/.flake8 index c56c75ff6..0a93c97d6 100644 --- a/.flake8 +++ b/.flake8 @@ -1,7 +1,7 @@ [flake8] # The flake8 config should work well with black, # see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 -ignore = E203,E501,E741,W503,W604,N817,N814 +ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,N802 exclude = build,sample-files per-file-ignores = - tests/*: ASS001 + tests/*: ASS001,PT011 diff --git a/tests/bench.py b/tests/bench.py index d10fb767c..d8f526ed9 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -129,5 +129,5 @@ def text_extraction(pdf_path): @pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning") def test_text_extraction(benchmark): - file = os.path.join(SAMPLE_ROOT, "009-pdflatex-geotopo/GeoTopo.pdf") - benchmark(text_extraction, file) + file_path = os.path.join(SAMPLE_ROOT, "009-pdflatex-geotopo/GeoTopo.pdf") + benchmark(text_extraction, file_path) diff --git a/tests/test_generic.py b/tests/test_generic.py index 65ec0682b..33cdc5cfe 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -309,10 +309,10 @@ def test_DictionaryObject_read_from_stream_stream_no_newline(): def test_DictionaryObject_read_from_stream_stream_no_stream_length(strict): stream = BytesIO(b"<< /S /GoTo >>stream\n") - class tst: # to replace pdf + class Tst: # to replace pdf strict = False - pdf = tst() + pdf = Tst() pdf.strict = strict with pytest.raises(PdfReadError) as exc: DictionaryObject.read_from_stream(stream, pdf) @@ -399,11 +399,6 @@ def test_remove_child_in_tree(): writer.add_page(reader.pages[0]) writer.add_bookmark("foo", 0) obj = writer._objects[-1] - # print(dict) - # print(type(dict)) - # for obj in writer._objects: - # print(obj) - # print(type(obj)) tree.add_child(obj, writer) tree.remove_child(obj) tree.add_child(obj, writer) diff --git a/tests/test_utils.py b/tests/test_utils.py index 5d2393233..321883e2f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,6 +4,14 @@ import pytest import PyPDF2._utils +from PyPDF2._utils import ( + mark_location, + matrix_multiply, + read_until_regex, + read_until_whitespace, + skip_over_comment, + skip_over_whitespace, +) from PyPDF2.errors import PdfStreamError TESTS_ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -23,11 +31,11 @@ ], ) def test_skip_over_whitespace(stream, expected): - assert PyPDF2._utils.skip_over_whitespace(stream) == expected + assert skip_over_whitespace(stream) == expected def test_read_until_whitespace(): - assert PyPDF2._utils.read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" + assert read_until_whitespace(io.BytesIO(b"foo"), maxchars=1) == b"f" @pytest.mark.parametrize( @@ -40,7 +48,7 @@ def test_read_until_whitespace(): ], ) def test_skip_over_comment(stream, remainder): - PyPDF2._utils.skip_over_comment(stream) + skip_over_comment(stream) assert stream.read() == remainder @@ -49,7 +57,7 @@ def test_read_until_regex_premature_ending_raise(): stream = io.BytesIO(b"") with pytest.raises(PdfStreamError) as exc: - PyPDF2._utils.read_until_regex(stream, re.compile(b".")) + read_until_regex(stream, re.compile(b".")) assert exc.value.args[0] == "Stream has ended unexpectedly" @@ -57,9 +65,7 @@ def test_read_until_regex_premature_ending_name(): import re stream = io.BytesIO(b"") - assert ( - PyPDF2._utils.read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" - ) + assert read_until_regex(stream, re.compile(b"."), ignore_eof=True) == b"" @pytest.mark.parametrize( @@ -71,12 +77,12 @@ def test_read_until_regex_premature_ending_name(): ], ) def test_matrix_multiply(a, b, expected): - assert PyPDF2._utils.matrix_multiply(a, b) == expected + assert matrix_multiply(a, b) == expected def test_mark_location(): stream = io.BytesIO(b"abde" * 6000) - PyPDF2._utils.mark_location(stream) + mark_location(stream) os.remove("PyPDF2_pdfLocation.txt") # cleanup @@ -94,4 +100,5 @@ def test_b(): def test_deprecate_no_replacement(): with pytest.raises(PendingDeprecationWarning) as exc: PyPDF2._utils.deprecate_no_replacement("foo") - assert exc.value.args[0] == "foo is deprecated and will be removed in PyPDF2 3.0.0." + error_msg = "foo is deprecated and will be removed in PyPDF2 3.0.0." + assert exc.value.args[0] == error_msg From 3ae9943cb3775bf80179c132724be5902443e680 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:07:47 +0200 Subject: [PATCH 09/18] Move buildCharMap to its own function - makes it easier to read --- PyPDF2/_page.py | 328 ++++++++++++++++++++++++------------------------ 1 file changed, 163 insertions(+), 165 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 8b3367e1b..96bcb768b 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -1116,170 +1116,6 @@ def _extract_text( default = "/Content" :return: a string object. """ - # code freely inspired from @twiggy ; see #711 - def buildCharMap(font_name: str) -> Tuple[str, float, Dict, Dict]: - map_dict: Any = {} - process_rg: bool = False - process_char: bool = False - encoding: List[str] = [] - ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore - font_type: str = cast(str, ft["/Subtype"]) - sp_width: float = space_width * 2 # default value - w = [] - # encoding - space_code = 32 - if "/Encoding" in ft: - enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore - if isinstance(enc, str): - try: - if enc in ("/Identity-H", "/Identity-V"): - encoding = [] - else: - encoding = charset_encoding[enc].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: - try: - encoding = charset_encoding[ - cast(str, enc["/BaseEncoding"]) - ].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - else: - encoding = charset_encoding["/StandardCoding"].copy() - if "/Differences" in enc: - x = 0 - for o in cast( - DictionaryObject, cast(DictionaryObject, enc)["/Differences"] - ): - if isinstance(o, int): - x = o - else: - try: - encoding[x] = adobe_glyphs[o] - except Exception: - encoding[x] = o - if o == " ": - space_code = x - x += 1 - if "/ToUnicode" in ft: - cm: str = ( - cast(DecodedStreamObject, ft["/ToUnicode"]) - .get_data() - .decode("utf-8") - ) - for l in ( - cm.strip() - .replace("<", " ") - .replace(">", "") - .replace("[", " [ ") - .replace("]", " ] ") - .split("\n") - ): - if l == "": - continue - if "beginbfrange" in l: - process_rg = True - elif "endbfrange" in l: - process_rg = False - elif "beginbfchar" in l: - process_char = True - elif "endbfchar" in l: - process_char = False - elif process_rg: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - if lst[2] == "[": - # lst = lst[3:].trim(' []').split(' ') - for sq in lst[3:]: - if "]": - break - map_dict[a] = unhexlify(sq).decode("utf-16-be") - a += 1 - assert a > b - else: - c = int(lst[2], 16) - fmt = b"%%0%dX" % len(lst[2]) - while a <= b: - map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") - a += 1 - c += 1 - elif process_char: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - map_dict[a] = unhexlify("".join(lst[1:])).decode( - "utf-16-be" - ) # join is here as some cases where the code was split - - # get - for a in map_dict: - if map_dict[a] == " ": - space_code = a - - # compute space width - st: int = 0 # declaration for mypy - if "/W" in ft: - if "/DW" in ft: - sp_width = cast(float, ft["/DW"]) - w = [x for x in ft["/W"]] # type: ignore - while len(w) > 0: - st = w[0] - second = w[1] - if isinstance(int, second): - if st <= space_code and space_code <= second: - sp_width = w[2] - break - w = w[3:] - if isinstance(list, second): - if st <= space_code and space_code <= st + len(second) - 1: - sp_width = second[space_code - st] - w = w[2:] - else: - warnings.warn( - "unknown widths : \n" + (ft["/W"]).__repr__(), - PdfReadWarning, - ) - break - if "/Widths" in ft: - w = [x for x in ft["/Widths"]] # type: ignore - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > space_code or en < space_code: - raise Exception("Not in range") - if w[space_code - st] == 0: - raise Exception("null width") - sp_width = w[space_code - st] - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore - else: - # will consider width of char as avg(width)/2 - m = 0 - cpt = 0 - for x in w: - if x > 0: - m += x - cpt += 1 - sp_width = m / max(1, cpt) / 2 - - return ( - font_type, - float(sp_width / 2), - dict(zip(range(256), encoding)), - "".maketrans(map_dict), - ) text: str = "" output: str = "" @@ -1287,7 +1123,7 @@ def buildCharMap(font_name: str) -> Tuple[str, float, Dict, Dict]: resources_dict = cast(DictionaryObject, obj["/Resources"]) if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): - cmaps[f] = buildCharMap(f) + cmaps[f] = _build_char_map(f, space_width, obj) cmap: Union[str, Dict[int, str]] = {} content = obj[content_key].get_object() if isinstance(content_key, str) else obj if not isinstance(content, ContentStream): @@ -1600,3 +1436,165 @@ def __getitem__(self, index: int) -> PageObject: def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): yield self[i] + + +# code freely inspired from @twiggy ; see #711 +def _build_char_map( + font_name: str, space_width: float, obj: DictionaryObject +) -> Tuple[str, float, Dict, Dict]: + map_dict: Any = {} + process_rg: bool = False + process_char: bool = False + encoding: List[str] = [] + ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore + font_type: str = cast(str, ft["/Subtype"]) + sp_width: float = space_width * 2 # default value + w = [] + # encoding + space_code = 32 + if "/Encoding" in ft: + enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore + if isinstance(enc, str): + try: + if enc in ("/Identity-H", "/Identity-V"): + encoding = [] + else: + encoding = charset_encoding[enc].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: + try: + encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + else: + encoding = charset_encoding["/StandardCoding"].copy() + if "/Differences" in enc: + x = 0 + for o in cast( + DictionaryObject, cast(DictionaryObject, enc)["/Differences"] + ): + if isinstance(o, int): + x = o + else: + try: + encoding[x] = adobe_glyphs[o] + except Exception: + encoding[x] = o + if o == " ": + space_code = x + x += 1 + if "/ToUnicode" in ft: + cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") + for l in ( + cm.strip() + .replace("<", " ") + .replace(">", "") + .replace("[", " [ ") + .replace("]", " ] ") + .split("\n") + ): + if l == "": + continue + if "beginbfrange" in l: + process_rg = True + elif "endbfrange" in l: + process_rg = False + elif "beginbfchar" in l: + process_char = True + elif "endbfchar" in l: + process_char = False + elif process_rg: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + b = int(lst[1], 16) + if lst[2] == "[": + # lst = lst[3:].trim(' []').split(' ') + for sq in lst[3:]: + if "]": + break + map_dict[a] = unhexlify(sq).decode("utf-16-be") + a += 1 + assert a > b + else: + c = int(lst[2], 16) + fmt = b"%%0%dX" % len(lst[2]) + while a <= b: + map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") + a += 1 + c += 1 + elif process_char: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + map_dict[a] = unhexlify("".join(lst[1:])).decode( + "utf-16-be" + ) # join is here as some cases where the code was split + + # get + for a in map_dict: + if map_dict[a] == " ": + space_code = a + + # compute space width + st: int = 0 # declaration for mypy + if "/W" in ft: + if "/DW" in ft: + sp_width = cast(float, ft["/DW"]) + w = [x for x in ft["/W"]] # type: ignore + while len(w) > 0: + st = w[0] + second = w[1] + if isinstance(int, second): + if st <= space_code and space_code <= second: + sp_width = w[2] + break + w = w[3:] + if isinstance(list, second): + if st <= space_code and space_code <= st + len(second) - 1: + sp_width = second[space_code - st] + w = w[2:] + else: + warnings.warn( + "unknown widths : \n" + (ft["/W"]).__repr__(), + PdfReadWarning, + ) + break + if "/Widths" in ft: + w = [x for x in ft["/Widths"]] # type: ignore + try: + st = cast(int, ft["/FirstChar"]) + en: int = cast(int, ft["/LastChar"]) + if st > space_code or en < space_code: + raise Exception("Not in range") + if w[space_code - st] == 0: + raise Exception("null width") + sp_width = w[space_code - st] + except Exception: + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + else: + # will consider width of char as avg(width)/2 + m = 0 + cpt = 0 + for x in w: + if x > 0: + m += x + cpt += 1 + sp_width = m / max(1, cpt) / 2 + + return ( + font_type, + float(sp_width / 2), + dict(zip(range(256), encoding)), + "".maketrans(map_dict), + ) From af00e73446fef216d8885bdf4f8dd5298cb84a9a Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:08:48 +0200 Subject: [PATCH 10/18] Minor test improvements --- tests/__init__.py | 4 ++-- tests/test_filters.py | 6 +++--- tests/test_generic.py | 5 ++--- tests/test_javascript.py | 2 +- tests/test_page.py | 1 - tests/test_reader.py | 42 ++++++++++++++++------------------------ 6 files changed, 25 insertions(+), 35 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 5c4b01d5f..3ec9b8785 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -10,8 +10,8 @@ def get_pdf_from_url(url: str, name: str) -> bytes: This function is a last resort for PDF files where we are uncertain if we may add it for testing purposes to https://github.com/py-pdf/sample-files - URL: location of the PDF file - name: unique name accross all files + :param str url: location of the PDF file + :param str name: unique name accross all files """ cache_dir = os.path.join(os.path.dirname(__file__), "pdf_cache") if not os.path.exists(cache_dir): diff --git a/tests/test_filters.py b/tests/test_filters.py index db9f8078a..b2b6e1be0 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -54,7 +54,7 @@ def test_FlateDecode_unsupported_predictor(): @pytest.mark.parametrize( - ("input", "expected"), + ("data", "expected"), [ (">", ""), ( @@ -88,7 +88,7 @@ def test_FlateDecode_unsupported_predictor(): "whitespace", ], ) -def test_ASCIIHexDecode(input, expected): +def test_ASCIIHexDecode(data, expected): """ Feeds a bunch of values to ASCIIHexDecode.decode() and ensures the correct output is returned. @@ -97,7 +97,7 @@ def test_ASCIIHexDecode(input, expected): is currently raised.) """ - assert ASCIIHexDecode.decode(input) == expected + assert ASCIIHexDecode.decode(data) == expected def test_ASCIIHexDecode_no_eod(): diff --git a/tests/test_generic.py b/tests/test_generic.py index 33cdc5cfe..41fd8ae79 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -137,7 +137,6 @@ def test_readStringFromStream_not_in_escapedict_no_digit(): with pytest.raises(PdfReadError) as exc: readStringFromStream(stream) assert exc.value.args[0] == "Stream has ended unexpectedly" - # "Unexpected escaped string: y" def test_readStringFromStream_multichar_eol(): @@ -334,10 +333,10 @@ def test_DictionaryObject_read_from_stream_stream_stream_valid( ): stream = BytesIO(b"<< /S /GoTo /Length %d >>stream\nBT /F1\nendstream\n" % length) - class tst: # to replace pdf + class Tst: # to replace pdf strict = True - pdf = tst() + pdf = Tst() pdf.strict = strict with pytest.raises(PdfReadError) as exc: do = DictionaryObject.read_from_stream(stream, pdf) diff --git a/tests/test_javascript.py b/tests/test_javascript.py index 3376fd862..83e08ff21 100644 --- a/tests/test_javascript.py +++ b/tests/test_javascript.py @@ -15,7 +15,7 @@ def pdf_file_writer(): reader = PdfReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) writer = PdfWriter() writer.append_pages_from_reader(reader) - yield writer + return writer def test_add_js(pdf_file_writer): diff --git a/tests/test_page.py b/tests/test_page.py index e996a32b5..6ac28c762 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -45,7 +45,6 @@ def test_read(meta): [ ("crazyones.pdf", None), ("attachment.pdf", None), - # ("side-by-side-subfig.pdf", None), ( "libreoffice-writer-password.pdf", "openpassword", diff --git a/tests/test_reader.py b/tests/test_reader.py index c0b69f37d..6270dabef 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -160,11 +160,11 @@ def test_get_images(src, nb_images): images_extracted = [] if RES.XOBJECT in page[PG.RESOURCES]: - xObject = page[PG.RESOURCES][RES.XOBJECT].get_object() + x_object = page[PG.RESOURCES][RES.XOBJECT].get_object() - for obj in xObject: - if xObject[obj][IA.SUBTYPE] == "/Image": - extension, byte_stream = _xobj_to_image(xObject[obj]) + for obj in x_object: + if x_object[obj][IA.SUBTYPE] == "/Image": + extension, byte_stream = _xobj_to_image(x_object[obj]) if extension is not None: filename = obj[1:] + ".png" with open(filename, "wb") as img: @@ -229,9 +229,8 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): ) pdf_stream = io.BytesIO(pdf_data) if should_fail: - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - PdfReader(pdf_stream, strict=strict) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + PdfReader(pdf_stream, strict=strict) assert exc.type == PdfReadError if startx_correction == -1: assert ( @@ -245,9 +244,8 @@ def test_get_images_raw(strict, with_prev_0, startx_correction, should_fail): def test_issue297(): path = os.path.join(RESOURCE_ROOT, "issue-297.pdf") - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - reader = PdfReader(path, strict=True) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + reader = PdfReader(path, strict=True) assert "Broken xref table" in exc.value.args[0] with pytest.warns(PdfReadWarning): reader = PdfReader(path, strict=False) @@ -437,9 +435,8 @@ def test_read_prev_0_trailer(): pdf_data.find(b"xref") - 1, ) pdf_stream = io.BytesIO(pdf_data) - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - PdfReader(pdf_stream, strict=True) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + PdfReader(pdf_stream, strict=True) assert exc.value.args[0] == "/Prev=0 in the trailer (try opening with strict=False)" @@ -511,16 +508,14 @@ def test_read_unknown_zero_pages(): pdf_stream = io.BytesIO(pdf_data) with pytest.warns(PdfReadWarning): reader = PdfReader(pdf_stream, strict=True) - with pytest.raises(PdfReadError) as exc: - with pytest.warns(PdfReadWarning): - len(reader.pages) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + len(reader.pages) assert exc.value.args[0] == "Could not find object." with pytest.warns(PdfReadWarning): reader = PdfReader(pdf_stream, strict=False) - with pytest.raises(AttributeError) as exc: - with pytest.warns(PdfReadWarning): - len(reader.pages) + with pytest.raises(AttributeError) as exc, pytest.warns(PdfReadWarning): + len(reader.pages) assert exc.value.args[0] == "'NoneType' object has no attribute 'get_object'" @@ -588,10 +583,9 @@ def test_issue604(strict): pdf = None bookmarks = None if strict: - with pytest.raises(PdfReadError) as exc: - pdf = PdfReader(f, strict=strict) - with pytest.warns(PdfReadWarning): - bookmarks = pdf._get_outlines() + pdf = PdfReader(f, strict=strict) + with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): + bookmarks = pdf._get_outlines() if "Unknown Destination" not in exc.value.args[0]: raise Exception("Expected exception not raised") return # bookmarks not correct @@ -601,7 +595,6 @@ def test_issue604(strict): bookmarks = pdf._get_outlines() def get_dest_pages(x): - # print(x) if isinstance(x, list): r = [get_dest_pages(y) for y in x] return r @@ -613,7 +606,6 @@ def get_dest_pages(x): b ) in bookmarks: # b can be destination or a list:preferred to just print them out.append(get_dest_pages(b)) - # print(out) def test_decode_permissions(): From 55e241a6fea94e8ad9f083c8aef336a4f2014326 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:10:03 +0200 Subject: [PATCH 11/18] PEP8 fixes --- .flake8 | 2 +- PyPDF2/_merger.py | 10 ++++---- PyPDF2/_reader.py | 4 ++-- PyPDF2/filters.py | 59 ++++++++++++++++++++++++++--------------------- PyPDF2/generic.py | 14 +++++------ PyPDF2/xmp.py | 6 +++++ 6 files changed, 54 insertions(+), 41 deletions(-) diff --git a/.flake8 b/.flake8 index 0a93c97d6..4799bab0c 100644 --- a/.flake8 +++ b/.flake8 @@ -1,7 +1,7 @@ [flake8] # The flake8 config should work well with black, # see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 -ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,N802 +ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,N802,SIM105 exclude = build,sample-files per-file-ignores = tests/*: ASS001,PT011 diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py index 64b53c001..abf56d86b 100644 --- a/PyPDF2/_merger.py +++ b/PyPDF2/_merger.py @@ -703,13 +703,13 @@ def add_bookmark( {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) - format = 0 + format_flag = 0 if italic: - format += 1 + format_flag += 1 if bold: - format += 2 - if format: - bookmark.update({NameObject("/F"): NumberObject(format)}) + format_flag += 2 + if format_flag: + bookmark.update({NameObject("/F"): NumberObject(format_flag)}) bookmark_ref = self.output._add_object(bookmark) parent = cast(Bookmark, parent.get_object()) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 8b345d4fc..38052f02e 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -913,7 +913,7 @@ def _flatten( inherit: Optional[Dict[str, Any]] = None, indirect_ref: Optional[IndirectObject] = None, ) -> None: - inheritablePageAttributes = ( + inheritable_page_attributes = ( NameObject(PG.RESOURCES), NameObject(PG.MEDIABOX), NameObject(PG.CROPBOX), @@ -933,7 +933,7 @@ def _flatten( t = pages[PA.TYPE] # type: ignore if t == "/Pages": - for attr in inheritablePageAttributes: + for attr in inheritable_page_attributes: if attr in pages: inherit[attr] = pages[attr] for page in pages[PA.KIDS]: # type: ignore diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 6bf4619c3..ca21f4aec 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -76,7 +76,9 @@ def compress(data: bytes) -> bytes: class FlateDecode: @staticmethod def decode( - data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] + # TODO: PEP8 + data: bytes, + decodeParms: Union[None, ArrayObject, DictionaryObject], ) -> bytes: """ :param data: flate-encoded data. @@ -90,9 +92,9 @@ def decode( if decodeParms: try: if isinstance(decodeParms, ArrayObject): - for decodeParm in decodeParms: - if "/Predictor" in decodeParm: - predictor = decodeParm["/Predictor"] + for decode_parm in decodeParms: + if "/Predictor" in decode_parm: + predictor = decode_parm["/Predictor"] else: predictor = decodeParms.get("/Predictor", 1) except AttributeError: @@ -103,9 +105,9 @@ def decode( # §7.4.4.3 LZWDecode and FlateDecode Parameters, Table 8 if isinstance(decodeParms, ArrayObject): columns = 1 - for decodeParm in decodeParms: - if "/Columns" in decodeParm: - columns = decodeParm["/Columns"] + for decode_parm in decodeParms: + if "/Columns" in decode_parm: + columns = decode_parm["/Columns"] else: columns = 1 if decodeParms is None else decodeParms.get(LZW.COLUMNS, 1) @@ -169,7 +171,9 @@ class ASCIIHexDecode: @staticmethod def decode( - data: str, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + # TODO: PEP8 + data: str, + decodeParms: Union[None, ArrayObject, DictionaryObject] = None, ) -> str: """ :param data: a str sequence of hexadecimal-encoded values to be @@ -285,7 +289,9 @@ def decode(self) -> str: @staticmethod def decode( - data: bytes, decodeParms: Union[None, ArrayObject, DictionaryObject] = None + # TODO: PEP8 + data: bytes, + decodeParms: Union[None, ArrayObject, DictionaryObject] = None, ) -> str: """ :param data: ``bytes`` or ``str`` text to decode. @@ -384,11 +390,11 @@ def _get_parameters( columns = 0 if parameters: if isinstance(parameters, ArrayObject): - for decodeParm in parameters: - if CCITT.COLUMNS in decodeParm: - columns = decodeParm[CCITT.COLUMNS] - if CCITT.K in decodeParm: - k = decodeParm[CCITT.K] + for decode_parm in parameters: + if CCITT.COLUMNS in decode_parm: + columns = decode_parm[CCITT.COLUMNS] + if CCITT.K in decode_parm: + k = decode_parm[CCITT.K] else: columns = parameters[CCITT.COLUMNS] # type: ignore k = parameters[CCITT.K] # type: ignore @@ -398,6 +404,7 @@ def _get_parameters( @staticmethod def decode( data: bytes, + # TODO: PEP8 decodeParms: Union[None, ArrayObject, DictionaryObject] = None, height: int = 0, ) -> bytes: @@ -460,25 +467,25 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # utils.StreamObject data: bytes = stream._data # If there is not data to decode we should not try to decode the data. if data: - for filterType in filters: - if filterType == FT.FLATE_DECODE or filterType == FTA.FL: + for filter_type in filters: + if filter_type == FT.FLATE_DECODE or filter_type == FTA.FL: data = FlateDecode.decode(data, stream.get(SA.DECODE_PARMS)) - elif filterType == FT.ASCII_HEX_DECODE or filterType == FTA.AHx: + elif filter_type == FT.ASCII_HEX_DECODE or filter_type == FTA.AHx: data = ASCIIHexDecode.decode(data) # type: ignore - elif filterType == FT.LZW_DECODE or filterType == FTA.LZW: + elif filter_type == FT.LZW_DECODE or filter_type == FTA.LZW: data = LZWDecode.decode(data, stream.get(SA.DECODE_PARMS)) # type: ignore - elif filterType == FT.ASCII_85_DECODE or filterType == FTA.A85: + elif filter_type == FT.ASCII_85_DECODE or filter_type == FTA.A85: data = ASCII85Decode.decode(data) - elif filterType == FT.DCT_DECODE: + elif filter_type == FT.DCT_DECODE: data = DCTDecode.decode(data) - elif filterType == "/JPXDecode": + elif filter_type == "/JPXDecode": data = JPXDecode.decode(data) - elif filterType == FT.CCITT_FAX_DECODE: + elif filter_type == FT.CCITT_FAX_DECODE: height = stream.get(IA.HEIGHT, ()) data = CCITTFaxDecode.decode(data, stream.get(SA.DECODE_PARMS), height) - elif filterType == "/Crypt": - decodeParms = stream.get(SA.DECODE_PARMS, {}) - if "/Name" not in decodeParms and "/Type" not in decodeParms: + elif filter_type == "/Crypt": + decode_parms = stream.get(SA.DECODE_PARMS, {}) + if "/Name" not in decode_parms and "/Type" not in decode_parms: pass else: raise NotImplementedError( @@ -486,7 +493,7 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # utils.StreamObject ) else: # Unsupported filter - raise NotImplementedError("unsupported filter %s" % filterType) + raise NotImplementedError("unsupported filter %s" % filter_type) return data diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 5fb5ca8cf..e43375386 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -555,7 +555,7 @@ def writeToStream( class NameObject(str, PdfObject): - delimiterPattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) + delimiter_pattern = re.compile(b_(r"\s+|[\(\)<>\[\]{}/%]")) surfix = b_("/") def write_to_stream( @@ -574,7 +574,7 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) if name != NameObject.surfix: raise PdfReadError("name read error") - name += read_until_regex(stream, NameObject.delimiterPattern, ignore_eof=True) + name += read_until_regex(stream, NameObject.delimiter_pattern, ignore_eof=True) try: try: ret = name.decode("utf-8") @@ -687,16 +687,16 @@ def read_from_stream( forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, ) -> "DictionaryObject": def get_next_obj_pos( - p: int, p1: int, remGens: List[int], pdf: Any + p: int, p1: int, rem_gens: List[int], pdf: Any ) -> int: # PdfReader - l = pdf.xref[remGens[0]] + l = pdf.xref[rem_gens[0]] for o in l: if p1 > l[o] and p < l[o]: p1 = l[o] - if len(remGens) == 1: + if len(rem_gens) == 1: return p1 else: - return get_next_obj_pos(p, p1, remGens[1:], pdf) + return get_next_obj_pos(p, p1, rem_gens[1:], pdf) def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader # we are just pointing at beginning of the stream @@ -1129,7 +1129,7 @@ def __parseContentStream(self, stream: StreamType) -> None: break stream.seek(-1, 1) if peek.isalpha() or peek == b_("'") or peek == b_('"'): - operator = read_until_regex(stream, NameObject.delimiterPattern, True) + operator = read_until_regex(stream, NameObject.delimiter_pattern, True) if operator == b_("BI"): # begin inline image - a completely different parsing # mechanism is required, of course... thanks buddy... diff --git a/PyPDF2/xmp.py b/PyPDF2/xmp.py index 2364cac25..f2deef1e1 100644 --- a/PyPDF2/xmp.py +++ b/PyPDF2/xmp.py @@ -370,6 +370,7 @@ def _get_text(self, element: XmlElement) -> str: The name of the tool that created the PDF document. """ + # TODO: PEP8 xmp_createDate = property( _getter_single(XMP_NAMESPACE, "CreateDate", _converter_date) ) @@ -378,6 +379,7 @@ def _get_text(self, element: XmlElement) -> str: time are returned as a UTC datetime.datetime object. """ + # TODO: PEP8 xmp_modifyDate = property( _getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date) ) @@ -386,6 +388,7 @@ def _get_text(self, element: XmlElement) -> str: are returned as a UTC datetime.datetime object. """ + # TODO: PEP8 xmp_metadataDate = property( _getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date) ) @@ -395,16 +398,19 @@ def _get_text(self, element: XmlElement) -> str: object. """ + # TODO: PEP8 xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool")) """ The name of the first known tool used to create the resource. """ + # TODO: PEP8 xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID")) """ The common identifier for all versions and renditions of this resource. """ + # TODO: PEP8 xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID")) """ An identifier for a specific incarnation of a document, updated each From 3d83dbb707ec8118c8a20f393f861b8456be63d1 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:18:27 +0200 Subject: [PATCH 12/18] Dont check old text extraction for coverage --- PyPDF2/_page.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 96bcb768b..10155b520 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -984,7 +984,9 @@ def compressContentStreams(self) -> None: # pragma: no cover deprecate_with_replacement("compressContentStreams", "compress_content_streams") self.compress_content_streams() - def _extract_text_old(self, Tj_sep: str = "", TJ_sep: str = "") -> str: + def _extract_text_old( + self, Tj_sep: str = "", TJ_sep: str = "" + ) -> str: # pragma: no cover """ Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF From 24051194fb8d40551008a9559f26ae9a1ae35a7e Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:25:24 +0200 Subject: [PATCH 13/18] add bookmark italic and bold --- tests/test_generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index 41fd8ae79..e51d21ef8 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -396,7 +396,7 @@ def test_remove_child_in_tree(): reader = PdfReader(pdf) writer = PdfWriter() writer.add_page(reader.pages[0]) - writer.add_bookmark("foo", 0) + writer.add_bookmark("foo", pagenum=0, italic=True, bold=True) obj = writer._objects[-1] tree.add_child(obj, writer) tree.remove_child(obj) From a5a776b1e1327b00625f53fffebaf7964b7b6221 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:42:26 +0200 Subject: [PATCH 14/18] merger add bookmark italic / bold --- tests/test_generic.py | 2 +- tests/test_merger.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index e51d21ef8..2bb36bb4b 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -396,7 +396,7 @@ def test_remove_child_in_tree(): reader = PdfReader(pdf) writer = PdfWriter() writer.add_page(reader.pages[0]) - writer.add_bookmark("foo", pagenum=0, italic=True, bold=True) + writer.add_bookmark("foo", pagenum=0) obj = writer._objects[-1] tree.add_child(obj, writer) tree.remove_child(obj) diff --git a/tests/test_merger.py b/tests/test_merger.py index 2d137fcad..12cba53c4 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -38,7 +38,7 @@ def test_merge(): merger.append(fh) bookmark = merger.add_bookmark("A bookmark", 0) - merger.add_bookmark("deeper", 0, parent=bookmark) + merger.add_bookmark("deeper", 0, parent=bookmark, italic=True, bold=True) merger.add_metadata({"author": "Martin Thoma"}) merger.add_named_destination("title", 0) merger.set_page_layout("/SinglePage") From 6957c00b6ccee070245648a0c4040b62075d1e82 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 11:52:50 +0200 Subject: [PATCH 15/18] Use new module _cmap --- PyPDF2/_cmap.py | 168 ++++++++++++++++++++++++++++++++++++++++++++ PyPDF2/_page.py | 167 +------------------------------------------ PyPDF2/pagerange.py | 1 - 3 files changed, 169 insertions(+), 167 deletions(-) create mode 100644 PyPDF2/_cmap.py diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py new file mode 100644 index 000000000..c10e4bba2 --- /dev/null +++ b/PyPDF2/_cmap.py @@ -0,0 +1,168 @@ +import warnings +from binascii import unhexlify +from typing import Any, Dict, List, Tuple, Union, cast + +from ._adobe_glyphs import adobe_glyphs +from .errors import PdfReadWarning +from .generic import DecodedStreamObject, DictionaryObject, charset_encoding + + +# code freely inspired from @twiggy ; see #711 +def _build_char_map( + font_name: str, space_width: float, obj: DictionaryObject +) -> Tuple[str, float, Dict, Dict]: + map_dict: Any = {} + process_rg: bool = False + process_char: bool = False + encoding: List[str] = [] + ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore + font_type: str = cast(str, ft["/Subtype"]) + sp_width: float = space_width * 2 # default value + w = [] + # encoding + space_code = 32 + if "/Encoding" in ft: + enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore + if isinstance(enc, str): + try: + if enc in ("/Identity-H", "/Identity-V"): + encoding = [] + else: + encoding = charset_encoding[enc].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: + try: + encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + else: + encoding = charset_encoding["/StandardCoding"].copy() + if "/Differences" in enc: + x = 0 + for o in cast( + DictionaryObject, cast(DictionaryObject, enc)["/Differences"] + ): + if isinstance(o, int): + x = o + else: + try: + encoding[x] = adobe_glyphs[o] + except Exception: + encoding[x] = o + if o == " ": + space_code = x + x += 1 + if "/ToUnicode" in ft: + cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") + for l in ( + cm.strip() + .replace("<", " ") + .replace(">", "") + .replace("[", " [ ") + .replace("]", " ] ") + .split("\n") + ): + if l == "": + continue + if "beginbfrange" in l: + process_rg = True + elif "endbfrange" in l: + process_rg = False + elif "beginbfchar" in l: + process_char = True + elif "endbfchar" in l: + process_char = False + elif process_rg: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + b = int(lst[1], 16) + if lst[2] == "[": + for sq in lst[3:]: + if "]": + break + map_dict[a] = unhexlify(sq).decode("utf-16-be") + a += 1 + assert a > b + else: + c = int(lst[2], 16) + fmt = b"%%0%dX" % len(lst[2]) + while a <= b: + map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") + a += 1 + c += 1 + elif process_char: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + map_dict[a] = unhexlify("".join(lst[1:])).decode( + "utf-16-be" + ) # join is here as some cases where the code was split + + # get + for a in map_dict: + if map_dict[a] == " ": + space_code = a + + # compute space width + st: int = 0 # declaration for mypy + if "/W" in ft: + if "/DW" in ft: + sp_width = cast(float, ft["/DW"]) + w = list(ft["/W"]) # type: ignore + while len(w) > 0: + st = w[0] + second = w[1] + if isinstance(int, second): + if st <= space_code and space_code <= second: + sp_width = w[2] + break + w = w[3:] + if isinstance(list, second): + if st <= space_code and space_code <= st + len(second) - 1: + sp_width = second[space_code - st] + w = w[2:] + else: + warnings.warn( + "unknown widths : \n" + (ft["/W"]).__repr__(), + PdfReadWarning, + ) + break + if "/Widths" in ft: + w = list(ft["/Widths"]) # type: ignore + try: + st = cast(int, ft["/FirstChar"]) + en: int = cast(int, ft["/LastChar"]) + if st > space_code or en < space_code: + raise Exception("Not in range") + if w[space_code - st] == 0: + raise Exception("null width") + sp_width = w[space_code - st] + except Exception: + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore + else: + # will consider width of char as avg(width)/2 + m = 0 + cpt = 0 + for x in w: + if x > 0: + m += x + cpt += 1 + sp_width = m / max(1, cpt) / 2 + + return ( + font_type, + float(sp_width / 2), + dict(zip(range(256), encoding)), + "".maketrans(map_dict), + ) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 10155b520..805177958 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -30,7 +30,6 @@ import math import uuid import warnings -from binascii import unhexlify from decimal import Decimal from math import sqrt from typing import ( @@ -46,7 +45,7 @@ cast, ) -from ._adobe_glyphs import adobe_glyphs +from ._cmap import _build_char_map from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, @@ -60,7 +59,6 @@ from .generic import ( ArrayObject, ContentStream, - DecodedStreamObject, DictionaryObject, EncodedStreamObject, FloatObject, @@ -70,7 +68,6 @@ NumberObject, RectangleObject, TextStringObject, - charset_encoding, ) @@ -1438,165 +1435,3 @@ def __getitem__(self, index: int) -> PageObject: def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): yield self[i] - - -# code freely inspired from @twiggy ; see #711 -def _build_char_map( - font_name: str, space_width: float, obj: DictionaryObject -) -> Tuple[str, float, Dict, Dict]: - map_dict: Any = {} - process_rg: bool = False - process_char: bool = False - encoding: List[str] = [] - ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore - font_type: str = cast(str, ft["/Subtype"]) - sp_width: float = space_width * 2 # default value - w = [] - # encoding - space_code = 32 - if "/Encoding" in ft: - enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore - if isinstance(enc, str): - try: - if enc in ("/Identity-H", "/Identity-V"): - encoding = [] - else: - encoding = charset_encoding[enc].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: - try: - encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - else: - encoding = charset_encoding["/StandardCoding"].copy() - if "/Differences" in enc: - x = 0 - for o in cast( - DictionaryObject, cast(DictionaryObject, enc)["/Differences"] - ): - if isinstance(o, int): - x = o - else: - try: - encoding[x] = adobe_glyphs[o] - except Exception: - encoding[x] = o - if o == " ": - space_code = x - x += 1 - if "/ToUnicode" in ft: - cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") - for l in ( - cm.strip() - .replace("<", " ") - .replace(">", "") - .replace("[", " [ ") - .replace("]", " ] ") - .split("\n") - ): - if l == "": - continue - if "beginbfrange" in l: - process_rg = True - elif "endbfrange" in l: - process_rg = False - elif "beginbfchar" in l: - process_char = True - elif "endbfchar" in l: - process_char = False - elif process_rg: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - if lst[2] == "[": - # lst = lst[3:].trim(' []').split(' ') - for sq in lst[3:]: - if "]": - break - map_dict[a] = unhexlify(sq).decode("utf-16-be") - a += 1 - assert a > b - else: - c = int(lst[2], 16) - fmt = b"%%0%dX" % len(lst[2]) - while a <= b: - map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") - a += 1 - c += 1 - elif process_char: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - map_dict[a] = unhexlify("".join(lst[1:])).decode( - "utf-16-be" - ) # join is here as some cases where the code was split - - # get - for a in map_dict: - if map_dict[a] == " ": - space_code = a - - # compute space width - st: int = 0 # declaration for mypy - if "/W" in ft: - if "/DW" in ft: - sp_width = cast(float, ft["/DW"]) - w = [x for x in ft["/W"]] # type: ignore - while len(w) > 0: - st = w[0] - second = w[1] - if isinstance(int, second): - if st <= space_code and space_code <= second: - sp_width = w[2] - break - w = w[3:] - if isinstance(list, second): - if st <= space_code and space_code <= st + len(second) - 1: - sp_width = second[space_code - st] - w = w[2:] - else: - warnings.warn( - "unknown widths : \n" + (ft["/W"]).__repr__(), - PdfReadWarning, - ) - break - if "/Widths" in ft: - w = [x for x in ft["/Widths"]] # type: ignore - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > space_code or en < space_code: - raise Exception("Not in range") - if w[space_code - st] == 0: - raise Exception("null width") - sp_width = w[space_code - st] - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore - else: - # will consider width of char as avg(width)/2 - m = 0 - cpt = 0 - for x in w: - if x > 0: - m += x - cpt += 1 - sp_width = m / max(1, cpt) / 2 - - return ( - font_type, - float(sp_width / 2), - dict(zip(range(256), encoding)), - "".maketrans(map_dict), - ) diff --git a/PyPDF2/pagerange.py b/PyPDF2/pagerange.py index cd5ab5660..c9090d0f3 100644 --- a/PyPDF2/pagerange.py +++ b/PyPDF2/pagerange.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ Representation and utils for ranges of PDF file pages. From 9fedc783dd070bd66217b044d75fc4f6df9c67a8 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 12:11:31 +0200 Subject: [PATCH 16/18] Split build_char_map --- PyPDF2/_cmap.py | 226 ++++++++++++++++++++++++++---------------------- PyPDF2/_page.py | 4 +- 2 files changed, 123 insertions(+), 107 deletions(-) diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py index c10e4bba2..2d25f9e8d 100644 --- a/PyPDF2/_cmap.py +++ b/PyPDF2/_cmap.py @@ -8,111 +8,133 @@ # code freely inspired from @twiggy ; see #711 -def _build_char_map( +def build_char_map( font_name: str, space_width: float, obj: DictionaryObject -) -> Tuple[str, float, Dict, Dict]: - map_dict: Any = {} - process_rg: bool = False - process_char: bool = False - encoding: List[str] = [] +) -> Tuple[str, float, Dict[int, str], Dict]: ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore font_type: str = cast(str, ft["/Subtype"]) - sp_width: float = space_width * 2 # default value - w = [] - # encoding + space_code = 32 - if "/Encoding" in ft: - enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore - if isinstance(enc, str): - try: - if enc in ("/Identity-H", "/Identity-V"): - encoding = [] - else: - encoding = charset_encoding[enc].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: - try: - encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() - except Exception: - warnings.warn( - f"Advanced encoding {encoding} not implemented yet", - PdfReadWarning, - ) - encoding = charset_encoding["/StandardCoding"].copy() - else: + encoding, space_code = parse_encoding(ft, space_code) + map_dict, space_code = parse_to_unicode(ft, space_code) + sp_width = compute_space_width(ft, space_code, space_width) + + return ( + font_type, + float(sp_width / 2), + dict(zip(range(256), encoding)), + # https://github.com/python/mypy/issues/4374 + "".maketrans(map_dict), # type: ignore + ) + + +def parse_encoding(ft: DictionaryObject, space_code: int) -> Tuple[List[str], int]: + encoding: List[str] = [] + if "/Encoding" not in ft: + return encoding, space_code + enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore + if isinstance(enc, str): + try: + if enc in ("/Identity-H", "/Identity-V"): + encoding = [] + else: + encoding = charset_encoding[enc].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) encoding = charset_encoding["/StandardCoding"].copy() - if "/Differences" in enc: - x = 0 - for o in cast( - DictionaryObject, cast(DictionaryObject, enc)["/Differences"] - ): - if isinstance(o, int): - x = o - else: - try: - encoding[x] = adobe_glyphs[o] - except Exception: - encoding[x] = o - if o == " ": - space_code = x - x += 1 - if "/ToUnicode" in ft: - cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") - for l in ( - cm.strip() - .replace("<", " ") - .replace(">", "") - .replace("[", " [ ") - .replace("]", " ] ") - .split("\n") - ): - if l == "": - continue - if "beginbfrange" in l: - process_rg = True - elif "endbfrange" in l: - process_rg = False - elif "beginbfchar" in l: - process_char = True - elif "endbfchar" in l: - process_char = False - elif process_rg: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - b = int(lst[1], 16) - if lst[2] == "[": - for sq in lst[3:]: - if "]": - break - map_dict[a] = unhexlify(sq).decode("utf-16-be") - a += 1 - assert a > b - else: - c = int(lst[2], 16) - fmt = b"%%0%dX" % len(lst[2]) - while a <= b: - map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") - a += 1 - c += 1 - elif process_char: - lst = [x for x in l.split(" ") if x] - a = int(lst[0], 16) - map_dict[a] = unhexlify("".join(lst[1:])).decode( - "utf-16-be" - ) # join is here as some cases where the code was split + elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: + try: + encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() + except Exception: + warnings.warn( + f"Advanced encoding {encoding} not implemented yet", + PdfReadWarning, + ) + encoding = charset_encoding["/StandardCoding"].copy() + else: + encoding = charset_encoding["/StandardCoding"].copy() + if "/Differences" in enc: + x = 0 + for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): + if isinstance(o, int): + x = o + else: + try: + encoding[x] = adobe_glyphs[o] + except Exception: + encoding[x] = o + if o == " ": + space_code = x + x += 1 + return encoding, space_code + - # get - for a in map_dict: - if map_dict[a] == " ": - space_code = a +def parse_to_unicode(ft: DictionaryObject, space_code: int) -> Tuple[Dict, int]: + map_dict: Dict[Any, Any] = {} + if "/ToUnicode" not in ft: + return map_dict, space_code + process_rg: bool = False + process_char: bool = False + cm: str = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data().decode("utf-8") + for l in ( + cm.strip() + .replace("<", " ") + .replace(">", "") + .replace("[", " [ ") + .replace("]", " ] ") + .split("\n") + ): + if l == "": + continue + if "beginbfrange" in l: + process_rg = True + elif "endbfrange" in l: + process_rg = False + elif "beginbfchar" in l: + process_char = True + elif "endbfchar" in l: + process_char = False + elif process_rg: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + b = int(lst[1], 16) + if lst[2] == "[": + for sq in lst[3:]: + if "]": + break + map_dict[a] = unhexlify(sq).decode("utf-16-be") + a += 1 + assert a > b + else: + c = int(lst[2], 16) + fmt = b"%%0%dX" % len(lst[2]) + while a <= b: + map_dict[a] = unhexlify(fmt % c).decode("utf-16-be") + a += 1 + c += 1 + elif process_char: + lst = [x for x in l.split(" ") if x] + a = int(lst[0], 16) + map_dict[a] = unhexlify("".join(lst[1:])).decode( + "utf-16-be" + ) # join is here as some cases where the code was split - # compute space width - st: int = 0 # declaration for mypy + # get + for a in map_dict: + if map_dict[a] == " ": + space_code = a + return map_dict, space_code + + +def compute_space_width( + ft: DictionaryObject, space_code: int, space_width: float +) -> float: + sp_width: float = space_width * 2 # default value + w = [] + st: int = 0 if "/W" in ft: if "/DW" in ft: sp_width = cast(float, ft["/DW"]) @@ -159,10 +181,4 @@ def _build_char_map( m += x cpt += 1 sp_width = m / max(1, cpt) / 2 - - return ( - font_type, - float(sp_width / 2), - dict(zip(range(256), encoding)), - "".maketrans(map_dict), - ) + return sp_width diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 805177958..5333e95c6 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -45,7 +45,7 @@ cast, ) -from ._cmap import _build_char_map +from ._cmap import build_char_map from ._utils import ( CompressedTransformationMatrix, TransformationMatrixType, @@ -1122,7 +1122,7 @@ def _extract_text( resources_dict = cast(DictionaryObject, obj["/Resources"]) if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): - cmaps[f] = _build_char_map(f, space_width, obj) + cmaps[f] = build_char_map(f, space_width, obj) cmap: Union[str, Dict[int, str]] = {} content = obj[content_key].get_object() if isinstance(content_key, str) else obj if not isinstance(content, ContentStream): From efc71f86ba2877b5c53989d3372e3c49d50b9986 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 13:03:47 +0200 Subject: [PATCH 17/18] More flake8 nitpicks --- .flake8 | 2 +- PyPDF2/_page.py | 83 +++++++++++++++++++++++++---------------------- PyPDF2/_reader.py | 10 +++--- PyPDF2/_writer.py | 24 +++++++------- PyPDF2/generic.py | 10 +++--- setup.py | 0 6 files changed, 66 insertions(+), 63 deletions(-) mode change 100644 => 100755 setup.py diff --git a/.flake8 b/.flake8 index 4799bab0c..95a1289f4 100644 --- a/.flake8 +++ b/.flake8 @@ -1,7 +1,7 @@ [flake8] # The flake8 config should work well with black, # see https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html#flake8 -ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,N802,SIM105 +ignore = E203,E501,E741,W503,W604,N817,N814,VNE001,VNE002,VNE003,N802,SIM105,P101 exclude = build,sample-files per-file-ignores = tests/*: ASS001,PT011 diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 5333e95c6..c16ea09ec 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -539,50 +539,55 @@ def _merge_page( # if expanding the page to fit a new page, calculate the new media box size if expand: - corners1 = [ - self.mediabox.left.as_numeric(), - self.mediabox.bottom.as_numeric(), - self.mediabox.right.as_numeric(), - self.mediabox.top.as_numeric(), - ] - corners2 = [ - page2.mediabox.left.as_numeric(), - page2.mediabox.bottom.as_numeric(), - page2.mediabox.left.as_numeric(), - page2.mediabox.top.as_numeric(), - page2.mediabox.right.as_numeric(), - page2.mediabox.top.as_numeric(), - page2.mediabox.right.as_numeric(), - page2.mediabox.bottom.as_numeric(), - ] - if ctm is not None: - ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] - new_x = [ - ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] - for i in range(0, 8, 2) - ] - new_y = [ - ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] - for i in range(0, 8, 2) - ] - else: - new_x = corners2[0:8:2] - new_y = corners2[1:8:2] - lowerleft = (min(new_x), min(new_y)) - upperright = (max(new_x), max(new_y)) - lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) - upperright = ( - max(corners1[2], upperright[0]), - max(corners1[3], upperright[1]), - ) - - self.mediabox.lower_left = lowerleft - self.mediabox.upper_right = upperright + self._expand_mediabox(page2, ctm) self[NameObject(PG.CONTENTS)] = ContentStream(new_content_array, self.pdf) self[NameObject(PG.RESOURCES)] = new_resources self[NameObject(PG.ANNOTS)] = new_annots + def _expand_mediabox( + self, page2: "PageObject", ctm: Optional[CompressedTransformationMatrix] + ) -> None: + corners1 = [ + self.mediabox.left.as_numeric(), + self.mediabox.bottom.as_numeric(), + self.mediabox.right.as_numeric(), + self.mediabox.top.as_numeric(), + ] + corners2 = [ + page2.mediabox.left.as_numeric(), + page2.mediabox.bottom.as_numeric(), + page2.mediabox.left.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.top.as_numeric(), + page2.mediabox.right.as_numeric(), + page2.mediabox.bottom.as_numeric(), + ] + if ctm is not None: + ctm = tuple(float(x) for x in ctm) # type: ignore[assignment] + new_x = [ + ctm[0] * corners2[i] + ctm[2] * corners2[i + 1] + ctm[4] + for i in range(0, 8, 2) + ] + new_y = [ + ctm[1] * corners2[i] + ctm[3] * corners2[i + 1] + ctm[5] + for i in range(0, 8, 2) + ] + else: + new_x = corners2[0:8:2] + new_y = corners2[1:8:2] + lowerleft = (min(new_x), min(new_y)) + upperright = (max(new_x), max(new_y)) + lowerleft = (min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])) + upperright = ( + max(corners1[2], upperright[0]), + max(corners1[3], upperright[1]), + ) + + self.mediabox.lower_left = lowerleft + self.mediabox.upper_right = upperright + def mergeTransformedPage( self, page2: "PageObject", diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 38052f02e..45173c415 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -393,7 +393,7 @@ def _get_page(self, page_number: int) -> PageObject: """ Retrieves a page by number from this PDF file. - :param int pageNumber: The page number to retrieve + :param int page_number: The page number to retrieve (pages begin at zero) :return: a :class:`PageObject` instance. :rtype: :class:`PageObject` @@ -498,7 +498,7 @@ def _build_field( field: Union[TreeObject, DictionaryObject], retval: Dict[Any, Any], fileobj: Any, - fieldAttributes: Any, + field_attributes: Any, ) -> None: self._check_kids(field, retval, fileobj) try: @@ -510,7 +510,7 @@ def _build_field( # Ignore no-name field for now return if fileobj: - self._write_field(fileobj, field, fieldAttributes) + self._write_field(fileobj, field, field_attributes) fileobj.write("\n") retval[key] = Field(field) @@ -1215,10 +1215,10 @@ def read(self, stream: StreamType) -> None: ) ) stream.seek(-1, 2) - last1M = stream.tell() - 1024 * 1024 + 1 # offset of last MB of stream + last_mb = stream.tell() - 1024 * 1024 + 1 # offset of last MB of stream line = b_("") while line[:5] != b_("%%EOF"): - if stream.tell() < last1M: + if stream.tell() < last_mb: raise PdfReadError("EOF marker not found") line = self.read_next_end_line(stream) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 95648d9fa..819c3aae5 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -215,7 +215,7 @@ def insertPage(self, page: PageObject, index: int = 0) -> None: # pragma: no co deprecate_with_replacement("insertPage", "insert_page") self.insert_page(page, index) - def get_page(self, pageNumber: int) -> PageObject: + def get_page(self, pageNumber: int) -> PageObject: # TODO: PEP8 """ Retrieve a page by number from this PDF file. @@ -453,8 +453,8 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: endobj """ - embeddedFilesNamesDictionary = DictionaryObject() - embeddedFilesNamesDictionary.update( + embedded_files_names_dictionary = DictionaryObject() + embedded_files_names_dictionary.update( { NameObject(CA.NAMES): ArrayObject( [createStringObject(filename), filespec] @@ -462,12 +462,12 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: } ) - embeddedFilesDictionary = DictionaryObject() - embeddedFilesDictionary.update( - {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary} + embedded_files_dictionary = DictionaryObject() + embedded_files_dictionary.update( + {NameObject("/EmbeddedFiles"): embedded_files_names_dictionary} ) # Update the root - self._root_object.update({NameObject(CA.NAMES): embeddedFilesDictionary}) + self._root_object.update({NameObject(CA.NAMES): embedded_files_dictionary}) def addAttachment( self, fname: str, fdata: Union[str, bytes] @@ -1097,13 +1097,13 @@ def add_bookmark( {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) - format = 0 + format_flag = 0 if italic: - format += 1 + format_flag += 1 if bold: - format += 2 - if format: - bookmark.update({NameObject("/F"): NumberObject(format)}) + format_flag += 2 + if format_flag: + bookmark.update({NameObject("/F"): NumberObject(format_flag)}) bookmark_ref = self._add_object(bookmark) diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index e43375386..667f60136 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -84,7 +84,7 @@ def getObject(self) -> Optional["PdfObject"]: # pragma: no cover def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: - raise NotImplementedError() + raise NotImplementedError class NullObject(PdfObject): @@ -581,14 +581,14 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader except (UnicodeEncodeError, UnicodeDecodeError): ret = name.decode("gbk") return NameObject(ret) - except (UnicodeEncodeError, UnicodeDecodeError): + except (UnicodeEncodeError, UnicodeDecodeError) as e: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number if not pdf.strict: warnings.warn("Illegal character in Name Object", PdfReadWarning) return NameObject(name) else: - raise PdfReadError("Illegal character in Name Object") + raise PdfReadError("Illegal character in Name Object") from e @staticmethod def readFromStream( @@ -700,9 +700,7 @@ def get_next_obj_pos( def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader # we are just pointing at beginning of the stream - eon = ( - get_next_obj_pos(stream.tell(), 2**32, [g for g in pdf.xref], pdf) - 1 - ) + eon = get_next_obj_pos(stream.tell(), 2**32, list(pdf.xref), pdf) - 1 curr = stream.tell() rw = stream.read(eon - stream.tell()) p = rw.find(b_("endstream")) diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 From b7b85ed8acfe05af5eecb1c68187991196e38c85 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Mon, 6 Jun 2022 13:21:40 +0200 Subject: [PATCH 18/18] More nitpics --- PyPDF2/_page.py | 30 +++++++++++------------------- PyPDF2/_writer.py | 7 ++++--- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index c16ea09ec..ae44506e6 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -271,7 +271,7 @@ def create_blank_page( width = lastpage.mediabox.width height = lastpage.mediabox.height else: - raise PageSizeNotDefinedError() + raise PageSizeNotDefinedError page.__setitem__( NameObject(PG.MEDIABOX), RectangleObject((0, 0, width, height)) # type: ignore ) @@ -1010,28 +1010,20 @@ def _extract_text_old( space_scale = 1.0 for operands, operator in content.operations: - if operator == b"Tf": # text font - pass - elif operator == b"Tfs": # text font size - pass - elif operator == b"Tc": # character spacing - # See '5.2.1 Character Spacing' + # Missing operators: + # Tf: text font + # Tfs: text font size + # Tc: '5.2.1 Character Spacing' + # Th: '5.2.3 Horizontal Scaling' + # Tl: '5.2.4 Leading' + # Tmode: '5.2.5 Text Rendering Mode' + # Trise: '5.2.6 Text Rise' + + if operator in [b"Tf", b"Tfs", b"Tc", b"Th", b"Tl", b"Tmode"]: pass elif operator == b"Tw": # word spacing # See '5.2.2 Word Spacing' space_scale = 1.0 + float(operands[0]) - elif operator == b"Th": # horizontal scaling - # See '5.2.3 Horizontal Scaling' - pass - elif operator == b"Tl": # leading - # See '5.2.4 Leading' - pass - elif operator == b"Tmode": # text rendering mode - # See '5.2.5 Text Rendering Mode' - pass - elif operator == b"Trise": # text rise - # See '5.2.6 Text Rise' - pass elif operator == b"Tj": # See 'TABLE 5.6 Text-showing operators' _text = operands[0] diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index 819c3aae5..e38af4d74 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -1248,9 +1248,10 @@ def remove_images(self, ignore_byte_string_object: bool = False) -> None: for operands, operator in content.operations: if operator in [b_("Tj"), b_("'")]: text = operands[0] - if ignore_byte_string_object: - if not isinstance(text, TextStringObject): - operands[0] = TextStringObject() + if ignore_byte_string_object and not isinstance( + text, TextStringObject + ): + operands[0] = TextStringObject() elif operator == b_('"'): text = operands[2] if ignore_byte_string_object and not isinstance(