Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add standard compliant default identifier #21

Merged
merged 2 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions pydyf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,11 +411,14 @@ def data(self):

class PDF:
"""PDF document."""
def __init__(self, version=b'1.7', identifier=None):
def __init__(self, version=b'1.7', identifier=False):
"""Create a PDF document.

:param bytes version: PDF version.
:param bytes identifier: PDF file identifier.
:param identifier: PDF file identifier. Default is :obj:`False`
to include no identifier, can be set to :obj:`True` to generate an
automatic identifier.
:type identifier: :obj:`bytes` or :obj:`bool`

"""
#: PDF version, as :obj:`bytes`.
Expand Down Expand Up @@ -489,18 +492,21 @@ def write_line(self, content, output):
self.current_position += len(content) + 1
output.write(content + b'\n')

def write(self, output, version=None, identifier=None, compress=False):
def write(self, output, version=None, identifier=False, compress=False):
"""Write PDF to output.

:param output: Output stream.
:type output: binary :term:`file object`
:param bytes version: PDF version.
:param bytes identifier: PDF file identifier.
:param identifier: PDF file identifier. Default is :obj:`False`
to include no identifier, can be set to :obj:`True` to generate an
automatic identifier.
:type identifier: :obj:`bytes` or :obj:`bool`
:param bool compress: whether the PDF uses a compressed object stream.

"""
version = self.version if version is None else _to_bytes(version)
identifier = self.identifier if identifier is None else identifier
identifier = self.identifier or identifier

# Write header
self.write_line(b'%PDF-' + version, output)
Expand Down Expand Up @@ -568,10 +574,12 @@ def write(self, output, version=None, identifier=None, compress=False):
'Root': self.catalog.reference,
'Info': self.info.reference,
}
if identifier is not None:
if identifier:
data = b''.join(
obj.data for obj in self.objects if obj.free != 'f')
data_hash = md5(data).hexdigest().encode()
if identifier is True:
identifier = data_hash
extra['ID'] = Array((
String(identifier).data, String(data_hash).data))
dict_stream = Stream([xref_stream], extra, compress)
Expand Down Expand Up @@ -601,10 +609,12 @@ def write(self, output, version=None, identifier=None, compress=False):
self.write_line(f'/Size {len(self.objects)}'.encode(), output)
self.write_line(b'/Root ' + self.catalog.reference, output)
self.write_line(b'/Info ' + self.info.reference, output)
if identifier is not None:
if identifier:
data = b''.join(
obj.data for obj in self.objects if obj.free != 'f')
data_hash = md5(data).hexdigest().encode()
if identifier is True:
identifier = data_hash
self.write_line(
b'/ID [' + String(identifier).data + b' ' +
String(data_hash).data + b']', output)
Expand Down
28 changes: 26 additions & 2 deletions tests/test_pydyf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import re

import pydyf

Expand Down Expand Up @@ -704,11 +705,34 @@ def test_text():
''')


def test_identifier():
def test_no_identifier():
document = pydyf.PDF()
pdf = io.BytesIO()
document.write(pdf, identifier=False)
assert re.search(
b'/ID \\[\\((?P<hash>[0-9a-f]{32})\\) \\((?P=hash)\\)\\]',
pdf.getvalue()
) is None


def test_default_identifier():
document = pydyf.PDF()
pdf = io.BytesIO()
document.write(pdf, identifier=True)
assert re.search(
b'/ID \\[\\((?P<hash>[0-9a-f]{32})\\) \\((?P=hash)\\)\\]',
pdf.getvalue()
) is not None


def test_custom_identifier():
document = pydyf.PDF()
pdf = io.BytesIO()
document.write(pdf, identifier=b'abc')
assert b'abc' in pdf.getvalue()
assert re.search(
b'/ID \\[\\(abc\\) \\(([0-9a-f]{32})\\)\\]',
pdf.getvalue()
) is not None


def test_version():
Expand Down