Skip to content

Commit

Permalink
Merge branch 'pdf-attachments' from PR #177
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonSapin committed Apr 27, 2014
2 parents 908b1ec + 96dd798 commit 830598c
Show file tree
Hide file tree
Showing 12 changed files with 517 additions and 71 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Contributors:
- Aymeric Bois
- Chung Wu
- Clément Plasse
- Colin Leitner
- Florian Mounier
- Frédérick Deslandes
- Glwadys Fayolle
Expand Down
8 changes: 7 additions & 1 deletion docs/features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ PDF
---

In addition to text, raster and vector graphics, WeasyPrint’s PDF files
can contain hyperlinks and bookmarks.
can contain hyperlinks, bookmarks and attachments.

Hyperlinks will be clickable in PDF viewers that support them. They can
be either internal, to another part of the same document (eg.
Expand All @@ -58,6 +58,12 @@ sidebar. Clicking on an entry scrolls the matching part of the document
into view. By default all ``<h1>`` to ``<h6>`` titles generate bookmarks,
but this can be controlled with CSS (see :ref:`bookmarks`.)

Attachments are related files, embedded in the PDF itself. They can be
specified through ``<link rel=attachment>`` elements to add resources globally
or through regular links with ``<a rel=attachment>`` to attach a resource that
can be saved by clicking on said link. The ``title`` attribute can be used as
description of the attachment.


Fonts
-----
Expand Down
29 changes: 26 additions & 3 deletions weasyprint/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# Used for 'User-Agent' in HTTP and 'Creator' in PDF
VERSION_STRING = 'WeasyPrint %s (http://weasyprint.org/)' % VERSION

__all__ = ['HTML', 'CSS', 'Document', 'Page', 'default_url_fetcher',
__all__ = ['HTML', 'CSS', 'Attachment', 'Document', 'Page', 'default_url_fetcher',
'VERSION']


Expand Down Expand Up @@ -131,7 +131,8 @@ def render(self, stylesheets=None, enable_hinting=False):
"""
return Document._render(self, stylesheets, enable_hinting)

def write_pdf(self, target=None, stylesheets=None, zoom=1):
def write_pdf(self, target=None, stylesheets=None, zoom=1,
attachments=None):
"""Render the document to a PDF file.
This is a shortcut for calling :meth:`render`, then
Expand All @@ -151,13 +152,16 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1):
For values other than 1, physical CSS units will thus be “wrong”.
Page size declarations are affected too, even with keyword values
like ``@page { size: A3 landscape; }``
:param attachments: A list of additional file attachments for the
generated PDF document or :obj:`None`. The list's elements are
:class:`Attachment` objects, filenames, URLs or file-like objects.
:returns:
The PDF as byte string if :obj:`target` is not provided or
:obj:`None`, otherwise :obj:`None` (the PDF is written to
:obj:`target`.)
"""
return self.render(stylesheets).write_pdf(target, zoom)
return self.render(stylesheets).write_pdf(target, zoom, attachments)

def write_image_surface(self, stylesheets=None, resolution=96):
surface, _width, _height = (
Expand Down Expand Up @@ -234,6 +238,25 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None,
for error in self.stylesheet.errors:
LOGGER.warning(error)

class Attachment(object):
"""Represents a file attachment for a PDF document.
An instance is created in the same way as :class:`HTML`, except that
the HTML specific parameters are not supported. An optional description can
be provided with the ``description`` parameter.
:param description: A description of the attachment to be included in the
PDF document. May be :obj:`None`
"""
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
string=None, base_url=None, url_fetcher=default_url_fetcher,
description=None):
self.source = _select_source(
guess, filename, url, file_obj, string, tree=None,
base_url=base_url, url_fetcher=url_fetcher)
self.description = description


@contextlib.contextmanager
def _select_source(guess=None, filename=None, url=None, file_obj=None,
Expand Down
16 changes: 15 additions & 1 deletion weasyprint/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import sys
import argparse

from . import VERSION, HTML
from . import VERSION, HTML, Attachment


def main(argv=None, stdout=None, stdin=None):
Expand Down Expand Up @@ -61,6 +61,11 @@ def main(argv=None, stdout=None, stdin=None):
Set the media type to use for ``@media``. Defaults to ``print``.
.. option:: -a <file>, --attachment <file>
Adds an attachment to the document which is included in the PDF output.
This option can be added multiple times to attach more files.
.. option:: --version
Show the version number. Other options and arguments are ignored.
Expand Down Expand Up @@ -92,6 +97,8 @@ def main(argv=None, stdout=None, stdin=None):
help='Base for relative URLs in the HTML input. '
"Defaults to the input's own filename or URL "
'or the current directory for stdin.')
parser.add_argument('-a', '--attachment', action='append',
help='URL or filename of a file to attach to the PDF document')
parser.add_argument(
'input', help='URL or filename of the HTML input, or - for stdin')
parser.add_argument(
Expand Down Expand Up @@ -136,6 +143,13 @@ def main(argv=None, stdout=None, stdin=None):
kwargs['resolution'] = args.resolution
else:
parser.error('--resolution only applies for the PNG format.')

if args.attachment:
if format_ == 'pdf':
kwargs['attachments'] = args.attachments
else:
parser.error('--attachment only applies for the PDF format.')

html = HTML(source, base_url=args.base_url, encoding=args.encoding,
media_type=args.media_type)
getattr(html, 'write_' + format_)(output, **kwargs)
Expand Down
33 changes: 18 additions & 15 deletions weasyprint/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
__all__ = ['Request', 'base64_decode', 'base64_encode', 'basestring',
'ints_from_bytes', 'iteritems', 'izip', 'parse_email', 'parse_qs',
'pathname2url', 'quote', 'unicode', 'unquote', 'unquote_to_bytes',
'urlencode', 'urljoin', 'urlopen', 'urlopen_contenttype',
'urlencode', 'urljoin', 'urlopen', 'urllib_get_content_type',
'urllib_get_charset', 'urllib_get_filename',
'urlparse_uses_relative', 'urlsplit', 'xrange']


Expand All @@ -39,13 +40,14 @@
iteritems = dict.items
izip = zip

def urlopen_contenttype(url):
"""Return (file_obj, mime_type, encoding)"""
result = urlopen(url)
info = result.info()
mime_type = info.get_content_type()
charset = info.get_param('charset')
return result, mime_type, charset
def urllib_get_content_type(urlobj):
return urlobj.info().get_content_type()

def urllib_get_charset(urlobj):
return urlobj.info().get_param('charset')

def urllib_get_filename(urlobj):
return urlobj.info().get_filename()

def parse_email(data):
if isinstance(data, bytes):
Expand Down Expand Up @@ -75,13 +77,14 @@ def ints_from_bytes(byte_string):
def array(typecode, initializer):
return _array(typecode.encode('ascii'), initializer)

def urlopen_contenttype(url):
"""Return (file_obj, mime_type, encoding)"""
result = urlopen(url)
info = result.info()
mime_type = info.gettype()
charset = info.getparam('charset')
return result, mime_type, charset
def urllib_get_content_type(urlobj):
return urlobj.info().gettype()

def urllib_get_charset(urlobj):
return urlobj.info().getparam('charset')

def urllib_get_filename(urlobj):
return None

def unquote_to_bytes(data):
if isinstance(data, unicode):
Expand Down
34 changes: 26 additions & 8 deletions weasyprint/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,14 @@ def _gather_links_and_bookmarks(box, bookmarks, links, anchors, matrix):
has_link = link and not isinstance(box, boxes.TextBox)
# In case of duplicate IDs, only the first is an anchor.
has_anchor = anchor_name and anchor_name not in anchors
is_attachment = hasattr(box, 'is_attachment') and box.is_attachment

if has_bookmark or has_link or has_anchor:
pos_x, pos_y, width, height = box.hit_area()
if has_link:
link_type, target = link
if link_type == 'external' and is_attachment:
link_type = 'attachment'
if matrix:
link = _TaggedTuple(
(link_type, target, rectangle_aabb(
Expand Down Expand Up @@ -171,6 +174,8 @@ def __init__(self, page_box, enable_hinting=False):
# The anchor might be defined in another page,
# in multiple pages (in which case the first occurence is used),
# or not at all.
#: * ``'attachment'``: :obj:`target` is an absolute URL and points
#: to a resource to attach to the document.
self.links = links = []

#: A dict mapping anchor names to their target, ``(x, y)`` points
Expand Down Expand Up @@ -246,7 +251,8 @@ class DocumentMetadata(object):
"""
def __init__(self, title=None, authors=None, description=None,
keywords=None, generator=None, created=None, modified=None):
keywords=None, generator=None, created=None, modified=None,
attachments=None):
#: The title of the document, as a string or :obj:`None`.
#: Extracted from the ``<title>`` element in HTML
#: and written to the ``/Title`` info field in PDF.
Expand Down Expand Up @@ -281,6 +287,11 @@ def __init__(self, title=None, authors=None, description=None,
#: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
#: and written to the ``/ModDate`` info field in PDF.
self.modified = modified
#: File attachments as a list of tuples of URL and a description or
#: :obj:`None`.
#: Extracted from the ``<link rel=attachment>`` elements in HTML
#: and written to the ``/EmbeddedFiles`` dictionary in PDF.
self.attachments = attachments or []


class Document(object):
Expand All @@ -289,8 +300,8 @@ class Document(object):
Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`,
but can also be instantiated directly
with a list of :class:`pages <Page>`
and a set of :class:`metadata <DocumentMetadata>`.
with a list of :class:`pages <Page>`,
a set of :class:`metadata <DocumentMetadata>` and a ``url_fetcher``.
"""
@classmethod
Expand All @@ -306,15 +317,18 @@ def _render(cls, html, stylesheets, enable_hinting):
build_formatting_structure(
html.root_element, style_for, get_image_from_uri))
return cls([Page(p, enable_hinting) for p in page_boxes],
DocumentMetadata(**html._get_metadata()))
DocumentMetadata(**html._get_metadata()), html.url_fetcher)

def __init__(self, pages, metadata):
def __init__(self, pages, metadata, url_fetcher):
#: A list of :class:`Page` objects.
self.pages = pages
#: A :class:`DocumentMetadata` object.
#: Contains information that does not belong to a specific page
#: but to the whole document.
self.metadata = metadata
#: A ``url_fetcher`` for resources that have to be read when writing
#: the output.
self.url_fetcher = url_fetcher

def copy(self, pages='all'):
"""Take a subset of the pages.
Expand Down Expand Up @@ -349,7 +363,7 @@ def copy(self, pages='all'):
pages = self.pages
elif not isinstance(pages, list):
pages = list(pages)
return type(self)(pages, self.metadata)
return type(self)(pages, self.metadata, self.url_fetcher)

def resolve_links(self):
"""Resolve internal hyperlinks.
Expand Down Expand Up @@ -431,7 +445,7 @@ def make_bookmark_tree(self):
last_by_depth.append(children)
return root

def write_pdf(self, target=None, zoom=1):
def write_pdf(self, target=None, zoom=1, attachments=None):
"""Paint the pages in a PDF file, with meta-data.
PDF files written directly by cairo do not have meta-data such as
Expand All @@ -447,6 +461,9 @@ def write_pdf(self, target=None, zoom=1):
For values other than 1, physical CSS units will thus be “wrong”.
Page size declarations are affected too, even with keyword values
like ``@page { size: A3 landscape; }``
:param attachments: A list of additional file attachments for the
generated PDF document or :obj:`None`. The list's elements are
:class:`Attachment` objects, filenames, URLs or file-like objects.
:returns:
The PDF as byte string if :obj:`target` is :obj:`None`, otherwise
:obj:`None` (the PDF is written to :obj:`target`.)
Expand All @@ -466,7 +483,8 @@ def write_pdf(self, target=None, zoom=1):
surface.show_page()
surface.finish()

write_pdf_metadata(self, file_obj, scale, self.metadata)
write_pdf_metadata(self, file_obj, scale, self.metadata, attachments,
self.url_fetcher)

if target is None:
return file_obj.getvalue()
Expand Down
22 changes: 20 additions & 2 deletions weasyprint/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ def handle_td(element, box, _get_image_from_uri):
return [box]


@handler('a')
def handle_a(element, box, _get_image_from_uri):
"""Handle the ``rel`` attribute."""
box.is_attachment = element_has_link_type(element, 'attachment')
return [box]


def find_base_url(html_document, fallback_base_url):
"""Return the base URL for the document.
Expand All @@ -264,6 +271,7 @@ def get_html_metadata(html_document):
http://www.whatwg.org/html#the-title-element
http://www.whatwg.org/html#standard-metadata-names
http://wiki.whatwg.org/wiki/MetaExtensions
http://microformats.org/wiki/existing-rel-values#HTML5_link_type_extensions
"""
title = None
Expand All @@ -273,7 +281,8 @@ def get_html_metadata(html_document):
authors = []
created = None
modified = None
for element in html_document.iter('title', 'meta'):
attachments = []
for element in html_document.iter('title', 'meta', 'link'):
if element.tag == 'title' and title is None:
title = get_child_text(element)
elif element.tag == 'meta':
Expand All @@ -293,9 +302,18 @@ def get_html_metadata(html_document):
created = parse_w3c_date(name, element.sourceline, content)
elif name == 'dcterms.modified' and modified is None:
modified = parse_w3c_date(name, element.sourceline, content)
elif element.tag == 'link' and element_has_link_type(element,
'attachment'):
url = get_url_attribute(element, 'href')
title = element.get('title', None)
if url is None:
LOGGER.warning('Missing href in <link rel="%s">', rel)
else:
attachments.append((url, title))
return dict(title=title, description=description, generator=generator,
keywords=keywords, authors=authors,
created=created, modified=modified)
created=created, modified=modified,
attachments=attachments)


def strip_whitespace(string):
Expand Down
Loading

0 comments on commit 830598c

Please sign in to comment.