Merge branch 'pdf-attachments' from PR #177

Kozea · Apr 27, 2014 · 830598c · 830598c
2 parents 908b1ec + 96dd798
commit 830598c
Show file tree

Hide file tree

Showing 12 changed files with 517 additions and 71 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -15,6 +15,7 @@ Contributors:
 - Aymeric Bois
 - Chung Wu
 - Clément Plasse
+- Colin Leitner
 - Florian Mounier
 - Frédérick Deslandes
 - Glwadys Fayolle

diff --git a/docs/features.rst b/docs/features.rst
@@ -45,7 +45,7 @@ PDF
 ---
 
 In addition to text, raster and vector graphics, WeasyPrint’s PDF files
-can contain hyperlinks and bookmarks.
+can contain hyperlinks, bookmarks and attachments.
 
 Hyperlinks will be clickable in PDF viewers that support them. They can
 be either internal, to another part of the same document (eg.
@@ -58,6 +58,12 @@ sidebar. Clicking on an entry scrolls the matching part of the document
 into view. By default all ``<h1>`` to ``<h6>`` titles generate bookmarks,
 but this can be controlled with CSS (see :ref:`bookmarks`.)
 
+Attachments are related files, embedded in the PDF itself. They can be
+specified through ``<link rel=attachment>`` elements to add resources globally
+or through regular links with ``<a rel=attachment>`` to attach a resource that
+can be saved by clicking on said link. The ``title`` attribute can be used as
+description of the attachment.
+
 
 Fonts
 -----

diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py
@@ -22,7 +22,7 @@
 # Used for 'User-Agent' in HTTP and 'Creator' in PDF
 VERSION_STRING = 'WeasyPrint %s (http://weasyprint.org/)' % VERSION
 
-__all__ = ['HTML', 'CSS', 'Document', 'Page', 'default_url_fetcher',
+__all__ = ['HTML', 'CSS', 'Attachment', 'Document', 'Page', 'default_url_fetcher',
            'VERSION']
 
 
@@ -131,7 +131,8 @@ def render(self, stylesheets=None, enable_hinting=False):
         """
         return Document._render(self, stylesheets, enable_hinting)
 
-    def write_pdf(self, target=None, stylesheets=None, zoom=1):
+    def write_pdf(self, target=None, stylesheets=None, zoom=1,
+        attachments=None):
         """Render the document to a PDF file.
 
         This is a shortcut for calling :meth:`render`, then
@@ -151,13 +152,16 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1):
             For values other than 1, physical CSS units will thus be “wrong”.
             Page size declarations are affected too, even with keyword values
             like ``@page { size: A3 landscape; }``
+        :param attachments: A list of additional file attachments for the
+            generated PDF document or :obj:`None`. The list's elements are
+            :class:`Attachment` objects, filenames, URLs or file-like objects.
         :returns:
             The PDF as byte string if :obj:`target` is not provided or
             :obj:`None`, otherwise :obj:`None` (the PDF is written to
             :obj:`target`.)
 
         """
-        return self.render(stylesheets).write_pdf(target, zoom)
+        return self.render(stylesheets).write_pdf(target, zoom, attachments)
 
     def write_image_surface(self, stylesheets=None, resolution=96):
         surface, _width, _height = (
@@ -234,6 +238,25 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None,
         for error in self.stylesheet.errors:
             LOGGER.warning(error)
 
+class Attachment(object):
+    """Represents a file attachment for a PDF document.
+
+    An instance is created in the same way as :class:`HTML`, except that
+    the HTML specific parameters are not supported. An optional description can
+    be provided with the ``description`` parameter.
+
+    :param description: A description of the attachment to be included in the
+        PDF document. May be :obj:`None`
+
+    """
+    def __init__(self, guess=None, filename=None, url=None, file_obj=None,
+                 string=None, base_url=None, url_fetcher=default_url_fetcher,
+                 description=None):
+        self.source = _select_source(
+            guess, filename, url, file_obj, string, tree=None,
+            base_url=base_url, url_fetcher=url_fetcher)
+        self.description = description
+
 
 @contextlib.contextmanager
 def _select_source(guess=None, filename=None, url=None, file_obj=None,

diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py
@@ -16,7 +16,7 @@
 import sys
 import argparse
 
-from . import VERSION, HTML
+from . import VERSION, HTML, Attachment
 
 
 def main(argv=None, stdout=None, stdin=None):
@@ -61,6 +61,11 @@ def main(argv=None, stdout=None, stdin=None):
 
         Set the media type to use for ``@media``. Defaults to ``print``.
 
+    .. option:: -a <file>, --attachment <file>
+
+        Adds an attachment to the document which is included in the PDF output.
+        This option can be added multiple times to attach more files.
+
     .. option:: --version
 
         Show the version number. Other options and arguments are ignored.
@@ -92,6 +97,8 @@ def main(argv=None, stdout=None, stdin=None):
                         help='Base for relative URLs in the HTML input. '
                              "Defaults to the input's own filename or URL "
                              'or the current directory for stdin.')
+    parser.add_argument('-a', '--attachment', action='append',
+                        help='URL or filename of a file to attach to the PDF document')
     parser.add_argument(
         'input', help='URL or filename of the HTML input, or - for stdin')
     parser.add_argument(
@@ -136,6 +143,13 @@ def main(argv=None, stdout=None, stdin=None):
             kwargs['resolution'] = args.resolution
         else:
             parser.error('--resolution only applies for the PNG format.')
+
+    if args.attachment:
+        if format_ == 'pdf':
+            kwargs['attachments'] = args.attachments
+        else:
+            parser.error('--attachment only applies for the PDF format.')
+
     html = HTML(source, base_url=args.base_url, encoding=args.encoding,
                 media_type=args.media_type)
     getattr(html, 'write_' + format_)(output, **kwargs)

diff --git a/weasyprint/compat.py b/weasyprint/compat.py
@@ -19,7 +19,8 @@
 __all__ = ['Request', 'base64_decode', 'base64_encode', 'basestring',
            'ints_from_bytes', 'iteritems', 'izip', 'parse_email', 'parse_qs',
            'pathname2url', 'quote', 'unicode', 'unquote', 'unquote_to_bytes',
-           'urlencode', 'urljoin', 'urlopen', 'urlopen_contenttype',
+           'urlencode', 'urljoin', 'urlopen', 'urllib_get_content_type',
+           'urllib_get_charset', 'urllib_get_filename',
            'urlparse_uses_relative', 'urlsplit', 'xrange']
 
 
@@ -39,13 +40,14 @@
     iteritems = dict.items
     izip = zip
 
-    def urlopen_contenttype(url):
-        """Return (file_obj, mime_type, encoding)"""
-        result = urlopen(url)
-        info = result.info()
-        mime_type = info.get_content_type()
-        charset = info.get_param('charset')
-        return result, mime_type, charset
+    def urllib_get_content_type(urlobj):
+        return urlobj.info().get_content_type()
+
+    def urllib_get_charset(urlobj):
+        return urlobj.info().get_param('charset')
+
+    def urllib_get_filename(urlobj):
+        return urlobj.info().get_filename()
 
     def parse_email(data):
         if isinstance(data, bytes):
@@ -75,13 +77,14 @@ def ints_from_bytes(byte_string):
     def array(typecode, initializer):
         return _array(typecode.encode('ascii'), initializer)
 
-    def urlopen_contenttype(url):
-        """Return (file_obj, mime_type, encoding)"""
-        result = urlopen(url)
-        info = result.info()
-        mime_type = info.gettype()
-        charset = info.getparam('charset')
-        return result, mime_type, charset
+    def urllib_get_content_type(urlobj):
+        return urlobj.info().gettype()
+
+    def urllib_get_charset(urlobj):
+        return urlobj.info().getparam('charset')
+
+    def urllib_get_filename(urlobj):
+        return None
 
     def unquote_to_bytes(data):
         if isinstance(data, unicode):

diff --git a/weasyprint/document.py b/weasyprint/document.py
@@ -113,11 +113,14 @@ def _gather_links_and_bookmarks(box, bookmarks, links, anchors, matrix):
     has_link = link and not isinstance(box, boxes.TextBox)
     # In case of duplicate IDs, only the first is an anchor.
     has_anchor = anchor_name and anchor_name not in anchors
+    is_attachment = hasattr(box, 'is_attachment') and box.is_attachment
 
     if has_bookmark or has_link or has_anchor:
         pos_x, pos_y, width, height = box.hit_area()
         if has_link:
             link_type, target = link
+            if link_type == 'external' and is_attachment:
+                link_type = 'attachment'
             if matrix:
                 link = _TaggedTuple(
                     (link_type, target, rectangle_aabb(
@@ -171,6 +174,8 @@ def __init__(self, page_box, enable_hinting=False):
         #    The anchor might be defined in another page,
         #    in multiple pages (in which case the first occurence is used),
         #    or not at all.
+        #: * ``'attachment'``: :obj:`target` is an absolute URL and points
+        #:   to a resource to attach to the document.
         self.links = links = []
 
         #: A dict mapping anchor names to their target, ``(x, y)`` points
@@ -246,7 +251,8 @@ class DocumentMetadata(object):
 
     """
     def __init__(self, title=None, authors=None, description=None,
-                 keywords=None, generator=None, created=None, modified=None):
+                 keywords=None, generator=None, created=None, modified=None,
+                 attachments=None):
         #: The title of the document, as a string or :obj:`None`.
         #: Extracted from the ``<title>`` element in HTML
         #: and written to the ``/Title`` info field in PDF.
@@ -281,6 +287,11 @@ def __init__(self, title=None, authors=None, description=None,
         #: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
         #: and written to the ``/ModDate`` info field in PDF.
         self.modified = modified
+        #: File attachments as a list of tuples of URL and a description or
+        #: :obj:`None`.
+        #: Extracted from the ``<link rel=attachment>`` elements in HTML
+        #: and written to the ``/EmbeddedFiles`` dictionary in PDF.
+        self.attachments = attachments or []
 
 
 class Document(object):
@@ -289,8 +300,8 @@ class Document(object):
 
     Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`,
     but can also be instantiated directly
-    with a list of :class:`pages <Page>`
-    and a set of :class:`metadata <DocumentMetadata>`.
+    with a list of :class:`pages <Page>`,
+    a set of :class:`metadata <DocumentMetadata>` and a ``url_fetcher``.
 
     """
     @classmethod
@@ -306,15 +317,18 @@ def _render(cls, html, stylesheets, enable_hinting):
             build_formatting_structure(
                 html.root_element, style_for, get_image_from_uri))
         return cls([Page(p, enable_hinting) for p in page_boxes],
-                   DocumentMetadata(**html._get_metadata()))
+                   DocumentMetadata(**html._get_metadata()), html.url_fetcher)
 
-    def __init__(self, pages, metadata):
+    def __init__(self, pages, metadata, url_fetcher):
         #: A list of :class:`Page` objects.
         self.pages = pages
         #: A :class:`DocumentMetadata` object.
         #: Contains information that does not belong to a specific page
         #: but to the whole document.
         self.metadata = metadata
+        #: A ``url_fetcher`` for resources that have to be read when writing
+        #: the output.
+        self.url_fetcher = url_fetcher
 
     def copy(self, pages='all'):
         """Take a subset of the pages.
@@ -349,7 +363,7 @@ def copy(self, pages='all'):
             pages = self.pages
         elif not isinstance(pages, list):
             pages = list(pages)
-        return type(self)(pages, self.metadata)
+        return type(self)(pages, self.metadata, self.url_fetcher)
 
     def resolve_links(self):
         """Resolve internal hyperlinks.
@@ -431,7 +445,7 @@ def make_bookmark_tree(self):
                 last_by_depth.append(children)
         return root
 
-    def write_pdf(self, target=None, zoom=1):
+    def write_pdf(self, target=None, zoom=1, attachments=None):
         """Paint the pages in a PDF file, with meta-data.
 
         PDF files written directly by cairo do not have meta-data such as
@@ -447,6 +461,9 @@ def write_pdf(self, target=None, zoom=1):
             For values other than 1, physical CSS units will thus be “wrong”.
             Page size declarations are affected too, even with keyword values
             like ``@page { size: A3 landscape; }``
+        :param attachments: A list of additional file attachments for the
+            generated PDF document or :obj:`None`. The list's elements are
+            :class:`Attachment` objects, filenames, URLs or file-like objects.
         :returns:
             The PDF as byte string if :obj:`target` is :obj:`None`, otherwise
             :obj:`None` (the PDF is written to :obj:`target`.)
@@ -466,7 +483,8 @@ def write_pdf(self, target=None, zoom=1):
             surface.show_page()
         surface.finish()
 
-        write_pdf_metadata(self, file_obj, scale, self.metadata)
+        write_pdf_metadata(self, file_obj, scale, self.metadata, attachments,
+            self.url_fetcher)
 
         if target is None:
             return file_obj.getvalue()

diff --git a/weasyprint/html.py b/weasyprint/html.py
@@ -243,6 +243,13 @@ def handle_td(element, box, _get_image_from_uri):
     return [box]
 
 
+@handler('a')
+def handle_a(element, box, _get_image_from_uri):
+    """Handle the ``rel`` attribute."""
+    box.is_attachment = element_has_link_type(element, 'attachment')
+    return [box]
+
+
 def find_base_url(html_document, fallback_base_url):
     """Return the base URL for the document.
 
@@ -264,6 +271,7 @@ def get_html_metadata(html_document):
     http://www.whatwg.org/html#the-title-element
     http://www.whatwg.org/html#standard-metadata-names
     http://wiki.whatwg.org/wiki/MetaExtensions
+    http://microformats.org/wiki/existing-rel-values#HTML5_link_type_extensions
 
     """
     title = None
@@ -273,7 +281,8 @@ def get_html_metadata(html_document):
     authors = []
     created = None
     modified = None
-    for element in html_document.iter('title', 'meta'):
+    attachments = []
+    for element in html_document.iter('title', 'meta', 'link'):
         if element.tag == 'title' and title is None:
             title = get_child_text(element)
         elif element.tag == 'meta':
@@ -293,9 +302,18 @@ def get_html_metadata(html_document):
                 created = parse_w3c_date(name, element.sourceline, content)
             elif name == 'dcterms.modified' and modified is None:
                 modified = parse_w3c_date(name, element.sourceline, content)
+        elif element.tag == 'link' and element_has_link_type(element,
+            'attachment'):
+            url = get_url_attribute(element, 'href')
+            title = element.get('title', None)
+            if url is None:
+                LOGGER.warning('Missing href in <link rel="%s">', rel)
+            else:
+                attachments.append((url, title))
     return dict(title=title, description=description, generator=generator,
                 keywords=keywords, authors=authors,
-                created=created, modified=modified)
+                created=created, modified=modified,
+                attachments=attachments)
 
 
 def strip_whitespace(string):