Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert <source> tags. #9

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions htmlark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from urllib.parse import quote
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import pathname2url
from urllib.request import url2pathname

import bs4
# Import requests if available, dummy it if not
Expand Down Expand Up @@ -41,6 +43,10 @@ def get_available_parsers():
return available


def path2uri(path):
return urljoin('file:', pathname2url(path))


def _get_resource(resource_url: str) -> (str, bytes):
"""Download or reads a file (online or local).

Expand All @@ -64,11 +70,18 @@ def _get_resource(resource_url: str) -> (str, bytes):
mimetype = mimetypes.guess_type(resource_url)
else:
raise NameError("HTTP URL found but requests not available")
elif url_parsed.scheme == '':
# '' is local file
with open(resource_url, 'rb') as f:
elif url_parsed.scheme in ['', 'file']:
if url_parsed.scheme == 'file':
# Yank the scheme instead of using url_parsed.path because
# Windows drive letters interfere with parsing.
path = url2pathname(resource_url[len(url_parsed.scheme)+1:])
else:
# '' is local file.
path = resource_url

with open(path, 'rb') as f:
data = f.read()
mimetype, _ = mimetypes.guess_type(resource_url)
mimetype, _ = mimetypes.guess_type(path)
elif url_parsed.scheme == 'data':
raise ValueError("Resource path is a data URI", url_parsed.scheme)
else:
Expand Down Expand Up @@ -101,7 +114,9 @@ def make_data_uri(mimetype: str, data: bytes) -> str:
def convert_page(page_path: str, parser: str='auto',
callback: Callable[[str, str, str], None]=lambda *_: None,
ignore_errors: bool=False, ignore_images: bool=False,
ignore_css: bool=False, ignore_js: bool=False) -> str:
ignore_css: bool=False, ignore_js: bool=False,
ignore_source: bool=False,
resource_callback: Callable[[str, str], str]=lambda m, c: c) -> str:
"""Take an HTML file or URL and outputs new HTML with resources as data URIs.

Parameters:
Expand All @@ -120,10 +135,16 @@ def convert_page(page_path: str, parser: str='auto',
Default: ``False``
ignore_js (bool): If ``True`` do not process ``<script>`` tags.
Default: ``False``
ignore_source (bool): If ``True`` do not process ``<source>`` tags.
Default: ``False``
callback (function): Called before a new resource is processed. Takes
three parameters: message type ('INFO' or 'ERROR'), a string with
the category of the callback (usually the tag related to the
message), and the message data (usually a string to be printed).
resource_callback (function): Callback before a new resource is processed.
Takes two parameters: mime_type and resource contents. Returns the
modified contents for the resource. This can be used to minify
content or compress images, for example.
Returns:
str: The new webpage HTML.
Raises:
Expand Down Expand Up @@ -196,6 +217,11 @@ def convert_page(page_path: str, parser: str='auto',
for script in scripttags:
if 'src' in script.attrs:
tags.append(script)
if not ignore_source:
sourcetags = soup('source')
for source in sourcetags:
if 'src' in source.attrs:
tags.append(source)

# Convert the linked resources
for tag in tags:
Expand Down Expand Up @@ -229,6 +255,7 @@ def convert_page(page_path: str, parser: str='auto',
if not ignore_errors:
raise
else:
tag_data = resource_callback(tag_mime, tag_data)
encoded_resource = make_data_uri(tag_mime, tag_data)
if tag.name == 'link':
tag['href'] = encoded_resource
Expand All @@ -241,7 +268,8 @@ def convert_page(page_path: str, parser: str='auto',
soup.html.insert_after(bs4.Comment(
"Generated by HTMLArk {}. Original URL {}".format(datetime.now(),
page_path)))
return str(soup)

return resource_callback('text/html', str(soup).encode()).decode()


def _get_options():
Expand Down