BitLooter · matthiasmiller · Feb 26, 2019 · Feb 27, 2019 · Feb 27, 2019
diff --git a/htmlark.py b/htmlark.py
@@ -11,6 +11,8 @@
 from urllib.parse import quote
 from urllib.parse import urljoin
 from urllib.parse import urlparse
+from urllib.request import pathname2url
+from urllib.request import url2pathname
 
 import bs4
 # Import requests if available, dummy it if not
@@ -41,6 +43,10 @@ def get_available_parsers():
     return available
 
 
+def path2uri(path):
+    return urljoin('file:', pathname2url(path))
+
+
 def _get_resource(resource_url: str) -> (str, bytes):
     """Download or reads a file (online or local).
 
@@ -64,11 +70,18 @@ def _get_resource(resource_url: str) -> (str, bytes):
                 mimetype = mimetypes.guess_type(resource_url)
         else:
             raise NameError("HTTP URL found but requests not available")
-    elif url_parsed.scheme == '':
-        # '' is local file
-        with open(resource_url, 'rb') as f:
+    elif url_parsed.scheme in ['', 'file']:
+        if url_parsed.scheme == 'file':
+            # Yank the scheme instead of using url_parsed.path because
+            # Windows drive letters interfere with parsing.
+            path = url2pathname(resource_url[len(url_parsed.scheme)+1:])
+        else:
+            # '' is local file.
+            path = resource_url
+
+        with open(path, 'rb') as f:
             data = f.read()
-        mimetype, _ = mimetypes.guess_type(resource_url)
+        mimetype, _ = mimetypes.guess_type(path)
     elif url_parsed.scheme == 'data':
         raise ValueError("Resource path is a data URI", url_parsed.scheme)
     else:
@@ -101,7 +114,9 @@ def make_data_uri(mimetype: str, data: bytes) -> str:
 def convert_page(page_path: str, parser: str='auto',
                  callback: Callable[[str, str, str], None]=lambda *_: None,
                  ignore_errors: bool=False, ignore_images: bool=False,
-                 ignore_css: bool=False, ignore_js: bool=False) -> str:
+                 ignore_css: bool=False, ignore_js: bool=False,
+                 ignore_source: bool=False,
+                 resource_callback: Callable[[str, str], str]=lambda m, c: c) -> str:
     """Take an HTML file or URL and outputs new HTML with resources as data URIs.
 
     Parameters:
@@ -120,10 +135,16 @@ def convert_page(page_path: str, parser: str='auto',
             Default: ``False``
         ignore_js (bool): If ``True`` do not process ``<script>`` tags.
             Default: ``False``
+        ignore_source (bool): If ``True`` do not process ``<source>`` tags.
+            Default: ``False``
         callback (function): Called before a new resource is processed. Takes
             three parameters: message type ('INFO' or 'ERROR'), a string with
             the category of the callback (usually the tag related to the
             message), and the message data (usually a string to be printed).
+        resource_callback (function): Callback before a new resource is processed.
+            Takes two parameters: mime_type and resource contents. Returns the
+            modified contents for the resource. This can be used to minify
+            content or compress images, for example.
     Returns:
         str: The new webpage HTML.
     Raises:
@@ -196,6 +217,11 @@ def convert_page(page_path: str, parser: str='auto',
         for script in scripttags:
             if 'src' in script.attrs:
                 tags.append(script)
+    if not ignore_source:
+        sourcetags = soup('source')
+        for source in sourcetags:
+            if 'src' in source.attrs:
+                tags.append(source)
 
     # Convert the linked resources
     for tag in tags:
@@ -229,6 +255,7 @@ def convert_page(page_path: str, parser: str='auto',
             if not ignore_errors:
                 raise
         else:
+            tag_data = resource_callback(tag_mime, tag_data)
             encoded_resource = make_data_uri(tag_mime, tag_data)
             if tag.name == 'link':
                 tag['href'] = encoded_resource
@@ -241,7 +268,8 @@ def convert_page(page_path: str, parser: str='auto',
     soup.html.insert_after(bs4.Comment(
         "Generated by HTMLArk {}. Original URL {}".format(datetime.now(),
                                                           page_path)))
-    return str(soup)
+
+    return resource_callback('text/html', str(soup).encode()).decode()
 
 
 def _get_options():