diff --git a/htmlark.py b/htmlark.py index 02c2918..2cf471e 100755 --- a/htmlark.py +++ b/htmlark.py @@ -11,6 +11,8 @@ from urllib.parse import quote from urllib.parse import urljoin from urllib.parse import urlparse +from urllib.request import pathname2url +from urllib.request import url2pathname import bs4 # Import requests if available, dummy it if not @@ -41,6 +43,10 @@ def get_available_parsers(): return available +def path2uri(path): + return urljoin('file:', pathname2url(path)) + + def _get_resource(resource_url: str) -> (str, bytes): """Download or reads a file (online or local). @@ -64,11 +70,18 @@ def _get_resource(resource_url: str) -> (str, bytes): mimetype = mimetypes.guess_type(resource_url) else: raise NameError("HTTP URL found but requests not available") - elif url_parsed.scheme == '': - # '' is local file - with open(resource_url, 'rb') as f: + elif url_parsed.scheme in ['', 'file']: + if url_parsed.scheme == 'file': + # Yank the scheme instead of using url_parsed.path because + # Windows drive letters interfere with parsing. + path = url2pathname(resource_url[len(url_parsed.scheme)+1:]) + else: + # '' is local file. + path = resource_url + + with open(path, 'rb') as f: data = f.read() - mimetype, _ = mimetypes.guess_type(resource_url) + mimetype, _ = mimetypes.guess_type(path) elif url_parsed.scheme == 'data': raise ValueError("Resource path is a data URI", url_parsed.scheme) else: @@ -101,7 +114,9 @@ def make_data_uri(mimetype: str, data: bytes) -> str: def convert_page(page_path: str, parser: str='auto', callback: Callable[[str, str, str], None]=lambda *_: None, ignore_errors: bool=False, ignore_images: bool=False, - ignore_css: bool=False, ignore_js: bool=False) -> str: + ignore_css: bool=False, ignore_js: bool=False, + ignore_source: bool=False, + resource_callback: Callable[[str, str], str]=lambda m, c: c) -> str: """Take an HTML file or URL and outputs new HTML with resources as data URIs. Parameters: @@ -120,10 +135,16 @@ def convert_page(page_path: str, parser: str='auto', Default: ``False`` ignore_js (bool): If ``True`` do not process ``