Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/build fix #32

Merged
merged 12 commits into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pyre.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ jobs:
security-events: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: true

- name: Run Pyre
uses: facebook/pyre-action@60697a7858f7cc8470d8cc494a3cf2ad6b06560d
uses: facebook/pyre-action@v0.0.2
with:
# To customize these inputs:
# See https://github.com/facebook/pyre-action#inputs
Expand Down
13 changes: 6 additions & 7 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,18 @@ jobs:

steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.9
python-version: 3.12.3
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 mypy pytest pylint \
types-all types-attrs types-dataclasses types-PyYAML types-typed-ast
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# - name: MyPy types checking
# run: |
# mypy --config-file .mypy.ini
if [ -f requirements_dev.txt ]; then pip install -r requirements_dev.txt; fi
- name: MyPy types checking
run: |
mypy --config-file .mypy.ini markdown_toolset
- name: Lint with pylint
run: |
pylint -rn -sn --rcfile=.pylintrc --fail-on=I --load-plugins=pylint.extensions.docparams markdown_toolset
Expand Down
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ repos:
"types-requests",
"types-dataclasses>=0.1.3",
"types-PyYAML",
"types-typed-ast>=1.4.1"
"types-typed-ast>=1.4.1",
"types-markdown>=3.6.0.20240316"
]
# exclude: tests(/\w*)*/functional/|tests/input|tests(/.*)+/conftest.py|doc/data/messages|tests(/\w*)*data/

Expand All @@ -110,7 +111,7 @@ repos:
rev: "22.12.0"
hooks:
- id: black
args: ["-l", "120", "--skip-string-normalization"]
args: ["-l", "120", "--skip-string-normalization" ]

# - repo: https://github.com/DanielNoord/pydocstringformatter
# rev: v0.7.2
Expand Down
6 changes: 6 additions & 0 deletions .pyre_configuration
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"source_directories": [
"."
],
exclude: ["install_git_hooks.py"]
}
141 changes: 88 additions & 53 deletions markdown_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from mimetypes import types_map
from pathlib import Path

from markdown_toolset.article_processor import ArticleProcessor, DeduplicationVariant,\
IN_FORMATS_LIST, OUT_FORMATS_LIST
from markdown_toolset.article_processor import ArticleProcessor, DeduplicationVariant, IN_FORMATS_LIST, OUT_FORMATS_LIST

from markdown_toolset.__version__ import __version__

Expand All @@ -29,41 +28,44 @@ class CustomArgumentDefaultsHelpFormatter(RawDescriptionHelpFormatter):
"""

def _get_help_string(self, action):
help = action.help
help_ = action.help
if '%(default)' not in action.help:
if action.default is not SUPPRESS:
defaulting_nargs = [OPTIONAL, ZERO_OR_MORE]
if action.option_strings or action.nargs in defaulting_nargs:
help += ' (default: %(default)s)'
return help
help_ += ' (default: %(default)s)'
return help_


def main(arguments):
"""
Entrypoint.
"""

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%d.%m.%Y %H:%M:%S',
level='DEBUG' if arguments.verbose else 'INFO')
logging.basicConfig(
format='%(asctime)s %(message)s', datefmt='%d.%m.%Y %H:%M:%S', level='DEBUG' if arguments.verbose else 'INFO'
)

print(f'Markdown tool version {__version__} started...')

if arguments.process_local_images:
print('--process_local_images is deprecated and will be disabled in the next version!')

processor = ArticleProcessor(article_file_path_or_url=arguments.article_file_path_or_url,
skip_list=arguments.skip_list,
downloading_timeout=arguments.downloading_timeout,
output_format=arguments.output_format,
output_path=getattr(arguments, 'output_path', Path.cwd()),
remove_source=arguments.remove_source,
images_public_path=getattr(arguments, 'images_public_path', ''),
input_formats=arguments.input_format.split('+'),
skip_all_incorrect=arguments.skip_all_incorrect,
download_incorrect_mime=arguments.download_incorrect_mime,
deduplication_type=getattr(DeduplicationVariant, arguments.deduplication_type.upper()),
images_dirname=arguments.images_dirname,
save_hierarchy=arguments.prepend_images_with_path)
processor = ArticleProcessor(
article_file_path_or_url=arguments.article_file_path_or_url,
skip_list=arguments.skip_list,
downloading_timeout=arguments.downloading_timeout,
output_format=arguments.output_format,
output_path=getattr(arguments, 'output_path', Path.cwd()),
remove_source=arguments.remove_source,
images_public_path=getattr(arguments, 'images_public_path', ''),
input_formats=arguments.input_format.split('+'),
skip_all_incorrect=arguments.skip_all_incorrect,
download_incorrect_mime=arguments.download_incorrect_mime,
deduplication_type=getattr(DeduplicationVariant, arguments.deduplication_type.upper()),
images_dirname=arguments.images_dirname,
save_hierarchy=arguments.prepend_images_with_path,
)

processor.process()

Expand All @@ -76,42 +78,75 @@ def main(arguments):
prog='markdown_tool',
epilog='Use tool at your own risk!',
description=f'{__doc__}Version: {__version__}',
formatter_class=CustomArgumentDefaultsHelpFormatter
formatter_class=CustomArgumentDefaultsHelpFormatter,
)
parser.add_argument('article_file_path_or_url', type=str, help='path to the article file in the Markdown format')
parser.add_argument(
'-D',
'--deduplication-type',
choices=[i.name.lower() for i in DeduplicationVariant],
default='disabled',
help='Deduplicate images, using content hash or SHA1(image_name)',
)
parser.add_argument(
'-d',
'--images-dirname',
default='images',
help='Folder in which to download images ' '(possible variables: $article_name, $time, $date, $dt, $base_url)',
)
parser.add_argument(
'-a', '--skip-all-incorrect', default=False, action='store_true', help='skip all incorrect images'
)
parser.add_argument(
'-E',
'--download-incorrect-mime',
default=False,
action='store_true',
help='download "images" with unrecognized MIME type',
)
parser.add_argument(
'-s',
'--skip-list',
default=None,
help='skip URL\'s from the comma-separated list (or file with a leading \'@\')',
)
parser.add_argument('-i', '--input-format', default='md', choices=IN_FORMATS_LIST, help='input format')
parser.add_argument(
'-l', '--process-local-images', default=False, action='store_true', help='[DEPRECATED] Process local images'
)
parser.add_argument(
'-n',
'--replace-image-names',
default=False,
action='store_true',
help='Replace image names, using content hash',
)
parser.add_argument(
'-o', '--output-format', default=OUT_FORMATS_LIST[0], choices=OUT_FORMATS_LIST, help='output format'
)
parser.add_argument(
'-p',
'--images-public-path',
default=SUPPRESS,
help='Public path to the folder of downloaded images '
'(possible variables: $article_name, $time, $date, $dt, $base_url)',
)
parser.add_argument('article_file_path_or_url', type=str,
help='path to the article file in the Markdown format')
parser.add_argument('-D', '--deduplication-type', choices=[i.name.lower() for i in DeduplicationVariant],
default='disabled', help='Deduplicate images, using content hash or SHA1(image_name)')
parser.add_argument('-d', '--images-dirname', default='images',
help='Folder in which to download images '
'(possible variables: $article_name, $time, $date, $dt, $base_url)')
parser.add_argument('-a', '--skip-all-incorrect', default=False, action='store_true',
help='skip all incorrect images')
parser.add_argument('-E', '--download-incorrect-mime', default=False, action='store_true',
help='download "images" with unrecognized MIME type')
parser.add_argument('-s', '--skip-list', default=None,
help='skip URL\'s from the comma-separated list (or file with a leading \'@\')')
parser.add_argument('-i', '--input-format', default='md', choices=IN_FORMATS_LIST,
help='input format')
parser.add_argument('-l', '--process-local-images', default=False, action='store_true',
help='[DEPRECATED] Process local images')
parser.add_argument('-n', '--replace-image-names', default=False, action='store_true',
help='Replace image names, using content hash')
parser.add_argument('-o', '--output-format', default=OUT_FORMATS_LIST[0], choices=OUT_FORMATS_LIST,
help='output format')
parser.add_argument('-p', '--images-public-path', default=SUPPRESS,
help='Public path to the folder of downloaded images '
'(possible variables: $article_name, $time, $date, $dt, $base_url)')
# TODO: Replace this with variables.
parser.add_argument('-P', '--prepend-images-with-path', default=False, action='store_true',
help='Save relative images paths')
parser.add_argument('-R', '--remove-source', default=False, action='store_true',
help='Remove or replace source file')
parser.add_argument('-t', '--downloading-timeout', type=float, default=-1,
help='how many seconds to wait before downloading will be failed')
parser.add_argument(
'-P', '--prepend-images-with-path', default=False, action='store_true', help='Save relative images paths'
)
parser.add_argument(
'-R', '--remove-source', default=False, action='store_true', help='Remove or replace source file'
)
parser.add_argument(
'-t',
'--downloading-timeout',
type=float,
default=-1,
help='how many seconds to wait before downloading will be failed',
)
parser.add_argument('-O', '--output-path', type=str, help='article output file name or path', default=SUPPRESS)
parser.add_argument('--verbose', '-v', default=False, action='store_true',
help='More verbose logging')
parser.add_argument('--verbose', '-v', default=False, action='store_true', help='More verbose logging')
parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}', help='return version number')

args = parser.parse_args()
Expand Down
14 changes: 13 additions & 1 deletion markdown_toolset/string_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import unicodedata
from pathlib import Path
from typing import BinaryIO, Union
from typing import BinaryIO, Union, TextIO, List, Dict


def slugify(value):
Expand Down Expand Up @@ -37,3 +37,15 @@ def compare_files(filename1: Union[Path, str], filename2: Union[Path, str]) -> b
with open(filename1, 'rb') as f1:
with open(filename2, 'rb') as f2:
return is_binary_same(f1, f2)


def replace_strings(replacement_mapping: Dict[str, str], text_stream: TextIO) -> List[str]:
"""Replace strings in the stream, using mapping."""

lines = []
for line in text_stream:
for src, target in replacement_mapping.items():
line = line.replace(src, str(target))
lines.append(line)

return lines
11 changes: 3 additions & 8 deletions markdown_toolset/transformers/html/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

__all__ = ['ArticleTransformer']

from ...string_tools import replace_strings


class HTMLImageURLGrabber(HTMLParser, ABC):
def __init__(self):
Expand Down Expand Up @@ -53,15 +55,8 @@ def _read_article(self) -> List[str]:

def _fix_document_urls(self) -> List[str]:
logging.debug('Replacing images urls in the document...')
replacement_mapping = self._replacement_mapping
lines = []
self._article_stream.seek(self._start_pos)
for line in self._article_stream:
for src, target in replacement_mapping.items():
line = line.replace(src, str(target))
lines.append(line)

return lines
return replace_strings(self._replacement_mapping, self._article_stream)

def run(self):
"""
Expand Down
15 changes: 5 additions & 10 deletions markdown_toolset/transformers/md/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
from markdown.treeprocessors import Treeprocessor
from markdown.extensions import Extension

__all__ = ['ArticleTransformer']
from ...image_downloader import ImageLink
from ...string_tools import replace_strings


from markdown_toolset.image_downloader import ImageLink
__all__ = ['ArticleTransformer']


class ImgExtractor(Treeprocessor):
Expand Down Expand Up @@ -82,12 +84,5 @@ def fix_n(n):

def _fix_document_urls(self) -> List[str]:
logging.debug('Replacing images urls in the document...')
replacement_mapping = self._replacement_mapping
lines = []
self._article_stream.seek(self._start_pos)
for line in self._article_stream:
for src, target in replacement_mapping.items():
line = line.replace(src, str(target))
lines.append(line)

return lines
return replace_strings(self._replacement_mapping, self._article_stream)
9 changes: 5 additions & 4 deletions markdown_toolset/www_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from typing import Optional
from mimetypes import guess_extension
import os
import re
from urllib.parse import urlparse, urlunparse
import requests
Expand All @@ -23,7 +22,6 @@ def is_url(url: str, allowed_url_prefixes=('http', 'ftp', 'https', 'ftps')) -> b
"""
Check url for prefix match.
"""

l_url = url.lower()
for prefix in set(allowed_url_prefixes):
if l_url.startswith(prefix.lower()):
Expand All @@ -43,6 +41,7 @@ def remove_protocol_prefix(url: str) -> str:
def download_from_url(url: str, timeout: float = None):
"""
Download file from the URL.

:param url: URL to download.
:param timeout: timeout before fail.
:raise OSError: when HTTP status is not 200.
Expand Down Expand Up @@ -91,7 +90,10 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]:

result = file_name[0]

f_name, f_ext = os.path.splitext(result)
f_name, f_ext = (
(name_and_ext := result.rsplit('.', 1)),
(*name_and_ext, None) if len(name_and_ext) == 1 else name_and_ext,
)[1:][0]

if f_name == '':
return None
Expand All @@ -109,7 +111,6 @@ def get_base_url(req: requests.Response) -> Optional[str]:
"""
Get base URL from url.
"""

if req and req.url.find('/'):
return req.url.rsplit('/', 1)[0]

Expand Down
10 changes: 10 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
flake8
mypy
pre-commit==2.20.0
pylint
pytest
types-all
types-attrs
types-dataclasses
types-markdown
types-PyYAML
types-typed-ast
Loading