Skip to content

Commit

Permalink
CLI: add partial support for parallel processing (#37)
Browse files Browse the repository at this point in the history
* CLI: add partial support for parallel processing

* add test
  • Loading branch information
adbar authored May 30, 2023
1 parent 1f946ef commit d4ec4de
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 16 deletions.
68 changes: 52 additions & 16 deletions courlan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import argparse
import sys

from typing import Any, List
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Any, List, Optional, Tuple

from .core import check_url, sample_urls

Expand Down Expand Up @@ -42,6 +43,13 @@ def parse_args(args: Any) -> Any:
group1.add_argument(
"-v", "--verbose", help="increase output verbosity", action="store_true"
)
group1.add_argument(
"-p",
"--parallel",
help="number of parallel threads (not used for sampling)",
type=int,
default=4,
)
group2 = argsparser.add_argument_group("Filtering", "Configure URL filters")
group2.add_argument(
"--strict", help="perform more restrictive tests", action="store_true"
Expand All @@ -68,25 +76,53 @@ def parse_args(args: Any) -> Any:
return argsparser.parse_args()


def _cli_check_url(
url: str,
strict: bool = False,
with_redirects: bool = False,
language: Optional[str] = None,
with_nav: bool = False,
) -> Tuple[bool, str]:
"Internal function to be used with CLI multiprocessing."
result = check_url(
url,
strict=strict,
with_redirects=with_redirects,
language=language,
with_nav=with_nav,
)
if result is not None:
return (True, result[0])
return (False, url)


def process_args(args: Any) -> None:
"""Start processing according to the arguments"""
if not args.sample:
with open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh, open(args.outputfile, "w", encoding="utf-8") as outputfh:
for line in inputfh:
result = check_url(
line,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
with open(
args.inputfile, "r", encoding="utf-8", errors="ignore"
) as inputfh, open(args.outputfile, "w", encoding="utf-8") as outputfh:
futures = (
executor.submit(
_cli_check_url,
line,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for line in inputfh
)
if result is not None:
outputfh.write(result[0] + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(args.discardedfile, "a", encoding="utf-8") as discardfh:
discardfh.write(line)
for future in as_completed(futures):
valid, url = future.result()
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)
else:
urllist: List[str] = []
with open(args.inputfile, "r", encoding="utf-8", errors="ignore") as inputfh:
Expand Down
3 changes: 3 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,8 @@ def test_cli():
"-v",
"--language",
"en",
"--parallel",
"2",
]
with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
Expand All @@ -864,6 +866,7 @@ def test_cli():
assert args.outputfile == "output.txt"
assert args.verbose is True
assert args.language == "en"
assert args.parallel == 2
assert os.system("courlan --help") == 0 # exit status

# testfile
Expand Down

0 comments on commit d4ec4de

Please sign in to comment.