Skip to content

Commit

Permalink
CLI sampling: consume input in chunks (#49)
Browse files Browse the repository at this point in the history
* CLI sampling: consume input in chunks

* better batches and tests
  • Loading branch information
adbar authored Jul 3, 2023
1 parent 0735668 commit 2699fe6
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 30 deletions.
37 changes: 28 additions & 9 deletions courlan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,19 @@
## under GNU GPL v3 license

import argparse
import logging
import sys

from concurrent.futures import ProcessPoolExecutor, as_completed
from itertools import islice
from typing import Any, List, Optional, Tuple
from typing import Any, Iterator, List, Optional, Tuple

from .core import check_url
from .sampling import sample_urls
from .sampling import _make_sample
from .urlstore import UrlStore


LOGGER = logging.getLogger(__name__)


def parse_args(args: Any) -> Any:
Expand Down Expand Up @@ -102,21 +107,35 @@ def _cli_check_urls(
return results


def _batch_lines(inputfile: str) -> Iterator[List[str]]:
"Read input line in batches"
with open(inputfile, "r", encoding="utf-8", errors="ignore") as inputfh:
while True:
batch = list(islice(inputfh, 10**5))
if not batch:
return
yield batch


def _cli_sample(args: Any) -> None:
"Sample URLs on the CLI."
urllist: List[str] = []
if args.verbose:
LOGGER.setLevel(logging.DEBUG)
else:
LOGGER.setLevel(logging.ERROR)

with open(args.inputfile, "r", encoding="utf-8", errors="ignore") as inputfh:
urllist.extend(line.strip() for line in inputfh)
urlstore = UrlStore(
compressed=True, language=None, strict=args.strict, verbose=args.verbose
)
for batch in _batch_lines(args.inputfile):
urlstore.add_urls(batch)

with open(args.outputfile, "w", encoding="utf-8") as outputfh:
for url in sample_urls(
urllist,
for url in _make_sample(
urlstore,
args.samplesize,
exclude_min=args.exclude_min,
exclude_max=args.exclude_max,
strict=args.strict,
verbose=args.verbose,
):
outputfh.write(url + "\n")

Expand Down
46 changes: 26 additions & 20 deletions courlan/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,14 @@
LOGGER = logging.getLogger(__name__)


def sample_urls(
input_urls: List[str],
def _make_sample(
urlstore: UrlStore,
samplesize: int,
exclude_min: Optional[int] = None,
exclude_max: Optional[int] = None,
strict: bool = False,
verbose: bool = False,
exclude_min: Optional[int],
exclude_max: Optional[int],
) -> List[str]:
"""Sample a list of URLs by domain name, optionally using constraints on their number"""
# logging
if verbose:
LOGGER.setLevel(logging.DEBUG)
else:
LOGGER.setLevel(logging.ERROR)
# store
"Iterate through the hosts in store and draw samples."
output_urls = []
use_compression = len(input_urls) > 10**6
urlstore = UrlStore(
compressed=use_compression, language=None, strict=strict, verbose=verbose
)
urlstore.add_urls(sorted(input_urls))
# iterate
for domain in urlstore.urldict: # key=cmp_to_key(locale.strcoll)
urlpaths = [
p.urlpath
Expand Down Expand Up @@ -64,5 +50,25 @@ def sample_urls(
len(mysample),
len(mysample) / len(urlpaths),
)
# return gathered URLs
return output_urls


def sample_urls(
input_urls: List[str],
samplesize: int,
exclude_min: Optional[int] = None,
exclude_max: Optional[int] = None,
strict: bool = False,
verbose: bool = False,
) -> List[str]:
"""Sample a list of URLs by domain name, optionally using constraints on their number"""
# logging
if verbose:
LOGGER.setLevel(logging.DEBUG)
else:
LOGGER.setLevel(logging.ERROR)
# store
urlstore = UrlStore(compressed=True, language=None, strict=strict, verbose=verbose)
urlstore.add_urls(input_urls)
# return gathered URLs
return _make_sample(urlstore, samplesize, exclude_min, exclude_max)
10 changes: 9 additions & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,7 +920,10 @@ def test_cli():
assert os.system("courlan --help") == 0 # exit status

# _cli_check_urls
assert cli._cli_check_urls(["123", "https://example.org"]) == [(False, "123"), (True, "https://example.org")]
assert cli._cli_check_urls(["123", "https://example.org"]) == [
(False, "123"),
(True, "https://example.org"),
]

# testfile
inputfile = os.path.join(RESOURCES_DIR, "input.txt")
Expand Down Expand Up @@ -963,12 +966,17 @@ def test_cli():
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0

testargs = ["", "-i", inputfile, "-o", "/tmp/tralala.txt", "--sample"]
with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0
args.verbose = True
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0
# delete temporary output file
try:
os.remove(temp_outputfile)
Expand Down

0 comments on commit 2699fe6

Please sign in to comment.