Skip to content

Commit

Permalink
reviewed sampling and UrlStore efficiency (#31)
Browse files Browse the repository at this point in the history
* more efficient sampling

* simplify code

* more efficient processing
  • Loading branch information
adbar authored May 15, 2023
1 parent eb23b9b commit 76be6d5
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 20 deletions.
29 changes: 11 additions & 18 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,13 @@ def sample_urls(
LOGGER.setLevel(logging.DEBUG)
else:
LOGGER.setLevel(logging.ERROR)
# deduplicate
input_urls = list(dict.fromkeys(input_urls))
# validate
input_urls = [
u
for u in input_urls
if check_url(u, strict=strict, with_redirects=False) is not None
]
# store
output_urls = []
urlstore = UrlStore(compressed=False, language=None, strict=strict)
urlstore.add_urls(input_urls)
use_compression = len(input_urls) > 10**6
urlstore = UrlStore(
compressed=use_compression, language=None, strict=strict, verbose=verbose
)
urlstore.add_urls(sorted(input_urls))
# iterate
for domain in urlstore.urldict: # key=cmp_to_key(locale.strcoll)
urlpaths = [p.urlpath for p in urlstore._load_urls(domain)]
Expand All @@ -154,17 +149,15 @@ def sample_urls(
or exclude_max is not None
and len(urlpaths) > exclude_max
):
LOGGER.info("discarded (size): %s\t\turls: %s", domain, len(urlpaths))
continue
# copy all URLs
if len(urlpaths) <= samplesize:
output_urls.extend([domain + p for p in urlpaths])
LOGGER.info("%s\t\turls: %s", domain, len(urlpaths))
LOGGER.warning("discarded (size): %s\t\turls: %s", domain, len(urlpaths))
continue
# sample
mysample = sorted(sample(urlpaths, k=samplesize))
if len(urlpaths) > samplesize:
mysample = sorted(sample(urlpaths, k=samplesize))
else:
mysample = urlpaths
output_urls.extend([domain + p for p in mysample])
LOGGER.info(
LOGGER.debug(
"%s\t\turls: %s\tprop.: %s",
domain,
len(mysample),
Expand Down
2 changes: 1 addition & 1 deletion courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _buffer_urls(
self, data: List[str], visited: bool = False
) -> DefaultDict[str, Deque[UrlPathTuple]]:
inputdict: DefaultDict[str, Deque[UrlPathTuple]] = defaultdict(deque)
for url in list(dict.fromkeys(data)):
for url in dict.fromkeys(data):
# segment URL and add to domain dictionary
try:
# validate
Expand Down
6 changes: 5 additions & 1 deletion courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ def get_base_url(url: Any) -> str:
"""Strip URL of some of its parts to get base URL.
Accepts strings and urllib.parse ParseResult objects."""
parsed_url = _parse(url)
return parsed_url._replace(path="", params="", query="", fragment="").geturl()
if parsed_url.scheme:
scheme = parsed_url.scheme + "://"
else:
scheme = ""
return scheme + parsed_url.netloc


def get_host_and_path(url: Any) -> Tuple[str, str]:
Expand Down

0 comments on commit 76be6d5

Please sign in to comment.