Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more efficient sampling #31

Merged
merged 3 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 11 additions & 18 deletions courlan/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,13 @@ def sample_urls(
LOGGER.setLevel(logging.DEBUG)
else:
LOGGER.setLevel(logging.ERROR)
# deduplicate
input_urls = list(dict.fromkeys(input_urls))
# validate
input_urls = [
u
for u in input_urls
if check_url(u, strict=strict, with_redirects=False) is not None
]
# store
output_urls = []
urlstore = UrlStore(compressed=False, language=None, strict=strict)
urlstore.add_urls(input_urls)
use_compression = len(input_urls) > 10**6
urlstore = UrlStore(
compressed=use_compression, language=None, strict=strict, verbose=verbose
)
urlstore.add_urls(sorted(input_urls))
# iterate
for domain in urlstore.urldict: # key=cmp_to_key(locale.strcoll)
urlpaths = [p.urlpath for p in urlstore._load_urls(domain)]
Expand All @@ -154,17 +149,15 @@ def sample_urls(
or exclude_max is not None
and len(urlpaths) > exclude_max
):
LOGGER.info("discarded (size): %s\t\turls: %s", domain, len(urlpaths))
continue
# copy all URLs
if len(urlpaths) <= samplesize:
output_urls.extend([domain + p for p in urlpaths])
LOGGER.info("%s\t\turls: %s", domain, len(urlpaths))
LOGGER.warning("discarded (size): %s\t\turls: %s", domain, len(urlpaths))
continue
# sample
mysample = sorted(sample(urlpaths, k=samplesize))
if len(urlpaths) > samplesize:
mysample = sorted(sample(urlpaths, k=samplesize))
else:
mysample = urlpaths
output_urls.extend([domain + p for p in mysample])
LOGGER.info(
LOGGER.debug(
"%s\t\turls: %s\tprop.: %s",
domain,
len(mysample),
Expand Down
2 changes: 1 addition & 1 deletion courlan/urlstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _buffer_urls(
self, data: List[str], visited: bool = False
) -> DefaultDict[str, Deque[UrlPathTuple]]:
inputdict: DefaultDict[str, Deque[UrlPathTuple]] = defaultdict(deque)
for url in list(dict.fromkeys(data)):
for url in dict.fromkeys(data):
# segment URL and add to domain dictionary
try:
# validate
Expand Down
6 changes: 5 additions & 1 deletion courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ def get_base_url(url: Any) -> str:
"""Strip URL of some of its parts to get base URL.
Accepts strings and urllib.parse ParseResult objects."""
parsed_url = _parse(url)
return parsed_url._replace(path="", params="", query="", fragment="").geturl()
if parsed_url.scheme:
scheme = parsed_url.scheme + "://"
else:
scheme = ""
return scheme + parsed_url.netloc


def get_host_and_path(url: Any) -> Tuple[str, str]:
Expand Down