From 81f30aec97279b1273d37156e298e7d3796f3688 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 15 May 2023 16:52:40 +0200 Subject: [PATCH 1/3] more efficient sampling --- courlan/core.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/courlan/core.py b/courlan/core.py index c027913c..becb397a 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -132,18 +132,13 @@ def sample_urls( LOGGER.setLevel(logging.DEBUG) else: LOGGER.setLevel(logging.ERROR) - # deduplicate - input_urls = list(dict.fromkeys(input_urls)) - # validate - input_urls = [ - u - for u in input_urls - if check_url(u, strict=strict, with_redirects=False) is not None - ] # store output_urls = [] - urlstore = UrlStore(compressed=False, language=None, strict=strict) - urlstore.add_urls(input_urls) + is_compressed = len(input_urls) > 10**6 + urlstore = UrlStore( + compressed=is_compressed, language=None, strict=strict, verbose=verbose + ) + urlstore.add_urls(sorted(input_urls)) # iterate for domain in urlstore.urldict: # key=cmp_to_key(locale.strcoll) urlpaths = [p.urlpath for p in urlstore._load_urls(domain)] @@ -154,7 +149,7 @@ def sample_urls( or exclude_max is not None and len(urlpaths) > exclude_max ): - LOGGER.info("discarded (size): %s\t\turls: %s", domain, len(urlpaths)) + LOGGER.warning("discarded (size): %s\t\turls: %s", domain, len(urlpaths)) continue # copy all URLs if len(urlpaths) <= samplesize: @@ -164,7 +159,7 @@ def sample_urls( # sample mysample = sorted(sample(urlpaths, k=samplesize)) output_urls.extend([domain + p for p in mysample]) - LOGGER.info( + LOGGER.debug( "%s\t\turls: %s\tprop.: %s", domain, len(mysample), From e16fbf5a0d6c35e4b3bf2547c21c33800a8e7139 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 15 May 2023 18:02:21 +0200 Subject: [PATCH 2/3] simplify code --- courlan/core.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/courlan/core.py b/courlan/core.py index becb397a..25a22a27 100644 --- a/courlan/core.py +++ b/courlan/core.py @@ -134,9 +134,9 @@ def sample_urls( LOGGER.setLevel(logging.ERROR) # store output_urls = [] - is_compressed = len(input_urls) > 10**6 + use_compression = len(input_urls) > 10**6 urlstore = UrlStore( - compressed=is_compressed, language=None, strict=strict, verbose=verbose + compressed=use_compression, language=None, strict=strict, verbose=verbose ) urlstore.add_urls(sorted(input_urls)) # iterate @@ -151,13 +151,11 @@ def sample_urls( ): LOGGER.warning("discarded (size): %s\t\turls: %s", domain, len(urlpaths)) continue - # copy all URLs - if len(urlpaths) <= samplesize: - output_urls.extend([domain + p for p in urlpaths]) - LOGGER.info("%s\t\turls: %s", domain, len(urlpaths)) - continue # sample - mysample = sorted(sample(urlpaths, k=samplesize)) + if len(urlpaths) > samplesize: + mysample = sorted(sample(urlpaths, k=samplesize)) + else: + mysample = urlpaths output_urls.extend([domain + p for p in mysample]) LOGGER.debug( "%s\t\turls: %s\tprop.: %s", From 479e1911ca6b84b073550984ef906f1522d0d43d Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Mon, 15 May 2023 18:20:12 +0200 Subject: [PATCH 3/3] more efficient processing --- courlan/urlstore.py | 2 +- courlan/urlutils.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/courlan/urlstore.py b/courlan/urlstore.py index f7e649f0..673c6419 100644 --- a/courlan/urlstore.py +++ b/courlan/urlstore.py @@ -96,7 +96,7 @@ def _buffer_urls( self, data: List[str], visited: bool = False ) -> DefaultDict[str, Deque[UrlPathTuple]]: inputdict: DefaultDict[str, Deque[UrlPathTuple]] = defaultdict(deque) - for url in list(dict.fromkeys(data)): + for url in dict.fromkeys(data): # segment URL and add to domain dictionary try: # validate diff --git a/courlan/urlutils.py b/courlan/urlutils.py index 717325b6..913e07f7 100644 --- a/courlan/urlutils.py +++ b/courlan/urlutils.py @@ -80,7 +80,11 @@ def get_base_url(url: Any) -> str: """Strip URL of some of its parts to get base URL. Accepts strings and urllib.parse ParseResult objects.""" parsed_url = _parse(url) - return parsed_url._replace(path="", params="", query="", fragment="").geturl() + if parsed_url.scheme: + scheme = parsed_url.scheme + "://" + else: + scheme = "" + return scheme + parsed_url.netloc def get_host_and_path(url: Any) -> Tuple[str, str]: