docs: review link extraction functions and add them to readme (#115)

* docs: review extract_links() and add to readme * fix deprecation message * better docs
adbar · Sep 3, 2024 · fd646fb · fd646fb
1 parent 47e7e59
commit fd646fb
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -137,9 +137,8 @@ available in `lang_filter(url, language)`:
 ```
 
 Define stricter restrictions on the expected content type with
-`strict=True`. Also blocks certain platforms and pages types crawlers
-should stay away from if they don't target them explicitly and other
-black holes where machines get lost.
+`strict=True`. This also blocks certain platforms and page types
+where machines get lost.
 
 ``` python
 # strict filtering: blocked as it is a major platform
@@ -158,6 +157,20 @@ black holes where machines get lost.
 
 ### Web crawling and URL handling
 
+Link extraction and preprocessing:
+
+``` python
+>>> from courlan import extract_links
+>>> doc = '<html><body><a href="test/link.html">Link</a></body></html>'
+>>> url = "https://example.org"
+>>> extract_links(doc, url)
+{'https://example.org/test/link.html'}
+# other options: external_bool, no_filter, language, strict, redirects, ...
+```
+
+The `filter_links()` function provides additional filters for crawling purposes:
+use of robots.txt rules and link priorization. See `courlan.core` for details.
+
 Determine if a link leads to another host:
 
 ``` python
@@ -215,6 +228,10 @@ True
 True
 ```
 
+See also [URL management page](https://trafilatura.readthedocs.io/en/latest/url-management.html)
+of the Trafilatura documentation.
+
+
 ### Python helpers
 
 Helper function, scrub and normalize:

diff --git a/courlan/core.py b/courlan/core.py
@@ -5,6 +5,7 @@
 # import locale
 import logging
 import re
+import warnings
 
 from typing import List, Optional, Set, Tuple
 from urllib.robotparser import RobotFileParser
@@ -149,15 +150,14 @@ def extract_links(
     Args:
         pagecontent: whole page in binary format
         url: full URL of the original page
-        base_url: deprecated, legacy only
         external_bool: set to True for external links only, False for
                   internal links only
         no_filter: override settings and bypass checks to return all possible URLs
         language: set target language (ISO 639-1 codes)
         strict: set to True for stricter filtering
         trailing_slash: set to False to trim trailing slashes
         with_nav: set to True to include navigation pages instead of discarding them
-        with_redirects: set to True for redirection test (per HTTP HEAD request)
+        redirects: set to True for redirection test (per HTTP HEAD request)
         reference: provide a host reference for external/internal evaluation
 
     Returns:
@@ -166,6 +166,11 @@ def extract_links(
     Raises:
         Nothing.
     """
+    if base_url:
+        warnings.warn(
+            "'base_url' will soon be deprecated, use 'url'.", PendingDeprecationWarning
+        )
+
     base_url = base_url or get_base_url(url)
     url = url or base_url
     candidates, validlinks = set(), set()  # type: Set[str], Set[str]
@@ -232,7 +237,7 @@ def filter_links(
     strict: bool = False,
     with_nav: bool = True,
 ) -> Tuple[List[str], List[str]]:
-    "Find links in a HTML document, filter them and add them to the data store."
+    "Find links in a HTML document, filter and prioritize them for crawling purposes."
     links, links_priority = [], []
     url = url or base_url
     for link in extract_links(