sopel-irc · dgw · Oct 30, 2019 · Jul 26, 2019 · Aug 18, 2019
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -5,9 +5,8 @@ Sopel includes a number of additional functions that are useful for various
 common IRC tasks.
 
 Note that ``sopel.web`` was deprecated in 6.2.0, and is not included in this
-documentation, but is still in use in many modules. It's highly recommended
-that you switch to `requests <http://docs.python-requests.org/en/latest/>`_
-instead.
+documentation; it will be removed completely in Sopel 8. Plugins should use
+`requests <https://github.com/psf/requests>`_ directly.
 
 .. contents::
 
@@ -17,6 +16,12 @@ sopel.tools
 .. automodule:: sopel.tools
    :members:
 
+sopel.tools.web
+---------------
+
+.. automodule:: sopel.tools.web
+   :members:
+
 sopel.tools.time
 ----------------
 .. automodule:: sopel.tools.time

diff --git a/sopel/tools/web.py b/sopel/tools/web.py
@@ -1,9 +1,14 @@
 # coding=utf-8
 """
-*Availability: 7+; replaces ``sopel.web``*
+The ``tools.web`` package contains utility functions for interaction with web
+applications, APIs, or websites in your plugins.
 
-The ``web`` class contains web-related utility functions for interaction with
-web applications, APIs, or websites in your modules.
+.. versionadded:: 7.0
+
+.. note::
+    Some parts of this module will remain accessible through ``sopel.web`` as
+    well until its final removal in Sopel 8. This is for backward
+    compatibility only; please update old code as soon as possible.
 """
 # Copyright © 2008, Sean B. Palmer, inamidst.com
 # Copyright © 2009, Michael Yanovich <yanovich.1@osu.edu>
@@ -44,13 +49,61 @@
 ]
 
 USER_AGENT = 'Sopel/{} (https://sopel.chat)'.format(__version__)
+"""User agent string to be sent with HTTP requests.
+
+Meant to be passed like so::
+
+    import requests
+
+    from sopel.tools import web
+
+    result = requests.get(
+        'https://some.site/api/endpoint',
+        user_agent=web.USER_AGENT
+    )
+
+"""
 DEFAULT_HEADERS = {'User-Agent': USER_AGENT}
+"""Default header dict for use with ``requests`` methods.
+
+Use it like this::
+
+    import requests
+
+    from sopel.tools import web
+
+    result = requests.get(
+        'https://some.site/api/endpoint',
+        headers=web.DEFAULT_HEADERS
+    )
+
+.. important::
+   You should *never* modify this directly in your plugin code. Make a copy
+   and use :py:meth:`~dict.update` if you need to add or change headers::
+
+       from sopel.tools import web
+
+       default_headers = web.DEFAULT_HEADERS.copy()
+       custom_headers = {'Accept': 'text/*'}
+
+       default_headers.update(custom_headers)
+
+"""
 
 
 r_entity = re.compile(r'&([^;\s]+);')
+"""Regular expression to match HTML entities."""
 
 
 def entity(match):
+    """Convert an entity reference to the appropriate character.
+
+    :param str match: the entity name or code, as matched by
+        :py:const:`r_entity`
+    :return str: the Unicode character corresponding to the given ``match``
+        string, or a fallback representation if the reference cannot be
+        resolved to a character
+    """
     value = match.group(1).lower()
     if value.startswith('#x'):
         return unichr(int(value[2:], 16))
@@ -62,12 +115,27 @@ def entity(match):
 
 
 def decode(html):
+    """Decode HTML entities into Unicode text.
+
+    :param str html: the HTML page or snippet to process
+    :return str: ``html`` with all entity references replaced
+    """
     return r_entity.sub(entity, html)
 
 
 # Identical to urllib2.quote
 def quote(string, safe='/'):
-    """Like urllib2.quote but handles unicode properly."""
+    """Safely encodes a string for use in a URL.
+
+    :param str string: the string to encode
+    :param str safe: a list of characters that should not be quoted; defaults
+                     to ``'/'``
+    :return str: the ``string`` with special characters URL-encoded
+
+    .. note::
+        This is a shim to make writing cross-compatible plugins for both
+        Python 2 and Python 3 easier.
+    """
     if sys.version_info.major < 3:
         if isinstance(string, unicode):
             string = string.encode('utf8')
@@ -78,7 +146,11 @@ def quote(string, safe='/'):
 
 
 def quote_query(string):
-    """Quotes the query parameters."""
+    """Safely encodes a URL's query parameters.
+
+    :param str string: a URL containing query parameters
+    :return str: the input URL with query parameter values URL-encoded
+    """
     parsed = urlparse(string)
     string = string.replace(parsed.query, quote(parsed.query, "/=&"), 1)
     return string
@@ -87,13 +159,15 @@ def quote_query(string):
 # Functions for international domain name magic
 
 def urlencode_non_ascii(b):
+    """Safely encodes non-ASCII characters in a URL."""
     regex = '[\x80-\xFF]'
     if sys.version_info.major > 2:
         regex = b'[\x80-\xFF]'
     return re.sub(regex, lambda c: '%%%02x' % ord(c.group(0)), b)
 
 
 def iri_to_uri(iri):
+    """Decodes an internationalized domain name (IDN)."""
     parts = urlparse(iri)
     parts_seq = (part.encode('idna') if parti == 1 else urlencode_non_ascii(part.encode('utf-8')) for parti, part in enumerate(parts))
     if sys.version_info.major > 2:
@@ -115,6 +189,20 @@ def iri_to_uri(iri):
 # Functions for URL detection
 
 def trim_url(url):
+    """Removes extra punctuation from URLs found in text.
+
+    :param str url: the raw URL match
+    :return str: the cleaned URL
+
+    This function removes trailing punctuation that looks like it was not
+    intended to be part of the URL:
+
+        * trailing sentence- or clause-ending marks like ``.``, ``;``, etc.
+        * unmatched trailing brackets/braces like ``}``, ``)``, etc.
+
+    It is intended for use with the output of :py:func:`~.search_urls`, which
+    may include trailing punctuation when used on input from chat.
+    """
     # clean trailing sentence- or clause-ending punctuation
     while url[-1] in '.,?!\'":;':
         url = url[:-1]
@@ -128,6 +216,22 @@ def trim_url(url):
 
 
 def search_urls(text, exclusion_char=None, clean=False, schemes=None):
+    """Extracts all URLs in ``text``.
+
+    :param str text: the text to search for URLs
+    :param str exclusion_char: optional character that, if placed before a URL
+        in the ``text``, will exclude it from being extracted
+    :param bool clean: if ``True``, all found URLs are passed through
+        :py:func:`~.trim_url` before being returned; default ``False``
+    :param list schemes: optional list of URL schemes to look for; defaults to
+        ``['http', 'https', 'ftp']``
+    :return: :py:term:`generator iterator` of all URLs found in ``text``
+
+    To get the URLs as a plain list, use e.g.::
+
+        list(search_urls(text))
+
+    """
     schemes = schemes or ['http', 'https', 'ftp']
     schemes_patterns = '|'.join(re.escape(scheme) for scheme in schemes)
     re_url = r'((?:%s)(?::\/\/\S+))' % schemes_patterns