Skip to content

Commit

Permalink
docs, tools.web: overhaul documentation for new tools.web package
Browse files Browse the repository at this point in the history
Added `tools.web` to "Additional API features" page of docs

Tweaked `tools.web` module header

Wrote docstrings for everything in `tools.web`
  • Loading branch information
dgw committed Jul 27, 2019
1 parent 3b71a04 commit cd9feef
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 8 deletions.
11 changes: 8 additions & 3 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ Sopel includes a number of additional functions that are useful for various
common IRC tasks.

Note that ``sopel.web`` was deprecated in 6.2.0, and is not included in this
documentation, but is still in use in many modules. It's highly recommended
that you switch to `requests <http://docs.python-requests.org/en/latest/>`_
instead.
documentation; it will be removed completely in Sopel 8. Plugins should use
`requests <https://github.com/psf/requests>`_ directly.

.. contents::

Expand All @@ -17,6 +16,12 @@ sopel.tools
.. automodule:: sopel.tools
:members:

sopel.tools.web
---------------

.. automodule:: sopel.tools.web
:members:

sopel.tools.time
----------------
.. automodule:: sopel.tools.time
Expand Down
110 changes: 105 additions & 5 deletions sopel/tools/web.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# coding=utf-8
"""
*Availability: 7+; replaces ``sopel.web``*
The ``tools.web`` package contains utility functions for interaction with web
applications, APIs, or websites in your plugins.
The ``web`` class contains web-related utility functions for interaction with
web applications, APIs, or websites in your modules.
.. versionadded:: 7.0
Some parts of this module will remain accessible through ``sopel.web`` as well
until its final removal in Sopel 8.
"""
# Copyright © 2008, Sean B. Palmer, inamidst.com
# Copyright © 2009, Michael Yanovich <yanovich.1@osu.edu>
Expand Down Expand Up @@ -44,13 +47,62 @@
]

USER_AGENT = 'Sopel/{} (https://sopel.chat)'.format(__version__)
"""User agent string to be sent with HTTP requests.
Meant to be passed like so::
import requests
from sopel.tools import web
result = requests.get(
'https://some.site/api/endpoint',
user_agent=web.USER_AGENT
)
"""
DEFAULT_HEADERS = {'User-Agent': USER_AGENT}
"""Default header dict for use with ``requests`` methods.
Use it like this::
import requests
from sopel.tools import web
result = requests.get(
'https://some.site/api/endpoint',
headers=web.DEFAULT_HEADERS
)
.. important::
You should *never* modify this directly in your plugin code. Make a copy
and use :py:meth:`~dict.update` if you need to add or change headers::
from sopel.tools import web
default_headers = web.DEFAULT_HEADERS.copy()
custom_headers = {'Accept': 'text/*'}
default_headers.update(custom_headers)
"""


r_entity = re.compile(r'&([^;\s]+);')
"""Regular expression to match HTML entities."""


def entity(match):
"""Convert an entity reference to the appropriate character.
:param str match: the entity name or code, as matched by
:py:const:`r_entity`
:return str: the Unicode character corresponding to the given ``match``
string, or a fallback representation if the reference cannot be
resolved to a character
"""
value = match.group(1).lower()
if value.startswith('#x'):
return unichr(int(value[2:], 16))
Expand All @@ -62,12 +114,24 @@ def entity(match):


def decode(html):
"""Decode HTML entities into Unicode text.
:param str html: the HTML page or snippet to process
:return str: ``html`` with all entity references replaced
"""
return r_entity.sub(entity, html)


# Identical to urllib2.quote
def quote(string, safe='/'):
"""Like urllib2.quote but handles unicode properly."""
"""Safely encodes a string for use in a URL.
:param str string: the string to quote
:param str safe: a list of characters that should not be quoted
:return str: the ``string`` with special characters URL-encoded
"""
if sys.version_info.major < 3:
if isinstance(string, unicode):
string = string.encode('utf8')
Expand All @@ -78,7 +142,11 @@ def quote(string, safe='/'):


def quote_query(string):
"""Quotes the query parameters."""
"""Safely encodes a URL's query parameters.
:param str string: a URL containing query parameters
:return str: the input URL with query parameter values URL-encoded
"""
parsed = urlparse(string)
string = string.replace(parsed.query, quote(parsed.query, "/=&"), 1)
return string
Expand All @@ -87,13 +155,15 @@ def quote_query(string):
# Functions for international domain name magic

def urlencode_non_ascii(b):
"""Safely encodes non-ASCII characters in a URL."""
regex = '[\x80-\xFF]'
if sys.version_info.major > 2:
regex = b'[\x80-\xFF]'
return re.sub(regex, lambda c: '%%%02x' % ord(c.group(0)), b)


def iri_to_uri(iri):
"""Decodes an internationalized domain name (IDN)."""
parts = urlparse(iri)
parts_seq = (part.encode('idna') if parti == 1 else urlencode_non_ascii(part.encode('utf-8')) for parti, part in enumerate(parts))
if sys.version_info.major > 2:
Expand All @@ -115,6 +185,20 @@ def iri_to_uri(iri):
# Functions for URL detection

def trim_url(url):
"""Removes extra punctuation from URLs found in text.
:param str url: the raw URL match
:return str: the cleaned URL
This function removes trailing punctuation that looks like it was not
intended to be part of the URL:
* trailing sentence- or clause-ending marks like ``.``, ``;``, etc.
* unmatched trailing brackets/braces like ``}``, ``)``, etc.
It is intended for use with the output of :py:func:`~.search_urls`, which
may include trailing punctuation when used on input from chat.
"""
# clean trailing sentence- or clause-ending punctuation
while url[-1] in '.,?!\'":;':
url = url[:-1]
Expand All @@ -128,6 +212,22 @@ def trim_url(url):


def search_urls(text, exclusion_char=None, clean=False, schemes=None):
"""Extracts all URLs in ``text``.
:param str text: the text to search for URLs
:param str exclusion_char: optional character that, if placed before a URL
in the ``text``, will exclude it from being extracted
:param bool clean: if ``True``, all found URLs are passed through
:py:func:`~.trim_url` before being returned; default ``False``
:param list schemes: optional list of URL schemes to look for; defaults to
``['http', 'https', 'ftp']``
:return: :py:term:`generator iterator` of all URLs found in ``text``
To get the URLs as a plain list, use e.g.::
list(search_urls(text))
"""
schemes = schemes or ['http', 'https', 'ftp']
schemes_patterns = '|'.join(re.escape(scheme) for scheme in schemes)
re_url = r'((?:%s)(?::\/\/\S+))' % schemes_patterns
Expand Down

0 comments on commit cd9feef

Please sign in to comment.