Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document tools.web package #1669

Merged
merged 2 commits into from
Oct 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ Sopel includes a number of additional functions that are useful for various
common IRC tasks.

Note that ``sopel.web`` was deprecated in 6.2.0, and is not included in this
documentation, but is still in use in many modules. It's highly recommended
that you switch to `requests <http://docs.python-requests.org/en/latest/>`_
instead.
documentation; it will be removed completely in Sopel 8. Plugins should use
`requests <https://github.com/psf/requests>`_ directly.

.. contents::

Expand All @@ -17,6 +16,12 @@ sopel.tools
.. automodule:: sopel.tools
:members:

sopel.tools.web
---------------

.. automodule:: sopel.tools.web
:members:

sopel.tools.time
----------------
.. automodule:: sopel.tools.time
Expand Down
114 changes: 109 additions & 5 deletions sopel/tools/web.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# coding=utf-8
"""
*Availability: 7+; replaces ``sopel.web``*
The ``tools.web`` package contains utility functions for interaction with web
applications, APIs, or websites in your plugins.

The ``web`` class contains web-related utility functions for interaction with
web applications, APIs, or websites in your modules.
.. versionadded:: 7.0

.. note::
Some parts of this module will remain accessible through ``sopel.web`` as
well until its final removal in Sopel 8. This is for backward
compatibility only; please update old code as soon as possible.
"""
# Copyright © 2008, Sean B. Palmer, inamidst.com
# Copyright © 2009, Michael Yanovich <yanovich.1@osu.edu>
Expand Down Expand Up @@ -44,13 +49,61 @@
]

USER_AGENT = 'Sopel/{} (https://sopel.chat)'.format(__version__)
"""User agent string to be sent with HTTP requests.

Meant to be passed like so::

import requests

from sopel.tools import web

result = requests.get(
'https://some.site/api/endpoint',
user_agent=web.USER_AGENT
)

"""
DEFAULT_HEADERS = {'User-Agent': USER_AGENT}
"""Default header dict for use with ``requests`` methods.

Use it like this::

import requests

from sopel.tools import web

result = requests.get(
'https://some.site/api/endpoint',
headers=web.DEFAULT_HEADERS
)

.. important::
You should *never* modify this directly in your plugin code. Make a copy
and use :py:meth:`~dict.update` if you need to add or change headers::

from sopel.tools import web

default_headers = web.DEFAULT_HEADERS.copy()
custom_headers = {'Accept': 'text/*'}

default_headers.update(custom_headers)

"""


r_entity = re.compile(r'&([^;\s]+);')
"""Regular expression to match HTML entities."""


def entity(match):
"""Convert an entity reference to the appropriate character.

:param str match: the entity name or code, as matched by
:py:const:`r_entity`
:return str: the Unicode character corresponding to the given ``match``
string, or a fallback representation if the reference cannot be
resolved to a character
"""
value = match.group(1).lower()
if value.startswith('#x'):
return unichr(int(value[2:], 16))
Expand All @@ -62,12 +115,27 @@ def entity(match):


def decode(html):
"""Decode HTML entities into Unicode text.

:param str html: the HTML page or snippet to process
:return str: ``html`` with all entity references replaced
"""
return r_entity.sub(entity, html)


# Identical to urllib2.quote
def quote(string, safe='/'):
"""Like urllib2.quote but handles unicode properly."""
"""Safely encodes a string for use in a URL.

:param str string: the string to encode
:param str safe: a list of characters that should not be quoted; defaults
to ``'/'``
:return str: the ``string`` with special characters URL-encoded

dgw marked this conversation as resolved.
Show resolved Hide resolved
.. note::
This is a shim to make writing cross-compatible plugins for both
Python 2 and Python 3 easier.
"""
if sys.version_info.major < 3:
if isinstance(string, unicode):
string = string.encode('utf8')
Expand All @@ -78,7 +146,11 @@ def quote(string, safe='/'):


def quote_query(string):
"""Quotes the query parameters."""
"""Safely encodes a URL's query parameters.

:param str string: a URL containing query parameters
:return str: the input URL with query parameter values URL-encoded
"""
parsed = urlparse(string)
string = string.replace(parsed.query, quote(parsed.query, "/=&"), 1)
return string
Expand All @@ -87,13 +159,15 @@ def quote_query(string):
# Functions for international domain name magic

def urlencode_non_ascii(b):
"""Safely encodes non-ASCII characters in a URL."""
regex = '[\x80-\xFF]'
if sys.version_info.major > 2:
regex = b'[\x80-\xFF]'
return re.sub(regex, lambda c: '%%%02x' % ord(c.group(0)), b)


def iri_to_uri(iri):
"""Decodes an internationalized domain name (IDN)."""
parts = urlparse(iri)
parts_seq = (part.encode('idna') if parti == 1 else urlencode_non_ascii(part.encode('utf-8')) for parti, part in enumerate(parts))
if sys.version_info.major > 2:
Expand All @@ -115,6 +189,20 @@ def iri_to_uri(iri):
# Functions for URL detection

def trim_url(url):
"""Removes extra punctuation from URLs found in text.

:param str url: the raw URL match
:return str: the cleaned URL

This function removes trailing punctuation that looks like it was not
intended to be part of the URL:

* trailing sentence- or clause-ending marks like ``.``, ``;``, etc.
* unmatched trailing brackets/braces like ``}``, ``)``, etc.

It is intended for use with the output of :py:func:`~.search_urls`, which
may include trailing punctuation when used on input from chat.
"""
# clean trailing sentence- or clause-ending punctuation
while url[-1] in '.,?!\'":;':
url = url[:-1]
Expand All @@ -128,6 +216,22 @@ def trim_url(url):


def search_urls(text, exclusion_char=None, clean=False, schemes=None):
"""Extracts all URLs in ``text``.

:param str text: the text to search for URLs
:param str exclusion_char: optional character that, if placed before a URL
in the ``text``, will exclude it from being extracted
:param bool clean: if ``True``, all found URLs are passed through
:py:func:`~.trim_url` before being returned; default ``False``
:param list schemes: optional list of URL schemes to look for; defaults to
``['http', 'https', 'ftp']``
:return: :py:term:`generator iterator` of all URLs found in ``text``

To get the URLs as a plain list, use e.g.::

list(search_urls(text))

"""
schemes = schemes or ['http', 'https', 'ftp']
schemes_patterns = '|'.join(re.escape(scheme) for scheme in schemes)
re_url = r'((?:%s)(?::\/\/\S+))' % schemes_patterns
Expand Down