Skip to content

Commit

Permalink
[url] Fix .title command per issue #219
Browse files Browse the repository at this point in the history
Also use callbacks per issue #117
  • Loading branch information
embolalia committed May 18, 2013
1 parent 3e47a1f commit d1acb12
Showing 1 changed file with 169 additions and 130 deletions.
299 changes: 169 additions & 130 deletions willie/modules/url.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
url.py - Willie URL title module
Copyright 2010-2011, Michael Yanovich, yanovich.net, Kenneth Sham
Copyright 2012 Edward Powell
Copyright 2012-2013 Edward Powell
Licensed under the Eiffel Forum License 2.
http://willie.dftba.net
Expand All @@ -10,17 +10,24 @@
import re
from htmlentitydefs import name2codepoint
import willie.web as web
import urllib2
import unicodedata
import urlparse

url_finder = None
r_entity = re.compile(r'&[A-Za-z0-9#]+;')
INVALID_WEBSITE = 0x01
exclusion_char = '!'
# These are used to clean up the title tag before actually parsing it. Not the
# world's best way to do this, but it'll do for now.
title_tag_data = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
quoted_title = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
# This is another regex that presumably does something important.
re_dcc = re.compile(r'(?i)dcc\ssend')


def configure(config):
"""
| [url] | example | purpose |
| ---- | ------- | ------- |
| exclude | https?://git\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
Expand All @@ -29,169 +36,204 @@ def configure(config):
if config.option('Exclude certain URLs from automatic title display', False):
if not config.has_section('url'):
config.add_section('url')
config.add_list('url', 'exclude', 'Enter regular expressions for each URL you would like to exclude.',
config.add_list('url', 'exclude', 'Enter regular expressions for each URL you would like to exclude.',
'Regex:')
config.interactive_add('url', 'exclusion_char',
'Prefix to suppress URL titling', '!')



def setup(willie):
global url_finder, exclusion_char
if willie.config.has_option('url', 'exclude'):
regexes = [re.compile(s) for s in
willie.config.url.get_list(exclude)]
else:
regexes = []


# We're keeping these in their own list, rather than putting then in the
# callbacks list because 1, it's easier to deal with modules that are still
# using this list, and not the newer callbacks list and 2, having a lambda
# just to pass is kinda ugly.
if not willie.memory.contains('url_exclude'):
willie.memory['url_exclude'] = regexes
else:
exclude = willie.memory['url_exclude']
if regexes: exclude.append(regexes)
willie.memory['url_exclude'] = exclude

if regexes:
exclude.append(regexes)
willie.memory['url_exclude'] = regexes

# Ensure that url_callbacks and last_seen_url are in memory
if not willie.memory.contains('url_callbacks'):
willie.memory['url_callbacks'] = {}
if not willie.memory.contains('last_seen_url'):
willie.memory['last_seen_url'] = {}

if willie.config.has_option('url', 'exclusion_char'):
exclusion_char = willie.config.url.exclusion_char
url_finder = re.compile(r'(?u)(%s?(http|https|ftp)(://\S+))' %

url_finder = re.compile(r'(?u)(%s?(?:http|https|ftp)(?:://\S+))' %
(exclusion_char))
# We want the exclusion list to be pre-compiled, since url parsing gets
# called a /lot/, and it's annoying when it's laggy.

def find_title(url):

def title_command(willie, trigger):
"""
Show the title or URL information for the given URL, or the last URL seen
in this channel.
"""
if not trigger.group(2):
if trigger.sender not in willie.memory['last_seen_url']:
return
matched = check_callbacks(willie, trigger,
willie.memory['last_seen_url'][trigger.sender],
True)
if not matched:
urls = [willie.memory['last_seen_url'][trigger.sender]]
else:
urls = re.findall(url_finder, trigger)

results = process_urls(willie, trigger, urls)
title_command.commands = ['title']


def title_auto(willie, trigger):
"""
Automatically show titles for URLs. For shortened URLs/redirects, find
where the URL redirects to and show the title for that (or call a function
from another module to give more information).
"""
if re.match(willie.config.core.prefix + 'title', trigger):
return

urls = re.findall(url_finder, trigger)
results = process_urls(willie, trigger, urls)
willie.memory['last_seen_url'][trigger.sender] = urls[-1]

for result in results[:4]:
message = '[ %s ] - %s' % tuple(result)
if message != trigger:
willie.say(message)
title_auto.rule = '(?u).*(https?://\S+).*'


def process_urls(willie, trigger, urls):
"""
For each URL in the list, ensure that it isn't handled by another module.
If not, find where it redirects to, if anywhere. If that redirected URL
should be handled by another module, dispatch the callback for it.
Return a list of (title, TLD) tuples for each URL which is not handled by
another module.
"""

results = []
for url in urls:
if not url.startswith(exclusion_char):
# Magic stuff to account for international domain names
url = uni_encode(url)
url = uni_decode(url)
url = iri_to_uri(url)
# First, check that the URL we got doesn't match
matched = check_callbacks(willie, trigger, url, False)
if matched:
continue
# Then see if it redirects anywhere
new_url = follow_redirects(url)
if not new_url:
continue
# Then see if the final URL matches anything
matched = check_callbacks(willie, trigger, new_url, new_url != url)
if matched:
continue
# Finally, actually show the URL
title = find_title(url)
if title:
results.append((title, getTLD(url)))
return results


def follow_redirects(url):
"""
This finds the title when provided with a string of a URL.
Follow HTTP 3xx redirects, and return the actual URL. Return None if
there's a problem.
"""
uri = url
try:
connection = urllib2.urlopen(url)
url = connection.geturl() or url
connection.close()
except:
return None
return url


if not uri and hasattr(self, 'last_seen_uri'):
uri = self.last_seen_uri.get(origin.sender)
def check_callbacks(willie, trigger, url, run=True):
"""
Check the given URL against the callbacks list. If it matches, and ``run``
is given as ``True``, run the callback function, otherwise pass. Returns
``True`` if the url matched anything in the callbacks list.
"""
# Check if it matches the exclusion list first
matched = any(regex.search(url) for regex in willie.memory['url_exclude'])
# Then, check if there's anything in the callback list
for regex, function in willie.memory['url_callbacks'].iteritems():
match = regex.search(url)
if match:
if run:
function(willie, trigger, match)
matched = True
return matched

if not re.search('^((https?)|(ftp))://', uri):
uri = 'http://' + uri

if "twitter.com" in uri:
uri = uri.replace('#!', '?_escaped_fragment_=')
def find_title(url):
"""Return the title for the given URL."""
content = web.get(url)
# Some cleanup that I don't really grok, but was in the original, so
# we'll keep it (with the compiled regexes made global) for now.
content = title_tag_data.sub(r'<\1title>', content)
content = quoted_title.sub('', content)

content = web.get(uri)
regex = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
content = regex.sub(r'<\1title>',content)
regex = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
content = regex.sub('',content)
start = content.find('<title>')
if start == -1: return
end = content.find('</title>', start)
if end == -1: return
content = content[start+7:end]
content = content.strip('\n').rstrip().lstrip()
title = content

if len(title) > 200:
title = title[:200] + '[...]'

def e(m):
entity = m.group()
end = content.find('</title>')
if start == -1 or end == -1:
return
title = content[start + 7:end]
title = title.strip()[:200]

def get_unicode_entity(match):
entity = match.group()
if entity.startswith('&#x'):
cp = int(entity[3:-1],16)
return unichr(cp).encode('utf-8')
cp = int(entity[3:-1], 16)
elif entity.startswith('&#'):
cp = int(entity[2:-1])
return unichr(cp).encode('utf-8')
else:
char = name2codepoint[entity[1:-1]]
return unichr(char).encode('utf-8')

title = r_entity.sub(e, title)

if title:
title = uni_decode(title)
else: title = 'None'
cp = name2codepoint[entity[1:-1]]
return unichr(cp)

title = title.replace('\n', '')
title = title.replace('\r', '')
title = r_entity.sub(get_unicode_entity, title)
title = uni_decode(title)

def remove_spaces(x):
if " " in x:
x = x.replace(" ", " ")
return remove_spaces(x)
else:
return x
title = ' '.join(title.split()) # cleanly remove multiple spaces

title = remove_spaces (title)
# More cryptic regex substitutions. This one looks to be myano's invention.
title = re_dcc.sub('', title)

re_dcc = re.compile(r'(?i)dcc\ssend')
title = re.sub(re_dcc, '', title)
return title or None

if title:
return title

def getTLD (url):
def getTLD(url):
idx = 7
if url.startswith('https://'): idx = 8
elif url.startswith('ftp://'): idx = 6
u = url[idx:]
f = u.find('/')
if f == -1: u = url
else: u = url[0:idx] + u[0:f]
return u

def get_results(willie, text):
a = re.findall(url_finder, text)
display = [ ]
for match in a:
match = match[0]
if (match.startswith(exclusion_char) or
any(pattern.findall(match) for pattern in willie.memory['url_exclude'])):
continue
url = uni_encode(match)
url = uni_decode(url)
url = iriToUri(url)
try:
page_title = find_title(url)
except:
page_title = None # if it can't access the site fail silently
display.append([page_title, url])
return display

def show_title_auto (willie, trigger):
if trigger.startswith('.title '):
return
if len(re.findall("\([\d]+\sfiles\sin\s[\d]+\sdirs\)", trigger)) == 1: return
try:
results = get_results(willie, trigger)
except Exception as e: raise e
if results is None: return

k = 1
for r in results:
if k > 3: break
k += 1

if r[0] is None:
continue
else: r[1] = getTLD(r[1])
message = '[ %s ] - %s' % (r[0], r[1])
if message != trigger:
willie.say(message)
show_title_auto.rule = '(?u).*((http|https)(://\S+)).*'
show_title_auto.priority = 'high'
if url.startswith('https://'):
idx = 8
elif url.startswith('ftp://'):
idx = 6
tld = url[idx:]
slash = tld.find('/')
if slash != -1:
tld = tld[:slash]
return tld

def show_title_demand (willie, trigger):
"""Show the title of a URL"""
#try:
results = get_results(trigger)
#except: return
if results is None: return

for r in results:
if r[0] is None: continue
r[1] = getTLD(r[1])
willie.say('[ %s ] - %s' % (r[0], r[1]))
show_title_demand.commands = ['title']
show_title_demand.priority = 'high'
# Functions for international domain name magic


#Tools formerly in unicode.py

def uni_decode(bytes):
try:
text = bytes.decode('utf-8')
Expand All @@ -218,12 +260,9 @@ def urlEncodeNonAscii(b):
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)


def iriToUri(iri):
def iri_to_uri(iri):
parts = urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti == 1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)

if __name__ == '__main__':
print __doc__.strip()

0 comments on commit d1acb12

Please sign in to comment.