From b8e3b5f9fa3a56113d9251754b2e2ee2115a7070 Mon Sep 17 00:00:00 2001 From: Maxime Chatelle Date: Sun, 30 Nov 2014 13:19:21 +0100 Subject: [PATCH] Extends regexp to match () and @ (GH#9 and GH#7) --- urlscan/urlscan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/urlscan/urlscan.py b/urlscan/urlscan.py index 41d8ed2..7bad59a 100644 --- a/urlscan/urlscan.py +++ b/urlscan/urlscan.py @@ -240,8 +240,8 @@ def handle_entityref(self, name): # added above. self.handle_data('&%s;' % name) -urlinternalpattern = r'[{}a-zA-Z/\-_0-9%?&.=:;+,#~]' -urltrailingpattern = r'[{}a-zA-Z/\-_0-9%&=+#]' +urlinternalpattern = r'[{}()@a-zA-Z/\-_0-9%?&.=:;+,#~]' +urltrailingpattern = r'[{}()@a-zA-Z/\-_0-9%&=+#]' httpurlpattern = (r'(?:(https?|file)://' + urlinternalpattern + r'*' + urltrailingpattern + r')') # Used to guess that blah.blah.blah.TLD is a URL.