From 3a510ef7614d864e2b2b5e4694d220b3a696e023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Katrine=20J=C3=B8rgensen?= Date: Fri, 15 Nov 2019 13:03:58 +0100 Subject: [PATCH 1/8] enabled non-English geosearch, fixed bug in coordinates and added revision history --- wikipedia/wikipedia.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 7ad50e1..5e9858f 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -12,6 +12,7 @@ from .util import cache, stdout_encode, debug import re +# anna API_URL = 'http://en.wikipedia.org/w/api.php' RATE_LIMIT = False RATE_LIMIT_MIN_WAIT = None @@ -31,7 +32,7 @@ def set_lang(prefix): global API_URL API_URL = 'http://' + prefix.lower() + '.wikipedia.org/w/api.php' - for cached_func in (search, suggest, summary): + for cached_func in (geosearch, search, suggest, summary): cached_func.clear_cache() @@ -386,7 +387,7 @@ def __load(self, redirect=True, preload=False): request = _wiki_request(query_params) html = request['query']['pages'][pageid]['revisions'][0]['*'] - lis = BeautifulSoup(html, 'html.parser').find_all('li') + lis = BeautifulSoup(html).find_all('li') filtered_lis = [li for li in lis if not 'tocsection' in ''.join(li.get('class', []))] may_refer_to = [li.a.get_text() for li in filtered_lis if li.a] @@ -510,6 +511,29 @@ def parent_id(self): return self._parent_id + @property + def revisions(self): + ''' + All revisions in revision history for a page. + ''' + + if not getattr(self, '_revision', False): + query_params = { + 'prop': 'extracts|revisions', + 'rvprop': 'timestamp|user|comment|content', + 'rvslots': 'main', + 'rvlimit': 500, + } + if not getattr(self, 'title', None) is None: + query_params['titles'] = self.title + else: + query_params['pageids'] = self.pageid + + request = _wiki_request(query_params) + self._revisions = request['query']['pages'][self.pageid]['revisions'] + + return self._revisions + @property def summary(self): ''' @@ -566,10 +590,10 @@ def coordinates(self): request = _wiki_request(query_params) - if 'query' in request: + try: coordinates = request['query']['pages'][self.pageid]['coordinates'] self._coordinates = (Decimal(coordinates[0]['lat']), Decimal(coordinates[0]['lon'])) - else: + except KeyError: self._coordinates = None return self._coordinates From 3e6ed1698d4ca1de4db7913481d70e846253343a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Katrine=20J=C3=B8rgensen?= Date: Fri, 15 Nov 2019 13:12:45 +0100 Subject: [PATCH 2/8] syncing with head --- wikipedia/wikipedia.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 5e9858f..76677f6 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -12,7 +12,6 @@ from .util import cache, stdout_encode, debug import re -# anna API_URL = 'http://en.wikipedia.org/w/api.php' RATE_LIMIT = False RATE_LIMIT_MIN_WAIT = None @@ -387,7 +386,7 @@ def __load(self, redirect=True, preload=False): request = _wiki_request(query_params) html = request['query']['pages'][pageid]['revisions'][0]['*'] - lis = BeautifulSoup(html).find_all('li') + lis = BeautifulSoup(html, 'html.parser').find_all('li') filtered_lis = [li for li in lis if not 'tocsection' in ''.join(li.get('class', []))] may_refer_to = [li.a.get_text() for li in filtered_lis if li.a] From b54fa668f1360df17c7ad5f34a82575f0df78810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Katrine=20J=C3=B8rgensen?= Date: Tue, 19 Nov 2019 11:38:28 +0100 Subject: [PATCH 3/8] fixed bug for references. Not all pages have 'extlinks', which threw a KeyError. --- wikipedia/wikipedia.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 76677f6..3d75cf5 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -420,6 +420,7 @@ def __continued_query(self, query_params): for datum in pages.values(): # in python 3.3+: "yield from pages.values()" yield datum else: + print(pages[self.pageid]) for datum in pages[self.pageid][prop]: yield datum @@ -608,14 +609,17 @@ def references(self): def add_protocol(url): return url if url.startswith('http') else 'http:' + url - self._references = [ - add_protocol(link['*']) - for link in self.__continued_query({ - 'prop': 'extlinks', - 'ellimit': 'max' - }) - ] + try: + self._references = [ + add_protocol(link['*']) + for link in self.__continued_query({ + 'prop': 'extlinks', + 'ellimit': 'max' + }) + ] + except KeyError: + self._references = [] return self._references @property From 7980e91fbc873a97faf299ac2efa487adabf4a60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Katrine=20J=C3=B8rgensen?= Date: Tue, 19 Nov 2019 11:40:26 +0100 Subject: [PATCH 4/8] clean up --- wikipedia/wikipedia.py | 1 - 1 file changed, 1 deletion(-) diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 3d75cf5..2cc6e58 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -420,7 +420,6 @@ def __continued_query(self, query_params): for datum in pages.values(): # in python 3.3+: "yield from pages.values()" yield datum else: - print(pages[self.pageid]) for datum in pages[self.pageid][prop]: yield datum From 6308d6536b072ec4b5219898e2a90ab5108e5b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Katrine=20J=C3=B8rgensen?= Date: Mon, 3 Feb 2020 15:14:02 +0100 Subject: [PATCH 5/8] improving edit history functionality --- wikipedia/wikipedia.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 2cc6e58..a2cd342 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -419,7 +419,7 @@ def __continued_query(self, query_params): if 'generator' in query_params: for datum in pages.values(): # in python 3.3+: "yield from pages.values()" yield datum - else: + elif prop in pages[self.pageid].keys(): for datum in pages[self.pageid][prop]: yield datum @@ -513,24 +513,43 @@ def parent_id(self): @property def revisions(self): ''' - All revisions in revision history for a page. + Get all revisions in the revision history for a page. ''' + today = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") + print('processing revisions...') + if not getattr(self, '_revision', False): query_params = { - 'prop': 'extracts|revisions', + 'prop': 'revisions', 'rvprop': 'timestamp|user|comment|content', 'rvslots': 'main', - 'rvlimit': 500, + 'rvlimit': max, + 'rvstart': today, } if not getattr(self, 'title', None) is None: - query_params['titles'] = self.title + query_params['titles'] = self.title else: - query_params['pageids'] = self.pageid + query_params['pageids'] = self.pageid request = _wiki_request(query_params) self._revisions = request['query']['pages'][self.pageid]['revisions'] + + max_revisions = 20000 + num_revisions = 0 + + while True and num_revisions < max_revisions: + + if 'continue' in request: + query_params['continue'] = request['continue']['continue'] + query_params['rvcontinue'] = request['continue']['rvcontinue'] + request = _wiki_request(query_params) + self._revisions = self._revisions + request['query']['pages'][self.pageid]['revisions'] + num_revisions = len(self._revisions) + + else: + break return self._revisions @property From 7b2f5f2791da632aa0238a19b4da7cce0e5ceee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Katrine=20J=C3=B8rgensen?= Date: Fri, 21 Feb 2020 11:13:27 +0100 Subject: [PATCH 6/8] reworked --- wikipedia/wikipedia.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index a2cd342..4672209 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -517,14 +517,14 @@ def revisions(self): ''' today = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") - print('processing revisions...') + print('\t* Processing revisions *') if not getattr(self, '_revision', False): query_params = { 'prop': 'revisions', 'rvprop': 'timestamp|user|comment|content', 'rvslots': 'main', - 'rvlimit': max, + 'rvlimit': 500000, 'rvstart': today, } if not getattr(self, 'title', None) is None: @@ -533,9 +533,9 @@ def revisions(self): query_params['pageids'] = self.pageid request = _wiki_request(query_params) + if "error" in request.keys(): return None self._revisions = request['query']['pages'][self.pageid]['revisions'] - - max_revisions = 20000 + max_revisions = 40000 num_revisions = 0 while True and num_revisions < max_revisions: From 609583c7b514bf8eb46c3310dd543de08f4a6761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20J=C3=B8rgensen?= Date: Mon, 23 Mar 2020 13:38:11 +0100 Subject: [PATCH 7/8] updating README --- README.rst | 23 ++++++++++------------- wikipedia/wikipedia.py | 4 ++-- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/README.rst b/README.rst index 03b8107..ac5779d 100644 --- a/README.rst +++ b/README.rst @@ -14,10 +14,15 @@ Wikipedia **Wikipedia** is a Python library that makes it easy to access and parse data from Wikipedia. -Search Wikipedia, get article summaries, get data like links and images -from a page, and more. Wikipedia wraps the `MediaWiki -API `__ so you can focus on using -Wikipedia data, not getting it. +This forked version differs from the `Wikipedia master `_ (by @goldsmith) by enabling extraction of the entire revision history of a Wikipedia page as well as having non-English language support for the geo-search functionality. + +You can use this library to extract the data, and use the `WikiRevParser `_ to parse and organize the information in each revision. Without the `WikiRevParser `_, the data for each revision will be a noisy string, so parsing it first before processing or analysing the data is highly recommended. + +Beside these two important differences, this fork is identical to the `master version `_ in installation, test, and functionalities. + +"Search Wikipedia, get article summaries, get data like links and images +from a page, and more. Wikipedia wraps the `MediaWiki API `__ so you can focus on using +Wikipedia data, not getting it." .. code:: python @@ -91,16 +96,8 @@ full details. Credits ------- -- `wiki-api `__ by - @richardasaurus for inspiration -- @nmoroze and @themichaelyang for feedback and suggestions +- `Wikipedia `_ for initial implementation - The `Wikimedia Foundation `__ for giving the world free access to data - - -.. image:: https://d2weczhvl823v0.cloudfront.net/goldsmith/wikipedia/trend.png - :alt: Bitdeli badge - :target: https://bitdeli.com/free - diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 4672209..8b8ebef 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -1,7 +1,9 @@ from __future__ import unicode_literals +import re import requests import time + from bs4 import BeautifulSoup from datetime import datetime, timedelta from decimal import Decimal @@ -10,7 +12,6 @@ PageError, DisambiguationError, RedirectError, HTTPTimeoutError, WikipediaException, ODD_ERROR_MESSAGE) from .util import cache, stdout_encode, debug -import re API_URL = 'http://en.wikipedia.org/w/api.php' RATE_LIMIT = False @@ -18,7 +19,6 @@ RATE_LIMIT_LAST_CALL = None USER_AGENT = 'wikipedia (https://github.com/goldsmith/Wikipedia/)' - def set_lang(prefix): ''' Change the language of the API being requested. From 4d2ab7c08d9131cfddd2f61f88c00157be6e0f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20J=C3=B8rgensen?= Date: Mon, 23 Mar 2020 13:38:56 +0100 Subject: [PATCH 8/8] updating README --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index ac5779d..b548a81 100644 --- a/README.rst +++ b/README.rst @@ -20,9 +20,9 @@ You can use this library to extract the data, and use the `WikiRevParser `_ in installation, test, and functionalities. -"Search Wikipedia, get article summaries, get data like links and images +Search Wikipedia, get article summaries, get data like links and images from a page, and more. Wikipedia wraps the `MediaWiki API `__ so you can focus on using -Wikipedia data, not getting it." +Wikipedia data, not getting it. .. code:: python