diff --git a/README.rst b/README.rst index 03b8107..b548a81 100644 --- a/README.rst +++ b/README.rst @@ -14,9 +14,14 @@ Wikipedia **Wikipedia** is a Python library that makes it easy to access and parse data from Wikipedia. +This forked version differs from the `Wikipedia master `_ (by @goldsmith) by enabling extraction of the entire revision history of a Wikipedia page as well as having non-English language support for the geo-search functionality. + +You can use this library to extract the data, and use the `WikiRevParser `_ to parse and organize the information in each revision. Without the `WikiRevParser `_, the data for each revision will be a noisy string, so parsing it first before processing or analysing the data is highly recommended. + +Beside these two important differences, this fork is identical to the `master version `_ in installation, test, and functionalities. + Search Wikipedia, get article summaries, get data like links and images -from a page, and more. Wikipedia wraps the `MediaWiki -API `__ so you can focus on using +from a page, and more. Wikipedia wraps the `MediaWiki API `__ so you can focus on using Wikipedia data, not getting it. .. code:: python @@ -91,16 +96,8 @@ full details. Credits ------- -- `wiki-api `__ by - @richardasaurus for inspiration -- @nmoroze and @themichaelyang for feedback and suggestions +- `Wikipedia `_ for initial implementation - The `Wikimedia Foundation `__ for giving the world free access to data - - -.. image:: https://d2weczhvl823v0.cloudfront.net/goldsmith/wikipedia/trend.png - :alt: Bitdeli badge - :target: https://bitdeli.com/free - diff --git a/wikipedia/wikipedia.py b/wikipedia/wikipedia.py index 7ad50e1..8b8ebef 100644 --- a/wikipedia/wikipedia.py +++ b/wikipedia/wikipedia.py @@ -1,7 +1,9 @@ from __future__ import unicode_literals +import re import requests import time + from bs4 import BeautifulSoup from datetime import datetime, timedelta from decimal import Decimal @@ -10,7 +12,6 @@ PageError, DisambiguationError, RedirectError, HTTPTimeoutError, WikipediaException, ODD_ERROR_MESSAGE) from .util import cache, stdout_encode, debug -import re API_URL = 'http://en.wikipedia.org/w/api.php' RATE_LIMIT = False @@ -18,7 +19,6 @@ RATE_LIMIT_LAST_CALL = None USER_AGENT = 'wikipedia (https://github.com/goldsmith/Wikipedia/)' - def set_lang(prefix): ''' Change the language of the API being requested. @@ -31,7 +31,7 @@ def set_lang(prefix): global API_URL API_URL = 'http://' + prefix.lower() + '.wikipedia.org/w/api.php' - for cached_func in (search, suggest, summary): + for cached_func in (geosearch, search, suggest, summary): cached_func.clear_cache() @@ -419,7 +419,7 @@ def __continued_query(self, query_params): if 'generator' in query_params: for datum in pages.values(): # in python 3.3+: "yield from pages.values()" yield datum - else: + elif prop in pages[self.pageid].keys(): for datum in pages[self.pageid][prop]: yield datum @@ -510,6 +510,48 @@ def parent_id(self): return self._parent_id + @property + def revisions(self): + ''' + Get all revisions in the revision history for a page. + ''' + + today = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") + print('\t* Processing revisions *') + + if not getattr(self, '_revision', False): + query_params = { + 'prop': 'revisions', + 'rvprop': 'timestamp|user|comment|content', + 'rvslots': 'main', + 'rvlimit': 500000, + 'rvstart': today, + } + if not getattr(self, 'title', None) is None: + query_params['titles'] = self.title + else: + query_params['pageids'] = self.pageid + + request = _wiki_request(query_params) + if "error" in request.keys(): return None + self._revisions = request['query']['pages'][self.pageid]['revisions'] + max_revisions = 40000 + num_revisions = 0 + + while True and num_revisions < max_revisions: + + if 'continue' in request: + query_params['continue'] = request['continue']['continue'] + query_params['rvcontinue'] = request['continue']['rvcontinue'] + + request = _wiki_request(query_params) + self._revisions = self._revisions + request['query']['pages'][self.pageid]['revisions'] + num_revisions = len(self._revisions) + + else: + break + return self._revisions + @property def summary(self): ''' @@ -566,10 +608,10 @@ def coordinates(self): request = _wiki_request(query_params) - if 'query' in request: + try: coordinates = request['query']['pages'][self.pageid]['coordinates'] self._coordinates = (Decimal(coordinates[0]['lat']), Decimal(coordinates[0]['lon'])) - else: + except KeyError: self._coordinates = None return self._coordinates @@ -585,14 +627,17 @@ def references(self): def add_protocol(url): return url if url.startswith('http') else 'http:' + url - self._references = [ - add_protocol(link['*']) - for link in self.__continued_query({ - 'prop': 'extlinks', - 'ellimit': 'max' - }) - ] + try: + self._references = [ + add_protocol(link['*']) + for link in self.__continued_query({ + 'prop': 'extlinks', + 'ellimit': 'max' + }) + ] + except KeyError: + self._references = [] return self._references @property