forked from emijrp/wikidata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikidatafun.py
115 lines (104 loc) · 4.18 KB
/
wikidatafun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2017-2022 emijrp <emijrp@gmail.com>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
import json
import os
import re
import sys
import _thread
import time
import unicodedata
import urllib
import urllib.request
import urllib.parse
def cronstop():
return
if datetime.datetime.now().isoweekday() in [1, 2, 3, 4, 5]: #1 Monday
if datetime.datetime.now().hour > 4 and datetime.datetime.now().hour < 18:
sys.exit()
def removeAccents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
def getURL(url='', retry=False, timeout=30):
raw = ''
version = 110
req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/%d.0' % (version) })
try:
raw = urllib.request.urlopen(req, timeout=timeout).read().strip().decode('utf-8')
except:
sleep = 10 # seconds
maxsleep = 60
while retry and sleep <= maxsleep:
print('Error while retrieving: %s' % (url))
print('Retry in %s seconds...' % (sleep))
time.sleep(sleep)
try:
req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/%d.0' % (version) })
raw = urllib.request.urlopen(req, timeout=timeout).read().strip().decode('utf-8')
except:
version += 1
sleep = sleep * 2
return raw
def isScriptAliveCore(pidfilename=''):
while 1:
with open(pidfilename, 'w') as f:
f.write('alive')
time.sleep(10)
def isScriptAlive(filename=''):
alivefilename = '%s.alive' % (filename)
if os.path.exists(alivefilename):
print('Script is working, we wont launch another copy. Exiting...')
os.remove(alivefilename)
sys.exit()
else:
print('Alive file not found. We continue this instance')
try:
_thread.start_new_thread(isScriptAliveCore, (alivefilename,) )
except:
print("Error: unable to start thread")
def getUserEditCount(user='', site=''):
if user and site:
editcounturl = 'https://%s/w/api.php?action=query&list=users&ususers=%s&usprop=editcount&format=json' % (site, urllib.parse.quote(user))
raw = getURL(editcounturl)
json1 = json.loads(raw)
if 'query' in json1 and 'users' in json1['query'] and 'editcount' in json1['query']['users'][0]:
return json1['query']['users'][0]['editcount']
return 0
def loadSPARQL(sparql=''):
json1 = ''
if sparql:
try:
json1 = json.loads(sparql)
return json1
except:
print('Error downloading SPARQL? Malformatted JSON? Skiping\n')
return
else:
print('Server return empty file')
return
return
def getAllCountries():
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=SELECT%20%3FitemLabel%20%3Fitem%0AWHERE%20%7B%0A%09%3Fitem%20wdt%3AP31%20wd%3AQ6256.%0A%20%20%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20.%20%7D%0A%7D%0AORDER%20BY%20ASC(%3FitemLabel)'
url = '%s&format=json' % (url)
sparql = getURL(url=url)
json1 = loadSPARQL(sparql=sparql)
countries = []
for result in json1['results']['bindings']:
#print(result)
q = result['item']['value'].split('/entity/')[1]
label = result['itemLabel']['value']
countries.append([label, q])
return countries