forked from emijrp/wikidata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
human.references.py
533 lines (498 loc) · 23.7 KB
/
human.references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2019-2024 emijrp <emijrp@gmail.com>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import datetime
import html
import random
import re
import sys
import time
import urllib
import urllib.parse
import urllib.request
import unicodedata
import pywikibot
from pywikibot import pagegenerators
from wikidatafun import *
publishers = {
"bne": {"domain": "bne.es", "q": "Q50358336"},
"bnf": {"domain": "bnf.fr", "q": "Q19938912"},
"bbc": {"domain": "bbc.com", "q": "Q747860"},
"ccma": {"domain": "ccma.cat", "q": "Q3323383"},
"rtve": {"domain": "rtve.es", "q": "Q54829"},
}
languages = {
"ca": {"q": "Q7026", "keywords": {"death": ["mor", "mort"]}, "publishers": ["ccma"] },
"en": {"q": "Q1860", "keywords": {"death": ["dies", "passes away"]}, "publishers": ["bbc"] },
"es": {"q": "Q1321", "keywords": {"death": ["fallece", "muere"]}, "publishers": ["rtve"] },
}
def removeaccute(s):
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
def addHumanRef(repo="", item=""):
global publishers
if not repo or not item:
return
item.get()
today = datetime.date.today()
year, month, day = [int(today.strftime("%Y")), int(today.strftime("%m")), int(today.strftime("%d"))]
itembne = pywikibot.ItemPage(repo, publishers["bne"]["q"])
itembnf = pywikibot.ItemPage(repo, publishers["bnf"]["q"])
bneinref = False
bnfinref = False
if 'P31' in item.claims and len(item.claims['P31']) == 1:
#print(item.claims['P31'][0].getTarget())
sources = item.claims['P31'][0].getSources()
for source in sources:
#print(source)
if ("P248" in source and source["P248"][0].getTarget() == itembne) or \
("P950" in source) or \
("P854" in source and publishers["bne"]["domain"] in source["P854"][0].getTarget()):
bneinref = True
if ("P248" in source and source["P248"][0].getTarget() == itembnf) or \
("P268" in source) or \
("P854" in source and publishers["bnf"]["domain"] in source["P854"][0].getTarget()):
bnfinref = True
else:
print("Not Q5")
return
"""
if bnfinref:
print("Ya tiene BNF en referencia")
else:
print("NO tiene BNF en referencia")
"""
if bneinref:
print("Ya tiene BNE en referencia, skiping")
return
else:
print("NO tiene BNE en referencia")
bneid = ""
if 'P950' in item.claims:
if len(item.claims['P950']) == 1:
bneid = item.claims['P950'][0].getTarget()
print(bneid)
else:
print("More than one ID, skiping")
return
else:
print("No tiene BNE id, skiping")
return
if not bneid:
print("No BNE id, skiping")
return
if not bneid.startswith("XX"):
print("BNE ID not starts with XX, skiping")
return
if itembne and bneid:
claim = item.claims['P31'][0]
refstatedinclaim = pywikibot.Claim(repo, 'P248')
refstatedinclaim.setTarget(itembne)
refretrieveddateclaim = pywikibot.Claim(repo, 'P813')
refretrieveddateclaim.setTarget(pywikibot.WbTime(year=year, month=month, day=day))
refbneidclaim = pywikibot.Claim(repo, 'P950')
refbneidclaim.setTarget(bneid)
claim.addSources([refstatedinclaim, refretrieveddateclaim, refbneidclaim], summary='BOT - Adding 1 reference')
print("Adding BNE reference to P31 claim")
return
def addGenderRef(repo="", item=""):
if not repo or not item:
return
item.get()
today = datetime.date.today()
year, month, day = [int(today.strftime("%Y")), int(today.strftime("%m")), int(today.strftime("%d"))]
maleitem = pywikibot.ItemPage(repo, "Q6581097")
malegivennameitem = pywikibot.ItemPage(repo, "Q12308941")
femaleitem = pywikibot.ItemPage(repo, "Q6581072")
femalegivennameitem = pywikibot.ItemPage(repo, "Q11879590")
if 'P21' in item.claims and len(item.claims['P21']) == 1:
#print(item.claims['P21'][0].getTarget())
sources = item.claims['P21'][0].getSources()
if not sources:
if 'P735' in item.claims and len(item.claims['P735']) == 1: #given name
givennameitem = item.claims['P735'][0].getTarget()
if not givennameitem: #sometimes no value https://www.wikidata.org/wiki/Q1918629
return
givennameitem.get()
if "P31" in givennameitem.claims and len(givennameitem.claims['P31']) == 1:
inferredfromgivenname = pywikibot.ItemPage(repo, "Q69652498")
if givennameitem.claims["P31"][0].getTarget() == malegivennameitem and item.claims['P21'][0].getTarget() == maleitem:
print("Male given name")
claim = item.claims['P21'][0]
refheuristicclaim = pywikibot.Claim(repo, 'P887')
refheuristicclaim.setTarget(inferredfromgivenname)
claim.addSources([refheuristicclaim], summary='BOT - Adding 1 reference')
print("Adding reference to gender claim")
return
elif givennameitem.claims["P31"][0].getTarget() == femalegivennameitem and item.claims['P21'][0].getTarget() == femaleitem:
print("Female given name")
claim = item.claims['P21'][0]
refheuristicclaim = pywikibot.Claim(repo, 'P887')
refheuristicclaim.setTarget(inferredfromgivenname)
claim.addSources([refheuristicclaim], summary='BOT - Adding 1 reference')
print("Adding reference to gender claim")
return
else:
return
return
def ddgSearch(search="", domain=""):
if not search:
return
time.sleep(2)
#search = search + " " + str(random.randint(100, 999))
ddgurl = "https://html.duckduckgo.com/html/?q=" + urllib.parse.quote_plus(search)
print(ddgurl)
raw = getURL(url=ddgurl)
resulturls = []
if '<div class="no-results">' in raw:
print("Sin resultados")
else:
splits = raw.split('<h2 class="result__title">')[1:]
for split in splits:
resulturl = ""
#print(split)
m = re.findall(r'(?im)<a rel="nofollow" class="result__a" href="([^"]+?)">[^<>]*?</a>', split)
if m and len(m) == 1:
resulturl = urllib.parse.unquote_plus(m[0]).split('uddg=')[1].split('&rut=')[0]
if re.search(r"(?im)^https?://(([^/]+)\.)?%s/" % (domain), resulturl):
resulturls.append(resulturl)
print("Found", resulturl)
break #just first result
return resulturls
def publisherCore(repo="", itemlabel="", deathdate="", publisher="", lang=""):
global publishers
global languages
if not repo or not itemlabel:
return
if not deathdate or not publisher or not lang:
return
resultpublisher = pywikibot.ItemPage(repo, publishers[publisher]["q"])
search = '%s "%s" %s %s' % (" ".join(languages[lang]["keywords"]["death"]), itemlabel, deathdate.year, publishers[publisher]["domain"])
print(search)
resulturls = ddgSearch(search=search, domain=publishers[publisher]["domain"])
refcandidate = ""
for resulturl in resulturls:
refcandidate = ""
try:
raw = getURL(url=resulturl)
print("Retrieving url from publisher", publisher)
if publisher == "ccma":
refcandidate = publisherCCMA(repo=repo, itemlabel=itemlabel, deathdate=deathdate, raw=raw)
elif publisher == "rtve":
refcandidate = publisherRTVE(repo=repo, itemlabel=itemlabel, deathdate=deathdate, raw=raw)
else:
print("Unknown publisher", publisher)
return
if refcandidate:
refcandidate["publisher"] = publisher
refcandidate["lang"] = lang
refcandidate["url"] = resulturl
if re.search(r"(?im)(%s|%s)" % (itemlabel, removeaccute(itemlabel)), refcandidate["title"]+" "+refcandidate["text"]):
return refcandidate
else:
return
except:
print("Error al parsear referencia")
return refcandidate
def publisherCCMA(repo="", itemlabel="", deathdate="", raw=""):
#<meta property="og:title" content="">
#<meta property="article:published_time" content="2020-02-04T16:09:59+02:00">
resulttitle = unquote(re.findall(r'(?im)<meta property="og:title" content="([^<>]+?)">', raw)[0].strip())
resultdate = re.findall(r'(?im)<meta property="article:published_time" content="([^<>]+?)" */?>', raw)[0].split("T")[0].strip()
resultdate = datetime.datetime(year=int(resultdate.split("-")[0]), month=int(resultdate.split("-")[1]), day=int(resultdate.split("-")[2]))
resulttext = unquote(raw.split('<article')[1].split('</article>')[0])
refcandidate = {"title": resulttitle, "date": resultdate, "text": resulttext}
return refcandidate
def publisherRTVE(repo="", itemlabel="", deathdate="", raw=""):
#<meta name="DC.title" content="Muere el actor Francisco Merino, uno de los grandes secundarios del cine español"/>
#<meta name="DC.date" content="2022-10-09T18:07:00+02:00"/>
resulttitle = unquote(re.findall(r'(?im)<meta name="DC.title" content="([^<>]+?)" */?>', raw)[0].strip())
resultdate = re.findall(r'(?im)<meta name="DC.date" content="([^<>]+?)" */?>', raw)[0].split("T")[0].strip()
resultdate = datetime.datetime(year=int(resultdate.split("-")[0]), month=int(resultdate.split("-")[1]), day=int(resultdate.split("-")[2]))
resulttext = unquote(raw.split('<div class="artBody">')[1].split('<div class="slideH">')[0])
refcandidate = {"title": resulttitle, "date": resultdate, "text": resulttext}
return refcandidate
def unquote(s=""):
s = urllib.parse.unquote_plus(s)
s = html.unescape(s)
return s
def searchDeathdateRefCore(repo="", itemlabel="", deathdate="", publisher="", lang=""):
if not repo or not itemlabel or not deathdate or not publisher or not lang:
return
return publisherCore(repo=repo, itemlabel=itemlabel, deathdate=deathdate, publisher=publisher, lang=lang)
def searchDeathdateRef(repo="", item="", itemlabel="", deathdate="", lang=""):
global languages
if not repo or not item:
return
if not itemlabel or not deathdate or not lang:
return
refcandidate = ""
if 'P27' in item.claims and len(item.claims['P27']) == 1: #citizenship
itemcountry = item.claims['P27'][0].getTarget()
itemcountry.get()
if itemcountry.labels["en"] == "Spain" and lang in ["es", "ca", "eu", "gl"]:
for publisher in languages[lang]["publishers"]:
print("Searching ref in", publisher)
refcandidate = searchDeathdateRefCore(repo=repo, itemlabel=itemlabel, deathdate=deathdate, publisher=publisher, lang=lang)
if refcandidate:
break #just 1 ref per lang
else:
return
return refcandidate
def hasSourceInThisLang(sources="", lang=""):
global publishers
if not lang: #siempre pasar un lang
return True
hassourceinlang = False
if not sources:
return hassourceinlang
for source in sources:
#print(source)
for publisher, props in publishers.items():
if not publisher in languages[lang]["publishers"]:
continue
if 'P854' in source:
if re.search(r"(?im)^https?://(([^/]+)\.)?%s/" % (props["domain"]), source['P854'][0].getTarget()):
print("Ya tiene referencia a", props["domain"])
hassourceinlang = True
return hassourceinlang
def addDeathdateRefCore(repo="", item="", refcandidate=""):
global languages
global publishers
if not repo or not item or not refcandidate:
return
today = datetime.date.today()
year, month, day = [int(today.strftime("%Y")), int(today.strftime("%m")), int(today.strftime("%d"))]
item.get()
claim = item.claims['P570'][0]
reftitleclaim = pywikibot.Claim(repo, 'P1476')
reftitlemonotext = pywikibot.WbMonolingualText(text=refcandidate["title"], language=refcandidate["lang"])
reftitleclaim.setTarget(reftitlemonotext)
refpublisherclaim = pywikibot.Claim(repo, 'P123')
refpublisherclaim.setTarget(pywikibot.ItemPage(repo, publishers[refcandidate["publisher"]]["q"]))
refpublisheddateclaim = pywikibot.Claim(repo, 'P577')
refpublisheddateclaim.setTarget(pywikibot.WbTime(year=refcandidate["date"].year, month=refcandidate["date"].month, day=refcandidate["date"].day))
reflanguageofworkclaim = pywikibot.Claim(repo, 'P407')
reflanguageofworkclaim.setTarget(pywikibot.ItemPage(repo, languages[refcandidate["lang"]]["q"]))
refreferenceurlclaim = pywikibot.Claim(repo, 'P854')
refreferenceurlclaim.setTarget(refcandidate["url"])
refretrieveddateclaim = pywikibot.Claim(repo, 'P813')
refretrieveddateclaim.setTarget(pywikibot.WbTime(year=year, month=month, day=day))
newsource = [reftitleclaim, refpublisherclaim, refpublisheddateclaim, reflanguageofworkclaim, refreferenceurlclaim, refretrieveddateclaim]
try:
print(newsource)
print("Adding deathdate reference")
claim.addSources(newsource, summary='BOT - Adding 1 reference')
except:
print("Error while saving, skipping")
def addDeathdateRef(repo="", item=""):
global languages
if not repo or not item:
return
item.get()
today = datetime.date.today()
year, month, day = [int(today.strftime("%Y")), int(today.strftime("%m")), int(today.strftime("%d"))]
if ('P569' in item.claims and len(item.claims['P569']) == 1) and \
('P570' in item.claims and len(item.claims['P570']) == 1):
#print(item.claims['P570'][0].getTarget())
for lang in languages.keys():
refcandidate = ""
item.get()
sources = item.claims['P570'][0].getSources()
if hasSourceInThisLang(sources, lang):
print("Ya tiene referencia en idioma:", lang)
continue
itemlabel = lang in item.labels.keys() and item.labels[lang] or "en" in item.labels.keys() and item.labels["en"] or ""
if not itemlabel:
print("Label not found for lang", lang)
continue
birthdate = item.claims['P569'][0].getTarget()
deathdate = item.claims['P570'][0].getTarget()
if itemlabel and deathdate:
print("Buscando ref:", itemlabel, deathdate.year, lang)
refcandidate = searchDeathdateRef(repo=repo, item=item, itemlabel=itemlabel, deathdate=deathdate, lang=lang)
if refcandidate:
#print(refcandidate)
if not re.search(r"(?im)\b(%s)\b" % ("|".join(languages[lang]["keywords"]["death"])), refcandidate["title"]):
print("Not found keywords in title: %s" % (refcandidate["title"]))
return
if not re.search(r"(?im)\b(%s)\b" % (itemlabel), refcandidate["title"]):
print("Not found %s in title: %s" % (itemlabel, refcandidate["title"]))
return
if deathdate.year != refcandidate["date"].year:
print("Death year and news year are different")
return
addDeathdateRefCore(repo=repo, item=item, refcandidate=refcandidate)
return
def addCitizenshipRef(repo="", item=""):
if not repo or not item:
return
item.get()
if ('P569' in item.claims and len(item.claims['P569']) == 1 and item.claims['P569'][0].getTarget().year >= 1950) and \
('P27' in item.claims and len(item.claims['P27']) == 1) and \
('P19' in item.claims and len(item.claims['P19']) == 1): #P569 birthdate, P27 citizenship, P19 birthplace, 1950 to avoid old countries
#print(item.claims['P19'][0].getTarget())
sources = item.claims['P27'][0].getSources()
if not sources:
birthplace = item.claims['P19'][0].getTarget()
birthplace.get()
if ('P17' in birthplace.claims and len(birthplace.claims['P17']) == 1): #P17 country
birthcountry = birthplace.claims['P17'][0].getTarget()
inferredfrombirthplace = pywikibot.ItemPage(repo, "Q91770864")
if birthcountry == item.claims['P27'][0].getTarget():
claim = item.claims['P27'][0]
refheuristicclaim = pywikibot.Claim(repo, 'P887')
refheuristicclaim.setTarget(inferredfrombirthplace)
claim.addSources([refheuristicclaim], summary='BOT - Adding 1 reference')
print(birthcountry)
print("Adding reference to citizenship claim")
return
def addGivennameRef(repo="", item=""):
if not repo or not item:
return
item.get()
if ('P735' in item.claims and len(item.claims['P735']) == 1): #P735 given name
#print(item.claims['P735'][0].getTarget())
sources = item.claims['P735'][0].getSources()
if not sources:
givenname = item.claims['P735'][0].getTarget()
givenname.get()
given = pywikibot.ItemPage(repo, "Q202444")
givenmale = pywikibot.ItemPage(repo, "Q12308941")
givenfemale = pywikibot.ItemPage(repo, "Q11879590")
if not 'P31' in givenname.claims or \
(not given in [x.getTarget() for x in givenname.claims['P31']] and \
not givenmale in [x.getTarget() for x in givenname.claims['P31']] and \
not givenfemale in [x.getTarget() for x in givenname.claims['P31']]):
print("Not a given name")
return
if ("en" in givenname.labels and "en" in item.labels and \
len(givenname.labels["en"]) >= 4 and \
len(item.labels["en"].split(" ")) == 2 and item.labels["en"].startswith(givenname.labels["en"]+" ")):
inferredfromfullname = pywikibot.ItemPage(repo, "Q97033143")
claim = item.claims['P735'][0] #P735 given name
refheuristicclaim = pywikibot.Claim(repo, 'P887')
refheuristicclaim.setTarget(inferredfromfullname)
claim.addSources([refheuristicclaim], summary='BOT - Adding 1 reference')
print("Adding reference to given name claim")
return
def addFamilynameRef(repo="", item=""):
if not repo or not item:
return
if ('P734' in item.claims and len(item.claims['P734']) == 1): #P734 family name
#print(item.claims['P734'][0].getTarget())
sources = item.claims['P734'][0].getSources()
if not sources:
familyname = item.claims['P734'][0].getTarget()
familyname.get()
family = pywikibot.ItemPage(repo, "Q101352")
if not 'P31' in familyname.claims or \
(not family in [x.getTarget() for x in familyname.claims['P31']]):
print("Not a family name")
return
if ("en" in familyname.labels and "en" in item.labels and \
len(familyname.labels["en"]) >= 4 and \
len(item.labels["en"].split(" ")) == 2 and item.labels["en"].endswith(" "+familyname.labels["en"])):
inferredfromfullname = pywikibot.ItemPage(repo, "Q97033143")
claim = item.claims['P734'][0] #P734 family name
refheuristicclaim = pywikibot.Claim(repo, 'P887')
refheuristicclaim.setTarget(inferredfromfullname)
claim.addSources([refheuristicclaim], summary='BOT - Adding 1 reference')
print("Adding reference to family name claim")
return
def main():
site = pywikibot.Site('wikidata', 'wikidata')
repo = site.data_repository()
for i in range(1000):
queries = [
"""
SELECT ?item
WHERE {
SERVICE bd:sample {
?item wdt:P950 ?id. #P950 BNE id
bd:serviceParam bd:sample.limit 10000 .
bd:serviceParam bd:sample.sampleType "RANDOM" .
}
?item wdt:P31 wd:Q5.
}
#random%d
""" % (random.randint(1000000, 9999999))
]
queries = [ #para testear noticias de deathdate
"""
SELECT ?item ?linkcount
WHERE {
?item wdt:P31 wd:Q5.
?item wdt:P27 wd:Q29.
?item wdt:P570 ?deathdate.
FILTER (?deathdate > "2015-01-01"^^xsd:dateTime).
?item wikibase:sitelinks ?linkcount .
}
GROUP BY ?item ?linkcount
HAVING (?linkcount > 20)
ORDER BY DESC(?linkcount)
#random%d
""" % (random.randint(1000000, 9999999))
]
queries = [
"""
SELECT ?item
WHERE {
SERVICE bd:sample {
?item wdt:P31 wd:Q5 .
bd:serviceParam bd:sample.limit 10000 .
bd:serviceParam bd:sample.sampleType "RANDOM" .
}
?item wdt:P31 wd:Q5.
?item wdt:P569 ?birthdate.
FILTER (?birthdate > "1950-01-01"^^xsd:dateTime).
}
#random%d
""" % (random.randint(1000000, 9999999))
]
for query in queries:
time.sleep(1)
url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=%s' % (urllib.parse.quote(query))
url = '%s&format=json' % (url)
print("Loading...", url)
sparql = getURL(url=url)
json1 = loadSPARQL(sparql=sparql)
qlist = []
for result in json1['results']['bindings']:
q = 'item' in result and result['item']['value'].split('/entity/')[1] or ''
if q:
qlist.append(q)
if not qlist: #empty query result? maybe no more Q
continue
random.shuffle(qlist) #sino siempre empieza por los mismos y en sucesivas ejecuciones tiene que llegar hasta donde llegó
for q in qlist:
print('\n== %s ==' % (q))
print('https://www.wikidata.org/wiki/%s' % (q))
item = pywikibot.ItemPage(repo, q)
try: #to detect Redirect because .isRedirectPage fails
item.get()
except:
print('Error while .get()')
continue
try:
addHumanRef(repo=repo, item=item) #bne, orcid, google scholar...
#addGenderRef(repo=repo, item=item)
#addCitizenshipRef(repo=repo, item=item)
addGivennameRef(repo=repo, item=item)
addFamilynameRef(repo=repo, item=item)
#addDeathdateRef(repo=repo, item=item) no da muchos resultados
except:
pass
if __name__ == "__main__":
main()