-
Notifications
You must be signed in to change notification settings - Fork 2
/
functions.py
108 lines (94 loc) · 3.59 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
import re
import json
import pywikibot
from pywikibot import page
import cache
import logger
LOG = logger.logger(debug=True) # logger in debug mode
WIKTIONNAIRE = pywikibot.Site('fr', 'wiktionary')
WIKIPEDIA = pywikibot.Site('fr', 'wikipedia')
WIKIDATA = pywikibot.Site('wikidata', 'wikidata')
source = re.compile("{{source\|.+}}")
wAuthor = re.compile("{{w\|[^|}]+[|}]")
nomWAuthor = re.compile("{{nom w pc\|([^|}]+)\|([^|}]+)[|}]")
link = re.compile("\[\[[^|\]]+\]\]")
gender = "P21"
nationality = "P27"
birthDate = "P569"
datas = [gender, nationality, birthDate]
authors = "authors"
words = u'words'
thesaurusPrefix = u"Thésaurus:"
fr = u"/français"
CACHE = cache.get()
def sources(word):
"""Find sources information for word."""
LOG.debug(word)
p = page.Page(WIKTIONNAIRE, word)
if p.exists:
CACHE[words][word] = []
text = p.text
templates = source.findall(text)
for template in templates:
# Authors are linked to wikipedia
wikiAuthors = [x[4:len(x) - 2] for x in wAuthor.findall(template)]
wikiAuthors += [x[0] + " " + x[1]
for x in nomWAuthor.findall(template)]
for wikiAuthor in wikiAuthors:
if wikiAuthor not in CACHE[authors]:
CACHE[authors][wikiAuthor] = characteristics(wikiAuthor)
CACHE[authors][wikiAuthor][words] = []
CACHE[words][word].append(wikiAuthor)
CACHE[authors][wikiAuthor][words].append(word)
return True
else:
return False
def characteristics(author):
"""Find characteristics for an author on Wikidata."""
LOG.debug("\t%s", author)
result = {}
wikiArticle = page.Page(WIKIPEDIA, author)
if wikiArticle.exists():
while wikiArticle.isRedirectPage():
wikiArticle = wikiArticle.getRedirectTarget()
item = wikiArticle.data_item()
for data in datas:
if data in item.claims:
for claim in item.claims[data]:
if claim.getTarget() is not None:
if data is birthDate:
result[data] = claim.getTarget().year
else:
result[data] = claim.getTarget().id
return result
def harvest(thesaurus):
"""Harvest information about a given Thesaurus and save cache"""
LOG.info("Harvesting: %s", thesaurus)
p = page.Page(WIKTIONNAIRE, thesaurusPrefix + thesaurus + fr)
if p.exists and thesaurus not in CACHE["thesaurus"]:
LOG.debug(thesaurus.upper())
CACHE["thesaurus"][thesaurus] = []
text = page.Page(WIKTIONNAIRE, thesaurusPrefix + thesaurus + fr).text
wikiWords = [x[2:len(x) - 2] for x in link.findall(text)]
for wikiWord in wikiWords:
if sources(wikiWord):
CACHE["thesaurus"][thesaurus].append(wikiWord)
cache.save(CACHE)
def analyse(thesaurus):
"""Present results on a given thesaurus from the cache file."""
LOG.info("Analysing: %s", thesaurus)
result = {}
if thesaurus in CACHE["thesaurus"]:
for data in datas:
result[data] = {}
for word in CACHE["thesaurus"][thesaurus]:
for author in CACHE[words][word]:
for data in datas:
if data in CACHE[authors][author]:
d = CACHE[authors][author][data]
if d in result[data]:
result[data][d] += 1
else:
result[data][d] = 1
return result