Skip to content

Commit

Permalink
Add translations support
Browse files Browse the repository at this point in the history
  • Loading branch information
Surkal committed Oct 11, 2020
1 parent f53960d commit 5302228
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 11 deletions.
5 changes: 5 additions & 0 deletions tests/test_html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,8 @@ def test_subsections(self):
assert len(page.sections_id) == 6
assert len(page.sections_id['#Nom_commun']) == 9
assert '#Synonymes' in page.sections_id['#Nom_commun']

def test_messy_related_ords(self):
page = WiktionnaireParser.from_source('merci', oldid=28604039)
assert page.get_related_words('Synonymes') == {'Nom commun 1': ['grâce', 'miséricorde', 'pitié']}
assert page.get_related_words('Dérivés') == {'Nom commun 1': ['sans merci', 'à la merci de'], 'Interjection': ['Dieu merci', 'grand merci', 'merci beaucoup', 'merci énormément', 'merci infiniment', 'mille mercis', 'non merci', 'remercier', 'remerciement', 'un grand merci']}
54 changes: 43 additions & 11 deletions wiktionnaireparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,26 +194,58 @@ def get_etymology(self):

return etym

def get_related_words(self, related_word):
"""
Get related words. Doesn't work for translations.
Possible parameters: Apparentés étymologiques, Dérivés, Synonymes,
Dérivés dans d’autres langues, Traductions, Hyponymes, Hyperonymes,
Variantes orthographiques, Abréviations, Homophones, Méronymes,
Vocabulaire apparenté par le sens, etc.
"""
def get_related_words_ids(self, related_word):
related_word = related_word.replace(' ', '_')
regex = r'#%s(?:_\d+)?' % related_word
ids = {}
for key, values in self.sections_id.items():
name = self._query.find(key).text()
for value in values:
if re.match(regex, value):
ids[key] = value
ids[name] = value
return ids

def get_related_words(self, related_word):
"""
Get related words.
Possible parameters: Apparentés étymologiques, Dérivés, Synonymes,
Dérivés dans d’autres langues, Hyponymes, Hyperonymes,
Variantes orthographiques, Abréviations, Homophones, Méronymes,
Vocabulaire apparenté par le sens, etc.
For translations, use `get_translations`.
"""
ids = self.get_related_words_ids(related_word)
related_words = {}
for key, value in ids.items():
related = []
section = self._query.find(value)[0]
for s in section.getparent().getnext():
related.append(s.text_content())
section = section.getparent()
while section.tag != 'ul':
# 1 box
if section.tag == 'div' and section.attrib.get('class') == 'boite':
section = section.getprevious().getprevious().find('div')
section = section.find('div').find('div').find('ul')
break
section = section.getnext()
for s in section:
related.append(s.find('a').text_content())
related_words[key] = related
return related_words

def get_translations(self, translation_id):
result = {}
section = self._query.find(translation_id)[0].getparent()
lines = section.getnext().find('div').find('div').getnext().find('div')
lines = lines.find('div').find('ul').find('li')

while lines is not None:
language = lines.find('span').text_content()
transl = []
links = lines.find('a')
while links is not None:
if links.attrib.get('class') != 'trad-exposant' and links.attrib:
transl.append(links.text_content())
links = links.getnext()
lines = lines.getnext()
result[language] = transl
return result

0 comments on commit 5302228

Please sign in to comment.