-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
69534f6
commit 3a19434
Showing
8 changed files
with
331 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
from enum import Enum | ||
|
||
from .page.grammar import GrammarPage | ||
|
||
NOUN = "Deklination" | ||
ADJ_STRONG = 'Starke Beugung (ohne Artikel)' | ||
ADJ_WEAK = 'Schwache Beugung (mit Artikel)' | ||
ADJ_MIXED = 'Gemischte Beugung (mit ein, kein, Possessivpronomen u. a.)' | ||
ADJ_COMPARE = 'Steigerungsformen' | ||
|
||
VERB_INDICATIVE = "Indikativ" | ||
VERB_SUBJUNCTIVE_I = "Konjunktiv I" | ||
VERB_SUBJUNCTIVE_II = "Konjunktiv II" | ||
|
||
VERB_IMPERATIVE = "Imperativ" | ||
VERB_INFINITIVE_FORMS = "Infinite Formen" | ||
|
||
|
||
class Number(Enum): | ||
SINGULAR = "Singular" | ||
PLURAL = "Plural" | ||
|
||
|
||
class Case(Enum): | ||
NOMINATIVE = "Nominativ" | ||
GENITIVE = "Genitiv" | ||
DATIVE = "Dativ" | ||
ACCUSATIVE = "Akkusativ" | ||
|
||
class Gender(Enum): | ||
MASCULINE = "Maskulinum" | ||
FEMININE = "Femininum" | ||
NEUTER = "Neutrum" | ||
|
||
class Degree(Enum): | ||
POSITIVE = "Positiv" | ||
COMPARATIVE = "Komparativ" | ||
SUPERLATIVE = "Superlativ" | ||
|
||
# TODO: reduce to three categories | ||
class Person(Enum): | ||
FIRST_SINGULAR = "ich" | ||
SECOND_SINGULAR = "du" | ||
THIRD_SINGULAR = "er/sie/es" | ||
FIRST_PLURAL = "wir" | ||
SECOND_PLURAL = "ihr" | ||
THIRD_PLURAL = "sie" | ||
|
||
class Mood(Enum): | ||
INDICATIVE = VERB_INDICATIVE | ||
SUBJUNCTIVE_I = VERB_SUBJUNCTIVE_I | ||
SUBJUNCTIVE_II = VERB_SUBJUNCTIVE_II | ||
|
||
class Tense(Enum): | ||
PRESENT = "Präsens" | ||
PAST = "Präteritum" | ||
PERFECT = "Perfekt" | ||
PAST_PERFECT = "Plusquamperfekt" | ||
FUTURE = "Futur I" | ||
FUTURE_PERFECT = "Futur II" | ||
|
||
# TODO: rely only on number | ||
class ImperativePerson(Enum): | ||
PERSON_2_SINGULAR = '2. Person Singular [du]' | ||
PERSON_2_PLURAL = '2. Person Plural [ihr]' | ||
|
||
class InfinitiveForm(Enum): | ||
INFINITIVE_WITH_ZU = 'Infinitiv mit zu' | ||
PARTICIPLE_I = 'Partizip I' | ||
PARTICIPLE_II = 'Partizip II' | ||
|
||
|
||
class KeyChainError(KeyError): | ||
"""Variant of KeyError which stores previously accessed keys of a nested dict""" | ||
def __init__(self, key, previous_keys): | ||
super().__init__(key) | ||
self.key = key | ||
self.previous_keys = previous_keys | ||
|
||
class Enumdict: | ||
""" | ||
A dict wrapper with these properties | ||
* Enums behave as their raw values when used as keys (d["Singular"] == d[Number.SINGULAR]) | ||
* Nested inner dicts are also Enumdicts and store the key path they were accessed with | ||
""" | ||
def __init__(self, source, key_prefix=None): | ||
self.source = source | ||
self.key_prefix = key_prefix or [] # used for error messages | ||
|
||
def __getitem__(self, key): | ||
real_key = key.value if isinstance(key, Enum) else key | ||
try: | ||
value = self.source[real_key] | ||
except KeyError: | ||
raise KeyChainError(real_key, self.key_prefix) from None | ||
new_prefix = self.key_prefix + [real_key] | ||
return Enumdict(value, key_prefix=new_prefix) if isinstance(value, dict) else value | ||
|
||
def __repr__(self): | ||
return f"{self.__class__.__name__}({repr(self.source)})" | ||
|
||
class Inflector: | ||
|
||
def __init__(self, soup): | ||
self.page = GrammarPage(soup) | ||
self.transformed = { | ||
key: conditional_transform(key, value) | ||
for key, value in self.page.table_data.items() | ||
} | ||
self.enumraw = Enumdict(self.transformed) | ||
|
||
def __repr__(self): | ||
if not self.transformed: | ||
example = "Empty" | ||
else: | ||
example = self.transformed | ||
while isinstance(example, dict): | ||
example = list(example.values())[0] | ||
example = repr(example) + ", ..." | ||
return f"({self.__class__.__name__}: {example})" | ||
|
||
def inflect(self, *key_chain): | ||
inner = self.enumraw | ||
try: | ||
for key in key_chain: | ||
inner = inner[key] | ||
except KeyChainError as err: | ||
keys_str = '.'.join(repr(key) for key in (err.previous_keys + [err.key])) | ||
other_choices = ', '.join(repr(key) for key in inner.source.keys()) | ||
err_msg = _("Cannot inflect. Missing data for: {} . Did you mean {}?") | ||
raise ValueError(err_msg.format(keys_str, other_choices)) from None | ||
return inner | ||
|
||
# nouns | ||
def noun_decline(self, number, case): | ||
""" | ||
number: "Singular" or "Plural" | ||
case: "Nominativ", "Genitiv", "Dativ", "Akkusativ" | ||
""" | ||
return self.inflect(NOUN, number, case) | ||
|
||
# adjectives | ||
def adjective_decline_strong(self, gender, case): | ||
return self.inflect(ADJ_STRONG, gender, case) | ||
|
||
def adjective_decline_weak(self, gender, case): | ||
return self.inflect(ADJ_WEAK, gender, case) | ||
|
||
def adjective_decline_mixed(self, gender, case): | ||
return self.inflect(ADJ_MIXED, gender, case) | ||
|
||
def adjective_compare(self, degree): | ||
return self.inflect(ADJ_COMPARE, degree) | ||
|
||
# verbs | ||
def verb_conjugate(self, mood, tense, person): | ||
return self.inflect(mood, tense, person) | ||
|
||
def verb_imperative(self, person): | ||
return self.inflect(VERB_IMPERATIVE, person) | ||
|
||
def verb_infinitive_forms(self, form): | ||
return self.inflect(VERB_INFINITIVE_FORMS, form) | ||
|
||
|
||
def legend_left_transform(structure): | ||
return dict(zip(*structure[0])) | ||
|
||
|
||
def legend_top_transform(structure): | ||
return dict(s[0] for s in structure) | ||
|
||
|
||
def hidden_title_transform(structure): | ||
resmap = {} | ||
for legend, content in structure: | ||
name = content[0] | ||
mapping = dict(zip(legend[1:], content[1:])) | ||
resmap[name] = mapping | ||
return resmap | ||
|
||
|
||
def square_transform(structure): | ||
corner, *left_legend = structure[0][0] | ||
res = {} | ||
for _, column in structure: | ||
top_key, *values = column | ||
res[top_key] = dict(zip(left_legend, values)) | ||
return res | ||
|
||
def conditional_transform(key, structure): | ||
transformation = table_transformations[key] | ||
return transformation(structure) | ||
|
||
|
||
# how individual top-level section table_data must be transformed | ||
table_transformations = { | ||
# noun | ||
NOUN: hidden_title_transform, | ||
# verb | ||
VERB_INDICATIVE: hidden_title_transform, | ||
VERB_SUBJUNCTIVE_I: hidden_title_transform, | ||
VERB_SUBJUNCTIVE_II: hidden_title_transform, | ||
VERB_IMPERATIVE: legend_left_transform, | ||
VERB_INFINITIVE_FORMS: legend_top_transform, | ||
# adjective sections | ||
ADJ_COMPARE: legend_top_transform, | ||
ADJ_STRONG: square_transform, | ||
ADJ_WEAK: square_transform, | ||
ADJ_MIXED: square_transform, | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
class DudenPage: | ||
def __init__(self, soup): | ||
self.soup = soup | ||
|
||
def division(self, title): | ||
for division in self.soup.find_all("div", class_="division"): | ||
div_title = division.find("h2", class_="division__title") | ||
if div_title and div_title.text == title: | ||
return division | ||
raise KeyError(title) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import copy | ||
|
||
from .base import DudenPage | ||
|
||
|
||
class GrammarPage: | ||
""" | ||
Parses pages like | ||
/deklination/{substantive,adjektive}/{word} | ||
/konjugation/{word} | ||
""" | ||
def __init__(self, soup): | ||
self.page = DudenPage(soup) | ||
# self.soup = soup | ||
|
||
@property | ||
def table_data(self): | ||
return parse_grammar(self.page.division("Grammatik")) | ||
|
||
|
||
def parse_grammar(division): | ||
return { | ||
ig.h3.text: parse_igroup(ig) for ig in division.div(class_="con-dec__wrapper") | ||
} | ||
|
||
|
||
def parse_igroup(ig): | ||
return [parse_actable(actable) for actable in ig.div(class_="accordion-table")] | ||
|
||
|
||
def parse_actable(actable): | ||
return [parse_ul(ul) for ul in actable.find_all("ul")] | ||
|
||
|
||
def parse_ul(ul): | ||
return [parse_li(li) for li in ul.find_all("li")] | ||
|
||
|
||
def parse_li(li): | ||
li = copy.copy(li) | ||
try: | ||
li.sup.extract() | ||
except AttributeError: | ||
pass | ||
return li.text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.