Skip to content

Commit

Permalink
WIP: grammar
Browse files Browse the repository at this point in the history
  • Loading branch information
radomirbosak committed Sep 26, 2022
1 parent 69534f6 commit 3a19434
Show file tree
Hide file tree
Showing 8 changed files with 331 additions and 48 deletions.
10 changes: 8 additions & 2 deletions duden/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
print_string_or_list,
print_tree_of_strings,
)
from .search import get, search
from .search import get, grammar, search


def display_word(word, args):
Expand Down Expand Up @@ -68,7 +68,13 @@ def display_word(word, args):
for spelling in word.alternative_spellings:
print(spelling)
elif args.grammar:
display_grammar(word, args.grammar)
if not word.grammar_link:
print(red("No grammar info for word " + word.name))
sys.exit(1)

data = grammar(word.grammar_link)

display_grammar(data, args.grammar)
elif args.export:
yaml_string = yaml.dump(word.export(), sort_keys=False, allow_unicode=True)
print(yaml_string, end="")
Expand Down
49 changes: 15 additions & 34 deletions duden/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,27 @@
from crayons import blue, white, yellow # pylint: disable=no-name-in-module


def display_grammar(word, grammar_args):
def display_grammar(data, grammar_args):
"""
Display word grammar forms, corresponds to --grammar switch
"""
grammar_struct = word.grammar_raw
if not grammar_struct:
return

grammar_tokens = [token.lower() for token in grammar_args.split(",")]

# filter out grammar forms which do not match provided keys
tag_columns = []
value_column = []
for keys, value in word.grammar_raw:
lkeys = {key.lower() for key in keys}

if not (grammar_args == "ALL" or lkeys.issuperset(grammar_tokens)):
continue

reduced_keys = lkeys.difference(grammar_tokens)
import yaml

tag_columns.append(list(reduced_keys))
value_column.append(value)
# print(grammar_args)
if grammar_args == "ALL":
grammar_args = ""
grammar_tokens = [token.lower() for token in grammar_args.split(",") if token != ""]

# determine the width of the table
max_keys_count = max(map(len, tag_columns))
subdata = data
for token in grammar_tokens:
subdata = {key.lower(): value for key, value in subdata.items()}
subdata = subdata[token.lower()]

# if provided keys uniquely determine the value(s), display a 1-col table
if max_keys_count == 0:
display_table([[value] for value in value_column])
return

# otherwise make a nice "| key1 key2 | value |" table
table = []
for keys, value in zip(tag_columns, value_column):
padding = [""] * (max_keys_count - len(keys))
row = keys + padding + [blue("|")] + [value]
table.append(row)

display_table(table)
if isinstance(subdata, str):
print(subdata)
else:
s = yaml.dump(subdata, indent=2, allow_unicode=True)
print(s)


def display_table(table, cell_spacing=" "):
Expand Down
212 changes: 212 additions & 0 deletions duden/inflection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
from enum import Enum

from .page.grammar import GrammarPage

NOUN = "Deklination"
ADJ_STRONG = 'Starke Beugung (ohne Artikel)'
ADJ_WEAK = 'Schwache Beugung (mit Artikel)'
ADJ_MIXED = 'Gemischte Beugung (mit ein, kein, Possessivpronomen u. a.)'
ADJ_COMPARE = 'Steigerungsformen'

VERB_INDICATIVE = "Indikativ"
VERB_SUBJUNCTIVE_I = "Konjunktiv I"
VERB_SUBJUNCTIVE_II = "Konjunktiv II"

VERB_IMPERATIVE = "Imperativ"
VERB_INFINITIVE_FORMS = "Infinite Formen"


class Number(Enum):
SINGULAR = "Singular"
PLURAL = "Plural"


class Case(Enum):
NOMINATIVE = "Nominativ"
GENITIVE = "Genitiv"
DATIVE = "Dativ"
ACCUSATIVE = "Akkusativ"

class Gender(Enum):
MASCULINE = "Maskulinum"
FEMININE = "Femininum"
NEUTER = "Neutrum"

class Degree(Enum):
POSITIVE = "Positiv"
COMPARATIVE = "Komparativ"
SUPERLATIVE = "Superlativ"

# TODO: reduce to three categories
class Person(Enum):
FIRST_SINGULAR = "ich"
SECOND_SINGULAR = "du"
THIRD_SINGULAR = "er/sie/es"
FIRST_PLURAL = "wir"
SECOND_PLURAL = "ihr"
THIRD_PLURAL = "sie"

class Mood(Enum):
INDICATIVE = VERB_INDICATIVE
SUBJUNCTIVE_I = VERB_SUBJUNCTIVE_I
SUBJUNCTIVE_II = VERB_SUBJUNCTIVE_II

class Tense(Enum):
PRESENT = "Präsens"
PAST = "Präteritum"
PERFECT = "Perfekt"
PAST_PERFECT = "Plusquamperfekt"
FUTURE = "Futur I"
FUTURE_PERFECT = "Futur II"

# TODO: rely only on number
class ImperativePerson(Enum):
PERSON_2_SINGULAR = '2. Person Singular [du]'
PERSON_2_PLURAL = '2. Person Plural [ihr]'

class InfinitiveForm(Enum):
INFINITIVE_WITH_ZU = 'Infinitiv mit zu'
PARTICIPLE_I = 'Partizip I'
PARTICIPLE_II = 'Partizip II'


class KeyChainError(KeyError):
"""Variant of KeyError which stores previously accessed keys of a nested dict"""
def __init__(self, key, previous_keys):
super().__init__(key)
self.key = key
self.previous_keys = previous_keys

class Enumdict:
"""
A dict wrapper with these properties
* Enums behave as their raw values when used as keys (d["Singular"] == d[Number.SINGULAR])
* Nested inner dicts are also Enumdicts and store the key path they were accessed with
"""
def __init__(self, source, key_prefix=None):
self.source = source
self.key_prefix = key_prefix or [] # used for error messages

def __getitem__(self, key):
real_key = key.value if isinstance(key, Enum) else key
try:
value = self.source[real_key]
except KeyError:
raise KeyChainError(real_key, self.key_prefix) from None
new_prefix = self.key_prefix + [real_key]
return Enumdict(value, key_prefix=new_prefix) if isinstance(value, dict) else value

def __repr__(self):
return f"{self.__class__.__name__}({repr(self.source)})"

class Inflector:

def __init__(self, soup):
self.page = GrammarPage(soup)
self.transformed = {
key: conditional_transform(key, value)
for key, value in self.page.table_data.items()
}
self.enumraw = Enumdict(self.transformed)

def __repr__(self):
if not self.transformed:
example = "Empty"
else:
example = self.transformed
while isinstance(example, dict):
example = list(example.values())[0]
example = repr(example) + ", ..."
return f"({self.__class__.__name__}: {example})"

def inflect(self, *key_chain):
inner = self.enumraw
try:
for key in key_chain:
inner = inner[key]
except KeyChainError as err:
keys_str = '.'.join(repr(key) for key in (err.previous_keys + [err.key]))
other_choices = ', '.join(repr(key) for key in inner.source.keys())
err_msg = _("Cannot inflect. Missing data for: {} . Did you mean {}?")
raise ValueError(err_msg.format(keys_str, other_choices)) from None
return inner

# nouns
def noun_decline(self, number, case):
"""
number: "Singular" or "Plural"
case: "Nominativ", "Genitiv", "Dativ", "Akkusativ"
"""
return self.inflect(NOUN, number, case)

# adjectives
def adjective_decline_strong(self, gender, case):
return self.inflect(ADJ_STRONG, gender, case)

def adjective_decline_weak(self, gender, case):
return self.inflect(ADJ_WEAK, gender, case)

def adjective_decline_mixed(self, gender, case):
return self.inflect(ADJ_MIXED, gender, case)

def adjective_compare(self, degree):
return self.inflect(ADJ_COMPARE, degree)

# verbs
def verb_conjugate(self, mood, tense, person):
return self.inflect(mood, tense, person)

def verb_imperative(self, person):
return self.inflect(VERB_IMPERATIVE, person)

def verb_infinitive_forms(self, form):
return self.inflect(VERB_INFINITIVE_FORMS, form)


def legend_left_transform(structure):
return dict(zip(*structure[0]))


def legend_top_transform(structure):
return dict(s[0] for s in structure)


def hidden_title_transform(structure):
resmap = {}
for legend, content in structure:
name = content[0]
mapping = dict(zip(legend[1:], content[1:]))
resmap[name] = mapping
return resmap


def square_transform(structure):
corner, *left_legend = structure[0][0]
res = {}
for _, column in structure:
top_key, *values = column
res[top_key] = dict(zip(left_legend, values))
return res

def conditional_transform(key, structure):
transformation = table_transformations[key]
return transformation(structure)


# how individual top-level section table_data must be transformed
table_transformations = {
# noun
NOUN: hidden_title_transform,
# verb
VERB_INDICATIVE: hidden_title_transform,
VERB_SUBJUNCTIVE_I: hidden_title_transform,
VERB_SUBJUNCTIVE_II: hidden_title_transform,
VERB_IMPERATIVE: legend_left_transform,
VERB_INFINITIVE_FORMS: legend_top_transform,
# adjective sections
ADJ_COMPARE: legend_top_transform,
ADJ_STRONG: square_transform,
ADJ_WEAK: square_transform,
ADJ_MIXED: square_transform,
}
Empty file added duden/page/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions duden/page/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
class DudenPage:
def __init__(self, soup):
self.soup = soup

def division(self, title):
for division in self.soup.find_all("div", class_="division"):
div_title = division.find("h2", class_="division__title")
if div_title and div_title.text == title:
return division
raise KeyError(title)
46 changes: 46 additions & 0 deletions duden/page/grammar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import copy

from .base import DudenPage


class GrammarPage:
"""
Parses pages like
/deklination/{substantive,adjektive}/{word}
/konjugation/{word}
"""
def __init__(self, soup):
self.page = DudenPage(soup)
# self.soup = soup

@property
def table_data(self):
return parse_grammar(self.page.division("Grammatik"))


def parse_grammar(division):
return {
ig.h3.text: parse_igroup(ig) for ig in division.div(class_="con-dec__wrapper")
}


def parse_igroup(ig):
return [parse_actable(actable) for actable in ig.div(class_="accordion-table")]


def parse_actable(actable):
return [parse_ul(ul) for ul in actable.find_all("ul")]


def parse_ul(ul):
return [parse_li(li) for li in ul.find_all("li")]


def parse_li(li):
li = copy.copy(li)
try:
li.sup.extract()
except AttributeError:
pass
return li.text
14 changes: 14 additions & 0 deletions duden/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
from xdg.BaseDirectory import xdg_cache_home

from .common import clear_text
from .inflection import Inflector
from .word import DudenWord

URL_FORM = "https://www.duden.de/rechtschreibung/{word}"
SEARCH_URL_FORM = "https://www.duden.de/suchen/dudenonline/{word}"
GRAMMAR_BASE = "https://www.duden.de/{urlpart}"
DEFAULT_TIMEOUT = 10


Expand Down Expand Up @@ -155,3 +157,15 @@ def search(word, exact=True, return_words=True, cache=True):
if not return_words:
return urlnames
return [get(urlname, cache=cache) for urlname in urlnames]


@cached_response(prefix="grammar-")
def request_grammar(urlpart):
url = GRAMMAR_BASE.format(urlpart=urlpart)
return requests.get(url, timeout=DEFAULT_TIMEOUT).text


def grammar(urlpart):
response_text = request_grammar(urlpart)
soup = bs4.BeautifulSoup(response_text, "html.parser")
return Inflector(soup)
Loading

0 comments on commit 3a19434

Please sign in to comment.