Skip to content

Commit

Permalink
reviewed decomposition and affix search logic
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Aug 6, 2021
1 parent 4839201 commit b52df25
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 32 deletions.
63 changes: 38 additions & 25 deletions simplemma/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,44 +146,52 @@ def _greedy_search(candidate, datadict, steps=2, distance=4):


def _decompose(token, datadict, affixlen=0):
candidate = None
candidate, plan_b = None, None
for count, prefix in enumerate(token, start=1): # AFFIXLEN, MINCOMPLEN
if len(token[:-count]) < MINCOMPLEN:
break
length = count + affixlen
if len(token[:-length]) == 0:
continue
part1, part2 = _simple_search(token[:-length], datadict), \
_simple_search(token[-count:].capitalize(), datadict)
if part1 is not None:
part1, part1_aff, part2 = token[:-count], token[:-length], token[-count:]
lempart1, lempart2 = _simple_search(part1, datadict), \
_simple_search(part2.capitalize(), datadict)
if lempart1 is not None:
#print(part1, part2, affixlen, count)
# maybe an affix? discard it
if affixlen == 0 and count <= AFFIXLEN:
candidate = part1
candidate = lempart1
break
elif part2 is not None:
newcandidate = _greedy_search(part2, datadict)
elif lempart2 is not None:
# candidate must be shorter
newcandidate = _greedy_search(part2.capitalize(), datadict) # lempart2?
# shorten the second known part of the token
if newcandidate and len(newcandidate) < len(token[-count:]):
candidate = ''.join([token[:-count], newcandidate.lower()])
if newcandidate and len(newcandidate) < len(part2):
candidate = ''.join([part1, newcandidate.lower()])
else:
newcandidate = _greedy_search(part2.capitalize(), datadict)
# shorten the second known part of the token
if newcandidate and len(newcandidate) < len(token[-count:]):
candidate = ''.join([token[:-count], newcandidate.lower()])
# try without capitalizing
else:
newcandidate = _simple_search(token[-count:], datadict)
if newcandidate and len(newcandidate) < len(token[-count:]):
candidate = ''.join([token[:-count], newcandidate.lower()])
# backup
newcandidate = _greedy_search(part2, datadict)
# shorten the second known part of the token
if newcandidate and len(newcandidate) < len(part2):
candidate = ''.join([part1, newcandidate.lower()])
# backup: equal length or further candidates accepted
if candidate is None:
# try without capitalizing
newcandidate = _simple_search(part2, datadict)
if newcandidate and len(newcandidate) <= len(token[-count:]):
candidate = ''.join([token[:-count], newcandidate.lower()])
# even greedier: if candidate is not None:
if newcandidate and len(newcandidate) <= len(part2):
candidate = ''.join([part1, newcandidate.lower()])
# even greedier
else:
# with capital letter
#print(part1, part2, affixlen, count, newcandidate)
if len(lempart2) < len(part2) + AFFIXLEN:
plan_b = ''.join([part1, lempart2.lower()])
#print(part1, part2, affixlen, count, newcandidate, planb)
elif newcandidate and len(newcandidate) < len(part2) + AFFIXLEN:
plan_b = ''.join([part1, newcandidate.lower()])
#print(part1, part2, affixlen, count, newcandidate, planb)
break
return candidate
return candidate, plan_b


def _dehyphen(token, datadict, greedy):
Expand All @@ -210,11 +218,16 @@ def _dehyphen(token, datadict, greedy):


def _affix_search(wordform, datadict):
candidate = None
candidate, plan_b = None, None
for l in range(0, AFFIXLEN+1):
candidate = _decompose(wordform, datadict, affixlen=l)
candidate, bufferstring = _decompose(wordform, datadict, affixlen=l)
if candidate is not None:
break
if bufferstring is not None:
plan_b = bufferstring
# exceptionally accept a longer solution
if candidate is None and plan_b is not None:
candidate = plan_b
return candidate


Expand Down Expand Up @@ -249,7 +262,7 @@ def _return_lemma(token, datadict, greedy=True, lang=None):
if newcandidate is not None:
candidate = newcandidate
# stop here in some cases
if len(token) <= 9 or greedy is False:
if len(token) <= 8 or greedy is False:
return candidate
# greedy subword decomposition: suffix/affix search
if candidate is None:
Expand Down
17 changes: 10 additions & 7 deletions tests/test_simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,28 +96,31 @@ def test_subwords():
assert simplemma.lemmatize('Spitzenposten', mydata, greedy=True) == 'Spitzenposten'
assert simplemma.lemmatize('I-Pace', mydata, greedy=True) == 'I-Pace'
assert simplemma.lemmatize('PCR-Bestätigungstests', mydata, greedy=True) == 'PCR-Bestätigungstest'
assert simplemma.lemmatize('Bürgerschaftsabgeordneter', mydata, greedy=True) == 'Bürgerschaftsabgeordneter'
assert simplemma.lemmatize('standortübergreifend', mydata, greedy=True) == 'standortübergreifend'
assert simplemma.lemmatize('obamamäßigsten', mydata, greedy=True) == 'obamamäßig'
assert simplemma.lemmatize('obamaartigere', mydata, greedy=True) == 'obamaartig'
assert simplemma.lemmatize('durchgestyltes', mydata, greedy=True) == 'durchgestylt'
assert simplemma.lemmatize('durchgeknallte', mydata, greedy=True) == 'durchgeknallt'
assert simplemma.lemmatize('herunterfährt', mydata, greedy=True) == 'herunterfahren'
assert simplemma.lemmatize('Atomdeals', mydata, greedy=True) == 'Atomdeal'
assert simplemma.lemmatize('Bürgerschaftsabgeordneter', mydata, greedy=True) == 'Bürgerschaftsabgeordnete'
#assert simplemma.lemmatize('Abholservices', mydata, greedy=True) == 'Abholservice'
#assert simplemma.lemmatize('beständigsten', mydata, greedy=True) == 'beständig'
#assert simplemma.lemmatize('Funktionärsebene', mydata, greedy=True) == 'Funktionärsebene'
#assert simplemma.lemmatize('herunterfährt', mydata, greedy=True) == 'herunterfahren'
#assert simplemma.lemmatize('Abholservices', mydata, greedy=True) == 'Abholservice'
#assert simplemma.lemmatize('zweitstärkster', mydata, greedy=True) == 'zweitstärkste'
#assert simplemma.lemmatize('Pharmagrößen', mydata, greedy=True) == 'Pharmagroßer'
#assert simplemma.lemmatize('Anspruchsberechtigten', mydata, greedy=True) == 'Anspruchsberechtigter'
#assert simplemma.lemmatize('Lichtbild-Ausweis', mydata, greedy=True) == 'Lichtbildausweis'
#Spargelstange/stangen, Kapuzenpullis, strafbewehrte/strafbewehrt, fälschungssicheren



def test_tokenizer():
# tokenization and chaining
# problem here
assert simplemma.simple_tokenizer('200er-Inzidenz 1.000er-Inzidenz St.-Martini-Gemeinde') == ['200er-Inzidenz', '1.000er-Inzidenz', 'St.-Martini-Gemeinde']
assert simplemma.simple_tokenizer('360-Grad-Panorama @sebastiankurz 2,5-Zimmer-Wohnung') == ['360-Grad-Panorama', '@sebastiankurz', '2,5-Zimmer-Wohnung']
assert simplemma.simple_tokenizer('Covid-19, Covid19, Covid-19-Pandemie') == ['Covid-19', ',', 'Covid19', ',', 'Covid-19-Pandemie']
# problem here: WDR5-„Morgenecho“
assert simplemma.simple_tokenizer('200er-Inzidenz 1.000er-Inzidenz 5%-Hürde 5-%-Hürde FFP2-Masken St.-Martini-Gemeinde, Lebens-, Liebes- und Arbeitsbedingungen') == ['200er-Inzidenz', '1.000er-Inzidenz', '5%-Hürde', '5-%-Hürde', 'FFP2-Masken', 'St.-Martini-Gemeinde', ',', 'Lebens-', ',', 'Liebes-', 'und', 'Arbeitsbedingungen']
assert simplemma.simple_tokenizer('360-Grad-Panorama @sebastiankurz 2,5-Zimmer-Wohnung 1,2-butylketoaldehyde') == ['360-Grad-Panorama', '@sebastiankurz', '2,5-Zimmer-Wohnung', '1,2-butylketoaldehyde']
assert simplemma.simple_tokenizer('Covid-19, Covid19, Covid-19-Pandemie https://example.org/covid-test') == ['Covid-19', ',', 'Covid19', ',', 'Covid-19-Pandemie', 'https://example.org/covid-test']
assert simplemma.simple_tokenizer('Test 4:1-Auswärtssieg 2,5€ §52, for $5') == ['Test', '4:1-Auswärtssieg', '2,5€', '§52', ',', 'for', '$5']


0 comments on commit b52df25

Please sign in to comment.