Skip to content

Commit

Permalink
removed redundant morphemizer (#273)
Browse files Browse the repository at this point in the history
'sss + punctuation' is redundant after having added the 'ignore custom characters' preprocess option.
  • Loading branch information
mortii committed Dec 14, 2024
1 parent cf3f930 commit c4fc68b
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 58 deletions.
2 changes: 1 addition & 1 deletion ankimorphs/ankimorphs_globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

# Semantic Versioning https://semver.org/
__version__ = "3.3.0"
__version__ = "4.0.0"

DEV_MODE: bool = False

Expand Down
39 changes: 0 additions & 39 deletions ankimorphs/morphemizers/morphemizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from . import jieba_wrapper, mecab_wrapper, spacy_wrapper

space_char_regex = re.compile(" ")
space_and_punctuation_pattern = re.compile(r"\w+(?:[-']\w+)*", re.UNICODE)

####################################################################################################
# Base Class
Expand Down Expand Up @@ -55,7 +54,6 @@ def get_all_morphemizers() -> list[Morphemizer]:
# therefore always included since nothing has to be installed
morphemizers = [
SimpleSpaceMorphemizer(),
SimpleSpaceAndPunctuationMorphemizer(),
]

_mecab = MecabMorphemizer()
Expand Down Expand Up @@ -120,43 +118,6 @@ def get_description(self) -> str:
return "AnkiMorphs: Simple Space Splitter"


####################################################################################################
# Simple Space and Punctuation Morphemizer
####################################################################################################


class SimpleSpaceAndPunctuationMorphemizer(Morphemizer):
"""
Extension of the SSS morphemizer that targets english and french, and should work
on words like:
- "mother-in-law"
- "quelqu'un"
"""

def _get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
# Regex:
# The '\w' character matches alphanumeric and underscore characters
#
# To also match words that have multiple hyphens or apostrophes, we add
# the optional group: '([-']\w+)*'
#
# re.findall() treats groups in a special way:
# "If one or more capturing groups are present in the pattern, return
# a list of groups; this will be a list of tuples if the pattern
# has more than one group."
# We don't want this to happen, we want a pure list of matches. To prevent
# this we prepend '?:' to make the group non-capturing.

word_list = [
word.lower()
for word in re.findall(space_and_punctuation_pattern, expression)
]
return [Morpheme(lemma=word, inflection=word) for word in word_list]

def get_description(self) -> str:
return "AnkiMorphs: SSS + Punctuations"


####################################################################################################
# spaCy Morphemizer
####################################################################################################
Expand Down
Binary file modified docs/src/img/morphemizer-selection.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 5 additions & 13 deletions docs/src/user_guide/setup/settings/note-filter.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,14 @@ how to add morphemizers.
![morphemizer-selection.png](../../../img/morphemizer-selection.png)


AnkiMorphs comes bundled with two morphemizers: [Simple Space Splitter](#simple-space-splitter) and
[SSS + Punctuation](#simple-space-splitter--punctuation). These morphemizers are very basic and do not perform any
linguistic analysis, meaning they won't provide accurate [lemmas](../../glossary.md#lemma). Therefore, you should only
use them if no other morphemizers are available for your particular language.

### Simple Space Splitter
As the name suggests, this morphemizer just splits words based on whitespace.
As the name suggests, this morphemizer just splits words based on whitespace and does not perform any
linguistic analysis, meaning they won't provide accurate [lemmas](../../glossary.md#lemma). You should only
use this if no other morphemizers are available for your particular target language.

### Simple Space Splitter + Punctuation
This morphemizer extends the [Simple Space Splitter](#simple-space-splitter) to preserve words containing hyphens (-)
and apostrophes ('), ensuring that words like these are not split apart:
- `mother-in-law`
- `quelqu'un`
If you use this morphemizer, punctuation and other unwanted characters will likely be included in the morphs. To fix this,
you can specify custom characters to ignore in [the preprocess settings](preprocess.md).

> **Note**: This morphemizer may not work correctly for some languages, such as Arabic. If you encounter this issue,
> try using the Simple Space Splitter instead.

## Morph Priority

Expand Down
6 changes: 5 additions & 1 deletion test/fake_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
default_config_filter[FilterKeys.NOTE_TYPE] = "Basic"
default_config_filter[FilterKeys.FIELD] = "Front"
default_config_filter[FilterKeys.MORPHEMIZER_DESCRIPTION] = (
"AnkiMorphs: SSS + Punctuations"
"AnkiMorphs: Simple Space Splitter"
)
default_config_filter[FilterKeys.EXTRA_ALL_MORPHS] = True
default_config_filter[FilterKeys.EXTRA_ALL_MORPHS_COUNT] = True
Expand Down Expand Up @@ -65,6 +65,10 @@
# fmt: off
config_known_morphs_enabled = copy.deepcopy(default_config_dict)
config_known_morphs_enabled[ConfigKeys.READ_KNOWN_MORPHS_FOLDER] = True
config_known_morphs_enabled[ConfigKeys.PREPROCESS_IGNORE_CUSTOM_CHARACTERS] = True
config_known_morphs_enabled[ConfigKeys.PREPROCESS_CUSTOM_CHARACTERS_TO_IGNORE] = (
",.!"
)
# fmt: on


Expand Down
4 changes: 3 additions & 1 deletion test/tests/recalc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

import pytest

from ankimorphs import ankimorphs_config
from ankimorphs import ankimorphs_config, text_preprocessing
from ankimorphs import ankimorphs_globals as am_globals
from ankimorphs.ankimorphs_config import RawConfigFilterKeys
from ankimorphs.exceptions import (
Expand Down Expand Up @@ -192,6 +192,8 @@ def test_recalc( # pylint:disable=too-many-locals
if fake_environment_fixture is None:
pytest.xfail()

text_preprocessing.update_translation_table() # updates custom characters to ignore

actual_collection = fake_environment_fixture.mock_mw.col
expected_collection = fake_environment_fixture.expected_collection

Expand Down
7 changes: 4 additions & 3 deletions test/tests/space_morphemizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def _fake_environment_fixture() -> Iterator[None]:
"morphemizer_description, sentence, correct_morphs",
[
(
"AnkiMorphs: SSS + Punctuations",
"Tu es quelqu'un de bien.", # french test
"AnkiMorphs: Simple Space Splitter",
"Tu es quelqu'un de bien", # french test
{
Morpheme("tu", "tu"),
Morpheme("es", "es"),
Expand All @@ -37,7 +37,7 @@ def _fake_environment_fixture() -> Iterator[None]:
},
),
(
"AnkiMorphs: SSS + Punctuations",
"AnkiMorphs: Simple Space Splitter",
"My mother-in-law is wonderful", # english test
{
Morpheme("my", "my"),
Expand Down Expand Up @@ -72,4 +72,5 @@ def test_simple_space_splitters(
assert len(extracted_morphs) == len(correct_morphs)

for morph in extracted_morphs:
# print(f"morph: {morph.inflection}")
assert morph in correct_morphs

0 comments on commit c4fc68b

Please sign in to comment.