removed redundant morphemizer (#273)

'sss + punctuation' is redundant after having added the 'ignore custom characters' preprocess option.
mortii · Dec 14, 2024 · c4fc68b · c4fc68b
1 parent cf3f930
commit c4fc68b
Show file tree

Hide file tree

Showing 7 changed files with 18 additions and 58 deletions.
diff --git a/ankimorphs/ankimorphs_globals.py b/ankimorphs/ankimorphs_globals.py
@@ -5,7 +5,7 @@
 """
 
 # Semantic Versioning https://semver.org/
-__version__ = "3.3.0"
+__version__ = "4.0.0"
 
 DEV_MODE: bool = False
 

diff --git a/ankimorphs/morphemizers/morphemizer.py b/ankimorphs/morphemizers/morphemizer.py
@@ -7,7 +7,6 @@
 from . import jieba_wrapper, mecab_wrapper, spacy_wrapper
 
 space_char_regex = re.compile(" ")
-space_and_punctuation_pattern = re.compile(r"\w+(?:[-']\w+)*", re.UNICODE)
 
 ####################################################################################################
 # Base Class
@@ -55,7 +54,6 @@ def get_all_morphemizers() -> list[Morphemizer]:
         # therefore always included since nothing has to be installed
         morphemizers = [
             SimpleSpaceMorphemizer(),
-            SimpleSpaceAndPunctuationMorphemizer(),
         ]
 
         _mecab = MecabMorphemizer()
@@ -120,43 +118,6 @@ def get_description(self) -> str:
         return "AnkiMorphs: Simple Space Splitter"
 
 
-####################################################################################################
-# Simple Space and Punctuation Morphemizer
-####################################################################################################
-
-
-class SimpleSpaceAndPunctuationMorphemizer(Morphemizer):
-    """
-    Extension of the SSS morphemizer that targets english and french, and should work
-    on words like:
-        - "mother-in-law"
-        - "quelqu'un"
-    """
-
-    def _get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
-        # Regex:
-        # The '\w' character matches alphanumeric and underscore characters
-        #
-        # To also match words that have multiple hyphens or apostrophes, we add
-        # the optional group: '([-']\w+)*'
-        #
-        # re.findall() treats groups in a special way:
-        #   "If one or more capturing groups are present in the pattern, return
-        #    a list of groups; this will be a list of tuples if the pattern
-        #    has more than one group."
-        # We don't want this to happen, we want a pure list of matches. To prevent
-        # this we prepend '?:' to make the group non-capturing.
-
-        word_list = [
-            word.lower()
-            for word in re.findall(space_and_punctuation_pattern, expression)
-        ]
-        return [Morpheme(lemma=word, inflection=word) for word in word_list]
-
-    def get_description(self) -> str:
-        return "AnkiMorphs: SSS + Punctuations"
-
-
 ####################################################################################################
 # spaCy Morphemizer
 ####################################################################################################

diff --git a/docs/src/img/morphemizer-selection.png b/docs/src/img/morphemizer-selection.png
diff --git a/docs/src/user_guide/setup/settings/note-filter.md b/docs/src/user_guide/setup/settings/note-filter.md
@@ -95,22 +95,14 @@ how to add morphemizers.
 ![morphemizer-selection.png](../../../img/morphemizer-selection.png)
 
 
-AnkiMorphs comes bundled with two morphemizers: [Simple Space Splitter](#simple-space-splitter) and
-[SSS + Punctuation](#simple-space-splitter--punctuation). These morphemizers are very basic and do not perform any
-linguistic analysis, meaning they won't provide accurate [lemmas](../../glossary.md#lemma). Therefore, you should only
-use them if no other morphemizers are available for your particular language.
-
 ### Simple Space Splitter
-As the name suggests, this morphemizer just splits words based on whitespace.
+As the name suggests, this morphemizer just splits words based on whitespace and does not perform any
+linguistic analysis, meaning they won't provide accurate [lemmas](../../glossary.md#lemma). You should only
+use this if no other morphemizers are available for your particular target language.
 
-### Simple Space Splitter + Punctuation
-This morphemizer extends the [Simple Space Splitter](#simple-space-splitter) to preserve words containing hyphens (-)
-and apostrophes ('), ensuring that words like these are not split apart:
-- `mother-in-law`
-- `quelqu'un`
+If you use this morphemizer, punctuation and other unwanted characters will likely be included in the morphs. To fix this,
+you can specify custom characters to ignore in [the preprocess settings](preprocess.md).
 
-> **Note**: This morphemizer may not work correctly for some languages, such as Arabic. If you encounter this issue,
-> try using the Simple Space Splitter instead.
 
 ## Morph Priority
 

diff --git a/test/fake_configs.py b/test/fake_configs.py
@@ -18,7 +18,7 @@
 default_config_filter[FilterKeys.NOTE_TYPE] = "Basic"
 default_config_filter[FilterKeys.FIELD] = "Front"
 default_config_filter[FilterKeys.MORPHEMIZER_DESCRIPTION] = (
-    "AnkiMorphs: SSS + Punctuations"
+    "AnkiMorphs: Simple Space Splitter"
 )
 default_config_filter[FilterKeys.EXTRA_ALL_MORPHS] = True
 default_config_filter[FilterKeys.EXTRA_ALL_MORPHS_COUNT] = True
@@ -65,6 +65,10 @@
 # fmt: off
 config_known_morphs_enabled = copy.deepcopy(default_config_dict)
 config_known_morphs_enabled[ConfigKeys.READ_KNOWN_MORPHS_FOLDER] = True
+config_known_morphs_enabled[ConfigKeys.PREPROCESS_IGNORE_CUSTOM_CHARACTERS] = True
+config_known_morphs_enabled[ConfigKeys.PREPROCESS_CUSTOM_CHARACTERS_TO_IGNORE] = (
+    ",.!"
+)
 # fmt: on
 
 

diff --git a/test/tests/recalc_test.py b/test/tests/recalc_test.py
@@ -27,7 +27,7 @@
 
 import pytest
 
-from ankimorphs import ankimorphs_config
+from ankimorphs import ankimorphs_config, text_preprocessing
 from ankimorphs import ankimorphs_globals as am_globals
 from ankimorphs.ankimorphs_config import RawConfigFilterKeys
 from ankimorphs.exceptions import (
@@ -192,6 +192,8 @@ def test_recalc(  # pylint:disable=too-many-locals
     if fake_environment_fixture is None:
         pytest.xfail()
 
+    text_preprocessing.update_translation_table()  # updates custom characters to ignore
+
     actual_collection = fake_environment_fixture.mock_mw.col
     expected_collection = fake_environment_fixture.expected_collection
 

diff --git a/test/tests/space_morphemizer_test.py b/test/tests/space_morphemizer_test.py
@@ -26,8 +26,8 @@ def _fake_environment_fixture() -> Iterator[None]:
     "morphemizer_description, sentence, correct_morphs",
     [
         (
-            "AnkiMorphs: SSS + Punctuations",
-            "Tu es quelqu'un de bien.",  # french test
+            "AnkiMorphs: Simple Space Splitter",
+            "Tu es quelqu'un de bien",  # french test
             {
                 Morpheme("tu", "tu"),
                 Morpheme("es", "es"),
@@ -37,7 +37,7 @@ def _fake_environment_fixture() -> Iterator[None]:
             },
         ),
         (
-            "AnkiMorphs: SSS + Punctuations",
+            "AnkiMorphs: Simple Space Splitter",
             "My mother-in-law is wonderful",  # english test
             {
                 Morpheme("my", "my"),
@@ -72,4 +72,5 @@ def test_simple_space_splitters(
     assert len(extracted_morphs) == len(correct_morphs)
 
     for morph in extracted_morphs:
+        # print(f"morph: {morph.inflection}")
         assert morph in correct_morphs