biolab · VesnaT · Feb 3, 2023 · Feb 3, 2023
diff --git a/doc/widgets/score-documents.md b/doc/widgets/score-documents.md
@@ -29,6 +29,8 @@ Scores documents based on word appearance.
 4. If *Send Automatically*, changes are communicated automatically. Alternatively press *Send*.
 5. Filter documents based on the document title in the first column. Below is the table with the document titles in the first column and scores in other columns.
 
+**Note**: Score Documents will apply preprocessing from the input Corpus to words before scoring.
+
 Example
 -------
 

diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
@@ -55,6 +55,7 @@ def _check_arrays(*arrays):
 
 class Corpus(Table):
     """Internal class for storing a corpus."""
+    NGRAMS_SEPARATOR = " "
 
     def __new__(cls, *args, **kwargs):
         if args and isinstance(args[0], Domain) or "domain" in kwargs:
@@ -447,7 +448,7 @@ def pos_tags(self):
     def pos_tags(self, pos_tags):
         self._pos_tags = pos_tags
 
-    def ngrams_iterator(self, join_with=' ', include_postags=False):
+    def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
         if self.pos_tags is None:
             include_postags = False
 
@@ -471,7 +472,7 @@ def ngrams_iterator(self, join_with=' ', include_postags=False):
     @property
     def ngrams(self):
         """generator: Ngram representations of documents."""
-        return self.ngrams_iterator(join_with=' ')
+        return self.ngrams_iterator(join_with=self.NGRAMS_SEPARATOR)
 
     def copy(self):
         """Return a copy of the table."""

diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py
@@ -19,6 +19,7 @@
     QRadioButton,
     QTableView,
 )
+
 from Orange.data import ContinuousVariable, Domain, StringVariable, Table
 from Orange.data.util import get_unique_names
 from Orange.util import wrap_callback
@@ -33,33 +34,34 @@
 from sklearn.metrics.pairwise import cosine_similarity
 
 from orangecontrib.text import Corpus
-from orangecontrib.text.preprocess import BaseNormalizer, BaseTransformer
+from orangecontrib.text.preprocess import BaseNormalizer, NGrams, BaseTokenFilter
 from orangecontrib.text.vectorization.document_embedder import (
     LANGS_TO_ISO,
     DocumentEmbedder,
 )
 from orangecontrib.text.widgets.utils import enum2int
 from orangecontrib.text.widgets.utils.words import create_words_table
 
+
 def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
     res = []
-    tokens = corpus.tokens
+    tokens = corpus.ngrams
     for i, t in enumerate(tokens):
         counts = Counter(t)
         res.append([counts.get(w, 0) for w in words])
-        callback((i + 1) / len(tokens))
+        callback((i + 1) / len(corpus))
     return np.array(res)
 
 
 def _word_appearance(
     corpus: Corpus, words: List[str], callback: Callable
 ) -> np.ndarray:
     res = []
-    tokens = corpus.tokens
+    tokens = corpus.ngrams
     for i, t in enumerate(tokens):
         t = set(t)
         res.append([w in t for w in words])
-        callback((i + 1) / len(tokens))
+        callback((i + 1) / len(corpus))
     return np.array(res).astype(float)
 
 
@@ -123,10 +125,9 @@ def _preprocess_words(
 ) -> List[str]:
     """
     Corpus's tokens can be preprocessed. Since they will not match correctly
-    with words preprocessors that change words (e.g. normalization) must
-    be applied to words too.
+    with words, same preprocessors that preprocess words in corpus
+    (e.g. normalization) must be applied to words too.
     """
-    # workaround to preprocess words
     # TODO: currently preprocessors work only on corpus, when there will be more
     #  cases like this think about implementation of preprocessors for a list
     #  of strings
@@ -137,16 +138,16 @@ def _preprocess_words(
         metas=np.array([[w] for w in words]),
         text_features=[words_feature],
     )
-    # only transformers and normalizers preprocess on the word level
-    pps = [
-        pp
-        for pp in corpus.used_preprocessor.preprocessors
-        if isinstance(pp, (BaseTransformer, BaseNormalizer))
-    ]
+    # apply all corpus preprocessors except Filter and NGrams, which change terms
+    # filter removes words from the term, and NGrams split the term in grams.
+    # If a user decided to score with a particular term, he meant this term
+    # and not derivations of it
+    pps = corpus.used_preprocessor.preprocessors
     for i, pp in enumerate(pps):
-        words_c = pp(words_c)
-        callback((i + 1) / len(pps))
-    return [w[0] for w in words_c.tokens if len(w)]
+        if not isinstance(pp, (BaseTokenFilter, NGrams)):
+            words_c = pp(words_c)
+            callback((i + 1) / len(pps))
+    return [Corpus.NGRAMS_SEPARATOR.join(ngs) for ngs in words_c.tokens if len(ngs)]
 
 
 def _run(

diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
@@ -291,6 +291,7 @@ def test_preprocess_words(self):
             preprocess.StripAccentsTransformer(),
             preprocess.UrlRemover(),
             preprocess.HtmlTransformer(),
+            preprocess.RegexpTokenizer()
         ]
         for p in pp_list:
             corpus = p(corpus)
@@ -452,6 +453,55 @@ def test_titles_no_newline(self):
             "The Little Match-Seller test", self.widget.view.model().index(0, 0).data()
         )
 
+    def test_n_grams(self):
+        texts = [
+            "Lorem ipsum dolor sit ipsum consectetur adipiscing elit dolor sit eu",
+            "Sed eu sollicitudin velit lorem.",
+            "lorem ipsum eu",
+        ]
+        # try n-gram range 2, 2
+        corpus = self.create_corpus(texts)
+        pp_list = [
+            preprocess.LowercaseTransformer(),
+            preprocess.RegexpTokenizer(),
+            # skipped in corpus but not among words, since filter ignored for words
+            preprocess.RegexpFilter("sed"),
+            # for corpus, ignored in
+            preprocess.NGrams(ngrams_range=(2, 2)),
+        ]
+        for p in pp_list:
+            corpus = p(corpus)
+        words = create_words_table(["lorem ipsum", "dolor sit", "eu", "sed eu"])
+        # just test that word preprocessing didn't consider Filter and Ngrams
+        self.assertListEqual(
+            ["lorem ipsum", "dolor sit", "eu", "sed eu"],
+            _preprocess_words(
+                corpus, ["lorem ipsum", "dolor sit", "eu", "sed eu"], dummy_callback
+            ),
+        )
+        self.send_signal(self.widget.Inputs.corpus, corpus)
+        self.send_signal(self.widget.Inputs.words, words)
+        self.widget.controls.word_appearance.click()
+        self.wait_until_finished()
+        # fist text: 1 lorem ipsum, 2 dolor sit, eu not in corpus's bi-grams
+        # second text: none of them present
+        # second text: only lorem ipsum
+        self.assertListEqual([x[1] for x in self.widget.model], [3 / 4, 0, 1 / 4])
+        self.assertListEqual([x[2] for x in self.widget.model], [2 / 4, 0, 1 / 4])
+
+        # try n-gram range 1, 2, seed not ignored this time
+        corpus = self.create_corpus(texts)
+        pp_list = [preprocess.NGrams(ngrams_range=(1, 2))]
+        for p in pp_list:
+            corpus = p(corpus)
+        self.send_signal(self.widget.Inputs.corpus, corpus)
+        self.wait_until_finished()
+        # fist: 1 lorem ipsum, 2 dolor sit, 1 eu
+        # second: 0 lorem ipsum, 0 dolor sit, 1 eu
+        # third: 1 lorem ipsum, 0 dolor sit, 1 eu
+        self.assertListEqual([x[1] for x in self.widget.model], [4 / 4, 2 / 4, 2 / 4])
+        self.assertListEqual([x[2] for x in self.widget.model], [3 / 4, 2 / 4, 2 / 4])
+
 
 if __name__ == "__main__":
     unittest.main()