From a873fe0188d0acfb5e46207270699c3426816601 Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Fri, 3 Feb 2023 11:00:08 +0100
Subject: [PATCH] Score Documents - enable matching n-grams
---
doc/widgets/score-documents.md | 2 ++
orangecontrib/text/corpus.py | 5 +--
.../text/widgets/owscoredocuments.py | 36 ++++++++++---------
.../widgets/tests/test_owscoredocuments.py | 36 +++++++++++++++++++
4 files changed, 60 insertions(+), 19 deletions(-)
diff --git a/doc/widgets/score-documents.md b/doc/widgets/score-documents.md
index 14e71f1cc..89f669a53 100644
--- a/doc/widgets/score-documents.md
+++ b/doc/widgets/score-documents.md
@@ -29,6 +29,8 @@ Scores documents based on word appearance.
4. If *Send Automatically*, changes are communicated automatically. Alternatively press *Send*.
5. Filter documents based on the document title in the first column. Below is the table with the document titles in the first column and scores in other columns.
+**Note**: Score Documents will apply preprocessing from the input Corpus to words before scoring.
+
Example
-------
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index bf1388c86..e28526165 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -55,6 +55,7 @@ def _check_arrays(*arrays):
class Corpus(Table):
"""Internal class for storing a corpus."""
+ NGRAMS_SEPARATOR = " "
def __new__(cls, *args, **kwargs):
if args and isinstance(args[0], Domain) or "domain" in kwargs:
@@ -447,7 +448,7 @@ def pos_tags(self):
def pos_tags(self, pos_tags):
self._pos_tags = pos_tags
- def ngrams_iterator(self, join_with=' ', include_postags=False):
+ def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
if self.pos_tags is None:
include_postags = False
@@ -471,7 +472,7 @@ def ngrams_iterator(self, join_with=' ', include_postags=False):
@property
def ngrams(self):
"""generator: Ngram representations of documents."""
- return self.ngrams_iterator(join_with=' ')
+ return self.ngrams_iterator(join_with=self.NGRAMS_SEPARATOR)
def copy(self):
"""Return a copy of the table."""
diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py
index dc5e62a6e..558fa78c2 100644
--- a/orangecontrib/text/widgets/owscoredocuments.py
+++ b/orangecontrib/text/widgets/owscoredocuments.py
@@ -1,6 +1,7 @@
import re
from collections import Counter
from inspect import signature
+from tracemalloc import BaseFilter
from typing import Callable, List, Tuple, Union
import numpy as np
@@ -19,6 +20,7 @@
QRadioButton,
QTableView,
)
+
from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.data.util import get_unique_names
from Orange.util import wrap_callback
@@ -33,7 +35,7 @@
from sklearn.metrics.pairwise import cosine_similarity
from orangecontrib.text import Corpus
-from orangecontrib.text.preprocess import BaseNormalizer, BaseTransformer
+from orangecontrib.text.preprocess import BaseNormalizer, NGrams
from orangecontrib.text.vectorization.document_embedder import (
LANGS_TO_ISO,
DocumentEmbedder,
@@ -41,13 +43,14 @@
from orangecontrib.text.widgets.utils import enum2int
from orangecontrib.text.widgets.utils.words import create_words_table
+
def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
res = []
- tokens = corpus.tokens
+ tokens = corpus.ngrams
for i, t in enumerate(tokens):
counts = Counter(t)
res.append([counts.get(w, 0) for w in words])
- callback((i + 1) / len(tokens))
+ callback((i + 1) / len(corpus))
return np.array(res)
@@ -55,11 +58,11 @@ def _word_appearance(
corpus: Corpus, words: List[str], callback: Callable
) -> np.ndarray:
res = []
- tokens = corpus.tokens
+ tokens = corpus.ngrams
for i, t in enumerate(tokens):
t = set(t)
res.append([w in t for w in words])
- callback((i + 1) / len(tokens))
+ callback((i + 1) / len(corpus))
return np.array(res).astype(float)
@@ -123,10 +126,9 @@ def _preprocess_words(
) -> List[str]:
"""
Corpus's tokens can be preprocessed. Since they will not match correctly
- with words preprocessors that change words (e.g. normalization) must
- be applied to words too.
+ with words, same preprocessors that preprocess words in corpus
+ (e.g. normalization) must be applied to words too.
"""
- # workaround to preprocess words
# TODO: currently preprocessors work only on corpus, when there will be more
# cases like this think about implementation of preprocessors for a list
# of strings
@@ -137,16 +139,16 @@ def _preprocess_words(
metas=np.array([[w] for w in words]),
text_features=[words_feature],
)
- # only transformers and normalizers preprocess on the word level
- pps = [
- pp
- for pp in corpus.used_preprocessor.preprocessors
- if isinstance(pp, (BaseTransformer, BaseNormalizer))
- ]
+ # apply all corpus preprocessors except Filter and NGrams, which change terms
+ # filter removes words from the term, and NGrams split the term in grams.
+ # If a user decided to score with a particular term, he meant this term
+ # and not derivations of it
+ pps = corpus.used_preprocessor.preprocessors
for i, pp in enumerate(pps):
- words_c = pp(words_c)
- callback((i + 1) / len(pps))
- return [w[0] for w in words_c.tokens if len(w)]
+ if not isinstance(pp, (BaseFilter, NGrams)):
+ words_c = pp(words_c)
+ callback((i + 1) / len(pps))
+ return [Corpus.NGRAMS_SEPARATOR.join(ngs) for ngs in words_c.tokens if len(ngs)]
def _run(
diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
index acf3e145d..db018e3e1 100644
--- a/orangecontrib/text/widgets/tests/test_owscoredocuments.py
+++ b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
@@ -291,6 +291,7 @@ def test_preprocess_words(self):
preprocess.StripAccentsTransformer(),
preprocess.UrlRemover(),
preprocess.HtmlTransformer(),
+ preprocess.RegexpTokenizer()
]
for p in pp_list:
corpus = p(corpus)
@@ -452,6 +453,41 @@ def test_titles_no_newline(self):
"The Little Match-Seller test", self.widget.view.model().index(0, 0).data()
)
+ def test_n_grams(self):
+ texts = [
+ "Lorem ipsum dolor sit ipsum consectetur adipiscing elit dolor sit eu",
+ "Sed eu sollicitudin velit lorem.",
+ "lorem ipsum eu",
+ ]
+ # try n-gram range 2, 2
+ corpus = self.create_corpus(texts)
+ pp_list = [preprocess.NGrams(ngrams_range=(2, 2))]
+ for p in pp_list:
+ corpus = p(corpus)
+ words = create_words_table(["lorem ipsum", "dolor sit", "eu"])
+ self.send_signal(self.widget.Inputs.corpus, corpus)
+ self.send_signal(self.widget.Inputs.words, words)
+ self.widget.controls.word_appearance.click()
+ self.wait_until_finished()
+ # fist text: 1 lorem ipsum, 2 dolor sit, eu not in corpus's bi-grams
+ # second text: none of them present
+ # second text: only lorem ipsum
+ self.assertListEqual([x[1] for x in self.widget.model], [1, 0, 1 / 3])
+ self.assertListEqual([x[2] for x in self.widget.model], [2 / 3, 0, 1 / 3])
+
+ # try n-gram range 1, 2
+ corpus = self.create_corpus(texts)
+ pp_list = [preprocess.NGrams(ngrams_range=(1, 2))]
+ for p in pp_list:
+ corpus = p(corpus)
+ self.send_signal(self.widget.Inputs.corpus, corpus)
+ self.wait_until_finished()
+ # fist: 1 lorem ipsum, 2 dolor sit, 1 eu
+ # second: 0 lorem ipsum, 0 dolor sit, 1 eu
+ # third: 1 lorem ipsum, 0 dolor sit, 1 eu
+ self.assertListEqual([x[1] for x in self.widget.model], [4 / 3, 1 / 3, 2 / 3])
+ self.assertListEqual([x[2] for x in self.widget.model], [3 / 3, 1 / 3, 2 / 3])
+
if __name__ == "__main__":
unittest.main()