From a873fe0188d0acfb5e46207270699c3426816601 Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Fri, 3 Feb 2023 11:00:08 +0100 Subject: [PATCH] Score Documents - enable matching n-grams --- doc/widgets/score-documents.md | 2 ++ orangecontrib/text/corpus.py | 5 +-- .../text/widgets/owscoredocuments.py | 36 ++++++++++--------- .../widgets/tests/test_owscoredocuments.py | 36 +++++++++++++++++++ 4 files changed, 60 insertions(+), 19 deletions(-) diff --git a/doc/widgets/score-documents.md b/doc/widgets/score-documents.md index 14e71f1cc..89f669a53 100644 --- a/doc/widgets/score-documents.md +++ b/doc/widgets/score-documents.md @@ -29,6 +29,8 @@ Scores documents based on word appearance. 4. If *Send Automatically*, changes are communicated automatically. Alternatively press *Send*. 5. Filter documents based on the document title in the first column. Below is the table with the document titles in the first column and scores in other columns. +**Note**: Score Documents will apply preprocessing from the input Corpus to words before scoring. + Example ------- diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index bf1388c86..e28526165 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -55,6 +55,7 @@ def _check_arrays(*arrays): class Corpus(Table): """Internal class for storing a corpus.""" + NGRAMS_SEPARATOR = " " def __new__(cls, *args, **kwargs): if args and isinstance(args[0], Domain) or "domain" in kwargs: @@ -447,7 +448,7 @@ def pos_tags(self): def pos_tags(self, pos_tags): self._pos_tags = pos_tags - def ngrams_iterator(self, join_with=' ', include_postags=False): + def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False): if self.pos_tags is None: include_postags = False @@ -471,7 +472,7 @@ def ngrams_iterator(self, join_with=' ', include_postags=False): @property def ngrams(self): """generator: Ngram representations of documents.""" - return self.ngrams_iterator(join_with=' ') + return self.ngrams_iterator(join_with=self.NGRAMS_SEPARATOR) def copy(self): """Return a copy of the table.""" diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py index dc5e62a6e..558fa78c2 100644 --- a/orangecontrib/text/widgets/owscoredocuments.py +++ b/orangecontrib/text/widgets/owscoredocuments.py @@ -1,6 +1,7 @@ import re from collections import Counter from inspect import signature +from tracemalloc import BaseFilter from typing import Callable, List, Tuple, Union import numpy as np @@ -19,6 +20,7 @@ QRadioButton, QTableView, ) + from Orange.data import ContinuousVariable, Domain, StringVariable, Table from Orange.data.util import get_unique_names from Orange.util import wrap_callback @@ -33,7 +35,7 @@ from sklearn.metrics.pairwise import cosine_similarity from orangecontrib.text import Corpus -from orangecontrib.text.preprocess import BaseNormalizer, BaseTransformer +from orangecontrib.text.preprocess import BaseNormalizer, NGrams from orangecontrib.text.vectorization.document_embedder import ( LANGS_TO_ISO, DocumentEmbedder, @@ -41,13 +43,14 @@ from orangecontrib.text.widgets.utils import enum2int from orangecontrib.text.widgets.utils.words import create_words_table + def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray: res = [] - tokens = corpus.tokens + tokens = corpus.ngrams for i, t in enumerate(tokens): counts = Counter(t) res.append([counts.get(w, 0) for w in words]) - callback((i + 1) / len(tokens)) + callback((i + 1) / len(corpus)) return np.array(res) @@ -55,11 +58,11 @@ def _word_appearance( corpus: Corpus, words: List[str], callback: Callable ) -> np.ndarray: res = [] - tokens = corpus.tokens + tokens = corpus.ngrams for i, t in enumerate(tokens): t = set(t) res.append([w in t for w in words]) - callback((i + 1) / len(tokens)) + callback((i + 1) / len(corpus)) return np.array(res).astype(float) @@ -123,10 +126,9 @@ def _preprocess_words( ) -> List[str]: """ Corpus's tokens can be preprocessed. Since they will not match correctly - with words preprocessors that change words (e.g. normalization) must - be applied to words too. + with words, same preprocessors that preprocess words in corpus + (e.g. normalization) must be applied to words too. """ - # workaround to preprocess words # TODO: currently preprocessors work only on corpus, when there will be more # cases like this think about implementation of preprocessors for a list # of strings @@ -137,16 +139,16 @@ def _preprocess_words( metas=np.array([[w] for w in words]), text_features=[words_feature], ) - # only transformers and normalizers preprocess on the word level - pps = [ - pp - for pp in corpus.used_preprocessor.preprocessors - if isinstance(pp, (BaseTransformer, BaseNormalizer)) - ] + # apply all corpus preprocessors except Filter and NGrams, which change terms + # filter removes words from the term, and NGrams split the term in grams. + # If a user decided to score with a particular term, he meant this term + # and not derivations of it + pps = corpus.used_preprocessor.preprocessors for i, pp in enumerate(pps): - words_c = pp(words_c) - callback((i + 1) / len(pps)) - return [w[0] for w in words_c.tokens if len(w)] + if not isinstance(pp, (BaseFilter, NGrams)): + words_c = pp(words_c) + callback((i + 1) / len(pps)) + return [Corpus.NGRAMS_SEPARATOR.join(ngs) for ngs in words_c.tokens if len(ngs)] def _run( diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py index acf3e145d..db018e3e1 100644 --- a/orangecontrib/text/widgets/tests/test_owscoredocuments.py +++ b/orangecontrib/text/widgets/tests/test_owscoredocuments.py @@ -291,6 +291,7 @@ def test_preprocess_words(self): preprocess.StripAccentsTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer(), + preprocess.RegexpTokenizer() ] for p in pp_list: corpus = p(corpus) @@ -452,6 +453,41 @@ def test_titles_no_newline(self): "The Little Match-Seller test", self.widget.view.model().index(0, 0).data() ) + def test_n_grams(self): + texts = [ + "Lorem ipsum dolor sit ipsum consectetur adipiscing elit dolor sit eu", + "Sed eu sollicitudin velit lorem.", + "lorem ipsum eu", + ] + # try n-gram range 2, 2 + corpus = self.create_corpus(texts) + pp_list = [preprocess.NGrams(ngrams_range=(2, 2))] + for p in pp_list: + corpus = p(corpus) + words = create_words_table(["lorem ipsum", "dolor sit", "eu"]) + self.send_signal(self.widget.Inputs.corpus, corpus) + self.send_signal(self.widget.Inputs.words, words) + self.widget.controls.word_appearance.click() + self.wait_until_finished() + # fist text: 1 lorem ipsum, 2 dolor sit, eu not in corpus's bi-grams + # second text: none of them present + # second text: only lorem ipsum + self.assertListEqual([x[1] for x in self.widget.model], [1, 0, 1 / 3]) + self.assertListEqual([x[2] for x in self.widget.model], [2 / 3, 0, 1 / 3]) + + # try n-gram range 1, 2 + corpus = self.create_corpus(texts) + pp_list = [preprocess.NGrams(ngrams_range=(1, 2))] + for p in pp_list: + corpus = p(corpus) + self.send_signal(self.widget.Inputs.corpus, corpus) + self.wait_until_finished() + # fist: 1 lorem ipsum, 2 dolor sit, 1 eu + # second: 0 lorem ipsum, 0 dolor sit, 1 eu + # third: 1 lorem ipsum, 0 dolor sit, 1 eu + self.assertListEqual([x[1] for x in self.widget.model], [4 / 3, 1 / 3, 2 / 3]) + self.assertListEqual([x[2] for x in self.widget.model], [3 / 3, 1 / 3, 2 / 3]) + if __name__ == "__main__": unittest.main()