Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Score Documents - enable matching n-grams #935

Merged
merged 1 commit into from
Feb 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/widgets/score-documents.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Scores documents based on word appearance.
4. If *Send Automatically*, changes are communicated automatically. Alternatively press *Send*.
5. Filter documents based on the document title in the first column. Below is the table with the document titles in the first column and scores in other columns.

**Note**: Score Documents will apply preprocessing from the input Corpus to words before scoring.

Example
-------

Expand Down
5 changes: 3 additions & 2 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def _check_arrays(*arrays):

class Corpus(Table):
"""Internal class for storing a corpus."""
NGRAMS_SEPARATOR = " "

def __new__(cls, *args, **kwargs):
if args and isinstance(args[0], Domain) or "domain" in kwargs:
Expand Down Expand Up @@ -447,7 +448,7 @@ def pos_tags(self):
def pos_tags(self, pos_tags):
self._pos_tags = pos_tags

def ngrams_iterator(self, join_with=' ', include_postags=False):
def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
if self.pos_tags is None:
include_postags = False

Expand All @@ -471,7 +472,7 @@ def ngrams_iterator(self, join_with=' ', include_postags=False):
@property
def ngrams(self):
"""generator: Ngram representations of documents."""
return self.ngrams_iterator(join_with=' ')
return self.ngrams_iterator(join_with=self.NGRAMS_SEPARATOR)

def copy(self):
"""Return a copy of the table."""
Expand Down
35 changes: 18 additions & 17 deletions orangecontrib/text/widgets/owscoredocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
QRadioButton,
QTableView,
)

from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.data.util import get_unique_names
from Orange.util import wrap_callback
Expand All @@ -33,33 +34,34 @@
from sklearn.metrics.pairwise import cosine_similarity

from orangecontrib.text import Corpus
from orangecontrib.text.preprocess import BaseNormalizer, BaseTransformer
from orangecontrib.text.preprocess import BaseNormalizer, NGrams, BaseTokenFilter
from orangecontrib.text.vectorization.document_embedder import (
LANGS_TO_ISO,
DocumentEmbedder,
)
from orangecontrib.text.widgets.utils import enum2int
from orangecontrib.text.widgets.utils.words import create_words_table


def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
res = []
tokens = corpus.tokens
tokens = corpus.ngrams
for i, t in enumerate(tokens):
counts = Counter(t)
res.append([counts.get(w, 0) for w in words])
callback((i + 1) / len(tokens))
callback((i + 1) / len(corpus))
return np.array(res)


def _word_appearance(
corpus: Corpus, words: List[str], callback: Callable
) -> np.ndarray:
res = []
tokens = corpus.tokens
tokens = corpus.ngrams
for i, t in enumerate(tokens):
t = set(t)
res.append([w in t for w in words])
callback((i + 1) / len(tokens))
callback((i + 1) / len(corpus))
return np.array(res).astype(float)


Expand Down Expand Up @@ -123,10 +125,9 @@ def _preprocess_words(
) -> List[str]:
"""
Corpus's tokens can be preprocessed. Since they will not match correctly
with words preprocessors that change words (e.g. normalization) must
be applied to words too.
with words, same preprocessors that preprocess words in corpus
(e.g. normalization) must be applied to words too.
"""
# workaround to preprocess words
# TODO: currently preprocessors work only on corpus, when there will be more
# cases like this think about implementation of preprocessors for a list
# of strings
Expand All @@ -137,16 +138,16 @@ def _preprocess_words(
metas=np.array([[w] for w in words]),
text_features=[words_feature],
)
# only transformers and normalizers preprocess on the word level
pps = [
pp
for pp in corpus.used_preprocessor.preprocessors
if isinstance(pp, (BaseTransformer, BaseNormalizer))
]
# apply all corpus preprocessors except Filter and NGrams, which change terms
# filter removes words from the term, and NGrams split the term in grams.
# If a user decided to score with a particular term, he meant this term
# and not derivations of it
pps = corpus.used_preprocessor.preprocessors
for i, pp in enumerate(pps):
words_c = pp(words_c)
callback((i + 1) / len(pps))
return [w[0] for w in words_c.tokens if len(w)]
if not isinstance(pp, (BaseTokenFilter, NGrams)):
words_c = pp(words_c)
callback((i + 1) / len(pps))
return [Corpus.NGRAMS_SEPARATOR.join(ngs) for ngs in words_c.tokens if len(ngs)]


def _run(
Expand Down
50 changes: 50 additions & 0 deletions orangecontrib/text/widgets/tests/test_owscoredocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def test_preprocess_words(self):
preprocess.StripAccentsTransformer(),
preprocess.UrlRemover(),
preprocess.HtmlTransformer(),
preprocess.RegexpTokenizer()
]
for p in pp_list:
corpus = p(corpus)
Expand Down Expand Up @@ -452,6 +453,55 @@ def test_titles_no_newline(self):
"The Little Match-Seller test", self.widget.view.model().index(0, 0).data()
)

def test_n_grams(self):
texts = [
"Lorem ipsum dolor sit ipsum consectetur adipiscing elit dolor sit eu",
"Sed eu sollicitudin velit lorem.",
"lorem ipsum eu",
]
# try n-gram range 2, 2
corpus = self.create_corpus(texts)
pp_list = [
preprocess.LowercaseTransformer(),
preprocess.RegexpTokenizer(),
# skipped in corpus but not among words, since filter ignored for words
preprocess.RegexpFilter("sed"),
# for corpus, ignored in
preprocess.NGrams(ngrams_range=(2, 2)),
]
for p in pp_list:
corpus = p(corpus)
words = create_words_table(["lorem ipsum", "dolor sit", "eu", "sed eu"])
# just test that word preprocessing didn't consider Filter and Ngrams
self.assertListEqual(
["lorem ipsum", "dolor sit", "eu", "sed eu"],
_preprocess_words(
corpus, ["lorem ipsum", "dolor sit", "eu", "sed eu"], dummy_callback
),
)
self.send_signal(self.widget.Inputs.corpus, corpus)
self.send_signal(self.widget.Inputs.words, words)
self.widget.controls.word_appearance.click()
self.wait_until_finished()
# fist text: 1 lorem ipsum, 2 dolor sit, eu not in corpus's bi-grams
# second text: none of them present
# second text: only lorem ipsum
self.assertListEqual([x[1] for x in self.widget.model], [3 / 4, 0, 1 / 4])
self.assertListEqual([x[2] for x in self.widget.model], [2 / 4, 0, 1 / 4])

# try n-gram range 1, 2, seed not ignored this time
corpus = self.create_corpus(texts)
pp_list = [preprocess.NGrams(ngrams_range=(1, 2))]
for p in pp_list:
corpus = p(corpus)
self.send_signal(self.widget.Inputs.corpus, corpus)
self.wait_until_finished()
# fist: 1 lorem ipsum, 2 dolor sit, 1 eu
# second: 0 lorem ipsum, 0 dolor sit, 1 eu
# third: 1 lorem ipsum, 0 dolor sit, 1 eu
self.assertListEqual([x[1] for x in self.widget.model], [4 / 4, 2 / 4, 2 / 4])
self.assertListEqual([x[2] for x in self.widget.model], [3 / 4, 2 / 4, 2 / 4])


if __name__ == "__main__":
unittest.main()