Skip to content

Commit

Permalink
Score Documents - enable matching n-grams
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 3, 2023
1 parent f644a27 commit a873fe0
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 19 deletions.
2 changes: 2 additions & 0 deletions doc/widgets/score-documents.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Scores documents based on word appearance.
4. If *Send Automatically*, changes are communicated automatically. Alternatively press *Send*.
5. Filter documents based on the document title in the first column. Below is the table with the document titles in the first column and scores in other columns.

**Note**: Score Documents will apply preprocessing from the input Corpus to words before scoring.

Example
-------

Expand Down
5 changes: 3 additions & 2 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def _check_arrays(*arrays):

class Corpus(Table):
"""Internal class for storing a corpus."""
NGRAMS_SEPARATOR = " "

def __new__(cls, *args, **kwargs):
if args and isinstance(args[0], Domain) or "domain" in kwargs:
Expand Down Expand Up @@ -447,7 +448,7 @@ def pos_tags(self):
def pos_tags(self, pos_tags):
self._pos_tags = pos_tags

def ngrams_iterator(self, join_with=' ', include_postags=False):
def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
if self.pos_tags is None:
include_postags = False

Expand All @@ -471,7 +472,7 @@ def ngrams_iterator(self, join_with=' ', include_postags=False):
@property
def ngrams(self):
"""generator: Ngram representations of documents."""
return self.ngrams_iterator(join_with=' ')
return self.ngrams_iterator(join_with=self.NGRAMS_SEPARATOR)

def copy(self):
"""Return a copy of the table."""
Expand Down
36 changes: 19 additions & 17 deletions orangecontrib/text/widgets/owscoredocuments.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from collections import Counter
from inspect import signature
from tracemalloc import BaseFilter
from typing import Callable, List, Tuple, Union

import numpy as np
Expand All @@ -19,6 +20,7 @@
QRadioButton,
QTableView,
)

from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.data.util import get_unique_names
from Orange.util import wrap_callback
Expand All @@ -33,33 +35,34 @@
from sklearn.metrics.pairwise import cosine_similarity

from orangecontrib.text import Corpus
from orangecontrib.text.preprocess import BaseNormalizer, BaseTransformer
from orangecontrib.text.preprocess import BaseNormalizer, NGrams
from orangecontrib.text.vectorization.document_embedder import (
LANGS_TO_ISO,
DocumentEmbedder,
)
from orangecontrib.text.widgets.utils import enum2int
from orangecontrib.text.widgets.utils.words import create_words_table


def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
res = []
tokens = corpus.tokens
tokens = corpus.ngrams
for i, t in enumerate(tokens):
counts = Counter(t)
res.append([counts.get(w, 0) for w in words])
callback((i + 1) / len(tokens))
callback((i + 1) / len(corpus))
return np.array(res)


def _word_appearance(
corpus: Corpus, words: List[str], callback: Callable
) -> np.ndarray:
res = []
tokens = corpus.tokens
tokens = corpus.ngrams
for i, t in enumerate(tokens):
t = set(t)
res.append([w in t for w in words])
callback((i + 1) / len(tokens))
callback((i + 1) / len(corpus))
return np.array(res).astype(float)


Expand Down Expand Up @@ -123,10 +126,9 @@ def _preprocess_words(
) -> List[str]:
"""
Corpus's tokens can be preprocessed. Since they will not match correctly
with words preprocessors that change words (e.g. normalization) must
be applied to words too.
with words, same preprocessors that preprocess words in corpus
(e.g. normalization) must be applied to words too.
"""
# workaround to preprocess words
# TODO: currently preprocessors work only on corpus, when there will be more
# cases like this think about implementation of preprocessors for a list
# of strings
Expand All @@ -137,16 +139,16 @@ def _preprocess_words(
metas=np.array([[w] for w in words]),
text_features=[words_feature],
)
# only transformers and normalizers preprocess on the word level
pps = [
pp
for pp in corpus.used_preprocessor.preprocessors
if isinstance(pp, (BaseTransformer, BaseNormalizer))
]
# apply all corpus preprocessors except Filter and NGrams, which change terms
# filter removes words from the term, and NGrams split the term in grams.
# If a user decided to score with a particular term, he meant this term
# and not derivations of it
pps = corpus.used_preprocessor.preprocessors
for i, pp in enumerate(pps):
words_c = pp(words_c)
callback((i + 1) / len(pps))
return [w[0] for w in words_c.tokens if len(w)]
if not isinstance(pp, (BaseFilter, NGrams)):
words_c = pp(words_c)
callback((i + 1) / len(pps))
return [Corpus.NGRAMS_SEPARATOR.join(ngs) for ngs in words_c.tokens if len(ngs)]


def _run(
Expand Down
36 changes: 36 additions & 0 deletions orangecontrib/text/widgets/tests/test_owscoredocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def test_preprocess_words(self):
preprocess.StripAccentsTransformer(),
preprocess.UrlRemover(),
preprocess.HtmlTransformer(),
preprocess.RegexpTokenizer()
]
for p in pp_list:
corpus = p(corpus)
Expand Down Expand Up @@ -452,6 +453,41 @@ def test_titles_no_newline(self):
"The Little Match-Seller test", self.widget.view.model().index(0, 0).data()
)

def test_n_grams(self):
texts = [
"Lorem ipsum dolor sit ipsum consectetur adipiscing elit dolor sit eu",
"Sed eu sollicitudin velit lorem.",
"lorem ipsum eu",
]
# try n-gram range 2, 2
corpus = self.create_corpus(texts)
pp_list = [preprocess.NGrams(ngrams_range=(2, 2))]
for p in pp_list:
corpus = p(corpus)
words = create_words_table(["lorem ipsum", "dolor sit", "eu"])
self.send_signal(self.widget.Inputs.corpus, corpus)
self.send_signal(self.widget.Inputs.words, words)
self.widget.controls.word_appearance.click()
self.wait_until_finished()
# fist text: 1 lorem ipsum, 2 dolor sit, eu not in corpus's bi-grams
# second text: none of them present
# second text: only lorem ipsum
self.assertListEqual([x[1] for x in self.widget.model], [1, 0, 1 / 3])
self.assertListEqual([x[2] for x in self.widget.model], [2 / 3, 0, 1 / 3])

# try n-gram range 1, 2
corpus = self.create_corpus(texts)
pp_list = [preprocess.NGrams(ngrams_range=(1, 2))]
for p in pp_list:
corpus = p(corpus)
self.send_signal(self.widget.Inputs.corpus, corpus)
self.wait_until_finished()
# fist: 1 lorem ipsum, 2 dolor sit, 1 eu
# second: 0 lorem ipsum, 0 dolor sit, 1 eu
# third: 1 lorem ipsum, 0 dolor sit, 1 eu
self.assertListEqual([x[1] for x in self.widget.model], [4 / 3, 1 / 3, 2 / 3])
self.assertListEqual([x[2] for x in self.widget.model], [3 / 3, 1 / 3, 2 / 3])


if __name__ == "__main__":
unittest.main()

0 comments on commit a873fe0

Please sign in to comment.