Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor bm25 to include model parametrization (cont.) #2722

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 64 additions & 8 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,36 @@ class BM25(object):
List of document lengths.
"""

def __init__(self, corpus):
def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
"""
Parameters
----------
corpus : list of list of str
Given corpus.
k1 : float
Constant used for influencing the term frequency saturation. After saturation is reached, additional
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
the type of documents or queries.
b : float
Constant used for influencing the effects of different document lengths relative to average document length.
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
depends on factors such as the type of documents or queries.
epsilon : float
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
score (with 'very common' meaning that it is present in more than half of the documents). That can be
undesirable as it means that an identical document would score less than an almost identical one (by
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
different documents) to receive an extra score.

"""

self.k1 = k1
self.b = b
self.epsilon = epsilon

self.corpus_size = 0
self.avgdl = 0
self.doc_freqs = []
Expand Down Expand Up @@ -126,7 +148,7 @@ def _initialize(self, corpus):
' unintuitive results.'.format(self.corpus_size)
)

eps = EPSILON * self.average_idf
eps = self.epsilon * self.average_idf
for word in negative_idfs:
self.idf[word] = eps

Expand All @@ -151,8 +173,8 @@ def get_score(self, document, index):
for word in document:
if word not in doc_freqs:
continue
score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
score += (self.idf[word] * doc_freqs[word] * (self.k1 + 1)
Witiko marked this conversation as resolved.
Show resolved Hide resolved
/ (doc_freqs[word] + self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)))
return score

def get_scores(self, document):
Expand Down Expand Up @@ -236,7 +258,7 @@ def _get_scores(bm25, document):
return bm25.get_scores(document)


def iter_bm25_bow(corpus, n_jobs=1):
def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
"""Yield BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.

Expand All @@ -246,6 +268,23 @@ def iter_bm25_bow(corpus, n_jobs=1):
Corpus of documents.
n_jobs : int
The number of processes to use for computing bm25.
k1 : float
Constant used for influencing the term frequency saturation. After saturation is reached, additional
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
the type of documents or queries.
b : float
Constant used for influencing the effects of different document lengths relative to average document length.
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
depends on factors such as the type of documents or queries.
epsilon : float
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
score (with 'very common' meaning that it is present in more than half of the documents). That can be
undesirable as it means that an identical document would score less than an almost identical one (by
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
different documents) to receive an extra score.

Yields
-------
Expand All @@ -265,7 +304,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
>>> result = iter_bm25_weights(corpus, n_jobs=-1)

"""
bm25 = BM25(corpus)
bm25 = BM25(corpus, k1, b, epsilon)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
Expand All @@ -282,7 +321,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
pool.join()


def get_bm25_weights(corpus, n_jobs=1):
def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
"""Returns BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.

Expand All @@ -292,6 +331,23 @@ def get_bm25_weights(corpus, n_jobs=1):
Corpus of documents.
n_jobs : int
The number of processes to use for computing bm25.
k1 : float
Constant used for influencing the term frequency saturation. After saturation is reached, additional
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
the type of documents or queries.
b : float
Constant used for influencing the effects of different document lengths relative to average document length.
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
depends on factors such as the type of documents or queries.
epsilon : float
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
score (with 'very common' meaning that it is present in more than half of the documents). That can be
undesirable as it means that an identical document would score less than an almost identical one (by
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
different documents) to receive an extra score.

Returns
-------
Expand All @@ -311,7 +367,7 @@ def get_bm25_weights(corpus, n_jobs=1):
>>> result = get_bm25_weights(corpus, n_jobs=-1)

"""
bm25 = BM25(corpus)
bm25 = BM25(corpus, k1, b, epsilon)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
Expand Down
72 changes: 71 additions & 1 deletion gensim/test/test_BM25.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import logging
import unittest

from gensim.summarization.bm25 import get_bm25_weights
from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25
from gensim.test.utils import common_texts


Expand Down Expand Up @@ -62,6 +62,76 @@ def test_multiprocessing(self):
self.assertAlmostEqual(weights1, weights3)
self.assertAlmostEqual(weights2, weights3)

def test_k1(self):
""" Changing the k1 parameter should give consistent results """
corpus = common_texts
index = 0
doc = corpus[index]
first_k1 = 1.0
second_k1 = 2.0

first_bm25 = BM25(corpus, k1=first_k1)
second_bm25 = BM25(corpus, k1=second_k1)
self.assertTrue(first_bm25.get_score(doc, index) < second_bm25.get_score(doc, index))
Witiko marked this conversation as resolved.
Show resolved Hide resolved

first_iter = iter_bm25_bow(corpus, k1=first_k1)
second_iter = iter_bm25_bow(corpus, k1=second_k1)
self.assertTrue(dict(next(iter(first_iter)))[index] < dict(next(iter(second_iter)))[index])

first_weights = get_bm25_weights(corpus, k1=first_k1)
second_weights = get_bm25_weights(corpus, k1=second_k1)
self.assertTrue(first_weights[index] < second_weights[index])

def test_b(self):
""" Changing the b parameter should give consistent results """
corpus = common_texts
index = 0
doc = corpus[index]
first_b = 1.0
second_b = 2.0

first_bm25 = BM25(corpus, b=first_b)
second_bm25 = BM25(corpus, b=second_b)
self.assertTrue(first_bm25.get_score(doc, index) < second_bm25.get_score(doc, index))

first_iter = iter_bm25_bow(corpus, b=first_b)
second_iter = iter_bm25_bow(corpus, b=second_b)
self.assertTrue(dict(next(iter(first_iter)))[index] < dict(next(iter(second_iter)))[index])

first_weights = get_bm25_weights(corpus, b=first_b)
second_weights = get_bm25_weights(corpus, b=second_b)
self.assertTrue(first_weights[index] < second_weights[index])

def test_epsilon(self):
""" Changing the b parameter should give consistent results """
corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
first_epsilon = 1.0
second_epsilon = 2.0
bm25 = BM25(corpus)
words_with_negative_idfs = set([
word
for word, idf in bm25.idf.items()
if idf < 0
])
index, doc = [
(index, document)
for index, document
in enumerate(corpus)
if words_with_negative_idfs & set(document)
][0]

first_bm25 = BM25(corpus, epsilon=first_epsilon)
second_bm25 = BM25(corpus, epsilon=second_epsilon)
self.assertTrue(first_bm25.get_score(doc, index) > second_bm25.get_score(doc, index))

first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon)
second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon)
self.assertTrue(dict(next(iter(first_iter)))[index] > dict(next(iter(second_iter)))[index])

first_weights = get_bm25_weights(corpus, epsilon=first_epsilon)
second_weights = get_bm25_weights(corpus, epsilon=second_epsilon)
self.assertTrue(first_weights[index] > second_weights[index])


if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
Expand Down