From 53ec11fee12ad201e560a1215dcba74d22dfdd43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Fri, 1 Apr 2022 01:00:45 +0200 Subject: [PATCH] Document gensim.models.bm25 --- .../core/run_topics_and_transformations.ipynb | 4 +- .../core/run_topics_and_transformations.py | 14 ++ .../run_topics_and_transformations.py.md5 | 2 +- .../core/run_topics_and_transformations.rst | 14 ++ .../core/run_topics_and_transformations.py | 14 ++ gensim/models/bm25model.py | 218 ++++++++++++++++-- gensim/test/test_bm25model.py | 16 +- 7 files changed, 248 insertions(+), 34 deletions(-) diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.ipynb b/docs/src/auto_examples/core/run_topics_and_transformations.ipynb index c5a5fbb709..0f8db2830b 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.ipynb +++ b/docs/src/auto_examples/core/run_topics_and_transformations.ipynb @@ -177,7 +177,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next question might be: just how exactly similar are those documents to each other?\nIs there a way to formalize the similarity, so that for a given input document, we can\norder some other set of documents according to their similarity? Similarity queries\nare covered in the next tutorial (`sphx_glr_auto_examples_core_run_similarity_queries.py`).\n\n\nAvailable transformations\n--------------------------\n\nGensim implements several popular Vector Space Model algorithms:\n\n* `Term Frequency * Inverse Document Frequency, Tf-Idf `_\n expects a bag-of-words (integer values) training corpus during initialization.\n During transformation, it will take a vector and return another vector of the\n same dimensionality, except that features which were rare in the training corpus\n will have their value increased.\n It therefore converts integer-valued vectors into real-valued ones, while leaving\n the number of dimensions intact. It can also optionally normalize the resulting\n vectors to (Euclidean) unit length.\n\n .. sourcecode:: pycon\n\n model = models.TfidfModel(corpus, normalize=True)\n\n* `Latent Semantic Indexing, LSI (or sometimes LSA) `_\n transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into\n a latent space of a lower dimensionality. For the toy corpus above we used only\n 2 latent dimensions, but on real corpora, target dimensionality of 200--500 is recommended\n as a \"golden standard\" [1]_.\n\n .. sourcecode:: pycon\n\n model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)\n\n LSI training is unique in that we can continue \"training\" at any point, simply\n by providing more training documents. This is done by incremental updates to\n the underlying model, in a process called `online training`. Because of this feature, the\n input document stream may even be infinite -- just keep feeding LSI new documents\n as they arrive, while using the computed transformation model as read-only in the meanwhile!\n\n .. sourcecode:: pycon\n\n model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus\n lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model\n\n model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents\n lsi_vec = model[tfidf_vec]\n\n See the :mod:`gensim.models.lsimodel` documentation for details on how to make\n LSI gradually \"forget\" old observations in infinite streams. If you want to get dirty,\n there are also parameters you can tweak that affect speed vs. memory footprint vs. numerical\n precision of the LSI algorithm.\n\n `gensim` uses a novel online incremental streamed distributed training algorithm (quite a mouthful!),\n which I published in [5]_. `gensim` also executes a stochastic multi-pass algorithm\n from Halko et al. [4]_ internally, to accelerate in-core part\n of the computations.\n See also `wiki` for further speed-ups by distributing the computation across\n a cluster of computers.\n\n* `Random Projections, RP `_ aim to\n reduce vector space dimensionality. This is a very efficient (both memory- and\n CPU-friendly) approach to approximating TfIdf distances between documents, by throwing in a little randomness.\n Recommended target dimensionality is again in the hundreds/thousands, depending on your dataset.\n\n .. sourcecode:: pycon\n\n model = models.RpModel(tfidf_corpus, num_topics=500)\n\n* `Latent Dirichlet Allocation, LDA `_\n is yet another transformation from bag-of-words counts into a topic space of lower\n dimensionality. LDA is a probabilistic extension of LSA (also called multinomial PCA),\n so LDA's topics can be interpreted as probability distributions over words. These distributions are,\n just like with LSA, inferred automatically from a training corpus. Documents\n are in turn interpreted as a (soft) mixture of these topics (again, just like with LSA).\n\n .. sourcecode:: pycon\n\n model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)\n\n `gensim` uses a fast implementation of online LDA parameter estimation based on [2]_,\n modified to run in `distributed mode ` on a cluster of computers.\n\n* `Hierarchical Dirichlet Process, HDP `_\n is a non-parametric bayesian method (note the missing number of requested topics):\n\n .. sourcecode:: pycon\n\n model = models.HdpModel(corpus, id2word=dictionary)\n\n `gensim` uses a fast, online implementation based on [3]_.\n The HDP model is a new addition to `gensim`, and still rough around its academic edges -- use with care.\n\nAdding new :abbr:`VSM (Vector Space Model)` transformations (such as different weighting schemes) is rather trivial;\nsee the `apiref` or directly the `Python code `_\nfor more info and examples.\n\nIt is worth repeating that these are all unique, **incremental** implementations,\nwhich do not require the whole training corpus to be present in main memory all at once.\nWith memory taken care of, I am now improving `distributed`,\nto improve CPU efficiency, too.\nIf you feel you could contribute by testing, providing use-cases or code, see the `Gensim Developer guide `__.\n\nWhat Next?\n----------\n\nContinue on to the next tutorial on `sphx_glr_auto_examples_core_run_similarity_queries.py`.\n\nReferences\n----------\n\n.. [1] Bradford. 2008. An empirical study of required dimensionality for large-scale latent semantic indexing applications.\n\n.. [2] Hoffman, Blei, Bach. 2010. Online learning for Latent Dirichlet Allocation.\n\n.. [3] Wang, Paisley, Blei. 2011. Online variational inference for the hierarchical Dirichlet process.\n\n.. [4] Halko, Martinsson, Tropp. 2009. Finding structure with randomness.\n\n.. [5] \u0158eh\u016f\u0159ek. 2011. Subspace tracking for Latent Semantic Analysis.\n\n" + "The next question might be: just how exactly similar are those documents to each other?\nIs there a way to formalize the similarity, so that for a given input document, we can\norder some other set of documents according to their similarity? Similarity queries\nare covered in the next tutorial (`sphx_glr_auto_examples_core_run_similarity_queries.py`).\n\n\nAvailable transformations\n--------------------------\n\nGensim implements several popular Vector Space Model algorithms:\n\n* `Term Frequency * Inverse Document Frequency, Tf-Idf `_\n expects a bag-of-words (integer values) training corpus during initialization.\n During transformation, it will take a vector and return another vector of the\n same dimensionality, except that features which were rare in the training corpus\n will have their value increased.\n It therefore converts integer-valued vectors into real-valued ones, while leaving\n the number of dimensions intact. It can also optionally normalize the resulting\n vectors to (Euclidean) unit length.\n\n .. sourcecode:: pycon\n\n model = models.TfidfModel(corpus, normalize=True)\n\n* `Okapi Best Matching, Okapi BM25 `_\n expects a bag-of-words (integer values) training corpus during initialization.\n During transformation, it will take a vector and return another vector of the\n same dimensionality, except that features which were rare in the training corpus\n will have their value increased. It therefore converts integer-valued\n vectors into real-valued ones, while leaving the number of dimensions intact.\n\n Okapi BM25 is the standard ranking function used by search engines to estimate\n the relevance of documents to a given search query.\n\n .. sourcecode:: pycon\n\n model = models.OkapiBM25Model(corpus)\n\n* `Latent Semantic Indexing, LSI (or sometimes LSA) `_\n transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into\n a latent space of a lower dimensionality. For the toy corpus above we used only\n 2 latent dimensions, but on real corpora, target dimensionality of 200--500 is recommended\n as a \"golden standard\" [1]_.\n\n .. sourcecode:: pycon\n\n model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)\n\n LSI training is unique in that we can continue \"training\" at any point, simply\n by providing more training documents. This is done by incremental updates to\n the underlying model, in a process called `online training`. Because of this feature, the\n input document stream may even be infinite -- just keep feeding LSI new documents\n as they arrive, while using the computed transformation model as read-only in the meanwhile!\n\n .. sourcecode:: pycon\n\n model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus\n lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model\n\n model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents\n lsi_vec = model[tfidf_vec]\n\n See the :mod:`gensim.models.lsimodel` documentation for details on how to make\n LSI gradually \"forget\" old observations in infinite streams. If you want to get dirty,\n there are also parameters you can tweak that affect speed vs. memory footprint vs. numerical\n precision of the LSI algorithm.\n\n `gensim` uses a novel online incremental streamed distributed training algorithm (quite a mouthful!),\n which I published in [5]_. `gensim` also executes a stochastic multi-pass algorithm\n from Halko et al. [4]_ internally, to accelerate in-core part\n of the computations.\n See also `wiki` for further speed-ups by distributing the computation across\n a cluster of computers.\n\n* `Random Projections, RP `_ aim to\n reduce vector space dimensionality. This is a very efficient (both memory- and\n CPU-friendly) approach to approximating TfIdf distances between documents, by throwing in a little randomness.\n Recommended target dimensionality is again in the hundreds/thousands, depending on your dataset.\n\n .. sourcecode:: pycon\n\n model = models.RpModel(tfidf_corpus, num_topics=500)\n\n* `Latent Dirichlet Allocation, LDA `_\n is yet another transformation from bag-of-words counts into a topic space of lower\n dimensionality. LDA is a probabilistic extension of LSA (also called multinomial PCA),\n so LDA's topics can be interpreted as probability distributions over words. These distributions are,\n just like with LSA, inferred automatically from a training corpus. Documents\n are in turn interpreted as a (soft) mixture of these topics (again, just like with LSA).\n\n .. sourcecode:: pycon\n\n model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)\n\n `gensim` uses a fast implementation of online LDA parameter estimation based on [2]_,\n modified to run in `distributed mode ` on a cluster of computers.\n\n* `Hierarchical Dirichlet Process, HDP `_\n is a non-parametric bayesian method (note the missing number of requested topics):\n\n .. sourcecode:: pycon\n\n model = models.HdpModel(corpus, id2word=dictionary)\n\n `gensim` uses a fast, online implementation based on [3]_.\n The HDP model is a new addition to `gensim`, and still rough around its academic edges -- use with care.\n\nAdding new :abbr:`VSM (Vector Space Model)` transformations (such as different weighting schemes) is rather trivial;\nsee the `apiref` or directly the `Python code `_\nfor more info and examples.\n\nIt is worth repeating that these are all unique, **incremental** implementations,\nwhich do not require the whole training corpus to be present in main memory all at once.\nWith memory taken care of, I am now improving `distributed`,\nto improve CPU efficiency, too.\nIf you feel you could contribute by testing, providing use-cases or code, see the `Gensim Developer guide `__.\n\nWhat Next?\n----------\n\nContinue on to the next tutorial on `sphx_glr_auto_examples_core_run_similarity_queries.py`.\n\nReferences\n----------\n\n.. [1] Bradford. 2008. An empirical study of required dimensionality for large-scale latent semantic indexing applications.\n\n.. [2] Hoffman, Blei, Bach. 2010. Online learning for Latent Dirichlet Allocation.\n\n.. [3] Wang, Paisley, Blei. 2011. Online variational inference for the hierarchical Dirichlet process.\n\n.. [4] Halko, Martinsson, Tropp. 2009. Finding structure with randomness.\n\n.. [5] \u0158eh\u016f\u0159ek. 2011. Subspace tracking for Latent Semantic Analysis.\n\n" ] }, { @@ -213,4 +213,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.py b/docs/src/auto_examples/core/run_topics_and_transformations.py index 605584084d..45888505e0 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.py +++ b/docs/src/auto_examples/core/run_topics_and_transformations.py @@ -188,6 +188,20 @@ # # model = models.TfidfModel(corpus, normalize=True) # +# * `Okapi Best Matching, Okapi BM25 `_ +# expects a bag-of-words (integer values) training corpus during initialization. +# During transformation, it will take a vector and return another vector of the +# same dimensionality, except that features which were rare in the training corpus +# will have their value increased. It therefore converts integer-valued +# vectors into real-valued ones, while leaving the number of dimensions intact. +# +# Okapi BM25 is the standard ranking function used by search engines to estimate +# the relevance of documents to a given search query. +# +# .. sourcecode:: pycon +# +# model = models.OkapiBM25Model(corpus) +# # * `Latent Semantic Indexing, LSI (or sometimes LSA) `_ # transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into # a latent space of a lower dimensionality. For the toy corpus above we used only diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 b/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 index 4ea3bee39d..cc92a5ab9b 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 +++ b/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 @@ -1 +1 @@ -f49c3821bbacdeefdf3945d5dcb5ad01 \ No newline at end of file +226db24f9e807e4bbd2a6ef280a75510 diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.rst b/docs/src/auto_examples/core/run_topics_and_transformations.rst index a5056ee4e3..097ea445c5 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.rst +++ b/docs/src/auto_examples/core/run_topics_and_transformations.rst @@ -334,6 +334,20 @@ Gensim implements several popular Vector Space Model algorithms: model = models.TfidfModel(corpus, normalize=True) +* `Okapi Best Matching, Okapi BM25 `_ + expects a bag-of-words (integer values) training corpus during initialization. + During transformation, it will take a vector and return another vector of the + same dimensionality, except that features which were rare in the training corpus + will have their value increased. It therefore converts integer-valued + vectors into real-valued ones, while leaving the number of dimensions intact. + + Okapi BM25 is the standard ranking function used by search engines to estimate + the relevance of documents to a given search query. + + .. sourcecode:: pycon + + model = models.OkapiBM25Model(corpus) + * `Latent Semantic Indexing, LSI (or sometimes LSA) `_ transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into a latent space of a lower dimensionality. For the toy corpus above we used only diff --git a/docs/src/gallery/core/run_topics_and_transformations.py b/docs/src/gallery/core/run_topics_and_transformations.py index 605584084d..45888505e0 100644 --- a/docs/src/gallery/core/run_topics_and_transformations.py +++ b/docs/src/gallery/core/run_topics_and_transformations.py @@ -188,6 +188,20 @@ # # model = models.TfidfModel(corpus, normalize=True) # +# * `Okapi Best Matching, Okapi BM25 `_ +# expects a bag-of-words (integer values) training corpus during initialization. +# During transformation, it will take a vector and return another vector of the +# same dimensionality, except that features which were rare in the training corpus +# will have their value increased. It therefore converts integer-valued +# vectors into real-valued ones, while leaving the number of dimensions intact. +# +# Okapi BM25 is the standard ranking function used by search engines to estimate +# the relevance of documents to a given search query. +# +# .. sourcecode:: pycon +# +# model = models.OkapiBM25Model(corpus) +# # * `Latent Semantic Indexing, LSI (or sometimes LSA) `_ # transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into # a latent space of a lower dimensionality. For the toy corpus above we used only diff --git a/gensim/models/bm25model.py b/gensim/models/bm25model.py index 65c2de470f..8890aaefe8 100644 --- a/gensim/models/bm25model.py +++ b/gensim/models/bm25model.py @@ -4,6 +4,11 @@ """This module implements functionality related to the `Okapi Best Matching `_ class of bag-of-words vector space models. +Robertson and Zaragoza [1]_ describe the original algorithm and its modifications. + +.. [1] Robertson S., Zaragoza H. (2015). `The Probabilistic Relevance Framework: BM25 and + Beyond, `_. + """ from abc import ABCMeta, abstractmethod @@ -19,41 +24,150 @@ class BM25ABC(interfaces.TransformationABC, metaclass=ABCMeta): + """Objects of this abstract class realize the transformation between word-document co-occurrence + matrix (int) into a BM25 matrix (positive floats). Concrete subclasses of this abstract class + implement different BM25 scoring functions. + + """ def __init__(self, corpus=None, dictionary=None): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + + Attributes + ---------- + avgdl : float + The average length of a document. + idfs : dict of (int, float) + A mapping from term ids to inverse term document frequencies. + + """ self.avgdl, self.idfs = None, None if dictionary: if corpus: logger.warning("constructor received both corpus and dictionary; ignoring the corpus") - self.initialize_from_dictionary(dictionary) + num_tokens = sum(dictionary.cfs.values()) + self.avgdl = num_tokens / dictionary.num_docs + self.idfs = self.precompute_idfs(dictionary.dfs, dictionary.num_docs) elif corpus: - self.initialize_from_corpus(corpus) + dfs = defaultdict(lambda: 0) + num_tokens = 0 + num_docs = 0 + for bow in corpus: + num_tokens += len(bow) + for term_id in set(term_id for term_id, _ in bow): + dfs[term_id] += 1 + num_docs += 1 + self.avgdl = num_tokens / num_docs + self.idfs = self.precompute_idfs(dfs, num_docs) else: pass - def initialize_from_dictionary(self, dictionary): - num_tokens = sum(dictionary.cfs.values()) - self.avgdl = num_tokens / dictionary.num_docs - self.idfs = self.precompute_idfs(dictionary.dfs, dictionary.num_docs) - - def initialize_from_corpus(self, corpus): - dfs = defaultdict(lambda: 0) - num_tokens = 0 - num_docs = 0 - for bow in corpus: - num_tokens += len(bow) - for term_id in set(term_id for term_id, _ in bow): - dfs[term_id] += 1 - num_docs += 1 - self.avgdl = num_tokens / num_docs - self.idfs = self.precompute_idfs(dfs, num_docs) - @abstractmethod def precompute_idfs(self, dfs, num_docs): + """Precompute inverse term document frequencies, which will be used to weight term frequencies + for the documents. + + Parameters + ---------- + dfs : dict of (int, int) + A mapping from term ids to term document frequencies. + num_docs : int + The total number of documents in the training corpus. + + Returns + ------- + idfs : dict of (int, float) + A mapping from term ids to inverse term document frequencies. + + """ pass class OkapiBM25Model(BM25ABC): + """The original Okapi BM25 scoring function of Robertson et al. [2]_. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.models import OkapiBM25Model + >>> from gensim.corpora import Dictionary + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath + >>> + >>> corpus = TextCorpus(datapath('testcorpus.txt')) + >>> dct = Dictionary(dataset) # fit dictionary + >>> corpus = [dct.doc2bow(line) for line in dataset] # convert corpus to BoW format + >>> + >>> model = OkapiBM25Model(corpus) # fit model + >>> vector = model[corpus[0]] # apply model to the first corpus document + + References + ---------- + .. [2] Robertson S. E., Walker S., Jones S., Hancock-Beaulieu M. M., Gatford M. (1995). + `Okapi at TREC-3 `_. + *NIST SPECIAL PUBLICATION SP*. + + """ def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75, epsilon=0.25): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default. + epsilon : float + A positive tuning parameter that lower-bounds an inverse document frequency. + Defaults to 0.25. + + Attributes + ---------- + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default. + epsilon : float + A positive tuning parameter that lower-bounds an inverse document frequency. + Defaults to 0.25. + + References + ---------- + .. [3] Singhal, A. (2001). `Modern information retrieval: A brief overview + `_. *IEEE Data Eng. Bull.*, 24(4), 35–43. + + """ self.k1, self.b, self.epsilon = k1, b, epsilon super().__init__(corpus, dictionary) @@ -102,7 +216,73 @@ def __getitem__(self, bow): class BM25PlusModel(BM25ABC): + """The BM25+ scoring function of Lv and Zhai [4]_. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.models import BM25PlusModel + >>> from gensim.corpora import Dictionary + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath + >>> + >>> corpus = TextCorpus(datapath('testcorpus.txt')) + >>> dct = Dictionary(dataset) # fit dictionary + >>> corpus = [dct.doc2bow(line) for line in dataset] # convert corpus to BoW format + >>> + >>> model = BM25PlusModel(corpus) # fit model + >>> vector = model[corpus[0]] # apply model to the first corpus document + + References + ---------- + .. [4] Lv Y., Zhai C. (2011). + `Lower-bounding term frequency normalization `_. + In Proceedings of the 20th ACM international conference on Information and knowledge + management (CIKM '11). 7–16. + + """ def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75, delta=1.0): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default. + delta : float + A tuning parameter that lower-bounds a term weight. Lv and Zhai [6]_ suggest to set + `delta` to 1.0, which is the default. + + Attributes + ---------- + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default. + delta : float + A tuning parameter that lower-bounds a term weight. Lv and Zhai [4]_ suggest to set + `delta` to 1.0, which is the default. + + """ self.k1, self.b, self.delta = k1, b, delta super().__init__(corpus, dictionary) diff --git a/gensim/test/test_bm25model.py b/gensim/test/test_bm25model.py index bd1da736eb..561360c49d 100644 --- a/gensim/test/test_bm25model.py +++ b/gensim/test/test_bm25model.py @@ -48,18 +48,10 @@ def get_idf(word): frequency = sum(map(lambda document: word in document, self.documents)) return math.log(len(self.documents) - frequency + 0.5) - math.log(frequency + 0.5) - dog_idf = get_idf('dog') - cat_idf = get_idf('cat') - mouse_idf = get_idf('mouse') - lion_idf = get_idf('lion') - - average_idf = (dog_idf + cat_idf + mouse_idf + lion_idf) / len(self.dictionary) - eps = self.epsilon * average_idf - - self.expected_dog_idf = dog_idf if dog_idf > 0 else eps - self.expected_cat_idf = cat_idf if cat_idf > 0 else eps - self.expected_mouse_idf = mouse_idf if mouse_idf > 0 else eps - self.expected_lion_idf = lion_idf if lion_idf > 0 else eps + self.expected_dog_idf = get_idf('dog') + self.expected_cat_idf = get_idf('cat') + self.expected_mouse_idf = get_idf('mouse') + self.expected_lion_idf = get_idf('lion') def test_idfs_from_corpus(self): corpus = list(map(self.dictionary.doc2bow, self.documents))