From b30888342e454a58ea000edc8b985a0b7ac5451b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Fri, 7 Aug 2020 15:01:58 +0200 Subject: [PATCH] Reduce memory use of the term similarity matrix constructor, deprecate the positive_definite parameter, and extend normalization capabilities of the inner_product method (#2783) * Deprecate SparseTermSimilarityMatrix's positive_definite parameter * Reference paper on efficient implementation of soft cosine similarity * Add example with Annoy indexer to SparseTermSimilarityMatrix * Add example of obtaining word embeddings from SparseTermSimilarityMatrix * Reduce space complexity of SparseTermSimilarityMatrix construction Build matrix using arrays and bitfields rather than DOK sparse format This work is based on the following blog post by @maciejkula: https://maciejkula.github.io/2015/02/22/incremental-construction-of-sparse-matrices/ * Fix a typo in the soft cosine similarity Jupyter notebook * Add human-readable string representation for TermSimilarityIndex * Avoid sparse term similarity matrix computation when nonzero_limit <= 0 * Extend normalization in the inner_product method Support the `maintain` vector normalization scheme. Support separate vector normalization schemes for queries and documents. * Remove a note in the docstring of SparseTermSimilarityMatrix * Rerun continuous integration tests * Use ==/!= to compare constant literals * Add human-readable string representation for TermSimilarityIndex (cont.) * Prod flake8 with a coding style violation in a docstring * Collapse two lambdas into one internal function * Revert "Prod flake8 with a coding style violation in a docstring" This reverts commit 6557b849732b314570ea9d5132f1731d964e2fe6. * Avoid str.format() * Slice SparseTermSimilarityMatrix.inner_product tests by input types * Remove similarity_type_code local variable * Remove starting underscore from local function name * Save indentation level and define populate_buffers function * Extract SparseTermSimilarityMatrix constructor body to _create_source * Extract NON_NEGATIVE_NORM_ASSERTION_MESSAGE to a module-level constant * Extract cell assignment logic to cell_full local function * Split variable swapping into three separate statements * Extract normalization from the body of SparseTermSimilarityMatrix.inner_product * Wrap overlong line * Add test_inner_product_zerovector_zerovector and test_inner_product_zerovector_vector tests * Further split test_inner_product into 63 test cases * Raise ValueError when dictionary is empty --- docs/notebooks/soft_cosine_tutorial.ipynb | 4 +- gensim/similarities/docsim.py | 2 +- gensim/similarities/termsim.py | 527 +++++++++++++------- gensim/test/test_similarities.py | 562 ++++++++++++++++++++-- 4 files changed, 862 insertions(+), 233 deletions(-) diff --git a/docs/notebooks/soft_cosine_tutorial.ipynb b/docs/notebooks/soft_cosine_tutorial.ipynb index 358c80ed02..aadbecf6a5 100644 --- a/docs/notebooks/soft_cosine_tutorial.ipynb +++ b/docs/notebooks/soft_cosine_tutorial.ipynb @@ -225,7 +225,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of documents: 3\n", + "Number of documents: 2274338\n", "CPU times: user 2min 1s, sys: 1.9 s, total: 2min 3s\n", "Wall time: 2min 56s\n" ] @@ -259,7 +259,7 @@ " [preprocess(relcomment[\"RelCText\"]) for relcomment in thread[\"RelComments\"]])\n", " for thread in api.load(\"semeval-2016-2017-task3-subtaskA-unannotated\")]))\n", "\n", - "print(\"Number of documents: %d\" % len(documents))" + "print(\"Number of documents: %d\" % len(corpus))" ] }, { diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index daba706eb1..b014952499 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -978,7 +978,7 @@ def get_similarities(self, query): is_corpus, query = utils.is_corpus(query) if not is_corpus and isinstance(query, numpy.ndarray): query = [self.corpus[i] for i in query] # convert document indexes to actual documents - result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True) + result = self.similarity_matrix.inner_product(query, self.corpus, normalized=(True, True)) if scipy.sparse.issparse(result): return numpy.asarray(result.todense()) diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index c0e61e1490..3dcd4c6ae6 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -8,11 +8,14 @@ This module provides classes that deal with term similarities. """ +from array import array from itertools import chain import logging from math import sqrt +import warnings import numpy as np +from six.moves import range from scipy import sparse from gensim.matutils import corpus2csc @@ -20,6 +23,10 @@ logger = logging.getLogger(__name__) +NON_NEGATIVE_NORM_ASSERTION_MESSAGE = u"sparse documents must not contain any explicit " \ + u"zero entries and the similarity matrix S must satisfy x^T * S * x >= 0 for any " \ + u"nonzero bag-of-words vector x." + class TermSimilarityIndex(SaveLoad): """ @@ -52,6 +59,10 @@ def most_similar(self, term, topn=10): """ raise NotImplementedError + def __str__(self): + members = ', '.join('%s=%s' % pair for pair in vars(self).items()) + return '%s(%s)' % (self.__class__.__name__, members) + class UniformTermSimilarityIndex(TermSimilarityIndex): """ @@ -86,33 +97,6 @@ def most_similar(self, t1, topn=10): yield (t2, self.term_similarity) -def _shortest_uint_dtype(max_value): - """Get the shortest unsingned integer data-type required for representing values up to a given - maximum value. - - Returns the shortest unsingned integer data-type required for representing values up to a given - maximum value. - - Parameters - ---------- - max_value : int - The maximum value we wish to represent. - - Returns - ------- - data-type - The shortest unsigned integer data-type required for representing values up to a given - maximum value. - """ - if max_value < 2**8: - return np.uint8 - elif max_value < 2**16: - return np.uint16 - elif max_value < 2**32: - return np.uint32 - return np.uint64 - - class WordEmbeddingSimilarityIndex(TermSimilarityIndex): """ Use objects of this class to: @@ -156,32 +140,288 @@ def most_similar(self, t1, topn=10): yield (t2, similarity**self.exponent) +def _shortest_uint_dtype(max_value): + """Get the shortest unsingned integer data-type required for representing values up to a given + maximum value. + + Returns the shortest unsingned integer data-type required for representing values up to a given + maximum value. + + Parameters + ---------- + max_value : int + The maximum value we wish to represent. + + Returns + ------- + data-type + The shortest unsigned integer data-type required for representing values up to a given + maximum value. + """ + if max_value < 2**8: + return np.uint8 + elif max_value < 2**16: + return np.uint16 + elif max_value < 2**32: + return np.uint32 + return np.uint64 + + +def _create_source(index, dictionary, tfidf, symmetric, dominant, nonzero_limit, dtype): + """Build a sparse term similarity matrix using a term similarity index. + + Returns + ------- + matrix : :class:`scipy.sparse.coo_matrix` + The sparse term similarity matrix. + + """ + assert isinstance(index, TermSimilarityIndex) + assert dictionary is not None + matrix_order = len(dictionary) + + if matrix_order == 0: + raise ValueError('Dictionary provided to SparseTermSimilarityMatrix must not be empty') + + logger.info("constructing a sparse term similarity matrix using %s", index) + + if nonzero_limit is None: + nonzero_limit = matrix_order + + def tfidf_sort_key(term_index): + if isinstance(term_index, tuple): + term_index, *_ = term_index + term_idf = tfidf.idfs[term_index] + return (-term_idf, term_index) + + if tfidf is None: + logger.info("iterating over columns in dictionary order") + columns = sorted(dictionary.keys()) + else: + assert max(tfidf.idfs) == matrix_order - 1 + logger.info("iterating over columns in tf-idf order") + columns = sorted(tfidf.idfs.keys(), key=tfidf_sort_key) + + nonzero_counter_dtype = _shortest_uint_dtype(nonzero_limit) + + column_nonzero = np.array([0] * matrix_order, dtype=nonzero_counter_dtype) + if dominant: + column_sum = np.zeros(matrix_order, dtype=dtype) + if symmetric: + assigned_cells = set() + row_buffer = array('Q') + column_buffer = array('Q') + if dtype is np.float16 or dtype is np.float32: + data_buffer = array('f') + elif dtype is np.float64: + data_buffer = array('d') + else: + raise ValueError('Dtype %s is unsupported, use numpy.float16, float32, or float64.' % dtype) + + def cell_full(t1_index, t2_index, similarity): + if dominant and column_sum[t1_index] + abs(similarity) >= 1.0: + return True # after adding the similarity, the matrix would cease to be strongly diagonally dominant + assert column_nonzero[t1_index] <= nonzero_limit + if column_nonzero[t1_index] == nonzero_limit: + return True # after adding the similarity, the column would contain more than nonzero_limit elements + if symmetric and (t1_index, t2_index) in assigned_cells: + return True # a similarity has already been assigned to this cell + return False + + def populate_buffers(t1_index, t2_index, similarity): + column_buffer.append(t1_index) + row_buffer.append(t2_index) + data_buffer.append(similarity) + column_nonzero[t1_index] += 1 + if symmetric: + assigned_cells.add((t1_index, t2_index)) + if dominant: + column_sum[t1_index] += abs(similarity) + + try: + from tqdm import tqdm as progress_bar + except ImportError: + def progress_bar(iterable): + return iterable + + for column_number, t1_index in enumerate(progress_bar(columns)): + column_buffer.append(column_number) + row_buffer.append(column_number) + data_buffer.append(1.0) + + if nonzero_limit <= 0: + continue + + t1 = dictionary[t1_index] + num_nonzero = column_nonzero[t1_index] + num_rows = nonzero_limit - num_nonzero + most_similar = [ + (dictionary.token2id[term], similarity) + for term, similarity in index.most_similar(t1, topn=num_rows) + if term in dictionary.token2id + ] if num_rows > 0 else [] + + if tfidf is None: + rows = sorted(most_similar) + else: + rows = sorted(most_similar, key=tfidf_sort_key) + + for t2_index, similarity in rows: + if cell_full(t1_index, t2_index, similarity): + continue + if not symmetric: + populate_buffers(t1_index, t2_index, similarity) + elif not cell_full(t2_index, t1_index, similarity): + populate_buffers(t1_index, t2_index, similarity) + populate_buffers(t2_index, t1_index, similarity) + + data_buffer = np.frombuffer(data_buffer, dtype=dtype) + row_buffer = np.frombuffer(row_buffer, dtype=np.uint64) + column_buffer = np.frombuffer(column_buffer, dtype=np.uint64) + matrix = sparse.coo_matrix((data_buffer, (row_buffer, column_buffer)), shape=(matrix_order, matrix_order)) + + logger.info( + "constructed a sparse term similarity matrix with %0.06f%% density", + 100.0 * matrix.getnnz() / matrix_order**2, + ) + + return matrix + + +def _normalize_dense_vector(vector, matrix, normalization): + """Normalize a dense vector after a change of basis. + + Parameters + ---------- + vector : 1xN ndarray + A dense vector. + matrix : NxN ndarray + A change-of-basis matrix. + normalization : {True, False, 'maintain'} + Whether the vector will be L2-normalized (True; corresponds to the soft + cosine measure), maintain its L2-norm during the change of basis + ('maintain'; corresponds to query expansion with partial membership), + or kept as-is (False; corresponds to query expansion). + + Returns + ------- + vector : ndarray + The normalized dense vector. + + """ + if not normalization: + return vector + + vector_norm = vector.T.dot(matrix).dot(vector)[0, 0] + assert vector_norm >= 0.0, NON_NEGATIVE_NORM_ASSERTION_MESSAGE + if normalization == 'maintain' and vector_norm > 0.0: + vector_norm /= vector.T.dot(vector) + vector_norm = sqrt(vector_norm) + + normalized_vector = vector + if vector_norm > 0.0: + normalized_vector /= vector_norm + + return normalized_vector + + +def _normalize_dense_corpus(corpus, matrix, normalization): + """Normalize a dense corpus after a change of basis. + + Parameters + ---------- + corpus : MxN ndarray + A dense corpus. + matrix : NxN ndarray + A change-of-basis matrix. + normalization : {True, False, 'maintain'} + Whether the vector will be L2-normalized (True; corresponds to the soft + cosine measure), maintain its L2-norm during the change of basis + ('maintain'; corresponds to query expansion with partial membership), + or kept as-is (False; corresponds to query expansion). + + Returns + ------- + normalized_corpus : ndarray + The normalized dense corpus. + + """ + if not normalization: + return corpus + + # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T + corpus_norm = np.multiply(corpus.T.dot(matrix), corpus.T).sum(axis=1).T + assert corpus_norm.min() >= 0.0, NON_NEGATIVE_NORM_ASSERTION_MESSAGE + if normalization == 'maintain': + corpus_norm /= np.multiply(corpus.T, corpus.T).sum(axis=1).T + corpus_norm = np.sqrt(corpus_norm) + + normalized_corpus = np.multiply(corpus, 1.0 / corpus_norm) + normalized_corpus = np.nan_to_num(normalized_corpus) # account for division by zero + return normalized_corpus + + +def _normalize_sparse_corpus(corpus, matrix, normalization): + """Normalize a sparse corpus after a change of basis. + + Parameters + ---------- + corpus : MxN :class:`scipy.sparse.csc_matrix` + A sparse corpus. + matrix : NxN :class:`scipy.sparse.csc_matrix` + A change-of-basis matrix. + normalization : {True, False, 'maintain'} + Whether the vector will be L2-normalized (True; corresponds to the soft + cosine measure), maintain its L2-norm during the change of basis + ('maintain'; corresponds to query expansion with partial membership), + or kept as-is (False; corresponds to query expansion). + + Returns + ------- + normalized_corpus : :class:`scipy.sparse.csc_matrix` + The normalized sparse corpus. + + """ + if not normalization: + return corpus + + # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T + corpus_norm = corpus.T.dot(matrix).multiply(corpus.T).sum(axis=1).T + assert corpus_norm.min() >= 0.0, NON_NEGATIVE_NORM_ASSERTION_MESSAGE + if normalization == 'maintain': + corpus_norm /= corpus.T.multiply(corpus.T).sum(axis=1).T + corpus_norm = np.sqrt(corpus_norm) + + normalized_corpus = corpus.multiply(sparse.csr_matrix(1.0 / corpus_norm)) + normalized_corpus[normalized_corpus == np.inf] = 0 # account for division by zero + return normalized_corpus + + class SparseTermSimilarityMatrix(SaveLoad): """ Builds a sparse term similarity matrix using a term similarity index. - Notes - ----- - Building a DOK matrix, and converting it to a CSC matrix carries a significant memory overhead. - Future work should switch to building arrays of rows, columns, and non-zero elements and - directly passing these arrays to the CSC matrix constructor without copying. - Examples -------- >>> from gensim.test.utils import common_texts >>> from gensim.corpora import Dictionary >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix + >>> from gensim.similarities.index import AnnoyIndexer + >>> from scikits.sparse.cholmod import cholesky >>> >>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors - >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv) + >>> annoy = AnnoyIndexer(model, num_trees=2) # use annoy for faster word similarity lookups + >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv, kwargs={'indexer': annoy}) >>> dictionary = Dictionary(common_texts) >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] - >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix + >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, symmetric=True, dominant=True) >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) >>> >>> query = 'graph trees computer'.split() # make a query >>> sims = docsim_index[dictionary.doc2bow(query)] # calculate similarity of query to each doc from bow_corpus + >>> + >>> word_embeddings = cholesky(similarity_matrix.matrix).L() # obtain word embeddings from similarity matrix Check out `Tutorial Notebook `_ @@ -192,125 +432,64 @@ class SparseTermSimilarityMatrix(SaveLoad): source : :class:`~gensim.similarities.termsim.TermSimilarityIndex` or :class:`scipy.sparse.spmatrix` The source of the term similarity. Either a term similarity index that will be used for building the term similarity matrix, or an existing sparse term similarity matrix that will - be encapsulated and stored in the matrix attribute. + be encapsulated and stored in the matrix attribute. When a matrix is specified as the + source, any other parameters will be ignored. dictionary : :class:`~gensim.corpora.dictionary.Dictionary` or None, optional A dictionary that specifies a mapping between terms and the indices of rows and columns - of the resulting term similarity matrix. The dictionary may only be `None` when `source` is + of the resulting term similarity matrix. The dictionary may only be None when source is a :class:`scipy.sparse.spmatrix`. tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional A model that specifies the relative importance of the terms in the dictionary. The columns of the term similarity matrix will be build in a decreasing order of importance of terms, or in the order of term identifiers if None. symmetric : bool, optional - Whether the symmetry of the term similarity matrix will be enforced. This parameter only has - an effect when `source` is a :class:`scipy.sparse.spmatrix`. Positive definiteness is a - necessary precondition if you later wish to derive a change-of-basis matrix from the term - similarity matrix using Cholesky factorization. - positive_definite: bool, optional - Whether the positive definiteness of the term similarity matrix will be enforced through - strict column diagonal dominance. Positive definiteness is a necessary precondition if you - later wish to derive a change-of-basis matrix from the term similarity matrix using Cholesky - factorization. + Whether the symmetry of the term similarity matrix will be enforced. Symmetry is a necessary + precondition for positive definiteness, which is necessary if you later wish to derive a + unique change-of-basis matrix from the term similarity matrix using Cholesky factorization. + Setting symmetric to False will significantly reduce memory usage during matrix construction. + dominant: bool, optional + Whether the strict column diagonal dominance of the term similarity matrix will be enforced. + Strict diagonal dominance and symmetry are sufficient preconditions for positive + definiteness, which is necessary if you later wish to derive a change-of-basis matrix from + the term similarity matrix using Cholesky factorization. nonzero_limit : int or None, optional The maximum number of non-zero elements outside the diagonal in a single column of the sparse term similarity matrix. If None, then no limit will be imposed. dtype : numpy.dtype, optional - Data-type of the sparse term similarity matrix. + The data type of the sparse term similarity matrix. + positive_definite: bool or None, optional + A deprecated alias for dominant. Attributes ---------- matrix : :class:`scipy.sparse.csc_matrix` The encapsulated sparse term similarity matrix. - """ - PROGRESS_MESSAGE_PERIOD = 1000 # how many columns are processed between progress messages - def __init__(self, source, dictionary=None, tfidf=None, symmetric=True, positive_definite=False, nonzero_limit=100, - dtype=np.float32): - if sparse.issparse(source): - self.matrix = source.tocsc() # encapsulate the passed sparse matrix - return + Raises + ------ + ValueError + If `dictionary` is empty. + + """ + def __init__(self, source, dictionary=None, tfidf=None, symmetric=True, dominant=False, + nonzero_limit=100, dtype=np.float32, positive_definite=None): - index = source - assert isinstance(index, TermSimilarityIndex) - assert dictionary is not None - matrix_order = len(dictionary) + if positive_definite is not None: + warnings.warn( + 'Parameter positive_definite will be removed in 4.0.0, use dominant instead', + category=DeprecationWarning, + ) + dominant = positive_definite - logger.info("constructing a sparse term similarity matrix using %s", index) + if not sparse.issparse(source): + index = source + args = (index, dictionary, tfidf, symmetric, dominant, nonzero_limit, dtype) + source = _create_source(*args) + assert sparse.issparse(source) - if nonzero_limit is None: - nonzero_limit = matrix_order + self.matrix = source.tocsc() - if tfidf is None: - logger.info("iterating over columns in dictionary order") - columns = sorted(dictionary.keys()) - else: - assert max(tfidf.idfs) == matrix_order - 1 - logger.info("iterating over columns in tf-idf order") - columns = [ - term_index for term_index, _ - in sorted( - tfidf.idfs.items(), - key=lambda x: (lambda term_index, term_idf: (term_idf, -term_index))(*x), reverse=True)] - - column_nonzero = np.array([0] * matrix_order, dtype=_shortest_uint_dtype(nonzero_limit)) - column_sum = np.zeros(matrix_order, dtype=dtype) - matrix = sparse.identity(matrix_order, dtype=dtype, format="dok") - - for column_number, t1_index in enumerate(columns): - if column_number % self.PROGRESS_MESSAGE_PERIOD == 0: - logger.info( - "PROGRESS: at %.02f%% columns (%d / %d, %.06f%% density, " - "%.06f%% projected density)", - 100.0 * (column_number + 1) / matrix_order, column_number + 1, matrix_order, - 100.0 * matrix.getnnz() / matrix_order**2, - 100.0 * np.clip( - (1.0 * (matrix.getnnz() - matrix_order) / matrix_order**2) - * (1.0 * matrix_order / (column_number + 1)) - + (1.0 / matrix_order), # add density correspoding to the main diagonal - 0.0, 1.0)) - - t1 = dictionary[t1_index] - num_nonzero = column_nonzero[t1_index] - num_rows = nonzero_limit - num_nonzero - most_similar = [ - (dictionary.token2id[term], similarity) - for term, similarity in index.most_similar(t1, topn=num_rows) - if term in dictionary.token2id - ] if num_rows > 0 else [] - - if tfidf is None: - rows = sorted(most_similar) - else: - rows = sorted( - most_similar, - key=lambda x: (lambda term_index, _: (tfidf.idfs[term_index], -term_index))(*x), reverse=True) - - for row_number, (t2_index, similarity) in zip(range(num_rows), rows): - if positive_definite and column_sum[t1_index] + abs(similarity) >= 1.0: - break - if symmetric: - if column_nonzero[t2_index] < nonzero_limit \ - and (not positive_definite or column_sum[t2_index] + abs(similarity) < 1.0) \ - and not (t1_index, t2_index) in matrix: - matrix[t1_index, t2_index] = similarity - column_nonzero[t1_index] += 1 - column_sum[t1_index] += abs(similarity) - matrix[t2_index, t1_index] = similarity - column_nonzero[t2_index] += 1 - column_sum[t2_index] += abs(similarity) - else: - matrix[t1_index, t2_index] = similarity - column_sum[t1_index] += abs(similarity) - - logger.info( - "constructed a sparse term similarity matrix with %0.06f%% density", - 100.0 * matrix.getnnz() / matrix_order**2) - - matrix = matrix.T - assert sparse.issparse(matrix) - self.__init__(matrix) - - def inner_product(self, X, Y, normalized=False): + def inner_product(self, X, Y, normalized=(False, False)): """Get the inner product(s) between real vectors / corpora X and Y. Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a @@ -323,10 +502,11 @@ def inner_product(self, X, Y, normalized=False): A query vector / corpus in the sparse bag-of-words format. vec2 : list of (int, float) or iterable of list of (int, float) A document vector / corpus in the sparse bag-of-words format. - normalized : bool, optional - Whether the inner product should be L2-normalized. The normalized inner product - corresponds to the Soft Cosine Measure (SCM). SCM is a number between <-1.0, 1.0>, - where higher is more similar. + normalized : tuple of {True, False, 'maintain'}, optional + First/second value specifies whether the query/document vectors in the inner product + will be L2-normalized (True; corresponds to the soft cosine measure), maintain their + L2-norm during change of basis ('maintain'; corresponds to query expansion with partial + membership), or kept as-is (False; corresponds to query expansion; default). Returns ------- @@ -336,14 +516,35 @@ def inner_product(self, X, Y, normalized=False): References ---------- The soft cosine measure was perhaps first described by [sidorovetal14]_. + Further notes on the efficient implementation of the soft cosine measure are described by + [novotny18]_. .. [sidorovetal14] Grigori Sidorov et al., "Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model", 2014, http://www.cys.cic.ipn.mx/ojs/index.php/CyS/article/view/2043/1921. + .. [novotny18] Vít Novotný, "Implementation Notes for the Soft Cosine Measure", 2018, + http://dx.doi.org/10.1145/3269206.3269317. + """ if not X or not Y: return self.matrix.dtype.type(0.0) + if normalized in (True, False): + warnings.warn( + 'Boolean parameter normalized will be removed in 4.0.0, use ' + 'normalized=(%s, %s) instead of normalized=%s' % tuple([normalized] * 3), + category=DeprecationWarning, + ) + normalized = (normalized, normalized) + + normalized_X, normalized_Y = normalized + valid_normalized_values = (True, False, 'maintain') + + if normalized_X not in valid_normalized_values: + raise ValueError('{} is not a valid value of normalize'.format(normalized_X)) + if normalized_Y not in valid_normalized_values: + raise ValueError('{} is not a valid value of normalize'.format(normalized_Y)) + is_corpus_X, X = is_corpus(X) is_corpus_Y, Y = is_corpus(Y) @@ -356,24 +557,19 @@ def inner_product(self, X, Y, normalized=False): Y = np.array([Y[i] if i in Y else 0 for i in word_indices], dtype=dtype) matrix = self.matrix[word_indices[:, None], word_indices].todense() + X = _normalize_dense_vector(X, matrix, normalized_X) + Y = _normalize_dense_vector(Y, matrix, normalized_Y) result = X.T.dot(matrix).dot(Y) - if normalized: - X_norm = X.T.dot(matrix).dot(X)[0, 0] - Y_norm = Y.T.dot(matrix).dot(Y)[0, 0] - - assert \ - X_norm > 0.0 and Y_norm > 0.0, \ - u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \ - u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x." - - result /= sqrt(X_norm) * sqrt(Y_norm) + if normalized_X is True and normalized_Y is True: result = np.clip(result, -1.0, 1.0) return result[0, 0] elif not is_corpus_X or not is_corpus_Y: if is_corpus_X and not is_corpus_Y: - is_corpus_X, X, is_corpus_Y, Y = is_corpus_Y, Y, is_corpus_X, X # make Y the corpus + X, Y = Y, X # make Y the corpus + is_corpus_X, is_corpus_Y = is_corpus_Y, is_corpus_X + normalized_X, normalized_Y = normalized_Y, normalized_X transposed = True else: transposed = False @@ -387,23 +583,12 @@ def inner_product(self, X, Y, normalized=False): X = np.array([X[i] if i in X else 0 for i in word_indices], dtype=dtype) Y = corpus2csc(Y, num_terms=self.matrix.shape[0], dtype=dtype)[word_indices, :].todense() matrix = self.matrix[word_indices[:, None], word_indices].todense() - if normalized: - # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T - X_norm = np.multiply(X.T.dot(matrix), X.T).sum(axis=1).T - Y_norm = np.multiply(Y.T.dot(matrix), Y.T).sum(axis=1).T - - assert \ - X_norm.min() > 0.0 and Y_norm.min() >= 0.0, \ - u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \ - u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x." - - X = np.multiply(X, 1 / np.sqrt(X_norm)).T - Y = np.multiply(Y, 1 / np.sqrt(Y_norm)) - Y = np.nan_to_num(Y) # Account for division by zero when Y_norm.min() == 0.0 - result = X.T.dot(matrix).dot(Y) + X = _normalize_dense_vector(X, matrix, normalized_X) + Y = _normalize_dense_corpus(Y, matrix, normalized_Y) + result = X.dot(matrix).dot(Y) - if normalized: + if normalized_X is True and normalized_Y is True: result = np.clip(result, -1.0, 1.0) if transposed: @@ -416,23 +601,11 @@ def inner_product(self, X, Y, normalized=False): Y = corpus2csc(Y if is_corpus_Y else [Y], num_terms=self.matrix.shape[0], dtype=dtype) matrix = self.matrix - if normalized: - # use the following equality: np.diag(A.T.dot(B).dot(A)) == A.T.dot(B).multiply(A.T).sum(axis=1).T - X_norm = X.T.dot(matrix).multiply(X.T).sum(axis=1).T - Y_norm = Y.T.dot(matrix).multiply(Y.T).sum(axis=1).T - - assert \ - X_norm.min() > 0.0 and Y_norm.min() >= 0.0, \ - u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \ - u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x." - - X = X.multiply(sparse.csr_matrix(1 / np.sqrt(X_norm))) - Y = Y.multiply(sparse.csr_matrix(1 / np.sqrt(Y_norm))) - Y[Y == np.inf] = 0 # Account for division by zero when Y_norm.min() == 0.0 - + X = _normalize_sparse_corpus(X, matrix, normalized_X) + Y = _normalize_sparse_corpus(Y, matrix, normalized_Y) result = X.T.dot(matrix).dot(Y) - if normalized: + if normalized_X is True and normalized_Y is True: result.data = np.clip(result.data, -1.0, 1.0) return result diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index fbd8f53ade..6a898f1a67 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -876,7 +876,16 @@ def setUp(self): [u"government", u"denied", u"holiday", u"slowing", u"hollingworth"]] self.dictionary = Dictionary(self.documents) self.tfidf = TfidfModel(dictionary=self.dictionary) + zero_index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.0) self.index = UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5) + self.identity_matrix = SparseTermSimilarityMatrix(zero_index, self.dictionary) + self.uniform_matrix = SparseTermSimilarityMatrix(self.index, self.dictionary) + self.vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) + self.vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + + def test_empty_dictionary(self): + with self.assertRaises(ValueError): + SparseTermSimilarityMatrix(self.index, []) def test_type(self): """Test the type of the produced matrix.""" @@ -942,6 +951,29 @@ def test_symmetric(self): [0.0, 0.0, 0.0, 0.0, 1.0]]) self.assertTrue(numpy.all(expected_matrix == matrix)) + def test_dominant(self): + """Test the dominant parameter of the matrix constructor.""" + negative_index = UniformTermSimilarityIndex(self.dictionary, term_similarity=-0.5) + matrix = SparseTermSimilarityMatrix( + negative_index, self.dictionary, nonzero_limit=2).matrix.todense() + expected_matrix = numpy.array([ + [1.0, -.5, -.5, 0.0, 0.0], + [-.5, 1.0, 0.0, -.5, 0.0], + [-.5, 0.0, 1.0, 0.0, 0.0], + [0.0, -.5, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + + matrix = SparseTermSimilarityMatrix( + negative_index, self.dictionary, nonzero_limit=2, dominant=True).matrix.todense() + expected_matrix = numpy.array([ + [1.0, -.5, 0.0, 0.0, 0.0], + [-.5, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0]]) + self.assertTrue(numpy.all(expected_matrix == matrix)) + def test_positive_definite(self): """Test the positive_definite parameter of the matrix constructor.""" negative_index = UniformTermSimilarityIndex(self.dictionary, term_similarity=-0.5) @@ -1004,109 +1036,533 @@ def test_encapsulation(self): self.assertTrue(isinstance(matrix, scipy.sparse.csc_matrix)) self.assertTrue(numpy.all(matrix.todense() == expected_matrix)) - def test_inner_product(self): - """Test the inner product.""" + def test_inner_product_zerovector_zerovector_default(self): + """Test the inner product between two zero vectors with the default normalization.""" - matrix = SparseTermSimilarityMatrix( - UniformTermSimilarityIndex(self.dictionary, term_similarity=0.5), self.dictionary) + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [])) + + def test_inner_product_zerovector_zerovector_false_maintain(self): + """Test the inner product between two zero vectors with the (False, 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=(False, 'maintain'))) + + def test_inner_product_zerovector_zerovector_false_true(self): + """Test the inner product between two zero vectors with the (False, True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=(False, True))) + + def test_inner_product_zerovector_zerovector_maintain_false(self): + """Test the inner product between two zero vectors with the ('maintain', False) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=('maintain', False))) + + def test_inner_product_zerovector_zerovector_maintain_maintain(self): + """Test the inner product between two zero vectors with the ('maintain', 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=('maintain', 'maintain'))) + + def test_inner_product_zerovector_zerovector_maintain_true(self): + """Test the inner product between two zero vectors with the ('maintain', True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=('maintain', True))) + + def test_inner_product_zerovector_zerovector_true_false(self): + """Test the inner product between two zero vectors with the (True, False) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=(True, False))) + + def test_inner_product_zerovector_zerovector_true_maintain(self): + """Test the inner product between two zero vectors with the (True, 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=(True, 'maintain'))) + + def test_inner_product_zerovector_zerovector_true_true(self): + """Test the inner product between two zero vectors with the (True, True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], [], normalized=(True, True))) + + def test_inner_product_zerovector_vector_default(self): + """Test the inner product between a zero vector and a vector with the default normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2)) + + def test_inner_product_zerovector_vector_false_maintain(self): + """Test the inner product between a zero vector and a vector with the (False, 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=(False, 'maintain'))) + + def test_inner_product_zerovector_vector_false_true(self): + """Test the inner product between a zero vector and a vector with the (False, True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=(False, True))) + + def test_inner_product_zerovector_vector_maintain_false(self): + """Test the inner product between a zero vector and a vector with the ('maintain', False) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=('maintain', False))) + + def test_inner_product_zerovector_vector_maintain_maintain(self): + """Test the inner product between a zero vector and a vector with the ('maintain', 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=('maintain', 'maintain'))) + + def test_inner_product_zerovector_vector_maintain_true(self): + """Test the inner product between a zero vector and a vector with the ('maintain', True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=('maintain', True))) + + def test_inner_product_zerovector_vector_true_false(self): + """Test the inner product between a zero vector and a vector with the (True, False) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=(True, False))) - # check zero vectors work as expected - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + def test_inner_product_zerovector_vector_true_maintain(self): + """Test the inner product between a zero vector and a vector with the (True, 'maintain') normalization.""" - self.assertEqual(0.0, matrix.inner_product([], vec2)) - self.assertEqual(0.0, matrix.inner_product(vec1, [])) - self.assertEqual(0.0, matrix.inner_product([], [])) + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=(True, 'maintain'))) - self.assertEqual(0.0, matrix.inner_product([], vec2, normalized=True)) - self.assertEqual(0.0, matrix.inner_product(vec1, [], normalized=True)) - self.assertEqual(0.0, matrix.inner_product([], [], normalized=True)) + def test_inner_product_zerovector_vector_true_true(self): + """Test the inner product between a zero vector and a vector with the (True, True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product([], self.vec2, normalized=(True, True))) + + def test_inner_product_vector_zerovector_default(self): + """Test the inner product between a vector and a zero vector with the default normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [])) + + def test_inner_product_vector_zerovector_false_maintain(self): + """Test the inner product between a vector and a zero vector with the (False, 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=(False, 'maintain'))) + + def test_inner_product_vector_zerovector_false_true(self): + """Test the inner product between a vector and a zero vector with the (False, True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=(False, True))) + + def test_inner_product_vector_zerovector_maintain_false(self): + """Test the inner product between a vector and a zero vector with the ('maintain', False) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=('maintain', False))) + + def test_inner_product_vector_zerovector_maintain_maintain(self): + """Test the inner product between a vector and a zero vector with the ('maintain', 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=('maintain', 'maintain'))) + + def test_inner_product_vector_zerovector_maintain_true(self): + """Test the inner product between a vector and a zero vector with the ('maintain', True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=('maintain', True))) + + def test_inner_product_vector_zerovector_true_false(self): + """Test the inner product between a vector and a zero vector with the (True, False) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=(True, False))) + + def test_inner_product_vector_zerovector_true_maintain(self): + """Test the inner product between a vector and a zero vector with the (True, 'maintain') normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=(True, 'maintain'))) + + def test_inner_product_vector_zerovector_true_true(self): + """Test the inner product between a vector and a zero vector with the (True, True) normalization.""" + + self.assertEqual(0.0, self.uniform_matrix.inner_product(self.vec1, [], normalized=(True, True))) + + def test_inner_product_vector_vector_default(self): + """Test the inner product between two vectors with the default normalization.""" - # check that real-world vectors work as expected - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday - result = matrix.inner_product(vec1, vec2) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2) self.assertAlmostEqual(expected_result, result, places=5) - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) - expected_result = matrix.inner_product(vec1, vec2) - expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) - expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) - result = matrix.inner_product(vec1, vec2, normalized=True) + def test_inner_product_vector_vector_false_maintain(self): + """Test the inner product between two vectors with the (False, 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=(False, 'maintain')) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_vector_false_true(self): + """Test the inner product between two vectors with the (False, True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=(False, True)) self.assertAlmostEqual(expected_result, result, places=5) - # check that real-world (vector, corpus) pairs work as expected - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + def test_inner_product_vector_vector_maintain_false(self): + """Test the inner product between two vectors with the ('maintain', False) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=('maintain', False)) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_vector_maintain_maintain(self): + """Test the inner product between two vectors with the ('maintain', 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=('maintain', 'maintain')) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_vector_maintain_true(self): + """Test the inner product between two vectors with the ('maintain', True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=('maintain', True)) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_vector_true_false(self): + """Test the inner product between two vectors with the (True, False) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=(True, False)) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_vector_true_maintain(self): + """Test the inner product between two vectors with the (True, 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=(True, 'maintain')) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_vector_true_true(self): + """Test the inner product between two vectors with the (True, True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + result = self.uniform_matrix.inner_product(self.vec1, self.vec2, normalized=(True, True)) + self.assertAlmostEqual(expected_result, result, places=5) + + def test_inner_product_vector_corpus_default(self): + """Test the inner product between a vector and a corpus with the default normalization.""" + expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday expected_result = numpy.full((1, 2), expected_result) - result = matrix.inner_product(vec1, [vec2] * 2) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_vector_corpus_false_maintain(self): + """Test the inner product between a vector and a corpus with the (False, 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=(False, 'maintain')) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) - expected_result = matrix.inner_product(vec1, vec2) - expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) - expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + def test_inner_product_vector_corpus_false_true(self): + """Test the inner product between a vector and a corpus with the (False, True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) expected_result = numpy.full((1, 2), expected_result) - result = matrix.inner_product(vec1, [vec2] * 2, normalized=True) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=(False, True)) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) - # check that real-world (corpus, vector) pairs work as expected - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + def test_inner_product_vector_corpus_maintain_false(self): + """Test the inner product between a vector and a corpus with the ('maintain', False) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=('maintain', False)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_vector_corpus_maintain_maintain(self): + """Test the inner product between a vector and a corpus with the ('maintain', 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=('maintain', 'maintain')) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_vector_corpus_maintain_true(self): + """Test the inner product between a vector and a corpus with the ('maintain', True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=('maintain', True)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_vector_corpus_true_false(self): + """Test the inner product between a vector and a corpus with the (True, False) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=(True, False)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_vector_corpus_true_maintain(self): + """Test the inner product between a vector and a corpus with the (True, 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=(True, 'maintain')) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_vector_corpus_true_true(self): + """Test the inner product between a vector and a corpus with the (True, True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((1, 2), expected_result) + result = self.uniform_matrix.inner_product(self.vec1, [self.vec2] * 2, normalized=(True, True)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_default(self): + """Test the inner product between a corpus and a vector with the default normalization.""" + expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday expected_result = numpy.full((3, 1), expected_result) - result = matrix.inner_product([vec1] * 3, vec2) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_false_maintain(self): + """Test the inner product between a corpus and a vector with the (False, 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=(False, 'maintain')) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_false_true(self): + """Test the inner product between a corpus and a vector with the (False, True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=(False, True)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_maintain_false(self): + """Test the inner product between a corpus and a vector with the ('maintain', False) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=('maintain', False)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_maintain_maintain(self): + """Test the inner product between a corpus and a vector with the ('maintain', 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=('maintain', 'maintain')) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_maintain_true(self): + """Test the inner product between a corpus and a vector with the ('maintain', True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=('maintain', True)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_true_false(self): + """Test the inner product between a corpus and a vector with the (True, False) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=(True, False)) + self.assertTrue(isinstance(result, numpy.ndarray)) + self.assertTrue(numpy.allclose(expected_result, result)) + + def test_inner_product_corpus_vector_true_maintain(self): + """Test the inner product between a corpus and a vector with the (True, 'maintain') normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 1), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=(True, 'maintain')) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) - expected_result = matrix.inner_product(vec1, vec2) - expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) - expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + def test_inner_product_corpus_vector_true_true(self): + """Test the inner product between a corpus and a vector with the (True, True) normalization.""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) expected_result = numpy.full((3, 1), expected_result) - result = matrix.inner_product([vec1] * 3, vec2, normalized=True) + result = self.uniform_matrix.inner_product([self.vec1] * 3, self.vec2, normalized=(True, True)) self.assertTrue(isinstance(result, numpy.ndarray)) self.assertTrue(numpy.allclose(expected_result, result)) - # check that real-world corpora work as expected - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) + def test_inner_product_corpus_corpus_default(self): + """Test the inner product between two corpora with the default normalization.""" + expected_result = 0.0 expected_result += 2 * 1.0 * 1 # government * s_{ij} * government expected_result += 2 * 0.5 * 1 # government * s_{ij} * holiday expected_result += 1 * 0.5 * 1 # denied * s_{ij} * government expected_result += 1 * 0.5 * 1 # denied * s_{ij} * holiday expected_result = numpy.full((3, 2), expected_result) - result = matrix.inner_product([vec1] * 3, [vec2] * 2) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_false_maintain(self): + """Test the inner product between two corpora with the (False, 'maintain').""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=(False, 'maintain')) self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) self.assertTrue(numpy.allclose(expected_result, result.todense())) - vec1 = self.dictionary.doc2bow([u"government", u"government", u"denied"]) - vec2 = self.dictionary.doc2bow([u"government", u"holiday"]) - expected_result = matrix.inner_product(vec1, vec2) - expected_result /= math.sqrt(matrix.inner_product(vec1, vec1)) - expected_result /= math.sqrt(matrix.inner_product(vec2, vec2)) + def test_inner_product_corpus_corpus_false_true(self): + """Test the inner product between two corpora with the (False, True).""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=(False, True)) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_maintain_false(self): + """Test the inner product between two corpora with the ('maintain', False).""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=('maintain', False)) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_maintain_maintain(self): + """Test the inner product between two corpora with the ('maintain', 'maintain').""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, + normalized=('maintain', 'maintain')) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_maintain_true(self): + """Test the inner product between two corpora with the ('maintain', True).""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=('maintain', True)) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_true_false(self): + """Test the inner product between two corpora with the (True, False).""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=(True, False)) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_true_maintain(self): + """Test the inner product between two corpora with the (True, 'maintain').""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) + expected_result *= math.sqrt(self.identity_matrix.inner_product(self.vec2, self.vec2)) + expected_result = numpy.full((3, 2), expected_result) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=(True, 'maintain')) + self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) + self.assertTrue(numpy.allclose(expected_result, result.todense())) + + def test_inner_product_corpus_corpus_true_true(self): + """Test the inner product between two corpora with the (True, True).""" + + expected_result = self.uniform_matrix.inner_product(self.vec1, self.vec2) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec1, self.vec1)) + expected_result /= math.sqrt(self.uniform_matrix.inner_product(self.vec2, self.vec2)) expected_result = numpy.full((3, 2), expected_result) - result = matrix.inner_product([vec1] * 3, [vec2] * 2, normalized=True) + result = self.uniform_matrix.inner_product([self.vec1] * 3, [self.vec2] * 2, normalized=(True, True)) self.assertTrue(isinstance(result, scipy.sparse.csr_matrix)) self.assertTrue(numpy.allclose(expected_result, result.todense()))