-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrand_svd.py
67 lines (57 loc) · 2.13 KB
/
rand_svd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
''' Randomised SVD for scaled M2
'''
import numpy as np
import scipy.linalg
from cumulants import moment1, prod_m2_x
def rand_svd(docs, alpha0, k, docs_m1=None, n_iter=1, n_partitions=1):
''' Randomised SVD in local mode
Perform Randomised SVD on scaled M2.
PARAMETERS
-----------
docs : n_docs-by-vocab_size array or csr_matrix
Entire collection of word count vectors.
alpha0 : float
Sum of Dirichlet prior parameter.
k : int
Rank for the truncated SVD, >= 1.
docs_m1: length-vocab_size array, optional
M1 of the entire collection of word count vectors.
n_iter: int, optional
Number of iterations for the Krylov method, >= 0, 1 by default.
n_partitions: int, optional
Number of partitions, >= 1, 1 by default.
RETURNS
-----------
eigval : length-k array
Top k eigenvalues of scaled M2.
eigvec : vocab_size-by-k array
Top k eigenvectors of scaled M2.
'''
# pylint: disable=too-many-arguments
n_docs, vocab_size = docs.shape
assert n_docs >= 1 and vocab_size >= 1
if docs_m1 is not None:
assert docs_m1.ndim == 1 and vocab_size == docs_m1.shape[0]
assert alpha0 > 0
assert k >= 1
assert n_iter >= 0
assert n_partitions >= 1
# Augment slightly k for better convergence
k_aug = np.min([k + 5, vocab_size])
# Gaussian test matrix
test_x = np.random.randn(vocab_size, k_aug)
# Krylov method
if docs_m1 is None:
docs_m1 = moment1(docs, n_partitions=n_partitions)
for _ in range(2 * n_iter + 1):
prod_test = prod_m2_x(docs, test_x, alpha0,
docs_m1=docs_m1, n_partitions=n_partitions)
test_x, _ = scipy.linalg.qr(prod_test, mode='economic')
# X^T M2 M2 X = Q S Q^T
# If M2 M2 = X Q S Q^T X^T, then the above holds,
# where X is an orthonormal test matrix.
prod_test = prod_m2_x(docs, test_x, alpha0,
n_partitions=n_partitions)
prod_test *= alpha0 * (alpha0 + 1)
svd_q, svd_s, _ = scipy.linalg.svd(prod_test.T.dot(prod_test))
return np.sqrt(svd_s)[:k], test_x.dot(svd_q)[:, :k]