-
Notifications
You must be signed in to change notification settings - Fork 11
/
fast_tsne.py
255 lines (244 loc) · 10.2 KB
/
fast_tsne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
from __future__ import absolute_import, division, print_function
import warnings
from typing import Optional
from typing_extensions import Literal
import numpy as np
from odin.utils.crypto import md5_checksum
from odin.utils.mpi import MPI, cpu_count
from sklearn.decomposition import PCA
_cached_values = {}
__all__ = ['fast_tsne']
# ===========================================================================
# auto-select best TSNE
# ===========================================================================
def _create_key(framework, kwargs, md5):
key = dict(kwargs)
del key['verbose']
key['md5'] = md5
return framework + str(list(sorted(key.items(), key=lambda x: x[0])))
def fast_tsne(
*X,
n_components: int = 2,
max_samples: Optional[int] = None,
perplexity: float = 30.0,
early_exaggeration: float = 12.0,
learning_rate: float = 200.0,
n_iter: int = 1000,
n_iter_without_progress: int = 300,
exaggeration_iter: int = 250,
perplexity_max_iter: int = 100,
min_grad_norm: float = 1e-7,
method: str = 'barnes_hut',
metric: str = "euclidean",
init: str = "random",
angle: float = 0.5,
n_jobs: Optional[int] = 4,
merge_inputs: bool = True,
pca_preprocessing: bool = True,
return_model: bool = False,
random_state: int = 1,
verbose: int = 0,
framework: Literal['auto', 'sklearn', 'cuml'] = 'auto',
):
""" t-Stochastic Nearest Neighbors.
If the algorithm take unexpected long time for running, lower the
`exaggeration_iter`, or reduce the amount of samples by downsampling
the dataset.
Parameters
----------
n_components : int, optional (default: 2)
Dimension of the embedded space.
max_samples : {int, None}
if given, downsampling the data to given number of sample
perplexity : float, optional (default: 30)
The perplexity is related to the number of nearest neighbors that
is used in other manifold learning algorithms. Larger datasets
usually require a larger perplexity. Consider selecting a value
between 5 and 50. The choice is not extremely critical since t-SNE
is quite insensitive to this parameter.
early_exaggeration : float, optional (default: 8.0)
Controls how tight natural clusters in the original space are in
the embedded space and how much space will be between them. For
larger values, the space between natural clusters will be larger
in the embedded space. Again, the choice of this parameter is not
very critical. If the cost function increases during initial
optimization, the early exaggeration factor or the learning rate
might be too high.
learning_rate : float, optional (default: 200.0)
The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
the learning rate is too high, the data may look like a 'ball' with any
point approximately equidistant from its nearest neighbours. If the
learning rate is too low, most points may look compressed in a dense
cloud with few outliers. If the cost function gets stuck in a bad local
minimum increasing the learning rate may help.
n_iter : int, optional (default: 1000)
Maximum number of iterations for the optimization. Should be at
least 250.
n_iter_without_progress : int, optional (default: 300)
Maximum number of iterations without progress before we abort the
optimization, used after 250 initial iterations with early
exaggeration. Note that progress is only checked every 50 iterations so
this value is rounded to the next multiple of 50.
perplexity_max_iter : int, (default 100)
The number of epochs the best gaussian bands are found for.
exaggeration_iter : int, (default 250)
To promote the growth of clusters, set this higher.
min_grad_norm : float, optional (default: 1e-7)
If the gradient norm is below this threshold, the optimization will
be stopped.
metric : string or callable, optional
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options
allowed by scipy.spatial.distance.pdist for its metric parameter, or
a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
If metric is "precomputed", X is assumed to be a distance matrix.
Alternatively, if metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays from X as input and return a value indicating
the distance between them. The default is "euclidean" which is
interpreted as squared euclidean distance.
init : string or numpy array, optional (default: "random")
Initialization of embedding. Possible options are 'random', 'pca',
and a numpy array of shape (n_samples, n_components).
PCA initialization cannot be used with precomputed distances and is
usually more globally stable than random initialization.
verbose : int, optional (default: 0)
Verbosity level, a number from 0 to 6.
random_state : int, RandomState instance or None, optional (default: None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Note that different initializations might result in
different local minima of the cost function.
method : string (default: 'barnes_hut')
By default the gradient calculation algorithm uses Barnes-Hut
approximation running in O(NlogN) time. method='exact'
will run on the slower, but exact, algorithm in O(N^2) time. The
exact algorithm should be used when nearest-neighbor errors need
to be better than 3%. However, the exact method cannot scale to
millions of examples.
angle : float (default: 0.5)
Only used if method='barnes_hut'
This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
'angle' is the angular size (referred to as theta in [3]) of a distant
node as measured from a point. If this size is below 'angle' then it is
used as a summary node of all points contained within it.
This method is not very sensitive to changes in this parameter
in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
computation time and angle greater 0.8 has quickly increasing error.
return_model : a Boolean, if `True`,
return the trained t-SNE model
merge_inputs : a Boolean, if `True`,
merge all arrays into a single array
for training t-SNE.
"""
assert len(X) > 0, "No input is given!"
if isinstance(X[0], (tuple, list)):
X = X[0]
if not all(isinstance(x, np.ndarray) for x in X):
raise ValueError("`X` can only be list of numpy.ndarray or numpy.ndarray")
# ====== kwarg for creating T-SNE class ====== #
kwargs = dict(locals())
del kwargs['X']
kwargs.pop('merge_inputs')
kwargs.pop('return_model')
kwargs.pop('max_samples')
kwargs.pop('framework')
kwargs.pop('pca_preprocessing')
# ====== downsampling ====== #
if max_samples is not None:
max_samples = int(max_samples)
assert max_samples > 0
new_X = []
rand = random_state if isinstance(random_state, np.random.RandomState) else \
np.random.RandomState(seed=random_state)
for x in X:
if x.shape[0] > max_samples:
ids = rand.permutation(x.shape[0])[:max_samples]
x = x[ids]
new_X.append(x)
X = new_X
# ====== import proper T-SNE ====== #
tsne_version = None
if framework != 'sklearn':
try:
from cuml.manifold import TSNE
tsne_version = 'cuda'
except ImportError:
warnings.warn("Install RAPIDSAI cuML GPUs-accelerated t-SNE")
try:
from MulticoreTSNE import MulticoreTSNE as TSNE
tsne_version = 'multicore'
except ImportError:
warnings.warn("pip install "
"git+https://github.com/DmitryUlyanov/Multicore-TSNE.git")
if tsne_version is None:
from sklearn.manifold import TSNE
tsne_version = 'sklearn'
# ====== modify kwargs ====== #
if tsne_version == 'cuda':
del kwargs['n_jobs']
elif tsne_version == 'multicore':
del kwargs['perplexity_max_iter']
del kwargs['exaggeration_iter']
else:
del kwargs['n_jobs']
del kwargs['perplexity_max_iter']
del kwargs['exaggeration_iter']
# ====== getting cached values ====== #
results = []
X_new = []
X_size = []
if merge_inputs:
X_size = [x.shape[0] for x in X]
x = np.vstack(X) if len(X) > 1 else X[0]
md5 = md5_checksum(x)
key = _create_key(tsne_version, kwargs, md5)
if key in _cached_values:
results.append((0, _cached_values[key]))
else:
X_new.append((0, md5, x))
else:
for i, x in enumerate(X):
md5 = md5_checksum(x)
key = _create_key(tsne_version, kwargs, md5)
if key in _cached_values:
results.append((i, _cached_values[key]))
else:
X_new.append((i, md5, x))
# ====== perform T-SNE ====== #
def apply_tsne(j):
idx, md5, x = j
if pca_preprocessing:
x = PCA(n_components=None, random_state=random_state).fit_transform(x)
tsne = TSNE(**kwargs)
return (idx, md5, tsne.fit_transform(x), tsne if return_model else None)
# only 1 X, no need for MPI
if len(X_new) == 1 or tsne_version in ('cuda', 'multicore'):
for x in X_new:
idx, md5, x, model = apply_tsne(x)
results.append((idx, x))
_cached_values[_create_key(tsne_version, kwargs, md5)] = x
else:
mpi = MPI(jobs=X_new,
func=apply_tsne,
batch=1,
ncpu=min(len(X_new),
cpu_count() - 1))
model = []
for idx, md5, x, m in mpi:
results.append((idx, x))
_cached_values[_create_key(tsne_version, kwargs, md5)] = x
model.append(m)
# ====== return and clean ====== #
if merge_inputs and len(X_size) > 1:
indices = [0] + np.cumsum(X_size).tolist()
results = [results[0][1][s:e] for s, e in zip(indices, indices[1:])]
else:
results = sorted(results, key=lambda a: a[0])
results = [r[1] for r in results]
results = results[0] if len(results) == 1 else results
if return_model:
return results, model
del model
return results