From 53fb776b4406e9d34a9d2c4ddc5597018ce4b3e6 Mon Sep 17 00:00:00 2001 From: Jesse Krijthe Date: Mon, 27 Nov 2023 13:35:53 +0100 Subject: [PATCH] Fixes and unescaped equation in the documentation and a incorrect type used in 2 print statements. --- DESCRIPTION | 1 + R/Rtsne.R | 2 +- man/Rtsne.Rd | 2 +- src/Rtsne.cpp | 4 ++-- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b18ea4c..66f8da8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,6 +10,7 @@ Description: An R wrapper around the fast T-distributed Stochastic Neighbor Embedding implementation by Van der Maaten (see for more information on the original implementation). License: file LICENSE URL: https://github.com/jkrijthe/Rtsne +Encoding: UTF-8 Imports: Rcpp (>= 0.11.0), stats diff --git a/R/Rtsne.R b/R/Rtsne.R index bf018af..6a1d10b 100644 --- a/R/Rtsne.R +++ b/R/Rtsne.R @@ -2,7 +2,7 @@ #' #' Wrapper for the C++ implementation of Barnes-Hut t-Distributed Stochastic Neighbor Embedding. t-SNE is a method for constructing a low dimensional embedding of high-dimensional data, distances or similarities. Exact t-SNE can be computed by setting theta=0.0. #' -#' Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space p_ij. \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of p_{j|i} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space. +#' Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space: \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of \eqn{p_{j|i}} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space. #' For the low-dimensional space we use the Cauchy distribution (t-distribution with one degree of freedom) as the distribution of the distances to neighbouring objects: #' \deqn{ q_{i j} = \frac{(1+ \| y_i-y_j\|^2)^{-1}}{\sum_{k \neq l} 1+ \| y_k-y_l\|^2)^{-1}}.} #' By changing the location of the objects y in the embedding to minimize the Kullback-Leibler divergence between these two distributions \eqn{ q_{i j}} and \eqn{ p_{i j}}, we create a map that focusses on small-scale structure, due to the asymmetry of the KL-divergence. The t-distribution is chosen to avoid the crowding problem: in the original high dimensional space, there are potentially many equidistant objects with moderate distance from a particular object, more than can be accounted for in the low dimensional representation. The t-distribution makes sure that these objects are more spread out in the new representation. diff --git a/man/Rtsne.Rd b/man/Rtsne.Rd index be04dcc..a4916e0 100644 --- a/man/Rtsne.Rd +++ b/man/Rtsne.Rd @@ -130,7 +130,7 @@ List with the following elements: Wrapper for the C++ implementation of Barnes-Hut t-Distributed Stochastic Neighbor Embedding. t-SNE is a method for constructing a low dimensional embedding of high-dimensional data, distances or similarities. Exact t-SNE can be computed by setting theta=0.0. } \details{ -Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space p_ij. \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of p_{j|i} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space. +Given a distance matrix \eqn{D} between input objects (which by default, is the euclidean distances between two objects), we calculate a similarity score in the original space: \deqn{ p_{j | i} = \frac{\exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)}{\sum_{k \neq i} \exp(-\|D_{ij}\|^2 / 2 \sigma_i^2)} } which is then symmetrized using: \deqn{ p_{i j}=\frac{p_{j|i} + p_{i|j}}{2n}.} The \eqn{\sigma} for each object is chosen in such a way that the perplexity of \eqn{p_{j|i}} has a value that is close to the user defined perplexity. This value effectively controls how many nearest neighbours are taken into account when constructing the embedding in the low-dimensional space. For the low-dimensional space we use the Cauchy distribution (t-distribution with one degree of freedom) as the distribution of the distances to neighbouring objects: \deqn{ q_{i j} = \frac{(1+ \| y_i-y_j\|^2)^{-1}}{\sum_{k \neq l} 1+ \| y_k-y_l\|^2)^{-1}}.} By changing the location of the objects y in the embedding to minimize the Kullback-Leibler divergence between these two distributions \eqn{ q_{i j}} and \eqn{ p_{i j}}, we create a map that focusses on small-scale structure, due to the asymmetry of the KL-divergence. The t-distribution is chosen to avoid the crowding problem: in the original high dimensional space, there are potentially many equidistant objects with moderate distance from a particular object, more than can be accounted for in the low dimensional representation. The t-distribution makes sure that these objects are more spread out in the new representation. diff --git a/src/Rtsne.cpp b/src/Rtsne.cpp index 6b143f0..ade07d4 100644 --- a/src/Rtsne.cpp +++ b/src/Rtsne.cpp @@ -18,7 +18,7 @@ Rcpp::List Rtsne_cpp(NumericMatrix X, int no_dims, double perplexity, size_t N = X.ncol(), D = X.nrow(); double * data=X.begin(); - if (verbose) Rprintf("Read the %i x %i data matrix successfully!\n", N, D); + if (verbose) Rprintf("Read the %zu x %zu data matrix successfully!\n", N, D); std::vector Y(N * no_dims), costs(N), itercosts(static_cast(std::ceil(max_iter/50.0))); // Providing user-supplied solution. @@ -60,7 +60,7 @@ Rcpp::List Rtsne_nn_cpp(IntegerMatrix nn_dex, NumericMatrix nn_dist, double eta, double exaggeration_factor, unsigned int num_threads) { size_t N = nn_dex.ncol(), K=nn_dex.nrow(); // transposed - columns are points, rows are neighbors. - if (verbose) Rprintf("Read the NN results for %i points successfully!\n", N); + if (verbose) Rprintf("Read the NN results for %zu points successfully!\n", N); std::vector Y(N * no_dims), costs(N), itercosts(static_cast(std::ceil(max_iter/50.0))); // Providing user-supplied solution.