From 1750fc7420592dcedb48d8e630d9e51a850cebdb Mon Sep 17 00:00:00 2001 From: Arlo Godfrey Date: Tue, 26 Sep 2023 19:07:41 -0500 Subject: [PATCH 1/5] Implemented IUmapDataPoint, cleanup IUmapDistanceParameter. This was abstracting away each separate vector in the embedding, rather than abstracting away the vector array float[] --- UMAP/DistanceCalculation.cs | 2 +- UMAP/IUmapDataPoint.cs | 13 +++++++ UMAP/IUmapDistance.cs | 11 ------ UMAP/NNDescent.cs | 6 ++-- UMAP/SIMD.cs | 23 ++++++------ UMAP/Tree.cs | 19 +++++----- UMAP/Umap.cs | 70 ++++++++++++++++++++----------------- 7 files changed, 73 insertions(+), 71 deletions(-) create mode 100644 UMAP/IUmapDataPoint.cs delete mode 100644 UMAP/IUmapDistance.cs diff --git a/UMAP/DistanceCalculation.cs b/UMAP/DistanceCalculation.cs index 2f1abd4..e48fc83 100644 --- a/UMAP/DistanceCalculation.cs +++ b/UMAP/DistanceCalculation.cs @@ -1,4 +1,4 @@ namespace UMAP { - public delegate float DistanceCalculation(IUmapDistanceParameter[] x, IUmapDistanceParameter[] y); + public delegate float DistanceCalculation(T x, T y) where T : IUmapDataPoint; } \ No newline at end of file diff --git a/UMAP/IUmapDataPoint.cs b/UMAP/IUmapDataPoint.cs new file mode 100644 index 0000000..8765e60 --- /dev/null +++ b/UMAP/IUmapDataPoint.cs @@ -0,0 +1,13 @@ +namespace UMAP +{ + /// + /// Represents a single data point to be processed by . + /// + public interface IUmapDataPoint + { + /// + /// The data being operated on. + /// + public float[] Data { get; } + } +} \ No newline at end of file diff --git a/UMAP/IUmapDistance.cs b/UMAP/IUmapDistance.cs deleted file mode 100644 index 7e2d16e..0000000 --- a/UMAP/IUmapDistance.cs +++ /dev/null @@ -1,11 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Text; - -namespace UMAP -{ - public interface IUmapDistanceParameter - { - float EmbeddingVectorValue { get; set; } - } -} diff --git a/UMAP/NNDescent.cs b/UMAP/NNDescent.cs index 5ec378e..03ebe1c 100644 --- a/UMAP/NNDescent.cs +++ b/UMAP/NNDescent.cs @@ -4,10 +4,10 @@ namespace UMAP { - internal static class NNDescent + internal static class NNDescent where T : IUmapDataPoint { public delegate (int[][] indices, float[][] weights) NNDescentFn( - IUmapDistanceParameter[][] data, + T[] data, int[][] leafArray, int nNeighbors, int nIters = 10, @@ -15,7 +15,7 @@ public delegate (int[][] indices, float[][] weights) NNDescentFn( float delta = 0.001f, float rho = 0.5f, bool rpTreeInit = true, - Action startingIteration = null + Action? startingIteration = null ); /// diff --git a/UMAP/SIMD.cs b/UMAP/SIMD.cs index 2564a17..139bc60 100644 --- a/UMAP/SIMD.cs +++ b/UMAP/SIMD.cs @@ -13,7 +13,7 @@ internal static class SIMD private static readonly int _vs4 = 4 * Vector.Count; [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static float Magnitude(ref IUmapDistanceParameter[] vec) => (float)Math.Sqrt(DotProduct(ref vec, ref vec)); + public static float Magnitude(ref float[] vec) => (float)Math.Sqrt(DotProduct(ref vec, ref vec)); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static float Euclidean(ref float[] lhs, ref float[] rhs) @@ -179,20 +179,17 @@ public static void Multiply(ref float[] lhs, float f) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static float DotProduct(ref IUmapDistanceParameter[] lhs, ref IUmapDistanceParameter[] rhs) + public static float DotProduct(ref float[] lhs, ref float[] rhs) { - - var lhsArray = lhs.Select(x => x.EmbeddingVectorValue).ToArray(); - var rhsArray = rhs.Select(x=>x.EmbeddingVectorValue).ToArray(); var result = 0f; var count = lhs.Length; var offset = 0; while (count >= _vs4) { - result += Vector.Dot(new Vector(lhsArray, offset), new Vector(rhsArray, offset)); - result += Vector.Dot(new Vector(lhsArray, offset + _vs1), new Vector(rhsArray, offset + _vs1)); - result += Vector.Dot(new Vector(lhsArray, offset + _vs2), new Vector(rhsArray, offset + _vs2)); - result += Vector.Dot(new Vector(lhsArray, offset + _vs3), new Vector(rhsArray, offset + _vs3)); + result += Vector.Dot(new Vector(lhs, offset), new Vector(rhs, offset)); + result += Vector.Dot(new Vector(lhs, offset + _vs1), new Vector(rhs, offset + _vs1)); + result += Vector.Dot(new Vector(lhs, offset + _vs2), new Vector(rhs, offset + _vs2)); + result += Vector.Dot(new Vector(lhs, offset + _vs3), new Vector(rhs, offset + _vs3)); if (count == _vs4) { return result; @@ -203,8 +200,8 @@ public static float DotProduct(ref IUmapDistanceParameter[] lhs, ref IUmapDis } if (count >= _vs2) { - result += Vector.Dot(new Vector(lhsArray, offset), new Vector(rhsArray, offset)); - result += Vector.Dot(new Vector(lhsArray, offset + _vs1), new Vector(rhsArray, offset + _vs1)); + result += Vector.Dot(new Vector(lhs, offset), new Vector(rhs, offset)); + result += Vector.Dot(new Vector(lhs, offset + _vs1), new Vector(rhs, offset + _vs1)); if (count == _vs2) { return result; @@ -215,7 +212,7 @@ public static float DotProduct(ref IUmapDistanceParameter[] lhs, ref IUmapDis } if (count >= _vs1) { - result += Vector.Dot(new Vector(lhsArray, offset), new Vector(rhsArray, offset)); + result += Vector.Dot(new Vector(lhs, offset), new Vector(rhs, offset)); if (count == _vs1) { return result; @@ -228,7 +225,7 @@ public static float DotProduct(ref IUmapDistanceParameter[] lhs, ref IUmapDis { while (count > 0) { - result += lhsArray[offset] * rhsArray[offset]; + result += lhs[offset] * rhs[offset]; offset++; count--; } } diff --git a/UMAP/Tree.cs b/UMAP/Tree.cs index d78439e..1601264 100644 --- a/UMAP/Tree.cs +++ b/UMAP/Tree.cs @@ -4,18 +4,18 @@ namespace UMAP { - internal static class Tree + internal static class Tree where T : IUmapDataPoint { /// /// Construct a random projection tree based on ``data`` with leaves of size at most ``leafSize`` /// - public static RandomProjectionTreeNode MakeTree(IUmapDistanceParameter[][] data, int leafSize, int n, IProvideRandomValues random) + public static RandomProjectionTreeNode MakeTree(T[] data, int leafSize, int n, IProvideRandomValues random) { var indices = Enumerable.Range(0, data.Length).ToArray(); return MakeEuclideanTree(data, indices, leafSize, n, random); } - private static RandomProjectionTreeNode MakeEuclideanTree(IUmapDistanceParameter[][] data, int[] indices, int leafSize, int q, IProvideRandomValues random) + private static RandomProjectionTreeNode MakeEuclideanTree(T[] data, int[] indices, int leafSize, int q, IProvideRandomValues random) { if (indices.Length > leafSize) { @@ -50,9 +50,10 @@ public static FlatTree FlattenTree(RandomProjectionTreeNode tree, int leafSize) /// the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses euclidean distance to determine the hyperplane and which side each data /// sample falls on. /// - private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(IUmapDistanceParameter[][] data, int[] indices, IProvideRandomValues random) + private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(T[] data, int[] indices, IProvideRandomValues random) { - var dim = data[0].Length; + var vectorData = data.Select(x => x.Data).ToArray(); + var dim = vectorData[0].Length; // Select two random points, set the hyperplane between them var leftIndex = random.Next(0, indices.Length); @@ -67,10 +68,8 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, var hyperplaneVector = new float[dim]; for (var i = 0; i < hyperplaneVector.Length; i++) { - var leftVectorValue = data[left][i].EmbeddingVectorValue; - var rightVectorValue = data[right][i].EmbeddingVectorValue; - hyperplaneVector[i] = leftVectorValue - rightVectorValue; - hyperplaneOffset -= (hyperplaneVector[i] * (leftVectorValue + rightVectorValue)) / 2; + hyperplaneVector[i] = vectorData[left][i] - vectorData[right][i]; + hyperplaneOffset -= (hyperplaneVector[i] * (vectorData[left][i] + vectorData[right][i])) / 2; } // For each point compute the margin (project into normal vector) @@ -83,7 +82,7 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, var margin = hyperplaneOffset; for (var d = 0; d < dim; d++) { - margin += hyperplaneVector[d] * data[indices[i]][d].EmbeddingVectorValue; + margin += hyperplaneVector[d] * vectorData[indices[i]][d]; } if (margin == 0) diff --git a/UMAP/Umap.cs b/UMAP/Umap.cs index 288e446..bdacae0 100644 --- a/UMAP/Umap.cs +++ b/UMAP/Umap.cs @@ -6,7 +6,7 @@ namespace UMAP { - public sealed class Umap + public sealed class Umap where T : IUmapDataPoint { private const float SMOOTH_K_TOLERANCE = 1e-5f; private const float MIN_K_DIST_SCALE = 1e-3f; @@ -26,12 +26,12 @@ public sealed class Umap private readonly ProgressReporter _progressReporter; // KNN state (can be precomputed and supplied via initializeFit) - private int[][] _knnIndices = null; - private float[][] _knnDistances = null; + private int[][]? _knnIndices = null; + private float[][]? _knnDistances = null; // Internal graph connectivity representation - private SparseMatrix _graph = null; - private IUmapDistanceParameter[][] _x = null; + private SparseMatrix? _graph = null; + private T[]? _x = null; private bool _isInitialized = false; private Tree.FlatTree[] _rpForest = new Tree.FlatTree[0]; @@ -69,7 +69,7 @@ public Umap( /// Initializes fit by computing KNN and a fuzzy simplicial set, as well as initializing the projected embeddings. Sets the optimization state ahead of optimization steps. /// Returns the number of epochs to be used for the SGD optimization. /// - public int InitializeFit(IUmapDistanceParameter[][] x) + public int InitializeFit(T[] x) { // We don't need to reinitialize if we've already initialized for this data if ((_x == x) && _isInitialized) @@ -149,7 +149,7 @@ private int GetNEpochs() /// /// Compute the ``nNeighbors`` nearest points for each data point in ``X`` - this may be exact, but more likely is approximated via nearest neighbor descent. /// - internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(IUmapDistanceParameter[][] x, ProgressReporter progressReporter) + internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(T[] x, ProgressReporter progressReporter) { var metricNNDescent = NNDescent.MakeNNDescent(_distanceFn, _random); progressReporter(0.05f); @@ -169,8 +169,6 @@ private int GetNEpochs() progressReporter(0.45f); var nnDescendProgressReporter = ScaleProgressReporter(progressReporter, 0.5f, 1); - var organizedDataList = new List<(float left, float right)>(); - return metricNNDescent(x, leafArray, _nNeighbors, nIters, startingIteration: (i, max) => nnDescendProgressReporter((float)i / max)); // Handle python3 rounding down from 0.5 discrpancy @@ -182,7 +180,7 @@ private int GetNEpochs() /// to the data. This is done by locally approximating geodesic distance at each point, creating a fuzzy simplicial set for each such point, and then combining all the local fuzzy /// simplicial sets into a global one via a fuzzy union. /// - private SparseMatrix FuzzySimplicialSet(IUmapDistanceParameter[][] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter) + private SparseMatrix FuzzySimplicialSet(T[] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter) { var knnIndices = _knnIndices ?? new int[0][]; var knnDistances = _knnDistances ?? new float[0][]; @@ -629,42 +627,48 @@ private static ProgressReporter ScaleProgressReporter(ProgressReporter progressR return progress => progressReporter((range * progress) + start); } - public static class DistanceFunctions + public static class DistanceFunctions where T : IUmapDataPoint { - public static float Cosine(IUmapDistanceParameter[] lhs, IUmapDistanceParameter[] rhs) + public static float Cosine(T lhs, T rhs) { - return 1 - (SIMD.DotProduct(ref lhs, ref rhs) / (SIMD.Magnitude(ref lhs) * SIMD.Magnitude(ref rhs))); + var lhsVal = lhs.Data; + var rhsVal = rhs.Data; + return 1 - (SIMD.DotProduct(ref lhsVal, ref rhsVal) / (SIMD.Magnitude(ref lhsVal) * SIMD.Magnitude(ref rhsVal))); } - public static float CosineForNormalizedVectors(IUmapDistanceParameter[] lhs, IUmapDistanceParameter[] rhs) + public static float CosineForNormalizedVectors(T lhs, T rhs) { - return 1 - SIMD.DotProduct(ref lhs, ref rhs); + var lhsVal = lhs.Data; + var rhsVal = rhs.Data; + return 1 - SIMD.DotProduct(ref lhsVal, ref rhsVal); } - public static float Euclidean(float[] lhs, float[] rhs) + public static float Euclidean(T lhs, T rhs) { - return (float)Math.Sqrt(SIMD.Euclidean(ref lhs, ref rhs)); // TODO: Replace with netcore3 MathF class when the framework is available + var lhsVal = lhs.Data; + var rhsVal = rhs.Data; + return (float)Math.Sqrt(SIMD.Euclidean(ref lhsVal, ref rhsVal)); // TODO: Replace with netcore3 MathF class when the framework is available } } private sealed class OptimizationState { - public int CurrentEpoch = 0; - public int[] Head = new int[0]; - public int[] Tail = new int[0]; - public float[] EpochsPerSample = new float[0]; - public float[] EpochOfNextSample = new float[0]; - public float[] EpochOfNextNegativeSample= new float[0]; - public float[] EpochsPerNegativeSample = new float[0]; - public bool MoveOther = true; - public float InitialAlpha = 1; - public float Alpha = 1; - public float Gamma = 1; - public float A = 1.5769434603113077f; - public float B = 0.8950608779109733f; - public int Dim = 2; - public int NEpochs = 500; - public int NVertices = 0; + public int CurrentEpoch = 0; + public int[] Head = new int[0]; + public int[] Tail = new int[0]; + public float[] EpochsPerSample = new float[0]; + public float[] EpochOfNextSample = new float[0]; + public float[] EpochOfNextNegativeSample = new float[0]; + public float[] EpochsPerNegativeSample = new float[0]; + public bool MoveOther = true; + public float InitialAlpha = 1; + public float Alpha = 1; + public float Gamma = 1; + public float A = 1.5769434603113077f; + public float B = 0.8950608779109733f; + public int Dim = 2; + public int NEpochs = 500; + public int NVertices = 0; [MethodImpl(MethodImplOptions.AggressiveInlining)] public float GetDistanceFactor(float distSquared) => 1f / ((0.001f + distSquared) * (float)(A * Math.Pow(distSquared, B) + 1)); From e8322427beee92d088e2ff2d8b2a13b8bddc2e33 Mon Sep 17 00:00:00 2001 From: Arlo Godfrey Date: Tue, 26 Sep 2023 20:02:31 -0500 Subject: [PATCH 2/5] Remove public modifier (C# 8+ only) --- UMAP/IUmapDataPoint.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/UMAP/IUmapDataPoint.cs b/UMAP/IUmapDataPoint.cs index 8765e60..91f088c 100644 --- a/UMAP/IUmapDataPoint.cs +++ b/UMAP/IUmapDataPoint.cs @@ -8,6 +8,6 @@ public interface IUmapDataPoint /// /// The data being operated on. /// - public float[] Data { get; } + float[] Data { get; } } } \ No newline at end of file From 1c28b9f33eb7171996c5c80fd1856b3c3a47c20e Mon Sep 17 00:00:00 2001 From: Arlo Godfrey Date: Tue, 26 Sep 2023 20:02:43 -0500 Subject: [PATCH 3/5] Fixed conflicting generic name --- UMAP/Umap.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/UMAP/Umap.cs b/UMAP/Umap.cs index bdacae0..c5a8103 100644 --- a/UMAP/Umap.cs +++ b/UMAP/Umap.cs @@ -381,7 +381,7 @@ private static (int[] rows, int[] cols, float[] vals) ComputeMembershipStrengths return (head.ToArray(), tail.ToArray(), MakeEpochsPerSample(weights.ToArray(), nEpochs)); } - private void ShuffleTogether(List list, List other, List weights) + private void ShuffleTogether(List list, List other, List weights) { int n = list.Count; if (other.Count != n) { throw new Exception(); } @@ -389,7 +389,7 @@ private void ShuffleTogether(List list, List other, List w { n--; int k = _random.Next(0, n + 1); - T value = list[k]; + T1 value = list[k]; list[k] = list[n]; list[n] = value; From aa8fe99b2eca66a9e8c4ddb1bb1d25b3f6f64e37 Mon Sep 17 00:00:00 2001 From: Arlo Godfrey Date: Tue, 26 Sep 2023 20:05:14 -0500 Subject: [PATCH 4/5] Reverted nullable annotations (C# 8+ only) --- UMAP/NNDescent.cs | 2 +- UMAP/Umap.cs | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/UMAP/NNDescent.cs b/UMAP/NNDescent.cs index 03ebe1c..978af97 100644 --- a/UMAP/NNDescent.cs +++ b/UMAP/NNDescent.cs @@ -15,7 +15,7 @@ public delegate (int[][] indices, float[][] weights) NNDescentFn( float delta = 0.001f, float rho = 0.5f, bool rpTreeInit = true, - Action? startingIteration = null + Action startingIteration = null ); /// diff --git a/UMAP/Umap.cs b/UMAP/Umap.cs index c5a8103..5498d28 100644 --- a/UMAP/Umap.cs +++ b/UMAP/Umap.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Runtime.CompilerServices; @@ -26,12 +26,12 @@ public sealed class Umap where T : IUmapDataPoint private readonly ProgressReporter _progressReporter; // KNN state (can be precomputed and supplied via initializeFit) - private int[][]? _knnIndices = null; - private float[][]? _knnDistances = null; + private int[][] _knnIndices = null; + private float[][] _knnDistances = null; // Internal graph connectivity representation - private SparseMatrix? _graph = null; - private T[]? _x = null; + private SparseMatrix _graph = null; + private T[] _x = null; private bool _isInitialized = false; private Tree.FlatTree[] _rpForest = new Tree.FlatTree[0]; @@ -57,7 +57,7 @@ public Umap( throw new ArgumentOutOfRangeException(nameof(customNumberOfEpochs), "if non-null then must be a positive value"); } - _distanceFn = distance ?? DistanceFunctions.Cosine; + _distanceFn = distance ?? DistanceFunctions.Cosine; _random = random ?? DefaultRandomGenerator.Instance; _nNeighbors = numberOfNeighbors; _optimizationState = new OptimizationState { Dim = dimensions }; From 99767f137d57ad508d28f0957f358b971d67c482 Mon Sep 17 00:00:00 2001 From: Arlo Godfrey Date: Tue, 26 Sep 2023 20:05:49 -0500 Subject: [PATCH 5/5] Removed generic from DistanceFunctions, now inherits from Umap --- UMAP/Umap.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/UMAP/Umap.cs b/UMAP/Umap.cs index 5498d28..a05a2d2 100644 --- a/UMAP/Umap.cs +++ b/UMAP/Umap.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Runtime.CompilerServices; @@ -627,7 +627,7 @@ private static ProgressReporter ScaleProgressReporter(ProgressReporter progressR return progress => progressReporter((range * progress) + start); } - public static class DistanceFunctions where T : IUmapDataPoint + public static class DistanceFunctions { public static float Cosine(T lhs, T rhs) {