Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented IUmapDataPoint, cleanup IUmapDistanceParameter. #1

Merged
merged 5 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion UMAP/DistanceCalculation.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace UMAP
{
public delegate float DistanceCalculation<T>(IUmapDistanceParameter<T>[] x, IUmapDistanceParameter<T>[] y);
public delegate float DistanceCalculation<T>(T x, T y) where T : IUmapDataPoint;
}
13 changes: 13 additions & 0 deletions UMAP/IUmapDataPoint.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace UMAP
{
/// <summary>
/// Represents a single data point to be processed by <see cref="Umap{T}"/>.
/// </summary>
public interface IUmapDataPoint
{
/// <summary>
/// The data being operated on.
/// </summary>
float[] Data { get; }
}
}
11 changes: 0 additions & 11 deletions UMAP/IUmapDistance.cs

This file was deleted.

4 changes: 2 additions & 2 deletions UMAP/NNDescent.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

namespace UMAP
{
internal static class NNDescent<T>
internal static class NNDescent<T> where T : IUmapDataPoint
{
public delegate (int[][] indices, float[][] weights) NNDescentFn(
IUmapDistanceParameter<T>[][] data,
T[] data,
int[][] leafArray,
int nNeighbors,
int nIters = 10,
Expand Down
23 changes: 10 additions & 13 deletions UMAP/SIMD.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ internal static class SIMD<T>
private static readonly int _vs4 = 4 * Vector<float>.Count;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float Magnitude(ref IUmapDistanceParameter<T>[] vec) => (float)Math.Sqrt(DotProduct(ref vec, ref vec));
public static float Magnitude(ref float[] vec) => (float)Math.Sqrt(DotProduct(ref vec, ref vec));

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float Euclidean(ref float[] lhs, ref float[] rhs)
Expand Down Expand Up @@ -179,20 +179,17 @@ public static void Multiply(ref float[] lhs, float f)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDistanceParameter<T>[] rhs)
public static float DotProduct(ref float[] lhs, ref float[] rhs)
{

var lhsArray = lhs.Select(x => x.EmbeddingVectorValue).ToArray();
var rhsArray = rhs.Select(x=>x.EmbeddingVectorValue).ToArray();
var result = 0f;
var count = lhs.Length;
var offset = 0;
while (count >= _vs4)
{
result += Vector.Dot(new Vector<float>(lhsArray, offset), new Vector<float>(rhsArray, offset));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs1), new Vector<float>(rhsArray, offset + _vs1));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs2), new Vector<float>(rhsArray, offset + _vs2));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs3), new Vector<float>(rhsArray, offset + _vs3));
result += Vector.Dot(new Vector<float>(lhs, offset), new Vector<float>(rhs, offset));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs1), new Vector<float>(rhs, offset + _vs1));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs2), new Vector<float>(rhs, offset + _vs2));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs3), new Vector<float>(rhs, offset + _vs3));
if (count == _vs4)
{
return result;
Expand All @@ -203,8 +200,8 @@ public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDis
}
if (count >= _vs2)
{
result += Vector.Dot(new Vector<float>(lhsArray, offset), new Vector<float>(rhsArray, offset));
result += Vector.Dot(new Vector<float>(lhsArray, offset + _vs1), new Vector<float>(rhsArray, offset + _vs1));
result += Vector.Dot(new Vector<float>(lhs, offset), new Vector<float>(rhs, offset));
result += Vector.Dot(new Vector<float>(lhs, offset + _vs1), new Vector<float>(rhs, offset + _vs1));
if (count == _vs2)
{
return result;
Expand All @@ -215,7 +212,7 @@ public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDis
}
if (count >= _vs1)
{
result += Vector.Dot(new Vector<float>(lhsArray, offset), new Vector<float>(rhsArray, offset));
result += Vector.Dot(new Vector<float>(lhs, offset), new Vector<float>(rhs, offset));
if (count == _vs1)
{
return result;
Expand All @@ -228,7 +225,7 @@ public static float DotProduct(ref IUmapDistanceParameter<T>[] lhs, ref IUmapDis
{
while (count > 0)
{
result += lhsArray[offset] * rhsArray[offset];
result += lhs[offset] * rhs[offset];
offset++; count--;
}
}
Expand Down
19 changes: 9 additions & 10 deletions UMAP/Tree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@

namespace UMAP
{
internal static class Tree<T>
internal static class Tree<T> where T : IUmapDataPoint
{
/// <summary>
/// Construct a random projection tree based on ``data`` with leaves of size at most ``leafSize``
/// </summary>
public static RandomProjectionTreeNode MakeTree(IUmapDistanceParameter<T>[][] data, int leafSize, int n, IProvideRandomValues random)
public static RandomProjectionTreeNode MakeTree(T[] data, int leafSize, int n, IProvideRandomValues random)
{
var indices = Enumerable.Range(0, data.Length).ToArray();
return MakeEuclideanTree(data, indices, leafSize, n, random);
}

private static RandomProjectionTreeNode MakeEuclideanTree(IUmapDistanceParameter<T>[][] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
private static RandomProjectionTreeNode MakeEuclideanTree(T[] data, int[] indices, int leafSize, int q, IProvideRandomValues random)
{
if (indices.Length > leafSize)
{
Expand Down Expand Up @@ -50,9 +50,10 @@ public static FlatTree FlattenTree(RandomProjectionTreeNode tree, int leafSize)
/// the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses euclidean distance to determine the hyperplane and which side each data
/// sample falls on.
/// </summary>
private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(IUmapDistanceParameter<T>[][] data, int[] indices, IProvideRandomValues random)
private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector, float hyperplaneOffset) EuclideanRandomProjectionSplit(T[] data, int[] indices, IProvideRandomValues random)
{
var dim = data[0].Length;
var vectorData = data.Select(x => x.Data).ToArray();
var dim = vectorData[0].Length;

// Select two random points, set the hyperplane between them
var leftIndex = random.Next(0, indices.Length);
Expand All @@ -67,10 +68,8 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector,
var hyperplaneVector = new float[dim];
for (var i = 0; i < hyperplaneVector.Length; i++)
{
var leftVectorValue = data[left][i].EmbeddingVectorValue;
var rightVectorValue = data[right][i].EmbeddingVectorValue;
hyperplaneVector[i] = leftVectorValue - rightVectorValue;
hyperplaneOffset -= (hyperplaneVector[i] * (leftVectorValue + rightVectorValue)) / 2;
hyperplaneVector[i] = vectorData[left][i] - vectorData[right][i];
hyperplaneOffset -= (hyperplaneVector[i] * (vectorData[left][i] + vectorData[right][i])) / 2;
}

// For each point compute the margin (project into normal vector)
Expand All @@ -83,7 +82,7 @@ private static (int[] indicesLeft, int[] indicesRight, float[] hyperplaneVector,
var margin = hyperplaneOffset;
for (var d = 0; d < dim; d++)
{
margin += hyperplaneVector[d] * data[indices[i]][d].EmbeddingVectorValue;
margin += hyperplaneVector[d] * vectorData[indices[i]][d];
}

if (margin == 0)
Expand Down
70 changes: 37 additions & 33 deletions UMAP/Umap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace UMAP
{
public sealed class Umap<T>
public sealed class Umap<T> where T : IUmapDataPoint
{
private const float SMOOTH_K_TOLERANCE = 1e-5f;
private const float MIN_K_DIST_SCALE = 1e-3f;
Expand All @@ -31,7 +31,7 @@ public sealed class Umap<T>

// Internal graph connectivity representation
private SparseMatrix _graph = null;
private IUmapDistanceParameter<T>[][] _x = null;
private T[] _x = null;
private bool _isInitialized = false;
private Tree<T>.FlatTree[] _rpForest = new Tree<T>.FlatTree[0];

Expand All @@ -57,7 +57,7 @@ public Umap(
throw new ArgumentOutOfRangeException(nameof(customNumberOfEpochs), "if non-null then must be a positive value");
}

_distanceFn = distance ?? DistanceFunctions<T>.Cosine;
_distanceFn = distance ?? DistanceFunctions.Cosine;
_random = random ?? DefaultRandomGenerator.Instance;
_nNeighbors = numberOfNeighbors;
_optimizationState = new OptimizationState { Dim = dimensions };
Expand All @@ -69,7 +69,7 @@ public Umap(
/// Initializes fit by computing KNN and a fuzzy simplicial set, as well as initializing the projected embeddings. Sets the optimization state ahead of optimization steps.
/// Returns the number of epochs to be used for the SGD optimization.
/// </summary>
public int InitializeFit(IUmapDistanceParameter<T>[][] x)
public int InitializeFit(T[] x)
{
// We don't need to reinitialize if we've already initialized for this data
if ((_x == x) && _isInitialized)
Expand Down Expand Up @@ -149,7 +149,7 @@ private int GetNEpochs()
/// <summary>
/// Compute the ``nNeighbors`` nearest points for each data point in ``X`` - this may be exact, but more likely is approximated via nearest neighbor descent.
/// </summary>
internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(IUmapDistanceParameter<T>[][] x, ProgressReporter progressReporter)
internal (int[][] knnIndices, float[][] knnDistances) NearestNeighbors(T[] x, ProgressReporter progressReporter)
{
var metricNNDescent = NNDescent<T>.MakeNNDescent(_distanceFn, _random);
progressReporter(0.05f);
Expand All @@ -169,8 +169,6 @@ private int GetNEpochs()
progressReporter(0.45f);
var nnDescendProgressReporter = ScaleProgressReporter(progressReporter, 0.5f, 1);

var organizedDataList = new List<(float left, float right)>();

return metricNNDescent(x, leafArray, _nNeighbors, nIters, startingIteration: (i, max) => nnDescendProgressReporter((float)i / max));

// Handle python3 rounding down from 0.5 discrpancy
Expand All @@ -182,7 +180,7 @@ private int GetNEpochs()
/// to the data. This is done by locally approximating geodesic distance at each point, creating a fuzzy simplicial set for each such point, and then combining all the local fuzzy
/// simplicial sets into a global one via a fuzzy union.
/// </summary>
private SparseMatrix FuzzySimplicialSet(IUmapDistanceParameter<T>[][] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter)
private SparseMatrix FuzzySimplicialSet(T[] x, int nNeighbors, float setOpMixRatio, ProgressReporter progressReporter)
{
var knnIndices = _knnIndices ?? new int[0][];
var knnDistances = _knnDistances ?? new float[0][];
Expand Down Expand Up @@ -383,15 +381,15 @@ private static (int[] rows, int[] cols, float[] vals) ComputeMembershipStrengths
return (head.ToArray(), tail.ToArray(), MakeEpochsPerSample(weights.ToArray(), nEpochs));
}

private void ShuffleTogether<T, T2, T3>(List<T> list, List<T2> other, List<T3> weights)
private void ShuffleTogether<T1, T2, T3>(List<T1> list, List<T2> other, List<T3> weights)
{
int n = list.Count;
if (other.Count != n) { throw new Exception(); }
while (n > 1)
{
n--;
int k = _random.Next(0, n + 1);
T value = list[k];
T1 value = list[k];
list[k] = list[n];
list[n] = value;

Expand Down Expand Up @@ -629,42 +627,48 @@ private static ProgressReporter ScaleProgressReporter(ProgressReporter progressR
return progress => progressReporter((range * progress) + start);
}

public static class DistanceFunctions<T>
public static class DistanceFunctions
{
public static float Cosine(IUmapDistanceParameter<T>[] lhs, IUmapDistanceParameter<T>[] rhs)
public static float Cosine(T lhs, T rhs)
{
return 1 - (SIMD<T>.DotProduct(ref lhs, ref rhs) / (SIMD<T>.Magnitude(ref lhs) * SIMD<T>.Magnitude(ref rhs)));
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return 1 - (SIMD<T>.DotProduct(ref lhsVal, ref rhsVal) / (SIMD<T>.Magnitude(ref lhsVal) * SIMD<T>.Magnitude(ref rhsVal)));
}

public static float CosineForNormalizedVectors(IUmapDistanceParameter<T>[] lhs, IUmapDistanceParameter<T>[] rhs)
public static float CosineForNormalizedVectors(T lhs, T rhs)
{
return 1 - SIMD<T>.DotProduct(ref lhs, ref rhs);
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return 1 - SIMD<T>.DotProduct(ref lhsVal, ref rhsVal);
}

public static float Euclidean(float[] lhs, float[] rhs)
public static float Euclidean(T lhs, T rhs)
{
return (float)Math.Sqrt(SIMD<T>.Euclidean(ref lhs, ref rhs)); // TODO: Replace with netcore3 MathF class when the framework is available
var lhsVal = lhs.Data;
var rhsVal = rhs.Data;
return (float)Math.Sqrt(SIMD<T>.Euclidean(ref lhsVal, ref rhsVal)); // TODO: Replace with netcore3 MathF class when the framework is available
}
}

private sealed class OptimizationState
{
public int CurrentEpoch = 0;
public int[] Head = new int[0];
public int[] Tail = new int[0];
public float[] EpochsPerSample = new float[0];
public float[] EpochOfNextSample = new float[0];
public float[] EpochOfNextNegativeSample= new float[0];
public float[] EpochsPerNegativeSample = new float[0];
public bool MoveOther = true;
public float InitialAlpha = 1;
public float Alpha = 1;
public float Gamma = 1;
public float A = 1.5769434603113077f;
public float B = 0.8950608779109733f;
public int Dim = 2;
public int NEpochs = 500;
public int NVertices = 0;
public int CurrentEpoch = 0;
public int[] Head = new int[0];
public int[] Tail = new int[0];
public float[] EpochsPerSample = new float[0];
public float[] EpochOfNextSample = new float[0];
public float[] EpochOfNextNegativeSample = new float[0];
public float[] EpochsPerNegativeSample = new float[0];
public bool MoveOther = true;
public float InitialAlpha = 1;
public float Alpha = 1;
public float Gamma = 1;
public float A = 1.5769434603113077f;
public float B = 0.8950608779109733f;
public int Dim = 2;
public int NEpochs = 500;
public int NVertices = 0;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public float GetDistanceFactor(float distSquared) => 1f / ((0.001f + distSquared) * (float)(A * Math.Pow(distSquared, B) + 1));
Expand Down