ml-explore · davidkoski · Jul 15, 2024 · Jul 15, 2024 · davidkoski · Jul 15, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,17 +11,9 @@ endif()
 FetchContent_Declare(
   mlx-c
   GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
-  GIT_TAG "v0.0.8")
+  GIT_TAG "v0.0.9")
 FetchContent_MakeAvailable(mlx-c)
 
-# TEMPORARY OVERRIDE -- 0.0.8 depends on v0.14.0 but we need v0.15.2 for iOS /
-# float16 issues
-FetchContent_Declare(
-  mlx
-  GIT_REPOSITORY "https://github.com/ml-explore/mlx.git"
-  GIT_TAG v0.15.2)
-FetchContent_MakeAvailable(mlx)
-
 # swift-numerics
 set(swift_numerics_patch git apply
                          ${CMAKE_CURRENT_SOURCE_DIR}/cmake/swift-numerics.patch)

diff --git a/Package.swift b/Package.swift
@@ -105,6 +105,10 @@ let package = Package(
                 "mlx/mlx/distributed/mpi",
                 "mlx/mlx/distributed/ops.cpp",
                 "mlx/mlx/distributed/primitives.cpp",
+
+                // the mlx-c side of distributed
+                "include/mlx/c/distributed.cpp",
+                "include/mlx/c/distributed_group.cpp",
             ],
 
             cSettings: [

diff --git a/Source/Cmlx/mlx b/Source/Cmlx/mlx
diff --git a/Source/Cmlx/mlx-c b/Source/Cmlx/mlx-c
diff --git a/Source/Cmlx/mlx-generated/hadamard.cpp b/Source/Cmlx/mlx-generated/hadamard.cpp
@@ -0,0 +1,128 @@
+namespace mlx::core::metal {
+
+const char* hadamard() {
+  return R"preamble(
+
+using namespace metal;
+template <short R>
+METAL_FUNC void radix_func(thread float* x) {
+  constexpr short logR = __builtin_ctz(R);
+  short h = 1;
+#pragma clang loop unroll(full)
+  for (short s = 0; s < logR; s++) {
+#pragma clang loop unroll(full)
+    for (short i = 0; i < R / 2; i++) {
+      short k = i & (h - 1);
+      short j = ((i - k) << 1) + k;
+      float a = x[j];
+      float b = x[j + h];
+      x[j] = a + b;
+      x[j + h] = a - b;
+    }
+    h <<= 1;
+  }
+}
+template <typename T, int N, int max_radix, int read_width>
+[[kernel]] void hadamard_n(
+    const device T* in [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    constant const float& scale,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  constexpr short num_threads = N / max_radix;
+  constexpr short logN = __builtin_ctz(N);
+  constexpr short logR = __builtin_ctz(max_radix);
+  constexpr short num_steps = logN / logR;
+  constexpr short logFinal = logN % logR;
+  constexpr short final_radix = 1 << (logFinal);
+  int batch_idx = elem.x * N;
+  short i = elem.y;
+  threadgroup T buf[N];
+#pragma clang loop unroll(full)
+  for (short j = 0; j < max_radix / read_width; j++) {
+    short index = j * read_width * num_threads + i * read_width;
+#pragma clang loop unroll(full)
+    for (short r = 0; r < read_width; r++) {
+      buf[index + r] = in[batch_idx + index + r];
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  float x[max_radix];
+  short h = 1;
+#pragma clang loop unroll(full)
+  for (short s = 0; s < num_steps; s++) {
+    short k = i & (h - 1);
+    short j = ((i - k) << logR) + k;
+#pragma clang loop unroll(full)
+    for (short r = 0; r < max_radix; r++) {
+      x[r] = buf[j + h * r];
+    }
+    radix_func<max_radix>(x);
+#pragma clang loop unroll(full)
+    for (short r = 0; r < max_radix; r++) {
+      buf[j + h * r] = x[r];
+    }
+    h <<= logR;
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+  if (final_radix > 1) {
+#pragma clang loop unroll(full)
+    for (int t = 0; t < max_radix / final_radix; t++) {
+      short index = i + t * num_threads;
+      short k = index & (h - 1);
+      short j = ((index - k) << logFinal) + k;
+#pragma clang loop unroll(full)
+      for (short r = 0; r < final_radix; r++) {
+        x[r] = buf[j + h * r];
+      }
+      radix_func<final_radix>(x);
+#pragma clang loop unroll(full)
+      for (short r = 0; r < final_radix; r++) {
+        buf[j + h * r] = x[r];
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+  }
+#pragma clang loop unroll(full)
+  for (short j = 0; j < max_radix / read_width; j++) {
+    short index = j * read_width * num_threads + i * read_width;
+#pragma clang loop unroll(full)
+    for (short r = 0; r < read_width; r++) {
+      out[batch_idx + index + r] = buf[index + r] * scale;
+    }
+  }
+}
+template <typename T, int N, int M, int read_width>
+[[kernel]] void hadamard_m(
+    const device T* in [[buffer(0)]],
+    device T* out [[buffer(1)]],
+    constant const float& scale,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  int index = elem.x * grid.y + elem.y;
+  short i = index % (N / read_width);
+  int batch_idx = index / (N / read_width) * M * N;
+  float x[read_width][M];
+#pragma clang loop unroll(full)
+  for (short c = 0; c < M; c++) {
+#pragma clang loop unroll(full)
+    for (short r = 0; r < read_width; r++) {
+      x[r][c] = in[batch_idx + c * N + i * read_width + r];
+    }
+  }
+#pragma clang loop unroll(full)
+  for (short r = 0; r < read_width; r++) {
+    hadamard_radix_m(x[r]);
+  }
+#pragma clang loop unroll(full)
+  for (short c = 0; c < M; c++) {
+#pragma clang loop unroll(full)
+    for (short r = 0; r < read_width; r++) {
+      out[batch_idx + c * N + i * read_width + r] = x[r][c] * scale;
+    }
+  }
+}
+)preamble";
+}
+
+} // namespace mlx::core::metal
diff --git a/Source/MLX/Documentation.docc/free-functions.md b/Source/MLX/Documentation.docc/free-functions.md
@@ -222,3 +222,4 @@ operations as methods for convenience.
 
 - ``diag(_:k:stream:)``
 - ``diagonal(_:offset:axis1:axis2:stream:)``
+- ``view(_:dtype:stream:)``
diff --git a/Source/MLX/MLXArray+Ops.swift b/Source/MLX/MLXArray+Ops.swift
@@ -2658,4 +2658,20 @@ extension MLXArray {
         MLXArray(mlx_var_all(ctx, keepDims, ddof.int32, stream.ctx))
     }
 
+    /// View the array as a different type.
+    ///
+    /// The output array will change along the last axis if the input array's
+    /// type and the output array's type do not have the same size.
+    ///
+    /// Note: the view op does not imply that the input and output arrays share
+    /// their underlying data. The view only gaurantees that the binary
+    /// representation of each element (or group of elements) is the same.
+    ///
+    /// - Parameters:
+    ///   - dtype: type to change to
+    ///   - stream: stream or device to evaluate on
+    /// - Returns: array with the new type
+    public func view(dtype: DType, stream: StreamOrDevice = .default) -> MLXArray {
+        MLXArray(mlx_view(ctx, dtype.cmlxDtype, stream.ctx))
+    }
 }
diff --git a/Source/MLX/Ops+Array.swift b/Source/MLX/Ops+Array.swift
@@ -1715,3 +1715,22 @@ public func variance(
 ) -> MLXArray {
     MLXArray(mlx_var_all(array.ctx, keepDims, ddof.int32, stream.ctx))
 }
+
+/// View the array as a different type.
+///
+/// The output array will change along the last axis if the input array's
+/// type and the output array's type do not have the same size.
+///
+/// Note: the view op does not imply that the input and output arrays share
+/// their underlying data. The view only gaurantees that the binary
+/// representation of each element (or group of elements) is the same.
+///
+/// - Parameters:
+///   - dtype: type to change to
+///   - stream: stream or device to evaluate on
+///
+/// ### See Also
+///- ``MLXArray/view(dtype:stream:)``
+public func view(_ array: MLXArray, dtype: DType, stream: StreamOrDevice = .default) -> MLXArray {
+    MLXArray(mlx_view(array.ctx, dtype.cmlxDtype, stream.ctx))
+}
diff --git a/tools/update-mlx.sh b/tools/update-mlx.sh
@@ -18,7 +18,8 @@ cmake ../Source/Cmlx/mlx -DMLX_METAL_JIT=ON -DMACOS_VERSION=14.0
 
 # NOTE:
 # until mlx supports overriding the METAL_VERSION you will need to edit
-# Source/Cmlx/mlx/mlx/backend/metal/CMakeLists.txt and manually set the METAL_VERSION.
+# Source/Cmlx/mlx/mlx/backend/metal/CMakeLists.txt and manually set the METAL_VERSION
+# to "3.0"
 #
 # Also Plugins/PrepareMetalShaders/main.swift kernels needs to be in sync.
 
@@ -34,6 +35,7 @@ make \
     fft \
     gather \
     gemm \
+    hadamard \
     quantized \
     reduce \
     reduce_utils \
+1 −1		CMakeLists.txt
+70 −0		benchmarks/python/hadamard_bench.py
+1 −0		docs/src/python/ops.rst
+1 −0		docs/src/python/transforms.rst
+5 −1		mlx/array.cpp
+17 −24		mlx/backend/accelerate/primitives.cpp
+1 −0		mlx/backend/common/CMakeLists.txt
+1 −1		mlx/backend/common/common.cpp
+75 −43		mlx/backend/common/copy.cpp
+2 −1		mlx/backend/common/default_primitives.cpp
+107 −0		mlx/backend/common/hadamard.cpp
+105 −0		mlx/backend/common/hadamard.h
+11 −1		mlx/backend/common/primitives.cpp
+9 −0		mlx/backend/common/utils.h
+2 −0		mlx/backend/metal/CMakeLists.txt
+1 −0		mlx/backend/metal/fft.cpp
+203 −0		mlx/backend/metal/hadamard.cpp
+1 −0		mlx/backend/metal/jit/includes.h
+167 −0		mlx/backend/metal/kernels/hadamard.h
+13 −2		mlx/backend/metal/primitives.cpp
+0 −11		mlx/backend/metal/utils.h
+2 −1		mlx/backend/no_cpu/primitives.cpp
+2 −1		mlx/backend/no_metal/primitives.cpp
+9 −8		mlx/fast.cpp
+13 −1		mlx/ops.cpp
+6 −0		mlx/ops.h
+97 −9		mlx/primitives.cpp
+52 −11		mlx/primitives.h
+6 −4		mlx/random.cpp
+79 −7		mlx/transforms.cpp
+27 −2		mlx/transforms.h
+16 −0		mlx/transforms_impl.h
+12 −0		mlx/utils.h
+3 −3		python/mlx/nn/layers/base.py
+1 −1		python/src/fast.cpp
+29 −0		python/src/ops.cpp
+451 −2		python/src/transforms.cpp
+13 −0		python/tests/test_array.py
+84 −0		python/tests/test_autograd.py
+113 −0		python/tests/test_ops.py
+1 −1		setup.py
+5 −2		CMakeLists.txt
+1 −1		docs/src/conf.py
+5 −0		docs/src/distributed_group.rst
+5 −0		docs/src/distributed_ops.rst
+2 −0		docs/src/index.rst
+45 −43		mlx/c/array.cpp
+35 −29		mlx/c/closure.cpp
+4 −3		mlx/c/compile.cpp
+1 −0		mlx/c/compile.h
+6 −5		mlx/c/device.cpp
+30 −0		mlx/c/distributed.cpp
+36 −0		mlx/c/distributed.h
+34 −0		mlx/c/distributed_group.cpp
+54 −0		mlx/c/distributed_group.h
+48 −0		mlx/c/error.cpp
+41 −0		mlx/c/error.h
+5 −4		mlx/c/fast.cpp
+1 −0		mlx/c/fast.h
+13 −12		mlx/c/fft.cpp
+1 −0		mlx/c/fft.h
+5 −5		mlx/c/future.cpp
+9 −8		mlx/c/io.cpp
+1 −0		mlx/c/io.h
+5 −6		mlx/c/ioutils.cpp
+8 −7		mlx/c/linalg.cpp
+1 −0		mlx/c/linalg.h
+20 −12		mlx/c/map.cpp
+5 −4		mlx/c/metal.cpp
+1 −0		mlx/c/metal.h
+189 −183		mlx/c/ops.cpp
+2 −0		mlx/c/ops.h
+17 −0		mlx/c/private/distributed_group.h
+42 −14		mlx/c/private/utils.h
+16 −15		mlx/c/random.cpp
+1 −0		mlx/c/random.h
+12 −13		mlx/c/stream.cpp
+3 −2		mlx/c/string.cpp
+7 −6		mlx/c/transforms.cpp
+1 −0		mlx/c/transforms.h
+8 −4		mlx/c/transforms_impl.cpp
+2 −0		mlx/c/transforms_impl.h
+30 −18		python/c.py
+4 −1		python/generator.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -222,3 +222,4 @@ operations as methods for convenience.

		- ``diag(_:k:stream:)``
		- ``diagonal(_:offset:axis1:axis2:stream:)``
		- ``view(_:dtype:stream:)``