From 41b52d25a5efb79344857b402c6017042dfc20a6 Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sun, 11 Aug 2024 14:58:21 +0200 Subject: [PATCH 1/3] Add transpose function. Reuse shape for row_slice. --- rustynum-rs/src/num_array/num_array.rs | 58 ++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/rustynum-rs/src/num_array/num_array.rs b/rustynum-rs/src/num_array/num_array.rs index 6fe47eb..a85bb8c 100644 --- a/rustynum-rs/src/num_array/num_array.rs +++ b/rustynum-rs/src/num_array/num_array.rs @@ -195,6 +195,29 @@ where } } + /// Transposes a 2D matrix from row-major to column-major format. + /// + /// # Returns + /// A new `NumArray` instance that is the transpose of the original matrix. + pub fn transpose(&self) -> Self { + assert!( + self.shape.len() == 2, + "Transpose is only valid for 2D matrices." + ); + + let rows = self.shape[0]; + let cols = self.shape[1]; + let mut transposed_data = vec![T::default(); rows * cols]; + + for i in 0..rows { + for j in 0..cols { + transposed_data[j * rows + i] = self.data[i * cols + j]; + } + } + + NumArray::new_with_shape(transposed_data, vec![cols, rows]) + } + /// Retrieves a reference to the underlying data vector. /// /// # Returns @@ -617,9 +640,10 @@ where /// println!("Row slice: {:?}", row_slice); /// ``` pub fn row_slice(&self, row: usize) -> &[T] { - assert_eq!(self.shape().len(), 2, "Only 2D arrays are supported."); - let start = row * self.shape()[1]; - let end = start + self.shape()[1]; + let shape = self.shape(); + assert_eq!(shape.len(), 2, "Only 2D arrays are supported."); + let start = row * shape[1]; + let end = start + shape[1]; &self.data[start..end] } @@ -829,6 +853,34 @@ mod tests { assert_eq!(ones_array.get_data(), &[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); } + #[test] + fn test_matrix_transpose() { + let matrix = NumArray32::new_with_shape( + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], + vec![3, 3], + ); + + let transposed = matrix.transpose(); + + let expected_data = vec![1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0]; + + assert_eq!(transposed.shape(), &[3, 3]); + assert_eq!(transposed.get_data(), &expected_data); + } + + #[test] + fn test_non_square_matrix_transpose() { + let matrix = + NumArray32::new_with_shape(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], vec![2, 4]); + + let transposed = matrix.transpose(); + + let expected_data = vec![1.0, 5.0, 2.0, 6.0, 3.0, 7.0, 4.0, 8.0]; + + assert_eq!(transposed.shape(), &[4, 2]); + assert_eq!(transposed.get_data(), &expected_data); + } + #[test] fn test_reshape_successfully() { let array = NumArray32::new(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); From 0a157a8e400cba8575d29214fed921bbd140da3e Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sun, 11 Aug 2024 15:00:09 +0200 Subject: [PATCH 2/3] Use fma instructions for multiply add in dot product. --- rustynum-rs/src/simd_ops/mod.rs | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/rustynum-rs/src/simd_ops/mod.rs b/rustynum-rs/src/simd_ops/mod.rs index 10194e2..25e4656 100644 --- a/rustynum-rs/src/simd_ops/mod.rs +++ b/rustynum-rs/src/simd_ops/mod.rs @@ -1,6 +1,7 @@ use std::simd::f32x16; use std::simd::f64x8; use std::simd::num::SimdFloat; +use std::simd::StdFloat; const LANES_32: usize = 16; const LANES_64: usize = 8; @@ -16,13 +17,13 @@ impl SimdOps for f32x16 { fn dot_product(a: &[f32], b: &[f32]) -> f32 { assert_eq!(a.len(), b.len()); let len = a.len(); - let mut sum = f32x16::splat(0.0); + let sum = f32x16::splat(0.0); let chunks = len / LANES_32; for i in 0..chunks { let a_simd = f32x16::from_slice(&a[i * LANES_32..]); let b_simd = f32x16::from_slice(&b[i * LANES_32..]); - sum += a_simd * b_simd; + let _ = sum.mul_add(a_simd, b_simd); } let mut scalar_sum = sum.reduce_sum(); @@ -87,25 +88,22 @@ impl SimdOps for f32x16 { impl SimdOps for f64x8 { fn dot_product(a: &[f64], b: &[f64]) -> f64 { assert_eq!(a.len(), b.len()); + let len = a.len(); + let sum = f64x8::splat(0.0); - let (a_extra, a_chunks) = a.as_rchunks(); - let (b_extra, b_chunks) = b.as_rchunks(); - - // These are always true, but for emphasis: - assert_eq!(a_chunks.len(), b_chunks.len()); - assert_eq!(a_extra.len(), b_extra.len()); - - let mut sums = [0.0; LANES_64]; - for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) { - *d = x * y; + let chunks = len / LANES_64; + for i in 0..chunks { + let a_simd = f64x8::from_slice(&a[i * LANES_64..]); + let b_simd = f64x8::from_slice(&b[i * LANES_64..]); + let _ = sum.mul_add(a_simd, b_simd); } - let mut sums = f64x8::from_array(sums); - std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| { - sums += f64x8::from_array(*x) * f64x8::from_array(*y); - }); + let mut scalar_sum = sum.reduce_sum(); + for i in (chunks * LANES_64)..len { + scalar_sum += a[i] * b[i]; + } - sums.reduce_sum() + scalar_sum } fn sum(a: &[f64]) -> f64 { From b7e2a19daad3670ac637ad0059f2efbb5fe87523 Mon Sep 17 00:00:00 2001 From: IgorSusmelj Date: Sun, 11 Aug 2024 15:03:24 +0200 Subject: [PATCH 3/3] Use matrix transpose before matrix matrix multiplication. --- rustynum-rs/src/num_array/linalg.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/rustynum-rs/src/num_array/linalg.rs b/rustynum-rs/src/num_array/linalg.rs index e35b7fa..184b910 100644 --- a/rustynum-rs/src/num_array/linalg.rs +++ b/rustynum-rs/src/num_array/linalg.rs @@ -42,30 +42,35 @@ where ); let rows = lhs.shape()[0]; - let cols = if rhs.shape().len() > 1 { - rhs.shape()[1] + let rhs_is_matrix = rhs.shape().len() > 1; + let cols = if rhs_is_matrix { rhs.shape()[1] } else { 1 }; + + // Transpose the right-hand side matrix if it is indeed a matrix (not a vector) + let rhs_transposed = if rhs_is_matrix { + rhs.transpose() } else { - 1 + rhs.clone() }; - let mut result = if rhs.shape().len() > 1 { + let mut result = if rhs_is_matrix { NumArray::new_with_shape(vec![T::default(); rows * cols], vec![rows, cols]) } else { NumArray::new(vec![T::default(); rows]) }; for j in 0..cols { - let rhs_col = if rhs.shape().len() > 1 { - rhs.column_slice(j) + let rhs_col = if rhs_is_matrix { + rhs_transposed.row_slice(j) // Use row_slice since rhs is transposed } else { - rhs.get_data().to_vec() + &rhs.get_data().to_vec() }; + for i in 0..rows { let lhs_row = lhs.row_slice(i); let sum = Ops::dot_product(lhs_row, &rhs_col); - if rhs.shape().len() > 1 { + if rhs_is_matrix { result.set(&[i, j], sum); } else { result.set(&[i], sum);