diff --git a/EngineCore/src/yfft/src/aligned.rs b/EngineCore/src/yfft/src/aligned.rs new file mode 100644 index 00000000..f7e9dd48 --- /dev/null +++ b/EngineCore/src/yfft/src/aligned.rs @@ -0,0 +1,74 @@ +// +// Copyright 2018 yvt, all rights reserved. +// +// This source code is a part of Nightingales. +// +use std::{ + fmt, + mem::size_of, + ops::{Deref, DerefMut}, +}; + +/// The alignment value guaranteed by `AlignedVec`. +const ALIGN: usize = 32; + +fn ptr_lsbs(x: usize) -> usize { + x & (ALIGN - 1) +} + +/// Provides a subset of `Vec`'s interface while providing a minimum alignment +/// guarantee that is convenient for SIMD operations. +pub struct AlignedVec { + storage: Vec, + offset: usize, +} + +impl AlignedVec { + pub fn with_capacity(i: usize) -> Self { + debug_assert!(size_of::() <= ALIGN); + debug_assert!(ALIGN % size_of::() == 0); + + let mut storage: Vec = Vec::with_capacity(i + ALIGN / size_of::() - 1); + let mut offset = 0; + + // Increase the padding until the storage is aligned + while ptr_lsbs(storage.as_ptr().wrapping_add(offset) as _) != 0 { + storage.push(T::default()); + offset += 1; + + debug_assert!(offset < ALIGN / size_of::()); + } + + Self { storage, offset } + } + + pub fn push(&mut self, x: T) { + if self.storage.len() >= self.storage.capacity() { + panic!("collection is full"); + } + self.storage.push(x); + } +} + +impl fmt::Debug for AlignedVec { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + fmt.debug_struct("AlignedVec") + .field("offset", &self.offset) + .field("entries", &&self[..]) + .finish() + } +} + +impl Deref for AlignedVec { + type Target = [T]; + + fn deref(&self) -> &Self::Target { + &self.storage[self.offset..] + } +} + +impl DerefMut for AlignedVec { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.storage[self.offset..] + } +} diff --git a/EngineCore/src/yfft/src/kernel/x86/x86avxf32realfft.rs b/EngineCore/src/yfft/src/kernel/x86/x86avxf32realfft.rs index 085ec7a4..5fc13e08 100644 --- a/EngineCore/src/yfft/src/kernel/x86/x86avxf32realfft.rs +++ b/EngineCore/src/yfft/src/kernel/x86/x86avxf32realfft.rs @@ -12,6 +12,7 @@ use std::f32; use std::mem; use std::ptr::{read_unaligned, write_unaligned}; +use aligned::AlignedVec; use simdutils::{avx_f32x8_bitxor, avx_f32x8_complex_mul_riri}; use Num; @@ -39,7 +40,7 @@ where #[derive(Debug)] struct AvxF32RealFFTPrePostProcessKernel { len: usize, - table: [Vec; 2], + table: [AlignedVec; 2], inverse: bool, } diff --git a/EngineCore/src/yfft/src/kernel/x86/x86sse1realfft.rs b/EngineCore/src/yfft/src/kernel/x86/x86sse1realfft.rs index 5d544817..a1a3083e 100644 --- a/EngineCore/src/yfft/src/kernel/x86/x86sse1realfft.rs +++ b/EngineCore/src/yfft/src/kernel/x86/x86sse1realfft.rs @@ -12,6 +12,7 @@ use std::f32; use std::mem; use std::ptr::{read_unaligned, write_unaligned}; +use aligned::AlignedVec; use simdutils::{f32x4_bitxor, f32x4_complex_mul_rrii}; use {mul_pos_i, Complex, Num}; @@ -34,10 +35,10 @@ where }) } -pub(super) fn new_real_fft_coef_table(len: usize, inverse: bool) -> [Vec; 2] { +pub(super) fn new_real_fft_coef_table(len: usize, inverse: bool) -> [AlignedVec; 2] { assert!(len % 2 == 0); - let mut table_a = Vec::with_capacity(len); - let mut table_b = Vec::with_capacity(len); + let mut table_a = AlignedVec::with_capacity(len); + let mut table_b = AlignedVec::with_capacity(len); for i in 0..(len / 2) { let c = Complex::new(0f32, (i as f32) * -f32::consts::PI / (len / 2) as f32).exp(); @@ -61,7 +62,7 @@ pub(super) fn new_real_fft_coef_table(len: usize, inverse: bool) -> [Vec; 2 #[derive(Debug)] struct SseRealFFTPrePostProcessKernel { len: usize, - table: [Vec; 2], + table: [AlignedVec; 2], inverse: bool, } diff --git a/EngineCore/src/yfft/src/kernel/x86/x86sse3f32realfft.rs b/EngineCore/src/yfft/src/kernel/x86/x86sse3f32realfft.rs index c4c96277..b857f38a 100644 --- a/EngineCore/src/yfft/src/kernel/x86/x86sse3f32realfft.rs +++ b/EngineCore/src/yfft/src/kernel/x86/x86sse3f32realfft.rs @@ -12,6 +12,7 @@ use std::f32; use std::mem; use std::ptr::{read_unaligned, write_unaligned}; +use aligned::AlignedVec; use simdutils::{f32x4_bitxor, sse3_f32x4_complex_mul_riri}; use Num; @@ -39,7 +40,7 @@ where #[derive(Debug)] struct Sse3F32RealFFTPrePostProcessKernel { len: usize, - table: [Vec; 2], + table: [AlignedVec; 2], inverse: bool, } diff --git a/EngineCore/src/yfft/src/lib.rs b/EngineCore/src/yfft/src/lib.rs index f3bd6aa1..4538660f 100644 --- a/EngineCore/src/yfft/src/lib.rs +++ b/EngineCore/src/yfft/src/lib.rs @@ -42,6 +42,7 @@ use num_complex::Complex; #[macro_use] mod simdutils; +mod aligned; mod env; mod kernel; mod setup;