Skip to content

Commit

Permalink
Merge pull request #1807 from odin-lang/simd-dev
Browse files Browse the repository at this point in the history
Generic #simd type and intrinsics
  • Loading branch information
gingerBill authored May 31, 2022
2 parents a6c779b + 516f664 commit a1f15c2
Show file tree
Hide file tree
Showing 43 changed files with 5,432 additions and 364 deletions.
99 changes: 91 additions & 8 deletions core/intrinsics/intrinsics.odin
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ package intrinsics
is_package_imported :: proc(package_name: string) -> bool ---

// Types
simd_vector :: proc($N: int, $T: typeid) -> type/#simd[N]T
soa_struct :: proc($N: int, $T: typeid) -> type/#soa[N]T

// Volatile
volatile_load :: proc(dst: ^$T) -> T ---
volatile_store :: proc(dst: ^$T, val: T) -> T ---
volatile_store :: proc(dst: ^$T, val: T) ---

non_temporal_load :: proc(dst: ^$T) -> T ---
non_temporal_store :: proc(dst: ^$T, val: T) ---

// Trapping
debug_trap :: proc() ---
Expand All @@ -23,18 +25,20 @@ alloca :: proc(size, align: int) -> [^]u8 ---
cpu_relax :: proc() ---
read_cycle_counter :: proc() -> i64 ---

count_ones :: proc(x: $T) -> T where type_is_integer(T) ---
count_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
count_leading_zeros :: proc(x: $T) -> T where type_is_integer(T) ---
reverse_bits :: proc(x: $T) -> T where type_is_integer(T) ---
count_ones :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
count_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
count_trailing_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
count_leading_zeros :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
reverse_bits :: proc(x: $T) -> T where type_is_integer(T) || type_is_simd_vector(T) ---
byte_swap :: proc(x: $T) -> T where type_is_integer(T) || type_is_float(T) ---

overflow_add :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
overflow_sub :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---
overflow_mul :: proc(lhs, rhs: $T) -> (T, bool) #optional_ok ---

sqrt :: proc(x: $T) -> T where type_is_float(T) ---
sqrt :: proc(x: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---

fused_mul_add :: proc(a, b, c: $T) -> T where type_is_float(T) || (type_is_simd_vector(T) && type_is_float(type_elem_type(T))) ---

mem_copy :: proc(dst, src: rawptr, len: int) ---
mem_copy_non_overlapping :: proc(dst, src: rawptr, len: int) ---
Expand Down Expand Up @@ -186,6 +190,81 @@ type_hasher_proc :: proc($T: typeid) -> (hasher: proc "contextless" (data: rawpt

constant_utf16_cstring :: proc($literal: string) -> [^]u16 ---

// SIMD related
simd_add :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_sub :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_mul :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_div :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_rem :: proc(a, b: #simd[N]T) -> #simd[N]T ---

// Keeps Odin's Behaviour
// (x << y) if y <= mask else 0
simd_shl :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
simd_shr :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---

// Similar to C's Behaviour
// x << (y & mask)
simd_shl_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---
simd_shr_masked :: proc(a: #simd[N]T, b: #simd[N]Unsigned_Integer) -> #simd[N]T ---

simd_add_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_sub_sat :: proc(a, b: #simd[N]T) -> #simd[N]T ---

simd_and :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_or :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_xor :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_and_not :: proc(a, b: #simd[N]T) -> #simd[N]T ---

simd_neg :: proc(a: #simd[N]T) -> #simd[N]T ---

simd_abs :: proc(a: #simd[N]T) -> #simd[N]T ---

simd_min :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_max :: proc(a, b: #simd[N]T) -> #simd[N]T ---
simd_clamp :: proc(v, min, max: #simd[N]T) -> #simd[N]T ---

// Return an unsigned integer of the same size as the input type
// NOT A BOOLEAN
// element-wise:
// false => 0x00...00
// true => 0xff...ff
simd_lanes_eq :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
simd_lanes_ne :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
simd_lanes_lt :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
simd_lanes_le :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
simd_lanes_gt :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---

simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---

simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T ---
simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T ---
simd_reduce_min :: proc(a: #simd[N]T) -> T ---
simd_reduce_max :: proc(a: #simd[N]T) -> T ---
simd_reduce_and :: proc(a: #simd[N]T) -> T ---
simd_reduce_or :: proc(a: #simd[N]T) -> T ---
simd_reduce_xor :: proc(a: #simd[N]T) -> T ---

simd_shuffle :: proc(a, b: #simd[N]T, indices: ..int) -> #simd[len(indices)]T ---
simd_select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T ---

// Lane-wise operations
simd_ceil :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
simd_floor :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
simd_trunc :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---
// rounding to the nearest integral value; if two values are equally near, rounds to the even one
simd_nearest :: proc(a: #simd[N]any_float) -> #simd[N]any_float ---

simd_to_bits :: proc(v: #simd[N]T) -> #simd[N]Integer where size_of(T) == size_of(Integer), type_is_unsigned(Integer) ---

// equivalent a swizzle with descending indices, e.g. reserve(a, 3, 2, 1, 0)
simd_reverse :: proc(a: #simd[N]T) -> #simd[N]T ---

simd_rotate_left :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---
simd_rotate_right :: proc(a: #simd[N]T, $offset: int) -> #simd[N]T ---


// WASM targets only
wasm_memory_grow :: proc(index, delta: uintptr) -> int ---
wasm_memory_size :: proc(index: uintptr) -> int ---
Expand All @@ -199,6 +278,10 @@ wasm_memory_size :: proc(index: uintptr) -> int ---
wasm_memory_atomic_wait32 :: proc(ptr: ^u32, expected: u32, timeout_ns: i64) -> u32 ---
wasm_memory_atomic_notify32 :: proc(ptr: ^u32, waiters: u32) -> (waiters_woken_up: u32) ---

// x86 Targets (i386, amd64)
x86_cpuid :: proc(ax, cx: u32) -> (eax, ebc, ecx, edx: u32) ---
x86_xgetbv :: proc(cx: u32) -> (eax, edx: u32) ---


// Darwin targets only
objc_object :: struct{}
Expand Down
1 change: 1 addition & 0 deletions core/mem/raw.odin
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ make_any :: proc "contextless" (data: rawptr, id: typeid) -> any {
}

raw_array_data :: runtime.raw_array_data
raw_simd_data :: runtime.raw_simd_data
raw_string_data :: runtime.raw_string_data
raw_slice_data :: runtime.raw_slice_data
raw_dynamic_array_data :: runtime.raw_dynamic_array_data
Expand Down
6 changes: 5 additions & 1 deletion core/runtime/core_builtin.odin
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,10 @@ raw_array_data :: proc "contextless" (a: $P/^($T/[$N]$E)) -> [^]E {
return ([^]E)(a)
}
@builtin
raw_simd_data :: proc "contextless" (a: $P/^($T/#simd[$N]$E)) -> [^]E {
return ([^]E)(a)
}
@builtin
raw_slice_data :: proc "contextless" (s: $S/[]$E) -> [^]E {
ptr := (transmute(Raw_Slice)s).data
return ([^]E)(ptr)
Expand All @@ -619,7 +623,7 @@ raw_string_data :: proc "contextless" (s: $S/string) -> [^]u8 {
}

@builtin
raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data}
raw_data :: proc{raw_array_data, raw_slice_data, raw_dynamic_array_data, raw_string_data, raw_simd_data}



Expand Down
188 changes: 188 additions & 0 deletions core/simd/simd.odin
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
package simd

import "core:builtin"
import "core:intrinsics"

// 128-bit vector aliases
u8x16 :: #simd[16]u8
i8x16 :: #simd[16]i8
u16x8 :: #simd[8]u16
i16x8 :: #simd[8]i16
u32x4 :: #simd[4]u32
i32x4 :: #simd[4]i32
u64x2 :: #simd[2]u64
i64x2 :: #simd[2]i64
f32x4 :: #simd[4]f32
f64x2 :: #simd[2]f64

boolx16 :: #simd[16]bool
b8x16 :: #simd[16]b8
b16x8 :: #simd[8]b16
b32x4 :: #simd[4]b32
b64x2 :: #simd[2]b64

// 256-bit vector aliases
u8x32 :: #simd[32]u8
i8x32 :: #simd[32]i8
u16x16 :: #simd[16]u16
i16x16 :: #simd[16]i16
u32x8 :: #simd[8]u32
i32x8 :: #simd[8]i32
u64x4 :: #simd[4]u64
i64x4 :: #simd[4]i64
f32x8 :: #simd[8]f32
f64x4 :: #simd[4]f64

boolx32 :: #simd[32]bool
b8x32 :: #simd[32]b8
b16x16 :: #simd[16]b16
b32x8 :: #simd[8]b32
b64x4 :: #simd[4]b64

// 512-bit vector aliases
u8x64 :: #simd[64]u8
i8x64 :: #simd[64]i8
u16x32 :: #simd[32]u16
i16x32 :: #simd[32]i16
u32x16 :: #simd[16]u32
i32x16 :: #simd[16]i32
u64x8 :: #simd[8]u64
i64x8 :: #simd[8]i64
f32x16 :: #simd[16]f32
f64x8 :: #simd[8]f64

boolx64 :: #simd[64]bool
b8x64 :: #simd[64]b8
b16x32 :: #simd[32]b16
b32x16 :: #simd[16]b32
b64x8 :: #simd[8]b64


add :: intrinsics.simd_add
sub :: intrinsics.simd_sub
mul :: intrinsics.simd_mul
div :: intrinsics.simd_div
rem :: intrinsics.simd_rem // integers only

// Keeps Odin's Behaviour
// (x << y) if y <= mask else 0
shl :: intrinsics.simd_shl
shr :: intrinsics.simd_shr

// Similar to C's Behaviour
// x << (y & mask)
shl_masked :: intrinsics.simd_shl_masked
shr_masked :: intrinsics.simd_shr_masked

// Saturation Arithmetic
add_sat :: intrinsics.simd_add_sat
sub_sat :: intrinsics.simd_sub_sat

and :: intrinsics.simd_and
or :: intrinsics.simd_or
xor :: intrinsics.simd_xor
and_not :: intrinsics.simd_and_not

neg :: intrinsics.simd_neg

abs :: intrinsics.simd_abs

min :: intrinsics.simd_min
max :: intrinsics.simd_max
clamp :: intrinsics.simd_clamp

// Return an unsigned integer of the same size as the input type
// NOT A BOOLEAN
// element-wise:
// false => 0x00...00
// true => 0xff...ff
lanes_eq :: intrinsics.simd_lanes_eq
lanes_ne :: intrinsics.simd_lanes_ne
lanes_lt :: intrinsics.simd_lanes_lt
lanes_le :: intrinsics.simd_lanes_le
lanes_gt :: intrinsics.simd_lanes_gt
lanes_ge :: intrinsics.simd_lanes_ge

// extract :: proc(a: #simd[N]T, idx: uint) -> T
extract :: intrinsics.simd_extract
// replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T
replace :: intrinsics.simd_replace

reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
reduce_min :: intrinsics.simd_reduce_min
reduce_max :: intrinsics.simd_reduce_max
reduce_and :: intrinsics.simd_reduce_and
reduce_or :: intrinsics.simd_reduce_or
reduce_xor :: intrinsics.simd_reduce_xor

// swizzle :: proc(a: #simd[N]T, indices: ..int) -> #simd[len(indices)]T
swizzle :: builtin.swizzle

// shuffle :: proc(a, b: #simd[N]T, indices: #simd[max 2*N]u32) -> #simd[len(indices)]T
shuffle :: intrinsics.simd_shuffle

// select :: proc(cond: #simd[N]boolean_or_integer, true, false: #simd[N]T) -> #simd[N]T
select :: intrinsics.simd_select


sqrt :: intrinsics.sqrt
ceil :: intrinsics.simd_ceil
floor :: intrinsics.simd_floor
trunc :: intrinsics.simd_trunc
nearest :: intrinsics.simd_nearest

to_bits :: intrinsics.simd_to_bits

lanes_reverse :: intrinsics.simd_lanes_reverse

lanes_rotate_left :: intrinsics.simd_lanes_rotate_left
lanes_rotate_right :: intrinsics.simd_lanes_rotate_right

count_ones :: intrinsics.count_ones
count_zeros :: intrinsics.count_zeros
count_trailing_zeros :: intrinsics.count_trailing_zeros
count_leading_zeros :: intrinsics.count_leading_zeros
reverse_bits :: intrinsics.reverse_bits

fused_mul_add :: intrinsics.fused_mul_add
fma :: intrinsics.fused_mul_add

to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E {
return (^[LANES]E)(v)
}
to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E {
return transmute([LANES]E)(v)
}
from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E {
return transmute(#simd[LANES]E)v
}

from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
assert(len(slice) >= LANES, "slice length must be a least the number of lanes")
array: [LANES]E
#no_bounds_check for i in 0..<LANES {
array[i] = slice[i]
}
return transmute(T)array
}

bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) {
return xor(v, T(~E(0)))
}

copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
neg_zero := to_bits(T(-0.0))
sign_bit := to_bits(sign) & neg_zero
magnitude := to_bits(v) &~ neg_zero
return transmute(T)(sign_bit|magnitude)
}

signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
is_nan := lanes_ne(v, v)
return select(is_nan, v, copysign(T(1), v))
}

recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
return T(1) / v
}
24 changes: 24 additions & 0 deletions core/simd/x86/abm.odin
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//+build i386, amd64
package simd_x86

import "core:intrinsics"

@(require_results, enable_target_feature="lzcnt")
_lzcnt_u32 :: #force_inline proc "c" (x: u32) -> u32 {
return intrinsics.count_leading_zeros(x)
}
@(require_results, enable_target_feature="popcnt")
_popcnt32 :: #force_inline proc "c" (x: u32) -> i32 {
return i32(intrinsics.count_ones(x))
}

when ODIN_ARCH == .amd64 {
@(require_results, enable_target_feature="lzcnt")
_lzcnt_u64 :: #force_inline proc "c" (x: u64) -> u64 {
return intrinsics.count_leading_zeros(x)
}
@(require_results, enable_target_feature="popcnt")
_popcnt64 :: #force_inline proc "c" (x: u64) -> i32 {
return i32(intrinsics.count_ones(x))
}
}
Loading

0 comments on commit a1f15c2

Please sign in to comment.