diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.PopCount.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.PopCount.cs index da45bd720b129..1ea141baac4d5 100644 --- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.PopCount.cs +++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/netcore/TensorPrimitives.PopCount.cs @@ -52,20 +52,40 @@ public static void PopCount(ReadOnlySpan x, Span destination) // This relies on 64-bit shifts for sizeof(T) == 8, and such shifts aren't accelerated on today's hardware. // Alternative approaches, such as doing two 32-bit operations and combining them were observed to not // provide any meaningfuls speedup over scalar. So for now, we don't vectorize when sizeof(T) == 8. - sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4; + (sizeof(T) is 1 or 2 or 4) || (AdvSimd.IsSupported && sizeof(T) == 8); public static T Invoke(T x) => T.PopCount(x); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Invoke(Vector128 x) { - if (sizeof(T) == 1) + if (AdvSimd.IsSupported) { - if (AdvSimd.IsSupported) + Vector128 cnt = AdvSimd.PopCount(x.AsByte()); + + if (sizeof(T) == 1) + { + return cnt.As(); + } + + if (sizeof(T) == 2) + { + return AdvSimd.AddPairwiseWidening(cnt).As(); + } + + if (sizeof(T) == 4) { - return AdvSimd.PopCount(x.AsByte()).As(); + return AdvSimd.AddPairwiseWidening(AdvSimd.AddPairwiseWidening(cnt)).As(); } + if (sizeof(T) == 8) + { + return AdvSimd.AddPairwiseWidening(AdvSimd.AddPairwiseWidening(AdvSimd.AddPairwiseWidening(cnt))).As(); + } + } + + if (sizeof(T) == 1) + { if (PackedSimd.IsSupported) { return PackedSimd.PopCount(x.AsByte()).As();