Skip to content

Commit

Permalink
ff/baby_bear.hpp: optimize recip_* helpers.
Browse files Browse the repository at this point in the history
While it makes only marginal difference in individual inversion, ~5%,
the difference is significant in batch inversion, +50% for N=8.
  • Loading branch information
dot-asm committed Jul 16, 2024
1 parent 348b385 commit 95fe0c0
Showing 1 changed file with 69 additions and 1 deletion.
70 changes: 69 additions & 1 deletion ff/baby_bear.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,21 +485,84 @@ class __align__(16) bb31_4_t {
{ c[0] -= b; return *this; }

private:
// don't bother with breaking these down, 1/x dominates.
inline bb31_t recip_b0(bb31_t beta) const
# if 0
{ return c[0]*c[0] - beta*(c[1]*bb31_t{u[3]<<1} - c[2]*c[2]); }
# else
{
union { uint64_t wl; uint32_t w[2]; };

wl = u[1] * (uint64_t)(u[3]<<1);
wl += u[2] * (uint64_t)(MOD-u[2]);
wl += (w[0] * M) * (uint64_t)MOD; final_sub(w[1]);
wl = w[1] * (uint64_t)(MOD-BETA);
wl += u[0] * (uint64_t)u[0];
wl += (w[0] * M) * (uint64_t)MOD;

return bb31_t{final_sub(w[1])};
}
# endif

inline bb31_t recip_b2(bb31_t beta) const
# if 0
{ return c[0]*bb31_t{u[2]<<1} - c[1]*c[1] - beta*(c[3]*c[3]); }
# else
{
union { uint64_t wl; uint32_t w[2]; };

wl = u[3] * (uint64_t)u[3];
wl += (w[0] * M) * (uint64_t)MOD; final_sub(w[1]);
wl = w[1] * (uint64_t)(MOD-BETA);
wl += u[1] * (uint64_t)(MOD-u[1]); final_sub(w[1]);
wl += u[0] * (uint64_t)(u[2]<<1);
wl += (w[0] * M) * (uint64_t)MOD;

return bb31_t{final_sub(w[1])};
}
# endif

inline bb31_4_t recip_ret(bb31_t b0, bb31_t b2, bb31_t beta) const
{
bb31_4_t ret;
# if 0
bb31_t beta_b2 = beta*b2;

ret[0] = c[0]*b0 - c[2]*beta_b2;
ret[1] = c[3]*beta_b2 - c[1]*b0;
ret[2] = c[2]*b0 - c[0]*b2;
ret[3] = c[1]*b2 - c[3]*b0;
# else
union { uint64_t wl; uint32_t w[2]; };

wl = b2[0] * (uint64_t)BETA;
wl += (w[0] * M) * (uint64_t)MOD; //final_sub(w[1]);

uint32_t beta_b2 = w[1];

// ret[0] = c[0]*b0 - c[2]*beta_b2;
wl = u[0] * (uint64_t)b0[0];
wl += (MOD-u[2]) * (uint64_t)beta_b2;
wl += (w[0] * M) * (uint64_t)MOD;
ret.u[0] = final_sub(w[1]);

// ret[1] = c[3]*beta_b2 - c[1]b0;
wl = u[3] * (uint64_t)beta_b2;
wl += (MOD-u[1]) * (uint64_t)b0[0];
wl += (w[0] * M) * (uint64_t)MOD;
ret.u[1] = final_sub(w[1]);

// ret[2] = c[2]*b0 - c[0]*b2;
wl = u[2] * (uint64_t)b0[0];
wl += (MOD-u[0]) * (uint64_t)b2[0];
wl += (w[0] * M) * (uint64_t)MOD;
ret.u[2] = final_sub(w[1]);

// ret[3] = c[1]*b2 - c[3]*b0;
wl = u[1] * (uint64_t)b2[0];
wl += (MOD-u[3]) * (uint64_t)b0[0];
wl += (w[0] * M) * (uint64_t)MOD;
ret.u[3] = final_sub(w[1]);
# endif
return ret;
}

Expand All @@ -511,7 +574,12 @@ class __align__(16) bb31_4_t {
bb31_t b0 = recip_b0(beta);
bb31_t b2 = recip_b2(beta);

# if 0 // inefficient code generated by at least 12.5?
bb31_t inv = 1/(b0*b0 - beta*b2*b2);
# else
bb31_t inv = b0*b0 - beta*b2*b2;
inv = 1/inv;
# endif

b0 *= inv;
b2 *= inv;
Expand Down

0 comments on commit 95fe0c0

Please sign in to comment.