/* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #if LIMB_T_BITS==32 typedef unsigned long long llimb_t; #endif #if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__) # error "unsupported compiler" #endif #if defined(__clang__) # pragma GCC diagnostic ignored "-Wstatic-in-inline" #endif static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], const limb_t p[], limb_t n0, size_t n) { llimb_t limbx; limb_t mask, borrow, mx, hi, tmp[n+1], carry; size_t i, j; for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; tmp[i] = hi; for (carry=0, j=0; ; ) { limbx = (mx * (llimb_t)p[0]) + tmp[0]; hi = (limb_t)(limbx >> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } limbx = tmp[i] + (hi + (llimb_t)carry); tmp[i-1] = (limb_t)limbx; carry = (limb_t)(limbx >> LIMB_T_BITS); if (++j==n) break; for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; limbx = hi + (llimb_t)carry; tmp[i] = (limb_t)limbx; carry = (limb_t)(limbx >> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; for(i=0; i> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; for(i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; for (carry=0, i=0; i> LIMB_T_BITS); } } #define SUB_MOD_IMPL(bits) \ inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ const vec##bits b, const vec##bits p) \ { sub_mod_n(ret, a, b, p, NLIMBS(bits)); } SUB_MOD_IMPL(256) SUB_MOD_IMPL(384) static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], size_t n) { llimb_t limbx; limb_t mask, carry, borrow, tmp[n], two_a[n]; size_t i; for (carry=0, i=0; i>(LIMB_T_BITS-1); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; for(i=0; i> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; for(i=0; i>(LIMB_T_BITS-1); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; for(i=0; i> LIMB_T_BITS) & 1; } flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; mask = (limb_t)0 - flag; for(i=0; i> LIMB_T_BITS) & 1; } return borrow & (is_zero(acc) ^ 1); } #define CHECK_MOD_IMPL(bits) \ inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ { return check_mod_n(a, p, NLIMBS(bits)); } CHECK_MOD_IMPL(256) static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], const limb_t p[], size_t n) { limb_t ret_[n], a_[n], b_[n], zero; limbs_from_le_bytes(a_, a, sizeof(a_)); limbs_from_le_bytes(b_, b, sizeof(b_)); add_mod_n(ret_, a_, b_, p, n); zero = vec_is_zero(ret_, sizeof(ret_)); le_bytes_from_limbs(ret, ret_, sizeof(ret_)); return zero^1; } #define ADD_N_CHECK_MOD_IMPL(bits) \ inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ const pow##bits b, const vec##bits p) \ { return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } ADD_N_CHECK_MOD_IMPL(256) static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], const limb_t p[], size_t n) { limb_t ret_[n], a_[n], b_[n], zero; limbs_from_le_bytes(a_, a, sizeof(a_)); limbs_from_le_bytes(b_, b, sizeof(b_)); sub_mod_n(ret_, a_, b_, p, n); zero = vec_is_zero(ret_, sizeof(ret_)); le_bytes_from_limbs(ret, ret_, sizeof(ret_)); return zero^1; } #define SUB_N_CHECK_MOD_IMPL(bits) \ inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ const pow##bits b, const vec##bits p) \ { return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } SUB_N_CHECK_MOD_IMPL(256) static void from_mont_n(limb_t ret[], const limb_t a[], const limb_t p[], limb_t n0, size_t n) { llimb_t limbx; limb_t mask, borrow, mx, hi, tmp[n]; size_t i, j; for (j=0; j> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } tmp[i-1] = hi; a = tmp; } /* this is needed only if input can be non-fully-reduced */ for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; for(i=0; i> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } tmp[i-1] = hi; b = tmp; } for (carry=0, i=0; i> LIMB_T_BITS); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = carry - borrow; for(i=0; i> LIMB_T_BITS); } for (next=ret[0], i=0; i> 1; next = ret[i+1]; ret[i] = limb | next << (LIMB_T_BITS-1); } ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); a = ret; } } #define RSHIFT_MOD_IMPL(bits) \ inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ const vec##bits p) \ { rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } RSHIFT_MOD_IMPL(256) RSHIFT_MOD_IMPL(384) #define DIV_BY_2_MOD_IMPL(bits) \ inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ const vec##bits p) \ { rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } DIV_BY_2_MOD_IMPL(384) static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) { llimb_t limbx; limb_t carry, borrow, ret, tmp[n]; size_t i; ret = a[0] & 1; /* parity */ for (carry=0, i=0; i>(LIMB_T_BITS-1); } for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } ret |= ((carry - borrow) & 2) ^ 2; return ret; } inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) { return sgn0_pty_mod_n(a, p, NLIMBS(384)); } inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) { vec384 tmp; from_mont_n(tmp, a, p, n0, NLIMBS(384)); return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); } inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) { limb_t re, im, sign, prty; re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); sign = (re & sign) | (im & ~sign); /* a->re==0 ? prty(a->im) : prty(a->re) */ prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); prty = (im & prty) | (re & ~prty); return (sign & 2) | (prty & 1); } inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) { vec384x tmp; from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); return sgn0_pty_mod_384x(tmp, p); } void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 p, limb_t n0) { vec384 aa, bb, cc; add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); } /* * mul_mont_n without final conditional subtraction, which implies * that modulus is one bit short, which in turn means that there are * no carries to handle between iterations... */ static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], const limb_t p[], limb_t n0, size_t n) { llimb_t limbx; limb_t mx, hi, tmp[n+1]; size_t i, j; for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; tmp[i] = hi; for (j=0; ; ) { limbx = (mx * (llimb_t)p[0]) + tmp[0]; hi = (limb_t)(limbx >> LIMB_T_BITS); for (i=1; i> LIMB_T_BITS); } tmp[i-1] = tmp[i] + hi; if (++j==n) break; for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); } mx = n0*tmp[0]; tmp[i] = hi; } vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); } void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, const vec384 p, limb_t n0, const vec384 b) { while(count--) { mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); a = ret; } mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); } void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0) { llimb_t limbx; limb_t mask, carry, borrow; size_t i; vec384 t0, t1; /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ for (carry=0, i=0; i> LIMB_T_BITS); } /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ for (carry=0, i=0; i>(LIMB_T_BITS-1); } /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); /* account for t1's sign... */ for (borrow=0, i=0; i> LIMB_T_BITS) & 1; } mask = 0 - borrow; for (carry=0, i=0; i> LIMB_T_BITS); } } #define MSB(x) ((x) >> (LIMB_T_BITS-1)) static size_t num_bits(limb_t l) { limb_t x, mask; size_t bits = is_zero(l) ^ 1; if (sizeof(limb_t) == 8) { x = l >> (32 & (8*sizeof(limb_t)-1)); mask = 0 - MSB(0 - x); bits += 32 & mask; l ^= (x ^ l) & mask; } x = l >> 16; mask = 0 - MSB(0 - x); bits += 16 & mask; l ^= (x ^ l) & mask; x = l >> 8; mask = 0 - MSB(0 - x); bits += 8 & mask; l ^= (x ^ l) & mask; x = l >> 4; mask = 0 - MSB(0 - x); bits += 4 & mask; l ^= (x ^ l) & mask; x = l >> 2; mask = 0 - MSB(0 - x); bits += 2 & mask; l ^= (x ^ l) & mask; bits += l >> 1; return bits; } #if defined(__clang_major__) && __clang_major__>7 __attribute__((optnone)) #endif static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) { size_t r = LIMB_T_BITS - l; limb_t mask = 0 - (is_zero(l)^1); return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); } /* * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. */ static void ab_approximation_n(limb_t a_[2], const limb_t a[], limb_t b_[2], const limb_t b[], size_t n) { limb_t a_hi, a_lo, b_hi, b_lo, mask; size_t i; i = n-1; a_hi = a[i], a_lo = a[i-1]; b_hi = b[i], b_lo = b[i-1]; for (i--; --i;) { mask = 0 - is_zero(a_hi | b_hi); a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; } i = LIMB_T_BITS - num_bits(a_hi | b_hi); /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); } typedef struct { limb_t f0, g0, f1, g1; } factors; static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], size_t n) { llimb_t limbx; limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; a_lo = a_[0], a_hi = a_[1]; b_lo = b_[0], b_hi = b_[1]; while(n--) { odd = 0 - (a_lo&1); /* a_ -= b_ if a_ is odd */ t_lo = a_lo, t_hi = a_hi; limbx = a_lo - (llimb_t)(b_lo & odd); a_lo = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); a_hi = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS); /* negate a_-b_ if it borrowed */ a_lo ^= borrow; a_hi ^= borrow; limbx = a_lo + (llimb_t)(borrow & 1); a_lo = (limb_t)limbx; a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; /* b_=a_ if a_-b_ borrowed */ b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; /* exchange f0 and f1 if a_-b_ borrowed */ xorm = (f0 ^ f1) & borrow; f0 ^= xorm; f1 ^= xorm; /* exchange g0 and g1 if a_-b_ borrowed */ xorm = (g0 ^ g1) & borrow; g0 ^= xorm; g1 ^= xorm; /* subtract if a_ was odd */ f0 -= f1 & odd; g0 -= g1 & odd; f1 <<= 1; g1 <<= 1; a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); a_hi >>= 1; } fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; } static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) { llimb_t limbx = 0; limb_t carry; size_t i; for (carry=neg&1, i=0; i> LIMB_T_BITS); } return 0 - MSB((limb_t)limbx); } static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) { llimb_t limbx; limb_t carry; size_t i; for (carry=0, i=0; i> LIMB_T_BITS); } return carry; } static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) { llimb_t limbx; limb_t hi; size_t i; for (hi=0, i=0; i> LIMB_T_BITS); } return hi; } static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, const limb_t b[], limb_t *g_, size_t n) { limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; size_t i; /* |a|*|f_| */ f = *f_; neg = 0 - MSB(f); f = (f ^ neg) - neg; /* ensure |f| is positive */ (void)cneg_n(a_, a, neg, n); hi = umul_n(a_, a_, f, n); a_[n] = hi - (f & neg); /* |b|*|g_| */ g = *g_; neg = 0 - MSB(g); g = (g ^ neg) - neg; /* ensure |g| is positive */ (void)cneg_n(b_, b, neg, n); hi = umul_n(b_, b_, g, n); b_[n] = hi - (g & neg); /* |a|*|f_| + |b|*|g_| */ (void)add_n(a_, a_, b_, n+1); /* (|a|*|f_| + |b|*|g_|) >> k */ for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); carry = a_[i+1]; ret[i] = hi | (carry << 2); } /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ neg = 0 - MSB(carry); *f_ = (*f_ ^ neg) - neg; *g_ = (*g_ ^ neg) - neg; (void)cneg_n(ret, ret, neg, n); return neg; } static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, const limb_t v[], limb_t g, size_t n) { limb_t u_[n], v_[n], neg, hi; /* |u|*|f_| */ neg = 0 - MSB(f); f = (f ^ neg) - neg; /* ensure |f| is positive */ neg = cneg_n(u_, u, neg, n); hi = umul_n(u_, u_, f, n) - (f&neg); /* |v|*|g_| */ neg = 0 - MSB(g); g = (g ^ neg) - neg; /* ensure |g| is positive */ neg = cneg_n(v_, v, neg, n); hi += umul_n(v_, v_, g, n) - (g&neg); /* |u|*|f_| + |v|*|g_| */ hi += add_n(ret, u_, v_, n); return hi; } static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], const limb_t mod[], const limb_t modx[], size_t n) { llimb_t limbx; limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; limb_t a_[2], b_[2], sign, carry, top; factors fg; size_t i; vec_copy(a, inp, sizeof(a)); vec_copy(b, mod, sizeof(b)); vec_zero(u, sizeof(u)); u[0] = 1; vec_zero(v, sizeof(v)); for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { ab_approximation_n(a_, a, b_, b, n); inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); vec_copy(a, t, sizeof(a)); smul_2n(t, u, fg.f0, v, fg.g0, 2*n); smul_2n(v, u, fg.f1, v, fg.g1, 2*n); vec_copy(u, t, sizeof(u)); } inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); sign = 0 - MSB(top); /* top is 1, 0 or -1 */ for (carry=0, i=0; i> LIMB_T_BITS); } top += carry; sign = 0 - top; /* top is 1, 0 or -1 */ top |= sign; for (i=0; i> LIMB_T_BITS) & 1; limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); a_hi = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS); L += ((t_lo & b_lo) >> 1) & borrow; /* negate a_-b_ if it borrowed */ a_lo ^= borrow; a_hi ^= borrow; limbx = a_lo + (llimb_t)(borrow & 1); a_lo = (limb_t)limbx; a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; /* b_=a_ if a_-b_ borrowed */ b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; /* exchange f0 and f1 if a_-b_ borrowed */ xorm = (f0 ^ f1) & borrow; f0 ^= xorm; f1 ^= xorm; /* exchange g0 and g1 if a_-b_ borrowed */ xorm = (g0 ^ g1) & borrow; g0 ^= xorm; g1 ^= xorm; /* subtract if a_ was odd */ f0 -= f1 & odd; g0 -= g1 & odd; f1 <<= 1; g1 <<= 1; a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); a_hi >>= 1; L += (b_lo + 2) >> 2; } fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; return L; } static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) { limb_t a[n], b[n], t[n]; limb_t a_[2], b_[2], neg, L = 0; factors fg; size_t i; vec_copy(a, inp, sizeof(a)); vec_copy(b, mod, sizeof(b)); for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { ab_approximation_n(a_, a, b_, b, n); L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); vec_copy(a, t, sizeof(a)); L += (b[0] >> 1) & neg; } L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); return (L & 1) ^ 1; } #define CT_IS_SQR_MOD_IMPL(bits) \ inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ const vec##bits mod) \ { return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } CT_IS_SQR_MOD_IMPL(384) /* * |div_top| points at two most significant limbs of the dividend, |d_hi| * and |d_lo| are two most significant limbs of the divisor. If divisor * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. * The divisor is required to be "bitwise left-aligned," and dividend's * top limbs to be not larger than the divisor's. The latter limitation * can be problematic in the first iteration of multi-precision division, * where in most general case the condition would have to be "smaller." * The subroutine considers four limbs, two of which are "overlapping," * hence the name... Another way to look at it is to think of the pair * of the dividend's limbs being suffixed with a zero: * +-------+-------+-------+ * R | | | 0 | * +-------+-------+-------+ * +-------+-------+ * D | | | * +-------+-------+ */ limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) { llimb_t Rx; limb_t r_lo = div_top[0], r_hi = div_top[1]; limb_t Q = 0, mask, borrow, rx; size_t i; for (i = 0; i < LIMB_T_BITS; i++) { /* "borrow, Rx = R - D" */ Rx = (llimb_t)r_lo - d_lo; rx = (limb_t)Rx; borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; Rx = r_hi - (d_hi + (llimb_t)borrow); borrow = (limb_t)(Rx >> LIMB_T_BITS); /* "if (R >= D) R -= D" */ r_lo = ((r_lo ^ rx) & borrow) ^ rx; rx = (limb_t)Rx; r_hi = ((r_hi ^ rx) & borrow) ^ rx; Q <<= 1; Q |= ~borrow & 1; /* "D >>= 1" */ d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); d_hi >>= 1; } mask = 0 - MSB(Q); /* does it overflow? */ /* "borrow, Rx = R - D" */ Rx = (llimb_t)r_lo - d_lo; rx = (limb_t)Rx; borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; Rx = r_hi - (d_hi + (llimb_t)borrow); borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; Q <<= 1; Q |= borrow ^ 1; return (Q | mask); } static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, limb_t quotient, size_t n) { llimb_t limbx; limb_t tmp[n+1], carry, mask, borrow; size_t i; /* divisor*quotient */ for (carry=0, i=0; i> LIMB_T_BITS); } tmp[i] = carry; /* remainder = dividend - divisor*quotient */ for (borrow=0, i=0; i<=n; i++) { limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); tmp[i] = (limb_t)limbx; borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; } mask = 0 - borrow; /* if quotient was off by one, add divisor to the remainder */ for (carry=0, i=0; i> LIMB_T_BITS) & 1; } return (div_rem[i] = quotient + mask); } inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, limb_t quotient) { return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, limb_t quotient) { return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } /* * Unlock reference implementations in vect.c */ #define mul_by_8_mod_384 mul_by_8_mod_384 #define mul_by_8_mod_384x mul_by_8_mod_384x #define mul_by_3_mod_384x mul_by_3_mod_384x #define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x #define add_mod_384x add_mod_384x #define sub_mod_384x sub_mod_384x #define lshift_mod_384x lshift_mod_384x #define sqr_mont_384x sqr_mont_384x inline void vec_prefetch(const void *ptr, size_t len) { (void)ptr; (void)len; } /* * SHA-256 */ #define ROTR(x,n) ((x)>>n | (x)<<(32-n)) #define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) #define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) #define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) #define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) void blst_sha256_block_data_order(unsigned int *v, const void *inp, size_t blocks) { static const unsigned int K256[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; const unsigned char *data = inp; size_t round; a = v[0]; b = v[1]; c = v[2]; d = v[3]; e = v[4]; f = v[5]; g = v[6]; h = v[7]; while (blocks--) { for (round = 0; round < 16; round++) { l = (unsigned int)data[0] << 24; l |= (unsigned int)data[1] << 16; l |= (unsigned int)data[2] << 8; l |= (unsigned int)data[3]; data += 4; T1 = X[round] = l; T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; T2 = Sigma0(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; } for (; round < 64; round++) { s0 = X[(round + 1) & 0x0f]; s0 = sigma0(s0); s1 = X[(round + 14) & 0x0f]; s1 = sigma1(s1); T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; T2 = Sigma0(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; } a += v[0]; v[0] = a; b += v[1]; v[1] = b; c += v[2]; v[2] = c; d += v[3]; v[3] = d; e += v[4]; v[4] = e; f += v[5]; v[5] = f; g += v[6]; v[6] = g; h += v[7]; v[7] = h; } } #undef ROTR #undef Sigma0 #undef Sigma1 #undef sigma0 #undef sigma1 #undef Ch #undef Maj void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) { size_t i; for (i=0; i<8; i++) dst[i] = src[i]; } void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) { size_t i; for (i=0; i<8; i++, md+=4) { unsigned int h_i = h[i]; md[0] = (unsigned char)(h_i >> 24); md[1] = (unsigned char)(h_i >> 16); md[2] = (unsigned char)(h_i >> 8); md[3] = (unsigned char)h_i; } } void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) { unsigned char *dst = dst_; const unsigned char *src = src_; size_t i; for (i=0; i