/* * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 */ #include "fields.h" #include "point.h" /* * Infinite point among inputs would be devastating. Shall we change it? */ #define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ static void ptype##s_to_affine(ptype##_affine dst[], \ const ptype *const points[], size_t npoints) \ { \ size_t i; \ vec##bits *acc, ZZ, ZZZ; \ const ptype *point = NULL; \ const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ \ while (npoints) { \ const ptype *p, *const *walkback; \ size_t delta = strideZ, sizeof(vec##bits)); \ for (i = 1; i < delta; i++, acc++) \ point = *points ? *points++ : point+1, \ mul_##field(acc[0], acc[-1], point->Z); \ \ --acc; reciprocal_##field(acc[0], acc[0]); \ \ walkback = points-1, p = point, --delta, dst += delta; \ for (i = 0; i < delta; i++, acc--, dst--) { \ mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ mul_##field(acc[-1], p->Z, acc[0]); \ mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ p = (p == *walkback) ? *--walkback : p-1; \ } \ sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ ++delta, dst += delta, npoints -= delta; \ } \ } \ \ void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ size_t npoints) \ { ptype##s_to_affine(dst, points, npoints); } POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) /* * This is two-step multi-scalar multiplication procedure. First, given * a set of points you pre-compute a table for chosen windowing factor * [expressed in bits with value between 2 and 14], and then you pass * this table to the actual multiplication procedure along with scalars. * Idea is that the pre-computed table will be reused multiple times. In * which case multiplication runs faster than below Pippenger algorithm * implementation for up to ~16K points for wbits=8, naturally at the * expense of multi-megabyte table. One can trade even more memory for * performance, but each wbits increment doubles the memory requirement, * so at some point it gets prohibively large... For reference, without * reusing the table it's faster than Pippenger algorithm for up ~32 * points [with wbits=5]... */ #define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) #define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ const ptype##_affine *point) \ { \ size_t i, j, n = (size_t)1 << (wbits-1); \ /* row[-1] is implicit infinity */\ vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ for (i = 2, j = 1; i < n; i += 2, j++) \ ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ } /* row[4] ... */\ \ static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ size_t wbits, size_t npoints) \ { \ size_t total = npoints << (wbits-1); \ size_t nwin = (size_t)1 << (wbits-1); \ size_t i, j; \ vec##bits *acc, ZZ, ZZZ; \ \ src += total; \ acc = (vec##bits *)src; \ vec_copy(acc++, one, sizeof(vec##bits)); \ for (i = 0; i < npoints; i++) \ for (j = nwin; --src, --j; acc++) \ mul_##field(acc[0], acc[-1], src->Z); \ \ --acc; reciprocal_##field(acc[0], acc[0]); \ \ for (i = 0; i < npoints; i++) { \ vec_copy(dst++, src++, sizeof(ptype##_affine)); \ for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ mul_##field(acc[-1], src->Z, acc[0]); \ mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ } \ } \ } \ \ /* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ const ptype##_affine *const points[], \ size_t npoints) \ { \ size_t total = npoints << (wbits-1); \ size_t nwin = (size_t)1 << (wbits-1); \ size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ size_t i, top = 0; \ ptype *rows, *row; \ const ptype##_affine *point = NULL; \ size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ if (stride == 0) stride = 1; \ \ while (npoints >= nmin) { \ size_t limit = total - npoints; \ \ if (top + (stride << wbits) > limit) { \ stride = (limit - top) >> wbits; \ if (stride == 0) break; \ } \ rows = row = (ptype *)(&table[top]); \ for (i = 0; i < stride; i++, row += nwin) \ point = *points ? *points++ : point+1, \ ptype##_precompute_row_wbits(row, wbits, point); \ ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ top += stride << (wbits-1); \ npoints -= stride; \ } \ rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ for (i = 0; i < npoints; i++, row += nwin) \ point = *points ? *points++ : point+1, \ ptype##_precompute_row_wbits(row, wbits, point); \ ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ } \ \ size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ { return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ const ptype##_affine *const points[], \ size_t npoints) \ { ptype##s_precompute_wbits(table, wbits, points, npoints); } #define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ size_t wbits, limb_t booth_idx) \ { \ bool_t booth_sign = (booth_idx >> wbits) & 1; \ bool_t idx_is_zero; \ static const ptype##_affine infinity = { 0 }; \ \ booth_idx &= ((limb_t)1 << wbits) - 1; \ idx_is_zero = is_zero(booth_idx); \ booth_idx -= 1 ^ idx_is_zero; \ vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ ptype##_cneg(p, booth_sign); \ } \ \ static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ size_t wbits, size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype scratch[]) \ { \ limb_t wmask, wval; \ size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ const byte *scalar, *const *scalar_s = scalars; \ const ptype##_affine *row = table; \ \ size_t scratch_sz = SCRATCH_SZ(ptype); \ if (scratch == NULL) { \ scratch_sz /= 4; /* limit to 288K */ \ scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ scratch = alloca(sizeof(ptype) * scratch_sz); \ } \ \ nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ scalar = *scalar_s++; \ \ /* top excess bits modulo target window size */ \ window = nbits % wbits; /* yes, it may be zero */ \ wmask = ((limb_t)1 << (window + 1)) - 1; \ \ nbits -= window; \ z = is_zero(nbits); \ wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ wval = booth_encode(wval, wbits); \ ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ row += nwin; \ \ i = 1; vec_zero(ret, sizeof(*ret)); \ while (nbits > 0) { \ for (j = i; i < npoints; i++, j++, row += nwin) { \ if (j == scratch_sz) \ ptype##s_accumulate(ret, scratch, j), j = 0; \ scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ wval = booth_encode(wval, wbits); \ ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ } \ ptype##s_accumulate(ret, scratch, j); \ \ for (j = 0; j < wbits; j++) \ ptype##_double(ret, ret); \ \ window = wbits; \ wmask = ((limb_t)1 << (window + 1)) - 1; \ nbits -= window; \ i = 0; row = table; scalar_s = scalars; \ } \ \ for (j = i; i < npoints; i++, j++, row += nwin) { \ if (j == scratch_sz) \ ptype##s_accumulate(ret, scratch, j), j = 0; \ scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ wval = booth_encode(wval, wbits); \ ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ } \ ptype##s_accumulate(ret, scratch, j); \ } \ \ size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ { \ const size_t scratch_sz = SCRATCH_SZ(ptype); \ return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ } \ void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ size_t wbits, size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype scratch[]) \ { ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) /* * Pippenger algorithm implementation, fastest option for larger amount * of points... */ static size_t pippenger_window_size(size_t npoints) { size_t wbits; for (wbits=0; npoints>>=1; wbits++) ; return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); } #define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; #define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ size_t wbits) \ { \ ptype##xyzz ret[1], acc[1]; \ size_t n = (size_t)1 << wbits; \ \ /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ vec_copy(acc, &buckets[--n], sizeof(acc)); \ vec_copy(ret, &buckets[n], sizeof(ret)); \ vec_zero(&buckets[n], sizeof(buckets[n])); \ while (n--) { \ ptype##xyzz_dadd(acc, acc, &buckets[n]); \ ptype##xyzz_dadd(ret, ret, acc); \ vec_zero(&buckets[n], sizeof(buckets[n])); \ } \ ptype##xyzz_to_Jacobian(out, ret); \ } \ \ static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ size_t wbits, const ptype##_affine *p) \ { \ bool_t booth_sign = (booth_idx >> wbits) & 1; \ \ booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ else wbits = cbits = window; \ ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ bit0, wbits, cbits); \ } \ void prefix##s_mult_pippenger(ptype *ret, \ const ptype##_affine *const points[], \ size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype##xyzz scratch[]) \ { ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); } DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2)