#include "fastavxbase64.h" #include #include /** * This code borrows from Wojciech Mula's library at * https://github.com/WojciechMula/base64simd (published under BSD) * as well as code from Alfred Klomp's library https://github.com/aklomp/base64 (published under BSD) * */ /** * Note : Hardware such as Knights Landing might do poorly with this AVX2 code since it relies on shuffles. Alternatives might be faster. */ static inline __m256i enc_reshuffle(const __m256i input) { // translation from SSE into AVX2 of procedure // https://github.com/WojciechMula/base64simd/blob/master/encode/unpack_bigendian.cpp const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8( 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1, 14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5 )); const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); return _mm256_or_si256(t1, t3); } static inline __m256i enc_translate(const __m256i in) { const __m256i lut = _mm256_setr_epi8( 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); __m256i mask = _mm256_cmpgt_epi8((in), _mm256_set1_epi8(25)); indices = _mm256_sub_epi8(indices, mask); __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); return out; } static inline __m256i dec_reshuffle(__m256i in) { // inlined procedure pack_madd from https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx2.cpp // The only difference is that elements are reversed, // only the multiplication constants were changed. const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); //_mm256_maddubs_epi16 is likely expensive __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); // end of inlined // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1 )); // the call to _mm256_permutevar8x32_epi32 could be replaced by a call to _mm256_storeu2_m128i but it is doubtful that it would help return _mm256_permutevar8x32_epi32( out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); } size_t fast_avx2_base64_encode(char* dest, const char* str, size_t len) { const char* const dest_orig = dest; if(len >= 32 - 4) { // first load is masked __m256i inputvector = _mm256_maskload_epi32((int const*)(str - 4), _mm256_set_epi32( 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x00000000 // we do not load the first 4 bytes )); ////////// // Intel docs: Faults occur only due to mask-bit required memory accesses that caused the faults. // Faults will not occur due to referencing any memory location if the corresponding mask bit for //that memory location is 0. For example, no faults will be detected if the mask bits are all zero. //////////// while(true) { inputvector = enc_reshuffle(inputvector); inputvector = enc_translate(inputvector); _mm256_storeu_si256((__m256i *)dest, inputvector); str += 24; dest += 32; len -= 24; if(len >= 32) { inputvector = _mm256_loadu_si256((__m256i *)(str - 4)); // no need for a mask here // we could do a mask load as long as len >= 24 } else { break; } } } size_t scalarret = chromium_base64_encode(dest, str, len); if(scalarret == MODP_B64_ERROR) return MODP_B64_ERROR; return (dest - dest_orig) + scalarret; } size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen) { char* out_orig = out; while (srclen >= 45) { // The input consists of six character sets in the Base64 alphabet, // which we need to map back to the 6-bit values they represent. // There are three ranges, two singles, and then there's the rest. // // # From To Add Characters // 1 [43] [62] +19 + // 2 [47] [63] +16 / // 3 [48..57] [52..61] +4 0..9 // 4 [65..90] [0..25] -65 A..Z // 5 [97..122] [26..51] -71 a..z // (6) Everything else => invalid input __m256i str = _mm256_loadu_si256((__m256i *)src); // code by @aqrit from // https://github.com/WojciechMula/base64simd/issues/3#issuecomment-271137490 // transated into AVX2 const __m256i lut_lo = _mm256_setr_epi8( 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A ); const __m256i lut_hi = _mm256_setr_epi8( 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 ); const __m256i lut_roll = _mm256_setr_epi8( 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 19, 4, -65, -65, -71, -71, 0, 0, 0, 0, 0, 0, 0, 0 ); const __m256i mask_2F = _mm256_set1_epi8(0x2f); // lookup __m256i hi_nibbles = _mm256_srli_epi32(str, 4); __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2F); const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); if (!_mm256_testz_si256(lo, hi)) { break; } str = _mm256_add_epi8(str, roll); // end of copied function srclen -= 32; src += 32; // end of inlined function // Reshuffle the input to packed 12-byte output format: str = dec_reshuffle(str); _mm256_storeu_si256((__m256i *)out, str); out += 24; } size_t scalarret = chromium_base64_decode(out, src, srclen); if(scalarret == MODP_B64_ERROR) return MODP_B64_ERROR; return (out - out_orig) + scalarret; }