/* reedsolomon.c - SIMD-optimized Galois-field multiplication routines * * Copyright (c) 2015, 2016 Nicolas Trangez * Copyright (c) 2015 Klaus Post * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE */ #if HAVE_CONFIG_H # include "config.h" #endif #include #include #if defined(__SSE2__) && __SSE2__ && defined(HAVE_EMMINTRIN_H) && HAVE_EMMINTRIN_H # define USE_SSE2 1 # undef VECTOR_SIZE # define VECTOR_SIZE 16 # include #else # define USE_SSE2 0 #endif #if defined(__SSSE3__) && __SSSE3__ && defined(HAVE_TMMINTRIN_H) && HAVE_TMMINTRIN_H # define USE_SSSE3 1 # undef VECTOR_SIZE # define VECTOR_SIZE 16 # include #else # define USE_SSSE3 0 #endif #if defined(__AVX2__) && __AVX2__ && defined(HAVE_IMMINTRIN_H) && HAVE_IMMINTRIN_H # define USE_AVX2 1 # undef VECTOR_SIZE # define VECTOR_SIZE 32 # include #else # define USE_AVX2 0 #endif #if ((defined(__ARM_NEON__) && __ARM_NEON__) \ || (defined(__ARM_NEON) && __ARM_NEON) \ || (defined(__aarch64__) && __aarch64__)) \ && defined(HAVE_ARM_NEON_H) && HAVE_ARM_NEON_H # define USE_ARM_NEON 1 #undef VECTOR_SIZE # define VECTOR_SIZE 16 # include #else # define USE_ARM_NEON 0 #endif #if defined(__ALTIVEC__) && __ALTIVEC__ && defined(HAVE_ALTIVEC_H) && HAVE_ALTIVEC_H # define USE_ALTIVEC 1 # undef VECTOR_SIZE # define VECTOR_SIZE 16 # include #else # define USE_ALTIVEC 0 #endif #ifndef VECTOR_SIZE /* 'Generic' code */ # define VECTOR_SIZE 16 #endif #if VECTOR_SIZE <= RS_ASSUMED_ALIGNMENT /* Assert RS_ASSUMED_ALIGNMENT is a multiple of VECTOR_SIZE, as expected */ static __attribute__((unused)) int _rs_assert_assumed_alignment_is_multiple_of_vector_size[ RS_ASSUMED_ALIGNMENT % VECTOR_SIZE == 0 ? 1 : -1]; # define USE_ALIGNED_ACCESS 1 # define ALIGNED_ACCESS # define UNALIGNED_ACCESS __attribute__((unused)) #else # define USE_ALIGNED_ACCESS 0 # define ALIGNED_ACCESS __attribute__((unused)) # define UNALIGNED_ACCESS #endif #include "reedsolomon.h" #if defined(HAVE_FUNC_ATTRIBUTE_HOT) && HAVE_FUNC_ATTRIBUTE_HOT # define HOT_FUNCTION __attribute__((hot)) #else # define HOT_FUNCTION #endif #if defined(HAVE_FUNC_ATTRIBUTE_CONST) && HAVE_FUNC_ATTRIBUTE_CONST # define CONST_FUNCTION __attribute__((const)) #else # define CONST_FUNCTION #endif #if defined(HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE) && HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE # define ALWAYS_INLINE inline __attribute__((always_inline)) #else # define ALWAYS_INLINE inline #endif #if defined(HAVE_FUNC_ATTRIBUTE_FORCE_ALIGN_ARG_POINTER) && HAVE_FUNC_ATTRIBUTE_FORCE_ALIGN_ARG_POINTER # define FORCE_ALIGN_ARG_POINTER __attribute__((force_align_arg_pointer)) #else # define FORCE_ALIGN_ARG_POINTER #endif #define CONCAT_HELPER(a, b) a ## b #define CONCAT(a, b) CONCAT_HELPER(a, b) typedef uint8_t v16u8v __attribute__((vector_size(16))); typedef uint64_t v2u64v __attribute__((vector_size(16))); #define T(t, n) t n[VSIZE / 8 / sizeof(t)] #define T1(t, n) t n #define VSIZE 128 typedef union { T(uint8_t, u8); T(uint64_t, u64); #if USE_SSE2 T1(__m128i, m128i); #endif #if USE_ARM_NEON T1(uint8x16_t, uint8x16); T1(uint8x8x2_t, uint8x8x2); #endif #if USE_ALTIVEC T1(__vector uint8_t, uint8x16); T1(__vector uint64_t, uint64x2); #endif T1(v16u8v, v16u8); T1(v2u64v, v2u64); } v128 __attribute__((aligned(16))); #undef VSIZE #define VSIZE 256 typedef union { T(uint8_t, u8); #if USE_AVX2 __m256i m256i; #endif } v256 __attribute__((aligned(32))); #undef VSIZE #undef T #undef T1 #if VECTOR_SIZE == 16 typedef v128 v; #elif VECTOR_SIZE == 32 typedef v256 v; #else # error Unsupported VECTOR_SIZE #endif static ALWAYS_INLINE UNALIGNED_ACCESS v128 loadu_v128(const uint8_t *in) { #if USE_SSE2 const v128 result = { .m128i = _mm_loadu_si128((const __m128i *)in) }; #else v128 result; memcpy(&result.u64, in, sizeof(result.u64)); #endif return result; } static ALWAYS_INLINE UNALIGNED_ACCESS v loadu_v(const uint8_t *in) { #if USE_AVX2 const v256 result = { .m256i = _mm256_loadu_si256((const __m256i *)in) }; #else const v128 result = loadu_v128(in); #endif return result; } static ALWAYS_INLINE ALIGNED_ACCESS v load_v(const uint8_t *in) { #if USE_AVX2 const v256 result = { .m256i = _mm256_load_si256((const __m256i *)in) }; #elif USE_SSE2 const v128 result = { .m128i = _mm_load_si128((const __m128i *)in) }; #elif USE_ARM_NEON const v128 result = { .uint8x16 = vld1q_u8(in) }; #elif USE_ALTIVEC const v128 result = { .uint8x16 = vec_ld(0, in) }; #else const v128 result = loadu_v128(in); #endif return result; } static ALWAYS_INLINE CONST_FUNCTION v set1_epi8_v(const uint8_t c) { #if USE_AVX2 const v256 result = { .m256i = _mm256_set1_epi8(c) }; #elif USE_SSE2 const v128 result = { .m128i = _mm_set1_epi8(c) }; #elif USE_ARM_NEON const v128 result = { .uint8x16 = vdupq_n_u8(c) }; #elif USE_ALTIVEC const v128 result = { .uint8x16 = { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c } }; #else uint64_t c2 = c, tmp = (c2 << (7 * 8)) | (c2 << (6 * 8)) | (c2 << (5 * 8)) | (c2 << (4 * 8)) | (c2 << (3 * 8)) | (c2 << (2 * 8)) | (c2 << (1 * 8)) | (c2 << (0 * 8)); const v128 result = { .u64 = { tmp, tmp } }; #endif return result; } static ALWAYS_INLINE CONST_FUNCTION v srli_epi64_v(const v in, const unsigned int n) { #if USE_AVX2 const v256 result = { .m256i = _mm256_srli_epi64(in.m256i, n) }; #elif USE_SSE2 const v128 result = { .m128i = _mm_srli_epi64(in.m128i, n) }; #elif USE_ARM_NEON const v128 result = { .uint8x16 = vshrq_n_u8(in.uint8x16, n) }; #elif USE_ALTIVEC # if RS_HAVE_VEC_VSRD const v128 shift = { .v2u64 = { n, n } }, result = { .uint64x2 = vec_vsrd(in.v2u64, shift.v2u64) }; # else const v128 result = { .v2u64 = in.v2u64 >> n }; # endif #else const v128 result = { .u64 = { in.u64[0] >> n, in.u64[1] >> n } }; #endif return result; } static ALWAYS_INLINE CONST_FUNCTION v and_v(const v a, const v b) { #if USE_AVX2 const v256 result = { .m256i = _mm256_and_si256(a.m256i, b.m256i) }; #elif USE_SSE2 const v128 result = { .m128i = _mm_and_si128(a.m128i, b.m128i) }; #elif USE_ARM_NEON const v128 result = { .uint8x16 = vandq_u8(a.uint8x16, b.uint8x16) }; #elif USE_ALTIVEC const v128 result = { .uint8x16 = vec_and(a.uint8x16, b.uint8x16) }; #else const v128 result = { .v2u64 = a.v2u64 & b.v2u64 }; #endif return result; } static ALWAYS_INLINE CONST_FUNCTION v xor_v(const v a, const v b) { #if USE_AVX2 const v256 result = { .m256i = _mm256_xor_si256(a.m256i, b.m256i) }; #elif USE_SSE2 const v128 result = { .m128i = _mm_xor_si128(a.m128i, b.m128i) }; #elif USE_ARM_NEON const v128 result = { .uint8x16 = veorq_u8(a.uint8x16, b.uint8x16) }; #elif USE_ALTIVEC const v128 result = { .uint8x16 = vec_xor(a.uint8x16, b.uint8x16) }; #else const v128 result = { .v2u64 = a.v2u64 ^ b.v2u64 }; #endif return result; } static ALWAYS_INLINE CONST_FUNCTION v shuffle_epi8_v(const v vec, const v mask) { #if USE_AVX2 const v256 result = { .m256i = _mm256_shuffle_epi8(vec.m256i, mask.m256i) }; #elif USE_SSSE3 const v128 result = { .m128i = _mm_shuffle_epi8(vec.m128i, mask.m128i) }; #elif USE_ARM_NEON # if defined(RS_HAVE_VQTBL1Q_U8) && RS_HAVE_VQTBL1Q_U8 const v128 result = { .uint8x16 = vqtbl1q_u8(vec.uint8x16, mask.uint8x16) }; # else /* There's no NEON instruction mapping 1-to-1 to _mm_shuffle_epi8, but * this should have the same result... */ const v128 result = { .uint8x16 = vcombine_u8(vtbl2_u8(vec.uint8x8x2, vget_low_u8(mask.uint8x16)), vtbl2_u8(vec.uint8x8x2, vget_high_u8(mask.uint8x16))) }; # endif #elif USE_ALTIVEC const v128 zeros = set1_epi8_v(0), result = { .uint8x16 = vec_perm(vec.uint8x16, zeros.uint8x16, mask.uint8x16) }; #elif defined(RS_HAVE_BUILTIN_SHUFFLE) && RS_HAVE_BUILTIN_SHUFFLE const v16u8v zeros = { 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0 }; const v128 result = { .v16u8 = __builtin_shuffle(vec.v16u8, zeros, mask.v16u8) }; #else v128 result = { .u64 = { 0, 0 } }; # define DO_BYTE(i) \ result.u8[i] = mask.u8[i] & 0x80 ? 0 : vec.u8[mask.u8[i] & 0x0F]; DO_BYTE( 0); DO_BYTE( 1); DO_BYTE( 2); DO_BYTE( 3); DO_BYTE( 4); DO_BYTE( 5); DO_BYTE( 6); DO_BYTE( 7); DO_BYTE( 8); DO_BYTE( 9); DO_BYTE(10); DO_BYTE(11); DO_BYTE(12); DO_BYTE(13); DO_BYTE(14); DO_BYTE(15); #endif return result; } static ALWAYS_INLINE UNALIGNED_ACCESS void storeu_v(uint8_t *out, const v vec) { #if USE_AVX2 _mm256_storeu_si256((__m256i *)out, vec.m256i); #elif USE_SSE2 _mm_storeu_si128((__m128i *)out, vec.m128i); #else memcpy(out, &vec.u64, sizeof(vec.u64)); #endif } static ALWAYS_INLINE ALIGNED_ACCESS void store_v(uint8_t *out, const v vec) { #if USE_AVX2 _mm256_store_si256((__m256i *)out, vec.m256i); #elif USE_SSE2 _mm_store_si128((__m128i *)out, vec.m128i); #elif USE_ARM_NEON vst1q_u8(out, vec.uint8x16); #elif USE_ALTIVEC vec_st(vec.uint8x16, 0, out); #else storeu_v(out, vec); #endif } static ALWAYS_INLINE CONST_FUNCTION v replicate_v128_v(const v128 vec) { #if USE_AVX2 const v256 result = { .m256i = _mm256_broadcastsi128_si256(vec.m128i) }; #else const v128 result = vec; #endif return result; } //+build !noasm !appengine // Copyright 2015, Klaus Post, see LICENSE for details. // Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf // and http://jerasure.org/jerasure/gf-complete/tree/master /* // func galMulSSSE3Xor(low, high, in, out []byte) TEXT ·galMulSSSE3Xor(SB), 7, $0 MOVQ low+0(FP),SI // SI: &low MOVQ high+24(FP),DX // DX: &high MOVOU (SI), X6 // X6 low MOVOU (DX), X7 // X7: high MOVQ $15, BX // BX: low mask MOVQ BX, X8 PXOR X5, X5 MOVQ in+48(FP),SI // R11: &in MOVQ in_len+56(FP),R9 // R9: len(in) MOVQ out+72(FP), DX // DX: &out PSHUFB X5, X8 // X8: lomask (unpacked) SHRQ $4, R9 // len(in) / 16 CMPQ R9 ,$0 JEQ done_xor loopback_xor: MOVOU (SI),X0 // in[x] MOVOU (DX),X4 // out[x] MOVOU X0, X1 // in[x] MOVOU X6, X2 // low copy MOVOU X7, X3 // high copy PSRLQ $4, X1 // X1: high input PAND X8, X0 // X0: low input PAND X8, X1 // X0: high input PSHUFB X0, X2 // X2: mul low part PSHUFB X1, X3 // X3: mul high part PXOR X2, X3 // X3: Result PXOR X4, X3 // X3: Result xor existing out MOVOU X3, (DX) // Store ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback_xor done_xor: RET // func galMulSSSE3(low, high, in, out []byte) TEXT ·galMulSSSE3(SB), 7, $0 MOVQ low+0(FP),SI // SI: &low MOVQ high+24(FP),DX // DX: &high MOVOU (SI), X6 // X6 low MOVOU (DX), X7 // X7: high MOVQ $15, BX // BX: low mask MOVQ BX, X8 PXOR X5, X5 MOVQ in+48(FP),SI // R11: &in MOVQ in_len+56(FP),R9 // R9: len(in) MOVQ out+72(FP), DX // DX: &out PSHUFB X5, X8 // X8: lomask (unpacked) SHRQ $4, R9 // len(in) / 16 CMPQ R9 ,$0 JEQ done loopback: MOVOU (SI),X0 // in[x] MOVOU X0, X1 // in[x] MOVOU X6, X2 // low copy MOVOU X7, X3 // high copy PSRLQ $4, X1 // X1: high input PAND X8, X0 // X0: low input PAND X8, X1 // X0: high input PSHUFB X0, X2 // X2: mul low part PSHUFB X1, X3 // X3: mul high part PXOR X2, X3 // X3: Result MOVOU X3, (DX) // Store ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback done: RET */ static ALWAYS_INLINE v reedsolomon_gal_mul_v( const v low_mask_unpacked, const v low_vector, const v high_vector, v (*modifier)(const v new, const v old), const v in_x, const v old) { const v low_input = and_v(in_x, low_mask_unpacked), in_x_shifted = srli_epi64_v(in_x, 4), high_input = and_v(in_x_shifted, low_mask_unpacked), mul_low_part = shuffle_epi8_v(low_vector, low_input), mul_high_part = shuffle_epi8_v(high_vector, high_input), new = xor_v(mul_low_part, mul_high_part), result = modifier(new, old); return result; } static ALWAYS_INLINE PROTO_RETURN reedsolomon_gal_mul_impl( PROTO_ARGS, v (*modifier)(const v new, const v old)) { const v low_mask_unpacked = set1_epi8_v(0x0f); const v128 low_vector128 = loadu_v128(low), high_vector128 = loadu_v128(high); const v low_vector = replicate_v128_v(low_vector128), high_vector = replicate_v128_v(high_vector128); size_t done = 0; #if USE_ALIGNED_ACCESS # define LOAD(addr) load_v(addr) # define STORE(addr, vec) store_v(addr, vec) #else # define LOAD(addr) loadu_v(addr) # define STORE(addr, vec) storeu_v(addr, vec) #endif #if RS_HAVE_CLANG_LOOP_UNROLL # pragma clang loop unroll(enable) #endif for(size_t x = 0; x < len / sizeof(v); x++) { const v in_x = LOAD(&in[done]), old = LOAD(&out[done]), result = reedsolomon_gal_mul_v( low_mask_unpacked, low_vector, high_vector, modifier, in_x, old); STORE(&out[done], result); done += sizeof(v); } return done; } static ALWAYS_INLINE CONST_FUNCTION v noop(const v new, const v old __attribute__((__unused__))) { return new; } #ifdef HOT HOT_FUNCTION #endif FORCE_ALIGN_ARG_POINTER PROTO(CONCAT(reedsolomon_gal_mul_, TARGET)) { return reedsolomon_gal_mul_impl(low, high, in, out, len, noop); } #ifdef HOT HOT_FUNCTION #endif FORCE_ALIGN_ARG_POINTER PROTO(CONCAT(reedsolomon_gal_mul_xor_, TARGET)) { return reedsolomon_gal_mul_impl(low, high, in, out, len, xor_v); }