#include <stdint.h>
#include <immintrin.h>

#include "reedsolomon.h"

#if HAVE_FUNC_ATTRIBUTE_HOT
# define HOT_FUNCTION   __attribute__((hot))
#else
# define HOT_FUNCTION
#endif

#if HAVE_FUNC_ATTRIBUTE_CONST
# define CONST_FUNCTION __attribute__((const))
#else
# define CONST_FUNCTION
#endif

#if HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE
# define ALWAYS_INLINE  inline __attribute__((always_inline))
#else
# define ALWAYS_INLINE  inline
#endif

#define CONCAT_HELPER(a, b)     a ## b
#define CONCAT(a, b)            CONCAT_HELPER(a, b)

/* Work-around GCC not doing correct stack alignment when needed, which causes
 * crashes of the SSE2 (and most likely others, when stars are not aligned)
 * implementation on Windows when calling `set1_epi8_v`: when storing the
 * result, the target memory is not 16-byte aligned, causing a GPF/segfault.
 *
 * References:
 * - http://www.peterstock.co.uk/games/mingw_sse/
 * - http://eigen.tuxfamily.org/dox/group__TopicWrongStackAlignment.html
 * - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838
 */
#if !defined(__clang__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
# warning You have an old C compiler that generates incorrect SIMD allocation code. Using slow work-around.
# define STACK_ALIGN(t, n)                                                              \
        char _tmp ##n [2 * sizeof(t)];                                                  \
        t *n = (t *)(((uintptr_t)&_tmp ##n + (sizeof(t) - 1)) & ~(uintptr_t)0x0F)
#else
# define STACK_ALIGN(t, n)                                                              \
        t n_tmp;                                                                        \
        t *n = &n_tmp
#endif

typedef uint8_t v16u8v __attribute__((vector_size(16)));
typedef uint64_t v2u64v __attribute__((vector_size(16)));

#define T(t, n) t n[128 / 8 / sizeof(t)]
#define T1(t, n) t n
typedef union {
        T(uint8_t, u8);
        T(uint64_t, u64);
#if __SSE2__
        T1(__m128i, m128i);
#endif
        T1(v16u8v, v16u8);
        T1(v2u64v, v2u64);
} v128 __attribute__((aligned(16)));
# undef T
# undef T1

#ifdef __AVX2__
typedef __m256i v;
#else
typedef v128 v;
#endif

static ALWAYS_INLINE v128 loadu_v128(const uint8_t *in) {
        STACK_ALIGN(v128, result);

#if defined(__SSE2__)
        result->m128i = _mm_loadu_si128((const __m128i *)in);
#else
        const uint64_t *in64 = (const uint64_t *)in;
        result->u64[0] = in64[0];
        result->u64[1] = in64[1];
#endif

        return *result;
}

static ALWAYS_INLINE v loadu_v(const uint8_t *in) {
        STACK_ALIGN(v, result);

#if defined(__AVX2__)
        *result = _mm256_loadu_si256((const v *)in);
#else
        *result = loadu_v128(in);
#endif

        return *result;
}

static ALWAYS_INLINE CONST_FUNCTION v set1_epi8_v(const uint8_t c) {
        STACK_ALIGN(v, result);

#if defined(__AVX2__)
        *result = _mm256_set1_epi8(c);
#elif defined(__SSE2__)
        result->m128i = _mm_set1_epi8(c);
#else
        uint64_t c2 = c,
                 tmp = (c2 << (7 * 8)) |
                       (c2 << (6 * 8)) |
                       (c2 << (5 * 8)) |
                       (c2 << (4 * 8)) |
                       (c2 << (3 * 8)) |
                       (c2 << (2 * 8)) |
                       (c2 << (1 * 8)) |
                       (c2 << (0 * 8));
        result->u64[0] = tmp;
        result->u64[1] = tmp;
#endif

        return *result;
}

static ALWAYS_INLINE CONST_FUNCTION v srli_epi64_v(const v in, const unsigned int n) {
        STACK_ALIGN(v, result);

#if defined(__AVX2__)
        *result = _mm256_srli_epi64(in, n);
#elif defined(__SSE2__)
        result->m128i = _mm_srli_epi64(in.m128i, n);
#else
        result->u64[0] = in.u64[0] >> n;
        result->u64[1] = in.u64[1] >> n;
#endif

        return *result;
}

static ALWAYS_INLINE CONST_FUNCTION v and_v(const v a, const v b) {
        STACK_ALIGN(v, result);

#if defined(__AVX2__)
        *result = _mm256_and_si256(a, b);
#elif defined(__SSE2__)
        result->m128i = _mm_and_si128(a.m128i, b.m128i);
#else
        result->v2u64 = a.v2u64 & b.v2u64;
#endif

        return *result;
}

static ALWAYS_INLINE CONST_FUNCTION v xor_v(const v a, const v b) {
        STACK_ALIGN(v, result);

#if defined(__AVX2__)
        *result = _mm256_xor_si256(a, b);
#elif defined(__SSE2__)
        result->m128i = _mm_xor_si128(a.m128i, b.m128i);
#else
        result->v2u64 = a.v2u64 ^ b.v2u64;
#endif

        return *result;
}

static ALWAYS_INLINE CONST_FUNCTION v shuffle_epi8_v(const v vec, const v mask) {
        STACK_ALIGN(v, result);

#if defined(__AVX2__)
        *result = _mm256_shuffle_epi8(vec, mask);
#elif defined(__SSSE3__)
        result->m128i = _mm_shuffle_epi8(vec.m128i, mask.m128i);
#else
        result->u64[0] = 0;
        result->u64[1] = 0;

# define DO_BYTE(i) \
        result->u8[i] = mask.u8[i] & 0x80 ? 0 : vec.u8[mask.u8[i] & 0x0F];

        DO_BYTE( 0); DO_BYTE( 1); DO_BYTE( 2); DO_BYTE( 3);
        DO_BYTE( 4); DO_BYTE( 5); DO_BYTE( 6); DO_BYTE( 7);
        DO_BYTE( 8); DO_BYTE( 9); DO_BYTE(10); DO_BYTE(11);
        DO_BYTE(12); DO_BYTE(13); DO_BYTE(14); DO_BYTE(15);
#endif

        return *result;
}

static ALWAYS_INLINE void storeu_v(uint8_t *out, const v vec) {
#if defined(__AVX2__)
        _mm256_storeu_si256((__m256i *)out, vec);
#elif defined(__SSE2__)
        _mm_storeu_si128((__m128i *)out, vec.m128i);
#else
        uint64_t *out64 = (uint64_t *)out;

        out64[0] = vec.u64[0];
        out64[1] = vec.u64[1];
#endif
}

static ALWAYS_INLINE CONST_FUNCTION v replicate_v128_v(const v128 vec) {
#if defined(__AVX2__)
        return _mm256_broadcastsi128_si256(vec.m128i);
#else
        return vec;
#endif
}


//+build !noasm !appengine

// Copyright 2015, Klaus Post, see LICENSE for details.

// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
// and http://jerasure.org/jerasure/gf-complete/tree/master

/*
// func galMulSSSE3Xor(low, high, in, out []byte)
TEXT ·galMulSSSE3Xor(SB), 7, $0
    MOVQ    low+0(FP),SI        // SI: &low
    MOVQ    high+24(FP),DX      // DX: &high
    MOVOU  (SI), X6             // X6 low
    MOVOU  (DX), X7             // X7: high
    MOVQ    $15, BX             // BX: low mask
    MOVQ    BX, X8
    PXOR    X5, X5    
    MOVQ    in+48(FP),SI        // R11: &in
    MOVQ    in_len+56(FP),R9    // R9: len(in)
    MOVQ    out+72(FP), DX      // DX: &out
    PSHUFB  X5, X8              // X8: lomask (unpacked)
    SHRQ    $4, R9              // len(in) / 16
    CMPQ    R9 ,$0
    JEQ     done_xor
loopback_xor:
    MOVOU  (SI),X0   // in[x]
    MOVOU  (DX),X4   // out[x]
    MOVOU   X0, X1   // in[x]
    MOVOU   X6, X2   // low copy
    MOVOU   X7, X3   // high copy
    PSRLQ   $4, X1   // X1: high input
    PAND    X8, X0   // X0: low input
    PAND    X8, X1   // X0: high input
    PSHUFB  X0, X2   // X2: mul low part
    PSHUFB  X1, X3   // X3: mul high part
    PXOR    X2, X3   // X3: Result
    PXOR    X4, X3   // X3: Result xor existing out
    MOVOU   X3, (DX) // Store
    ADDQ    $16, SI  // in+=16
    ADDQ    $16, DX  // out+=16 
    SUBQ    $1, R9
    JNZ     loopback_xor
done_xor:
    RET
*/

#ifdef HOT
HOT_FUNCTION
#endif
PROTO(CONCAT(reedsolomon_gal_mul_, TARGET)) {
        const v low_mask_unpacked = set1_epi8_v(0x0f);

        const v128 low_vector128 = loadu_v128(low),
                   high_vector128 = loadu_v128(high);
        const v low_vector = replicate_v128_v(low_vector128),
                high_vector = replicate_v128_v(high_vector128);

        const uint8_t *curr_in = in;
        uint8_t *curr_out = out;

        v in_x,
          high_input,
          low_input,
          mul_low_part,
          mul_high_part,
          result;
        size_t x = len / sizeof(v);

        while(x > 0) {
                in_x = loadu_v(curr_in);

                low_input = and_v(in_x, low_mask_unpacked);
                high_input = and_v(srli_epi64_v(in_x, 4), low_mask_unpacked);

                mul_low_part = shuffle_epi8_v(low_vector, low_input);
                mul_high_part = shuffle_epi8_v(high_vector, high_input);

                result = xor_v(mul_low_part, mul_high_part);

                storeu_v(curr_out, result);

                curr_in += sizeof(v);
                curr_out += sizeof(v);
                x--;
        }

        return (curr_in - in);
}

/*
// func galMulSSSE3(low, high, in, out []byte)
TEXT ·galMulSSSE3(SB), 7, $0
    MOVQ    low+0(FP),SI        // SI: &low
    MOVQ    high+24(FP),DX      // DX: &high
    MOVOU   (SI), X6            // X6 low
    MOVOU   (DX), X7            // X7: high
    MOVQ    $15, BX             // BX: low mask
    MOVQ    BX, X8
    PXOR    X5, X5    
    MOVQ    in+48(FP),SI        // R11: &in
    MOVQ    in_len+56(FP),R9    // R9: len(in)
    MOVQ    out+72(FP), DX      // DX: &out
    PSHUFB  X5, X8              // X8: lomask (unpacked)
    SHRQ    $4, R9              // len(in) / 16
    CMPQ    R9 ,$0
    JEQ     done
loopback:
    MOVOU  (SI),X0   // in[x]
    MOVOU   X0, X1   // in[x]
    MOVOU   X6, X2   // low copy
    MOVOU   X7, X3   // high copy
    PSRLQ   $4, X1   // X1: high input
    PAND    X8, X0   // X0: low input
    PAND    X8, X1   // X0: high input
    PSHUFB  X0, X2   // X2: mul low part
    PSHUFB  X1, X3   // X3: mul high part
    PXOR    X2, X3   // X3: Result
    MOVOU   X3, (DX) // Store
    ADDQ    $16, SI  // in+=16
    ADDQ    $16, DX  // out+=16 
    SUBQ    $1, R9
    JNZ     loopback
done:
    RET
*/

#ifdef HOT
HOT_FUNCTION
#endif
PROTO(CONCAT(reedsolomon_gal_mul_xor_, TARGET)) {
        const v low_mask_unpacked = set1_epi8_v(0x0f);

        const v128 low_vector128 = loadu_v128(low),
                   high_vector128 = loadu_v128(high);
        const v low_vector = replicate_v128_v(low_vector128),
                high_vector = replicate_v128_v(high_vector128);

        const uint8_t *curr_in = in;
        uint8_t *curr_out = out;

        v in_x,
          high_input,
          low_input,
          mul_low_part,
          mul_high_part,
          result;
        size_t x = len / sizeof(v);

        while(x > 0) {
                in_x = loadu_v(curr_in);

                low_input = and_v(in_x, low_mask_unpacked);
                high_input = and_v(
                                srli_epi64_v(in_x, 4),
                                low_mask_unpacked);

                mul_low_part = shuffle_epi8_v(low_vector, low_input);
                mul_high_part = shuffle_epi8_v(high_vector, high_input);

                result = xor_v(
                                loadu_v(curr_out),
                                xor_v(mul_low_part, mul_high_part));

                storeu_v(curr_out, result);

                curr_in += sizeof(v);
                curr_out += sizeof(v);
                x--;
        }

        return (curr_in - in);
}