#include #include #include #include /* Taken from: * * */ static inline uint32_t rotr32(uint32_t n, unsigned int c) { const unsigned int mask = (CHAR_BIT*sizeof(n)-1); c &= mask; /* avoid undef behaviour with NDEBUG. 0 overhead for most types / compilers */ return (n>>c) | (n<<( (-c)&mask )); } /* - `mask` is the 4-byte mask to apply to the source. It is stored in the * hosts' native byte ordering. * - `mask_offset` is the initial offset in the mask. It is specified in bytes * and should be between 0 and 3 (inclusive). This is necessary for when we * are dealing with multiple chunks. * - `src` is the source pointer. * - `len` is the size of the source (and destination) in bytes. * - `dst` is the destination. */ void _hs_mask_chunk( uint32_t mask, int mask_offset, uint8_t *src, size_t len, uint8_t *dst) { const uint8_t *src_end = src + len; /* We have two fast paths: one for `x86_64` and one for `i386` * architectures. In these fast paths, we mask 8 (or 4) bytes at a time. * * Note that we use unaligned loads and stores (allowed on these * architectures). This makes the code much easier to write, since we don't * need to guarantee that `src` and `dst` have the same alignment. * * It only causes a minor slowdown, around 5% on my machine (TM). */ #if defined(__x86_64__) uint64_t mask64; /* Set up 64 byte mask. */ mask64 = (uint64_t)(rotr32(mask, 8 * mask_offset)); mask64 |= (mask64 << 32); /* Take the fast road. */ while (src < src_end - 7) { *(uint64_t *)dst = *(uint64_t*)src ^ mask64; src += 8; dst += 8; } #elif defined(__i386__) /* Set up 32 byte mask. */ uint32_t mask32; mask32 = (uint32_t)(rotr32(mask, 8 * mask_offset)); /* Take the fast road. */ while (src < src_end - 3) { *(uint32_t *)dst = *(uint32_t*)src ^ mask32; src += 4; dst += 4; } #endif /* This is the slow path which also handles the un-aligned suffix. */ uint8_t *mask_ptr = (uint8_t *) &mask; while (src != src_end) { *dst = *src ^ *(mask_ptr + mask_offset); src++; dst++; mask_offset = (mask_offset + 1) & 0x3; } }