/** * @file constant_time.h * @copyright * Copyright (c) 2014 Cryptography Research, Inc. \n * Released under the MIT License. See LICENSE.txt for license information. * @author Mike Hamburg * * @brief Constant-time routines. */ #ifndef __CONSTANT_TIME_H__ #define __CONSTANT_TIME_H__ 1 #include "word.h" #include /* * Constant-time operations on hopefully-compile-time-sized memory * regions. Needed for flexibility / demagication: not all fields * have sizes which are multiples of the vector width, necessitating * a change from the Ed448 versions. * * These routines would be much simpler to define at the byte level, * but if not vectorized they would be a significant fraction of the * runtime. Eg on NEON-less ARM, constant_time_lookup is like 15% of * signing time, vs 6% on Haswell with its fancy AVX2 vectors. * * If the compiler could do a good job of autovectorizing the code, * we could just leave it with the byte definition. But that's unlikely * on most deployed compilers, especially if you consider that pcmpeq[size] * is much faster than moving a scalar to the vector unit (which is what * a naive autovectorizer will do with constant_time_lookup on Intel). * * Instead, we're putting our trust in the loop unroller and unswitcher. */ /** * Unaligned big (vector?) register. */ typedef struct { big_register_t unaligned; } __attribute__((packed)) unaligned_br_t; /** * Unaligned word register, for architectures where that matters. */ typedef struct { word_t unaligned; } __attribute__((packed)) unaligned_word_t; /** * @brief Constant-time conditional swap. * * If doswap, then swap elem_bytes between *a and *b. * * *a and *b must not alias. Also, they must be at least as aligned * as their sizes, if the CPU cares about that sort of thing. */ static __inline__ void __attribute__((unused,always_inline)) constant_time_cond_swap ( void *__restrict__ a_, void *__restrict__ b_, word_t elem_bytes, mask_t doswap ) { word_t k; unsigned char *a = (unsigned char *)a_; unsigned char *b = (unsigned char *)b_; big_register_t br_mask = br_set_to_mask(doswap); for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) { if (elem_bytes % sizeof(big_register_t)) { /* unaligned */ big_register_t xor = ((unaligned_br_t*)(&a[k]))->unaligned ^ ((unaligned_br_t*)(&b[k]))->unaligned; xor &= br_mask; ((unaligned_br_t*)(&a[k]))->unaligned ^= xor; ((unaligned_br_t*)(&b[k]))->unaligned ^= xor; } else { /* aligned */ big_register_t xor = *((big_register_t*)(&a[k])) ^ *((big_register_t*)(&b[k])); xor &= br_mask; *((big_register_t*)(&a[k])) ^= xor; *((big_register_t*)(&b[k])) ^= xor; } } if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) { /* unaligned */ word_t xor = ((unaligned_word_t*)(&a[k]))->unaligned ^ ((unaligned_word_t*)(&b[k]))->unaligned; xor &= doswap; ((unaligned_word_t*)(&a[k]))->unaligned ^= xor; ((unaligned_word_t*)(&b[k]))->unaligned ^= xor; } else { /* aligned */ word_t xor = *((word_t*)(&a[k])) ^ *((word_t*)(&b[k])); xor &= doswap; *((word_t*)(&a[k])) ^= xor; *((word_t*)(&b[k])) ^= xor; } } } if (elem_bytes % sizeof(word_t)) { for (; kunaligned |= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned; } else { /* aligned */ *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]); } } word_t mask = word_is_zero(idx^j); if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) { /* input unaligned, output aligned */ *(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned; } else { /* aligned */ *(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]); } } } if (elem_bytes % sizeof(word_t)) { for (; kunaligned = ( ((unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned & ~br_mask ) | ( ((const unaligned_br_t *)(in+k))->unaligned & br_mask ); } else { /* aligned */ *(big_register_t*)(&table[k+j*elem_bytes]) = ( *(big_register_t*)(&table[k+j*elem_bytes]) & ~br_mask ) | ( *(const big_register_t *)(in+k) & br_mask ); } } word_t mask = word_is_zero(idx^j); if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) { /* output unaligned, input aligned */ ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned = ( ((unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned & ~mask ) | ( *(const word_t *)(in+k) & mask ); } else { /* aligned */ *(word_t*)(&table[k+j*elem_bytes]) = ( *(word_t*)(&table[k+j*elem_bytes]) & ~mask ) | ( *(const word_t *)(in+k) & mask ); } } } if (elem_bytes % sizeof(word_t)) { for (; kunaligned = br_mask & ((const unaligned_br_t*)(&b[k]))->unaligned; } else { /* aligned */ *(big_register_t *)(a+k) = br_mask & *(const big_register_t*)(&b[k]); } } if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { if (elem_bytes % sizeof(word_t)) { /* unaligned */ ((unaligned_word_t*)(&a[k]))->unaligned = mask & ((const unaligned_word_t*)(&b[k]))->unaligned; } else { /* aligned */ *(word_t *)(a+k) = mask & *(const word_t*)(&b[k]); } } } if (elem_bytes % sizeof(word_t)) { for (; kunaligned = ( br_mask & ((const unaligned_br_t*)(&bTrue [k]))->unaligned) | (~br_mask & ((const unaligned_br_t*)(&bFalse[k]))->unaligned); } else { /* aligned */ *(big_register_t *)(a+k) = ( br_mask & *(const big_register_t*)(&bTrue [k])) | (~br_mask & *(const big_register_t*)(&bFalse[k])); } } if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) { for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) { if (alignment_bytes % sizeof(word_t)) { /* unaligned */ ((unaligned_word_t*)(&a[k]))->unaligned = ( mask & ((const unaligned_word_t*)(&bTrue [k]))->unaligned) | (~mask & ((const unaligned_word_t*)(&bFalse[k]))->unaligned); } else { /* aligned */ *(word_t *)(a+k) = ( mask & *(const word_t*)(&bTrue [k])) | (~mask & *(const word_t*)(&bFalse[k])); } } } if (elem_bytes % sizeof(word_t)) { for (; k