/*------------------------------------------------------------------------- * * arch-x86.h * Atomic operations considerations specific to intel x86 * * Note that we actually require a 486 upwards because the 386 doesn't have * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere * anymore that's not much of restriction luckily. * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * NOTES: * * src/include/port/atomics/arch-x86.h * *------------------------------------------------------------------------- */ /* * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads, * or stores to be reordered with other stores, but a load can be performed * before a subsequent store. * * Technically, some x86-ish chips support uncached memory access and/or * special instructions that are weakly ordered. In those cases we'd need * the read and write barriers to be lfence and sfence. But since we don't * do those things, a compiler barrier should be enough. * * "lock; addl" has worked for longer than "mfence". It's also rumored to be * faster in many scenarios */ #if defined(__INTEL_COMPILER) #define pg_memory_barrier_impl() _mm_mfence() #elif defined(__GNUC__) && (defined(__i386__) || defined(__i386)) #define pg_memory_barrier_impl() \ __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc") #elif defined(__GNUC__) && defined(__x86_64__) #define pg_memory_barrier_impl() \ __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc") #endif #define pg_read_barrier_impl() pg_compiler_barrier_impl() #define pg_write_barrier_impl() pg_compiler_barrier_impl() /* * Provide implementation for atomics using inline assembly on x86 gcc. It's * nice to support older gcc's and the compare/exchange implementation here is * actually more efficient than the * __sync variant. */ #if defined(HAVE_ATOMICS) #if defined(__GNUC__) && !defined(__INTEL_COMPILER) #define PG_HAVE_ATOMIC_FLAG_SUPPORT typedef struct pg_atomic_flag { volatile char value; } pg_atomic_flag; #define PG_HAVE_ATOMIC_U32_SUPPORT typedef struct pg_atomic_uint32 { volatile uint32 value; } pg_atomic_uint32; /* * It's too complicated to write inline asm for 64bit types on 32bit and the * 486 can't do it. */ #ifdef __x86_64__ #define PG_HAVE_ATOMIC_U64_SUPPORT typedef struct pg_atomic_uint64 { /* alignment guaranteed due to being on a 64bit platform */ volatile uint64 value; } pg_atomic_uint64; #endif #endif /* defined(HAVE_ATOMICS) */ #endif /* defined(__GNUC__) && !defined(__INTEL_COMPILER) */ #if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) #if !defined(PG_HAVE_SPIN_DELAY) /* * This sequence is equivalent to the PAUSE instruction ("rep" is * ignored by old IA32 processors if the following instruction is * not a string operation); the IA-32 Architecture Software * Developer's Manual, Vol. 3, Section 7.7.2 describes why using * PAUSE in the inner loop of a spin lock is necessary for good * performance: * * The PAUSE instruction improves the performance of IA-32 * processors supporting Hyper-Threading Technology when * executing spin-wait loops and other routines where one * thread is accessing a shared lock or semaphore in a tight * polling loop. When executing a spin-wait loop, the * processor can suffer a severe performance penalty when * exiting the loop because it detects a possible memory order * violation and flushes the core processor's pipeline. The * PAUSE instruction provides a hint to the processor that the * code sequence is a spin-wait loop. The processor uses this * hint to avoid the memory order violation and prevent the * pipeline flush. In addition, the PAUSE instruction * de-pipelines the spin-wait loop to prevent it from * consuming execution resources excessively. */ #if defined(__INTEL_COMPILER) #define PG_HAVE_SPIN_DELAY static inline pg_spin_delay_impl(void) { _mm_pause(); } #elif defined(__GNUC__) #define PG_HAVE_SPIN_DELAY static __inline__ void pg_spin_delay_impl(void) { __asm__ __volatile__( " rep; nop \n"); } #elif defined(WIN32_ONLY_COMPILER) && defined(__x86_64__) #define PG_HAVE_SPIN_DELAY static __forceinline void pg_spin_delay_impl(void) { _mm_pause(); } #elif defined(WIN32_ONLY_COMPILER) #define PG_HAVE_SPIN_DELAY static __forceinline void pg_spin_delay_impl(void) { /* See comment for gcc code. Same code, MASM syntax */ __asm rep nop; } #endif #endif /* !defined(PG_HAVE_SPIN_DELAY) */ #if defined(HAVE_ATOMICS) /* inline assembly implementation for gcc */ #if defined(__GNUC__) && !defined(__INTEL_COMPILER) #define PG_HAVE_ATOMIC_TEST_SET_FLAG static inline bool pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr) { register char _res = 1; __asm__ __volatile__( " lock \n" " xchgb %0,%1 \n" : "+q"(_res), "+m"(ptr->value) : : "memory"); return _res == 0; } #define PG_HAVE_ATOMIC_CLEAR_FLAG static inline void pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr) { /* * On a TSO architecture like x86 it's sufficient to use a compiler * barrier to achieve release semantics. */ __asm__ __volatile__("" ::: "memory"); ptr->value = 0; } #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32 static inline bool pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval) { char ret; /* * Perform cmpxchg and use the zero flag which it implicitly sets when * equal to measure the success. */ __asm__ __volatile__( " lock \n" " cmpxchgl %4,%5 \n" " setz %2 \n" : "=a" (*expected), "=m"(ptr->value), "=q" (ret) : "a" (*expected), "r" (newval), "m"(ptr->value) : "memory", "cc"); return (bool) ret; } #define PG_HAVE_ATOMIC_FETCH_ADD_U32 static inline uint32 pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_) { uint32 res; __asm__ __volatile__( " lock \n" " xaddl %0,%1 \n" : "=q"(res), "=m"(ptr->value) : "0" (add_), "m"(ptr->value) : "memory", "cc"); return res; } #ifdef __x86_64__ #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 static inline bool pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval) { char ret; /* * Perform cmpxchg and use the zero flag which it implicitly sets when * equal to measure the success. */ __asm__ __volatile__( " lock \n" " cmpxchgq %4,%5 \n" " setz %2 \n" : "=a" (*expected), "=m"(ptr->value), "=q" (ret) : "a" (*expected), "r" (newval), "m"(ptr->value) : "memory", "cc"); return (bool) ret; } #define PG_HAVE_ATOMIC_FETCH_ADD_U64 static inline uint64 pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_) { uint64 res; __asm__ __volatile__( " lock \n" " xaddq %0,%1 \n" : "=q"(res), "=m"(ptr->value) : "0" (add_), "m"(ptr->value) : "memory", "cc"); return res; } #endif /* __x86_64__ */ #endif /* defined(__GNUC__) && !defined(__INTEL_COMPILER) */ #endif /* HAVE_ATOMICS */ #endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */