#define UINT_SIZE 8 #include "simd.h" ENTRY(clmul_test); #ifdef __AVX512F__ /* AVX512BW may get enabled only below */ # define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT)) # define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE) # define lane_shr_unit(x) \ ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), 64, (vdi_t){}, \ 0x00ff00ff00ff00ffULL & (~0ULL >> (64 - VEC_SIZE)))) #else # if defined(__AVX2__) && VEC_SIZE == 32 # define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0) # else # define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff) # endif # define eq(x, y) to_bool((x) == (y)) # define lane_shr_unit(x) ((vec_t)B(palignr, , (vdi_t){}, (vdi_t)(x), 64)) #endif #define CLMUL(op, x, y, c) (vec_t)(__builtin_ia32_ ## op((vdi_t)(x), (vdi_t)(y), c)) #if VEC_SIZE == 16 # define clmul(x, y, c) CLMUL(pclmulqdq128, x, y, c) # define vpshrd __builtin_ia32_vpshrd_v2di #elif VEC_SIZE == 32 # define clmul(x, y, c) CLMUL(vpclmulqdq_v4di, x, y, c) # define vpshrd __builtin_ia32_vpshrd_v4di #elif VEC_SIZE == 64 # define clmul(x, y, c) CLMUL(vpclmulqdq_v8di, x, y, c) # define vpshrd __builtin_ia32_vpshrd_v8di #endif #define clmul_ll(x, y) clmul(x, y, 0x00) #define clmul_hl(x, y) clmul(x, y, 0x01) #define clmul_lh(x, y) clmul(x, y, 0x10) #define clmul_hh(x, y) clmul(x, y, 0x11) #if defined(__AVX512VBMI2__) # pragma GCC target ( "avx512bw" ) # define lane_shr_i(x, n) ({ \ vec_t h_ = lane_shr_unit(x); \ touch(h_); \ (n) < 64 ? (vec_t)vpshrd((vdi_t)(x), (vdi_t)(h_), n) : h_ >> ((n) - 64); \ }) # define lane_shr_v(x, n) ({ \ vec_t t_ = (x), h_ = lane_shr_unit(x); \ typeof(t_[0]) n_ = (n); \ if ( (n) < 64 ) \ /* gcc does not support embedded broadcast */ \ asm ( "vpshrdvq %2%{1to%c3%}, %1, %0" \ : "+v" (t_) : "v" (h_), "m" (n_), "i" (ELEM_COUNT) ); \ else \ t_ = h_ >> ((n) - 64); \ t_; \ }) #else # define lane_shr_i lane_shr_v # define lane_shr_v(x, n) ({ \ vec_t t_ = (n) > 0 ? lane_shr_unit(x) : (x); \ (n) < 64 ? ((x) >> (n)) | (t_ << (-(n) & 0x3f)) \ : t_ >> ((n) - 64); \ }) #endif int clmul_test(void) { unsigned int i; vec_t src; vqi_t raw = {}; for ( i = 1; i < VEC_SIZE; ++i ) raw[i] = i; src = (vec_t)raw; for ( i = 0; i < 256; i += VEC_SIZE ) { vec_t x = {}, y, z, lo, hi; unsigned int j; touch(x); y = clmul_ll(src, x); touch(x); if ( !eq(y, x) ) return __LINE__; for ( j = 0; j < ELEM_COUNT; j += 2 ) x[j] = 1; touch(src); y = clmul_ll(x, src); touch(src); z = clmul_lh(x, src); touch(src); for ( j = 0; j < ELEM_COUNT; j += 2 ) y[j + 1] = z[j]; if ( !eq(y, src) ) return __LINE__; /* * Besides the obvious property of the low and high half products * being the same either direction, the "square" of a number has the * property of simply being the original bit pattern with a zero bit * inserted between any two bits. This is what the code below checks. */ x = src; touch(src); y = clmul_lh(x, src); touch(src); z = clmul_hl(x, src); if ( !eq(y, z) ) return __LINE__; touch(src); y = lo = clmul_ll(x, src); touch(src); z = hi = clmul_hh(x, src); touch(src); for ( j = 0; j < 64; ++j ) { vec_t l = lane_shr_v(lo, 2 * j); vec_t h = lane_shr_v(hi, 2 * j); unsigned int n; if ( !eq(l, y) ) return __LINE__; if ( !eq(h, z) ) return __LINE__; x = src >> j; for ( n = 0; n < ELEM_COUNT; n += 2 ) { if ( (x[n + 0] & 1) != (l[n] & 3) ) return __LINE__; if ( (x[n + 1] & 1) != (h[n] & 3) ) return __LINE__; } touch(y); y = lane_shr_i(y, 2); touch(z); z = lane_shr_i(z, 2); } src += 0x0101010101010101ULL * VEC_SIZE; } return 0; }