#include #if defined(__i386__) && VEC_SIZE == 16 # define ENTRY(name) \ asm ( "\t.text\n" \ "\t.globl _start\n" \ "_start:\n" \ "\tpush %ebp\n" \ "\tmov %esp,%ebp\n" \ "\tand $~0xf,%esp\n" \ "\tcall " #name "\n" \ "\tleave\n" \ "\tret" ) #else # define ENTRY(name) \ asm ( "\t.text\n" \ "\t.globl _start\n" \ "_start:\n" \ "\tjmp " #name ) #endif typedef #if defined(INT_SIZE) # define ELEM_SIZE INT_SIZE signed int # if INT_SIZE == 1 # define MODE QI # elif INT_SIZE == 2 # define MODE HI # elif INT_SIZE == 4 # define MODE SI # elif INT_SIZE == 8 # define MODE DI # endif #elif defined(UINT_SIZE) # define ELEM_SIZE UINT_SIZE unsigned int # if UINT_SIZE == 1 # define MODE QI # elif UINT_SIZE == 2 # define MODE HI # elif UINT_SIZE == 4 # define MODE SI # elif UINT_SIZE == 8 # define MODE DI # endif #elif defined(FLOAT_SIZE) float # define ELEM_SIZE FLOAT_SIZE # if FLOAT_SIZE == 4 # define MODE SF # elif FLOAT_SIZE == 8 # define MODE DF # endif #endif #ifndef VEC_SIZE # define VEC_SIZE ELEM_SIZE #endif __attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t; #define ELEM_COUNT (VEC_SIZE / ELEM_SIZE) typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t; /* Various builtins want plain char / int / long long vector types ... */ typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t; typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t; typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t; #if VEC_SIZE >= 8 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t; typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t; #endif #if ELEM_SIZE == 1 typedef vqi_t vint_t; #elif ELEM_SIZE == 2 typedef vhi_t vint_t; #elif ELEM_SIZE == 4 typedef vsi_t vint_t; #elif ELEM_SIZE == 8 typedef vdi_t vint_t; #endif #if VEC_SIZE >= 16 # if ELEM_COUNT >= 2 # if VEC_SIZE > 32 # define HALF_SIZE (VEC_SIZE / 2) # else # define HALF_SIZE 16 # endif typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t; typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t; typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t; typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t; typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t; typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t; # endif # if ELEM_COUNT >= 4 # if VEC_SIZE > 64 # define QUARTER_SIZE (VEC_SIZE / 4) # else # define QUARTER_SIZE 16 # endif typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) quarter_t; typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t; typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t; typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t; typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t; # endif # if ELEM_COUNT >= 8 # if VEC_SIZE > 128 # define EIGHTH_SIZE (VEC_SIZE / 8) # else # define EIGHTH_SIZE 16 # endif typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) eighth_t; typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t; typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t; typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t; typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t; # endif # define DECL_PAIR(w) \ typedef w ## _t pair_t; \ typedef vsi_ ## w ## _t vsi_pair_t; \ typedef vdi_ ## w ## _t vdi_pair_t # define DECL_QUARTET(w) \ typedef w ## _t quartet_t; \ typedef vsi_ ## w ## _t vsi_quartet_t; \ typedef vdi_ ## w ## _t vdi_quartet_t # define DECL_OCTET(w) \ typedef w ## _t octet_t; \ typedef vsi_ ## w ## _t vsi_octet_t; \ typedef vdi_ ## w ## _t vdi_octet_t # if ELEM_COUNT == 4 DECL_PAIR(half); # elif ELEM_COUNT == 8 DECL_PAIR(quarter); DECL_QUARTET(half); # elif ELEM_COUNT == 16 DECL_PAIR(eighth); DECL_QUARTET(quarter); DECL_OCTET(half); # endif # undef DECL_OCTET # undef DECL_QUARTET # undef DECL_PAIR #endif #if VEC_SIZE == 16 # define B(n, s, a...) __builtin_ia32_ ## n ## 128 ## s(a) # define B_(n, s, a...) __builtin_ia32_ ## n ## s(a) #elif VEC_SIZE == 32 # define B(n, s, a...) __builtin_ia32_ ## n ## 256 ## s(a) #elif VEC_SIZE == 64 # define B(n, s, a...) __builtin_ia32_ ## n ## 512 ## s(a) # define BR(n, s, a...) __builtin_ia32_ ## n ## 512 ## s(a, 4) #endif #ifndef B_ # define B_ B #endif #ifndef BR # define BR B # define BR_ B_ #endif #ifndef BR_ # define BR_ BR #endif #ifdef __AVX512F__ /* Sadly there are a few exceptions to the general naming rules. */ # define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512 # define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512 # define __builtin_ia32_exp2pd512_mask __builtin_ia32_exp2pd_mask # define __builtin_ia32_exp2ps512_mask __builtin_ia32_exp2ps_mask # define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask # define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask # define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask # define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask # define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask # define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask # define __builtin_ia32_rcp28pd512_mask __builtin_ia32_rcp28pd_mask # define __builtin_ia32_rcp28ps512_mask __builtin_ia32_rcp28ps_mask # define __builtin_ia32_rndscalepd_512_mask __builtin_ia32_rndscalepd_mask # define __builtin_ia32_rndscaleps_512_mask __builtin_ia32_rndscaleps_mask # define __builtin_ia32_rsqrt28pd512_mask __builtin_ia32_rsqrt28pd_mask # define __builtin_ia32_rsqrt28ps512_mask __builtin_ia32_rsqrt28ps_mask # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask # define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask # if VEC_SIZE > ELEM_SIZE && (defined(VEC_MAX) ? VEC_MAX : VEC_SIZE) < 64 # pragma GCC target ( "avx512vl" ) # endif # define REN(insn, old, new) \ asm ( ".macro v" #insn #old " o:vararg \n\t" \ "v" #insn #new " \\o \n\t" \ ".endm" ) /* * The original plan was to effect use of EVEX encodings for scalar as well as * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit * only of course) XMM16-XMM31 only. All sorts of compiler errors result when * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes, * which has the benefit of also working for 32-bit. Granted, there is a lot of * escaping to get right here. */ asm ( ".macro override insn \n\t" ".macro $\\insn o:vararg \n\t" ".purgem \\insn \n\t" "{evex} \\insn \\(\\)o \n\t" ".macro \\insn o:vararg \n\t" "$\\insn \\(\\(\\))o \n\t" ".endm \n\t" ".endm \n\t" ".macro \\insn o:vararg \n\t" "$\\insn \\(\\)o \n\t" ".endm \n\t" ".endm" ); # define OVR(n) asm ( "override v" #n ) # define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss) # ifdef __AVX512VL__ # ifdef __AVX512BW__ # define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w) # else # define OVR_BW(n) # endif # define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q) # define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps) # else # define OVR_BW(n) # define OVR_DQ(n) # define OVR_VFP(n) # endif # define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \ OVR_ ## w(n ## 231) # define OVR_FP(n) OVR_VFP(n); OVR_SFP(n) # define OVR_INT(n) OVR_BW(n); OVR_DQ(n) OVR_INT(broadcast); OVR_SFP(broadcast); OVR_SFP(comi); OVR_VFP(cvtdq2); OVR_INT(abs); OVR_FP(add); OVR_INT(add); OVR_BW(adds); OVR_BW(addus); OVR_BW(avg); OVR_FP(div); OVR(extractps); OVR_FMA(fmadd, FP); OVR_FMA(fmaddsub, VFP); OVR_FMA(fmsub, FP); OVR_FMA(fmsubadd, VFP); OVR_FMA(fnmadd, FP); OVR_FMA(fnmsub, FP); OVR(insertps); OVR_FP(max); OVR_INT(maxs); OVR_INT(maxu); OVR_FP(min); OVR_INT(mins); OVR_INT(minu); OVR(movd); OVR(movq); OVR_SFP(mov); OVR_VFP(mova); OVR(movhlps); OVR(movhpd); OVR(movhps); OVR(movlhps); OVR(movlpd); OVR(movlps); OVR_VFP(movnt); OVR_VFP(movu); OVR_FP(mul); OVR_VFP(perm); OVR_VFP(permil); OVR_VFP(shuf); OVR_INT(sll); OVR_DQ(sllv); OVR_FP(sqrt); OVR_INT(sra); OVR_DQ(srav); OVR_INT(srl); OVR_DQ(srlv); OVR_FP(sub); OVR_INT(sub); OVR_BW(subs); OVR_BW(subus); OVR_SFP(ucomi); OVR_VFP(unpckh); OVR_VFP(unpckl); # ifdef __AVX512VL__ # if ELEM_SIZE == 8 && defined(__AVX512DQ__) REN(extract, f128, f64x2); REN(extract, i128, i64x2); REN(insert, f128, f64x2); REN(insert, i128, i64x2); # else REN(extract, f128, f32x4); REN(extract, i128, i32x4); REN(insert, f128, f32x4); REN(insert, i128, i32x4); # endif # if ELEM_SIZE == 8 REN(movdqa, , 64); REN(movdqu, , 64); REN(pand, , q); REN(pandn, , q); REN(por, , q); REN(pxor, , q); # else # if ELEM_SIZE == 1 && defined(__AVX512BW__) REN(movdq, a, u8); REN(movdqu, , 8); # elif ELEM_SIZE == 2 && defined(__AVX512BW__) REN(movdq, a, u16); REN(movdqu, , 16); # else REN(movdqa, , 32); REN(movdqu, , 32); # endif REN(pand, , d); REN(pandn, , d); REN(por, , d); REN(pxor, , d); # endif OVR(aesdec); OVR(aesdeclast); OVR(aesenc); OVR(aesenclast); OVR(cvtpd2dqx); OVR(cvtpd2dqy); OVR(cvtpd2psx); OVR(cvtpd2psy); OVR(cvtph2ps); OVR(cvtps2dq); OVR(cvtps2pd); OVR(cvtps2ph); OVR(cvtsd2ss); OVR(cvtsd2si); OVR(cvtsd2sil); OVR(cvtsd2siq); OVR(cvtsi2sd); OVR(cvtsi2sdl); OVR(cvtsi2sdq); OVR(cvtsi2ss); OVR(cvtsi2ssl); OVR(cvtsi2ssq); OVR(cvtss2sd); OVR(cvtss2si); OVR(cvtss2sil); OVR(cvtss2siq); OVR(cvttpd2dqx); OVR(cvttpd2dqy); OVR(cvttps2dq); OVR(cvttsd2si); OVR(cvttsd2sil); OVR(cvttsd2siq); OVR(cvttss2si); OVR(cvttss2sil); OVR(cvttss2siq); OVR(gf2p8mulb); OVR(movddup); OVR(movntdq); OVR(movntdqa); OVR(movshdup); OVR(movsldup); OVR(pclmulqdq); OVR(permd); OVR(permq); OVR(pmovsxbd); OVR(pmovsxbq); OVR(pmovsxdq); OVR(pmovsxwd); OVR(pmovsxwq); OVR(pmovzxbd); OVR(pmovzxbq); OVR(pmovzxdq); OVR(pmovzxwd); OVR(pmovzxwq); OVR(pmulld); OVR(pmuldq); OVR(pmuludq); OVR(pshufd); OVR(punpckhdq); OVR(punpckhqdq); OVR(punpckldq); OVR(punpcklqdq); # endif # ifdef __AVX512BW__ OVR(pextrb); OVR(pextrw); OVR(pinsrb); OVR(pinsrw); # ifdef __AVX512VL__ OVR(packssdw); OVR(packsswb); OVR(packusdw); OVR(packuswb); OVR(palignr); OVR(pmaddubsw); OVR(pmaddwd); OVR(pmovsxbw); OVR(pmovzxbw); OVR(pmulhrsw); OVR(pmulhuw); OVR(pmulhw); OVR(pmullw); OVR(psadbw); OVR(pshufb); OVR(pshufhw); OVR(pshuflw); OVR(pslldq); OVR(psrldq); OVR(punpckhbw); OVR(punpckhwd); OVR(punpcklbw); OVR(punpcklwd); # endif # endif # ifdef __AVX512DQ__ OVR_VFP(and); OVR_VFP(andn); OVR_VFP(or); OVR(pextrd); OVR(pextrq); OVR(pinsrd); OVR(pinsrq); # ifdef __AVX512VL__ OVR(pmullq); # endif OVR_VFP(xor); # endif # undef OVR_VFP # undef OVR_SFP # undef OVR_INT # undef OVR_FP # undef OVR_FMA # undef OVR_DQ # undef OVR_BW # undef OVR #endif /* __AVX512F__ */ /* * Suppress value propagation by the compiler, preventing unwanted * optimization. This at once makes the compiler use memory operands * more often, which for our purposes is the more interesting case. */ #define touch(var) asm volatile ( "" : "+m" (var) ) static inline vec_t undef(void) { vec_t v = v; return v; }