1 #include <stdbool.h>
2 
3 asm (
4     "\t.text\n"
5     "\t.globl _start\n"
6     "_start:\n"
7 #if defined(__i386__) && VEC_SIZE == 16
8     "\tpush %ebp\n"
9     "\tmov %esp,%ebp\n"
10     "\tand $~0xf,%esp\n"
11     "\tcall simd_test\n"
12     "\tleave\n"
13     "\tret"
14 #else
15     "\tjmp simd_test"
16 #endif
17     );
18 
19 typedef
20 #if defined(INT_SIZE)
21 # define ELEM_SIZE INT_SIZE
22 signed int
23 # if INT_SIZE == 1
24 #  define MODE QI
25 # elif INT_SIZE == 2
26 #  define MODE HI
27 # elif INT_SIZE == 4
28 #  define MODE SI
29 # elif INT_SIZE == 8
30 #  define MODE DI
31 # endif
32 #elif defined(UINT_SIZE)
33 # define ELEM_SIZE UINT_SIZE
34 unsigned int
35 # if UINT_SIZE == 1
36 #  define MODE QI
37 # elif UINT_SIZE == 2
38 #  define MODE HI
39 # elif UINT_SIZE == 4
40 #  define MODE SI
41 # elif UINT_SIZE == 8
42 #  define MODE DI
43 # endif
44 #elif defined(FLOAT_SIZE)
45 float
46 # define ELEM_SIZE FLOAT_SIZE
47 # if FLOAT_SIZE == 4
48 #  define MODE SF
49 # elif FLOAT_SIZE == 8
50 #  define MODE DF
51 # endif
52 #endif
53 #ifndef VEC_SIZE
54 # define VEC_SIZE ELEM_SIZE
55 #endif
56 __attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
57 
58 #define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
59 
60 typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
61 
62 /* Various builtins want plain char / int / long long vector types ... */
63 typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
64 typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
65 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
66 #if VEC_SIZE >= 8
67 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
68 #endif
69 
70 #if VEC_SIZE == 8 && defined(__SSE__)
71 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
72 #elif VEC_SIZE == 16
73 # if defined(__AVX__) && defined(FLOAT_SIZE)
74 #  if ELEM_SIZE == 4
75 #   define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
76 #  elif ELEM_SIZE == 8
77 #   define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
78 #  endif
79 # elif defined(__SSE4_1__)
80 #  define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
81 # elif defined(__SSE__) && ELEM_SIZE == 4
82 #  define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
83 # elif defined(__SSE2__)
84 #  if ELEM_SIZE == 8
85 #   define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
86 #  else
87 #   define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
88 #  endif
89 # endif
90 #elif VEC_SIZE == 32
91 # if defined(__AVX__) && ELEM_SIZE == 4
92 #  define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
93 # elif defined(__AVX__) && ELEM_SIZE == 8
94 #  define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
95 # endif
96 #endif
97 
98 #ifndef to_bool
_to_bool(byte_vec_t bv)99 static inline bool _to_bool(byte_vec_t bv)
100 {
101     unsigned int i;
102 
103     for ( i = 0; i < VEC_SIZE; ++i )
104         if ( bv[i] != 0xff )
105             return false;
106 
107     return true;
108 }
109 # define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
110 #endif
111 
112 #if VEC_SIZE == FLOAT_SIZE
113 # define to_int(x) ((vec_t){ (int)(x)[0] })
114 #elif VEC_SIZE == 16 && defined(__SSE2__)
115 # if FLOAT_SIZE == 4
116 #  define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
117 # elif FLOAT_SIZE == 8
118 #  define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
119 # endif
120 #elif VEC_SIZE == 32 && defined(__AVX__)
121 # if FLOAT_SIZE == 4
122 #  define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
123 # elif FLOAT_SIZE == 8
124 #  define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
125 # endif
126 #endif
127 
128 #if VEC_SIZE == FLOAT_SIZE
129 # define scalar_1op(x, op) ({ \
130     typeof((x)[0]) __attribute__((vector_size(16))) r_; \
131     asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
132     (vec_t){ r_[0] }; \
133 })
134 #endif
135 
136 #if FLOAT_SIZE == 4 && defined(__SSE__)
137 # if VEC_SIZE == 32 && defined(__AVX__)
138 #  define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
139 #  define max(x, y) __builtin_ia32_maxps256(x, y)
140 #  define min(x, y) __builtin_ia32_minps256(x, y)
141 #  define recip(x) __builtin_ia32_rcpps256(x)
142 #  define rsqrt(x) __builtin_ia32_rsqrtps256(x)
143 #  define sqrt(x) __builtin_ia32_sqrtps256(x)
144 #  define swap(x) ({ \
145     vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
146     __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
147 })
148 #  define swap2(x) ({ \
149     vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
150     __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
151 })
152 # elif VEC_SIZE == 16
153 #  ifdef __AVX__
154 #   define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
155 #  endif
156 #  define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
157 #  define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
158 #  define max(x, y) __builtin_ia32_maxps(x, y)
159 #  define min(x, y) __builtin_ia32_minps(x, y)
160 #  define recip(x) __builtin_ia32_rcpps(x)
161 #  define rsqrt(x) __builtin_ia32_rsqrtps(x)
162 #  define sqrt(x) __builtin_ia32_sqrtps(x)
163 #  define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
164 #  ifdef __AVX__
165 #   define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
166 #  endif
167 # elif VEC_SIZE == 4
168 #  define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
169 #  define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
170 #  define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
171 # endif
172 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
173 # if VEC_SIZE == 32 && defined(__AVX__)
174 #  define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
175 #  define max(x, y) __builtin_ia32_maxpd256(x, y)
176 #  define min(x, y) __builtin_ia32_minpd256(x, y)
177 #  define recip(x) ({ \
178     float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
179     t_ = __builtin_ia32_vextractf128_ps256( \
180              __builtin_ia32_rcpps256( \
181                  __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
182     __builtin_ia32_cvtps2pd256(t_); \
183 })
184 #  define rsqrt(x) ({ \
185     float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
186     float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
187     t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
188     t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
189     __builtin_ia32_cvtps2pd256(t1_); \
190 })
191 #  define sqrt(x) __builtin_ia32_sqrtpd256(x)
192 #  define swap(x) ({ \
193     vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
194     __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
195 })
196 # elif VEC_SIZE == 16
197 #  define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
198 #  define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
199 #  define max(x, y) __builtin_ia32_maxpd(x, y)
200 #  define min(x, y) __builtin_ia32_minpd(x, y)
201 #  define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
202 #  define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
203 #  define sqrt(x) __builtin_ia32_sqrtpd(x)
204 #  define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
205 #  ifdef __AVX__
206 #   define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
207                                                        __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
208 #  endif
209 # elif VEC_SIZE == 8
210 #  define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
211 #  define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
212 #  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
213 # endif
214 #endif
215 #if VEC_SIZE == 16 && defined(__SSE2__)
216 # if INT_SIZE == 1 || UINT_SIZE == 1
217 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
218 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
219 # elif INT_SIZE == 2 || UINT_SIZE == 2
220 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)(x), (vhi_t)(y)))
221 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)(x), (vhi_t)(y)))
222 #  define swap(x) ((vec_t)__builtin_ia32_pshufd( \
223                    (vsi_t)__builtin_ia32_pshufhw( \
224                           __builtin_ia32_pshuflw((vhi_t)(x), 0b00011011), 0b00011011), 0b01001110))
225 # elif INT_SIZE == 4 || UINT_SIZE == 4
226 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)(x), (vsi_t)(y)))
227 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)(x), (vsi_t)(y)))
228 #  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b00011011))
229 # elif INT_SIZE == 8 || UINT_SIZE == 8
230 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)(x), (vdi_t)(y)))
231 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)(x), (vdi_t)(y)))
232 #  define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b01001110))
233 # endif
234 # if UINT_SIZE == 1
235 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)(x), (vqi_t)(y)))
236 #  define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)(x), (vqi_t)(y)))
237 # elif INT_SIZE == 2
238 #  define max(x, y) __builtin_ia32_pmaxsw128(x, y)
239 #  define min(x, y) __builtin_ia32_pminsw128(x, y)
240 #  define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
241 # elif UINT_SIZE == 2
242 #  define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)(x), (vhi_t)(y)))
243 # elif UINT_SIZE == 4
244 #  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)(x), (vsi_t)(y)))
245 # endif
246 # define select(d, x, y, m) ({ \
247     void *d_ = (d); \
248     vqi_t m_ = (vqi_t)(m); \
249     __builtin_ia32_maskmovdqu((vqi_t)(x),  m_, d_); \
250     __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
251 })
252 #endif
253 #if VEC_SIZE == 16 && defined(__SSE3__)
254 # if FLOAT_SIZE == 4
255 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
256 #  define dup_hi(x) __builtin_ia32_movshdup(x)
257 #  define dup_lo(x) __builtin_ia32_movsldup(x)
258 #  define hadd(x, y) __builtin_ia32_haddps(x, y)
259 #  define hsub(x, y) __builtin_ia32_hsubps(x, y)
260 # elif FLOAT_SIZE == 8
261 #  define addsub(x, y) __builtin_ia32_addsubpd(x, y)
262 #  define dup_lo(x) ({ \
263     double __attribute__((vector_size(16))) r_; \
264     asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
265     r_; \
266 })
267 #  define hadd(x, y) __builtin_ia32_haddpd(x, y)
268 #  define hsub(x, y) __builtin_ia32_hsubpd(x, y)
269 # endif
270 #elif VEC_SIZE == 32 && defined(__AVX__)
271 # if FLOAT_SIZE == 4
272 #  define addsub(x, y) __builtin_ia32_addsubps256(x, y)
273 #  define dup_hi(x) __builtin_ia32_movshdup256(x)
274 #  define dup_lo(x) __builtin_ia32_movsldup256(x)
275 #  define hadd(x, y) ({ \
276         vec_t t_ = __builtin_ia32_haddps256(x, y); \
277         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
278 })
279 #  define hsub(x, y) ({ \
280         vec_t t_ = __builtin_ia32_hsubps256(x, y); \
281         (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
282 })
283 # elif FLOAT_SIZE == 8
284 #  define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
285 #  define dup_lo(x) __builtin_ia32_movddup256(x)
286 #  define hadd(x, y) ({ \
287         vec_t t_ = __builtin_ia32_haddpd256(x, y); \
288         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
289 })
290 #  define hsub(x, y) ({ \
291         vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
292         (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
293 })
294 # endif
295 #endif
296 #if VEC_SIZE == 16 && defined(__SSSE3__)
297 # if INT_SIZE == 1
298 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
299 # elif INT_SIZE == 2
300 #  define abs(x) __builtin_ia32_pabsw128(x)
301 # elif INT_SIZE == 4
302 #  define abs(x) __builtin_ia32_pabsd128(x)
303 # endif
304 # if INT_SIZE == 1 || UINT_SIZE == 1
305 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
306 #  define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
307 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
308 # elif INT_SIZE == 2 || UINT_SIZE == 2
309 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
310 #  define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
311 #  define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
312 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
313 # elif INT_SIZE == 4 || UINT_SIZE == 4
314 #  define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
315 #  define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
316 #  define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
317 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
318 # elif INT_SIZE == 8 || UINT_SIZE == 8
319 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
320 # endif
321 #endif
322 #if VEC_SIZE == 16 && defined(__SSE4_1__)
323 # if INT_SIZE == 1
324 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
325 #  define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
326 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
327 #  define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
328 #  define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
329 # elif INT_SIZE == 2
330 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
331 #  define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
332 # elif INT_SIZE == 4
333 #  define max(x, y) __builtin_ia32_pmaxsd128(x, y)
334 #  define min(x, y) __builtin_ia32_pminsd128(x, y)
335 #  define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
336 #  define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
337 # elif UINT_SIZE == 1
338 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
339 #  define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
340 #  define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
341 # elif UINT_SIZE == 2
342 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
343 #  define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
344 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
345 #  define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
346 # elif UINT_SIZE == 4
347 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
348 #  define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
349 #  define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
350 # endif
351 # undef select
352 # if defined(INT_SIZE) || defined(UINT_SIZE)
353 #  define select(d, x, y, m) \
354     (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
355 # elif FLOAT_SIZE == 4
356 #  define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
357 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
358 #  define trunc(x) __builtin_ia32_roundps(x, 0b1011)
359 # elif FLOAT_SIZE == 8
360 #  define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
361 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
362 #  define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
363 # endif
364 # if INT_SIZE == 2 || UINT_SIZE == 2
365 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
366 # elif INT_SIZE == 4 || UINT_SIZE == 4
367 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
368 # elif INT_SIZE == 8 || UINT_SIZE == 8
369 #  define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
370 # elif FLOAT_SIZE == 4
371 #  define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
372 # elif FLOAT_SIZE == 8
373 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
374 # endif
375 #endif
376 #if VEC_SIZE == 32 && defined(__AVX__)
377 # if FLOAT_SIZE == 4
378 #  define dot_product(x, y) ({ \
379     vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
380     (vec_t){t_[0] + t_[4]}; \
381 })
382 #  define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
383 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
384 #  define select2(d, x, y, m) ({ \
385     vsi_t m_ = (vsi_t)(m); \
386     *(d) = __builtin_ia32_maskloadps256(&(x),  m_); \
387     __builtin_ia32_maskstoreps256(d, ~m_, y); \
388 })
389 #  define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
390 # elif FLOAT_SIZE == 8
391 #  define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
392 #  define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
393 #  define select2(d, x, y, m) ({ \
394     vdi_t m_ = (vdi_t)(m); \
395     *(d) = __builtin_ia32_maskloadpd256(&(x),  m_); \
396     __builtin_ia32_maskstorepd256(d, ~m_, y); \
397 })
398 #  define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
399 # endif
400 #endif
401 #if VEC_SIZE == FLOAT_SIZE
402 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
403 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
404 # ifdef __SSE4_1__
405 #  if FLOAT_SIZE == 4
406 #   define trunc(x) ({ \
407     float __attribute__((vector_size(16))) r_; \
408     asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
409     (vec_t){ r_[0] }; \
410 })
411 #  elif FLOAT_SIZE == 8
412 #   define trunc(x) ({ \
413     double __attribute__((vector_size(16))) r_; \
414     asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
415     (vec_t){ r_[0] }; \
416 })
417 #  endif
418 # endif
419 #endif
420 
421 /*
422  * Suppress value propagation by the compiler, preventing unwanted
423  * optimization. This at once makes the compiler use memory operands
424  * more often, which for our purposes is the more interesting case.
425  */
426 #define touch(var) asm volatile ( "" : "+m" (var) )
427 
simd_test(void)428 int simd_test(void)
429 {
430     unsigned int i, j;
431     vec_t x, y, z, src, inv, alt, sh;
432 
433     for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
434     {
435         src[i] = i + 1;
436         inv[i] = ELEM_COUNT - i;
437 #ifdef UINT_SIZE
438         alt[i] = -!(i & 1);
439 #else
440         alt[i] = i & 1 ? -1 : 1;
441 #endif
442         if ( !(i & (i + 1)) )
443             --j;
444         sh[i] = j;
445     }
446 
447     touch(src);
448     x = src;
449     touch(x);
450     if ( !to_bool(x == src) ) return __LINE__;
451 
452     touch(src);
453     y = x + src;
454     touch(src);
455     touch(y);
456     if ( !to_bool(y == 2 * src) ) return __LINE__;
457 
458     touch(src);
459     z = y -= src;
460     touch(z);
461     if ( !to_bool(x == z) ) return __LINE__;
462 
463 #if defined(UINT_SIZE)
464 
465     touch(inv);
466     x |= inv;
467     touch(inv);
468     y &= inv;
469     touch(inv);
470     z ^= inv;
471     touch(inv);
472     touch(x);
473     if ( !to_bool((x & ~y) == z) ) return __LINE__;
474 
475 #elif ELEM_SIZE > 1 || VEC_SIZE <= 8
476 
477     touch(src);
478     x *= src;
479     y = inv * inv;
480     touch(src);
481     z = src + inv;
482     touch(inv);
483     z *= (src - inv);
484     if ( !to_bool(x - y == z) ) return __LINE__;
485 
486 #endif
487 
488 #if defined(FLOAT_SIZE)
489 
490     x = src * alt;
491     touch(alt);
492     y = src / alt;
493     if ( !to_bool(x == y) ) return __LINE__;
494     touch(alt);
495     touch(src);
496     if ( !to_bool(x * -alt == -src) ) return __LINE__;
497 
498 # if defined(recip) && defined(to_int)
499 
500     touch(src);
501     x = recip(src);
502     touch(src);
503     touch(x);
504     if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
505 
506 #  ifdef rsqrt
507     x = src * src;
508     touch(x);
509     y = rsqrt(x);
510     touch(y);
511     if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
512     touch(src);
513     if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
514 #  endif
515 
516 # endif
517 
518 # ifdef sqrt
519     x = src * src;
520     touch(x);
521     if ( !to_bool(sqrt(x) == src) ) return __LINE__;
522 # endif
523 
524 # ifdef trunc
525     x = 1 / src;
526     y = (vec_t){ 1 };
527     touch(x);
528     z = trunc(x);
529     if ( !to_bool(y == z) ) return __LINE__;
530 # endif
531 
532 #else
533 
534 # if ELEM_SIZE > 1
535 
536     touch(inv);
537     x = src * inv;
538     touch(inv);
539     y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
540     for ( i = 1; i < ELEM_COUNT / 2; ++i )
541         y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
542     if ( !to_bool(x == y) ) return __LINE__;
543 
544 #  ifdef mul_hi
545     touch(alt);
546     x = mul_hi(src, alt);
547     touch(alt);
548 #   ifdef INT_SIZE
549     if ( !to_bool(x == (alt < 0)) ) return __LINE__;
550 #   else
551     if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
552 #   endif
553 #  endif
554 
555 #  ifdef mul_full
556     x = src ^ alt;
557     touch(inv);
558     y = mul_full(x, inv);
559     touch(inv);
560     for ( i = 0; i < ELEM_COUNT; i += 2 )
561     {
562         unsigned long long res = x[i] * 1ULL * inv[i];
563 
564         z[i] = res;
565         z[i + 1] = res >> (ELEM_SIZE << 3);
566     }
567     if ( !to_bool(y == z) ) return __LINE__;
568 #  endif
569 
570     z = src;
571 #  ifdef INT_SIZE
572     z *= alt;
573 #  endif
574     touch(z);
575     x = z << 3;
576     touch(z);
577     y = z << 2;
578     touch(z);
579     if ( !to_bool(x == y + y) ) return __LINE__;
580 
581     touch(x);
582     z = x >> 2;
583     touch(x);
584     if ( !to_bool(y == z + z) ) return __LINE__;
585 
586     z = src;
587 #  ifdef INT_SIZE
588     z *= alt;
589 #  endif
590     /*
591      * Note that despite the touch()-es here there doesn't appear to be a way
592      * to make the compiler use a memory operand for the shift instruction (at
593      * least without resorting to built-ins).
594      */
595     j = 3;
596     touch(j);
597     x = z << j;
598     touch(j);
599     j = 2;
600     touch(j);
601     y = z << j;
602     touch(j);
603     if ( !to_bool(x == y + y) ) return __LINE__;
604 
605     z = x >> j;
606     touch(j);
607     if ( !to_bool(y == z + z) ) return __LINE__;
608 
609 # endif
610 
611 # if ELEM_SIZE == 2 || defined(__SSE4_1__)
612     /*
613      * Even when there are no instructions with varying shift counts per
614      * field, the code turns out to be a nice exercise for pextr/pinsr.
615      */
616     z = src;
617 #  ifdef INT_SIZE
618     z *= alt;
619 #  endif
620     /*
621      * Zap elements for which the shift count is negative (and the hence the
622      * decrement below would yield a negative count.
623      */
624     z &= (sh > 0);
625     touch(sh);
626     x = z << sh;
627     touch(sh);
628     --sh;
629     touch(sh);
630     y = z << sh;
631     touch(sh);
632     if ( !to_bool(x == y + y) ) return __LINE__;
633 
634 # endif
635 
636 #endif
637 
638 #if defined(max) && defined(min)
639 # ifdef UINT_SIZE
640     touch(inv);
641     x = min(src, inv);
642     touch(inv);
643     y = max(src, inv);
644     touch(inv);
645     if ( !to_bool(x + y == src + inv) ) return __LINE__;
646 # else
647     x = src * alt;
648     y = inv * alt;
649     touch(y);
650     z = max(x, y);
651     touch(y);
652     y = min(x, y);
653     touch(y);
654     if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
655 # endif
656 #endif
657 
658 #ifdef abs
659     x = src * alt;
660     touch(x);
661     if ( !to_bool(abs(x) == src) ) return __LINE__;
662 #endif
663 
664 #ifdef copysignz
665     touch(alt);
666     if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
667 #endif
668 
669 #ifdef swap
670     touch(src);
671     if ( !to_bool(swap(src) == inv) ) return __LINE__;
672 #endif
673 
674 #ifdef swap2
675     touch(src);
676     if ( !to_bool(swap2(src) == inv) ) return __LINE__;
677 #endif
678 
679 #if defined(broadcast)
680     if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
681 #endif
682 
683 #if defined(interleave_lo) && defined(interleave_hi)
684     touch(src);
685     x = interleave_lo(inv, src);
686     touch(src);
687     y = interleave_hi(inv, src);
688     touch(src);
689 # ifdef UINT_SIZE
690     z = ((x - y) ^ ~alt) - ~alt;
691 # else
692     z = (x - y) * alt;
693 # endif
694     if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
695 #endif
696 
697 #if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
698 
699     x = src * alt;
700     y = interleave_lo(x, alt < 0);
701     touch(x);
702     z = widen1(x);
703     touch(x);
704     if ( !to_bool(z == y) ) return __LINE__;
705 
706 # ifdef widen2
707     y = interleave_lo(alt < 0, alt < 0);
708     y = interleave_lo(z, y);
709     touch(x);
710     z = widen2(x);
711     touch(x);
712     if ( !to_bool(z == y) ) return __LINE__;
713 
714 #  ifdef widen3
715     y = interleave_lo(alt < 0, alt < 0);
716     y = interleave_lo(y, y);
717     y = interleave_lo(z, y);
718     touch(x);
719     z = widen3(x);
720     touch(x);
721     if ( !to_bool(z == y) ) return __LINE__;
722 #  endif
723 # endif
724 
725 #endif
726 
727 #if defined(UINT_SIZE) && defined(interleave_lo)
728 
729     y = interleave_lo(src, (vec_t){});
730     z = interleave_lo(y, (vec_t){});
731 
732 # ifdef widen1
733     touch(src);
734     x = widen1(src);
735     touch(src);
736     if ( !to_bool(x == y) ) return __LINE__;
737 # endif
738 
739 # ifdef widen2
740     touch(src);
741     x = widen2(src);
742     touch(src);
743     if ( !to_bool(x == z) ) return __LINE__;
744 # endif
745 
746 # ifdef widen3
747     touch(src);
748     x = widen3(src);
749     touch(src);
750     if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
751 # endif
752 
753 #endif
754 
755 #ifdef dup_lo
756     touch(src);
757     x = dup_lo(src);
758     touch(src);
759     if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
760 #endif
761 
762 #ifdef dup_hi
763     touch(src);
764     x = dup_hi(src);
765     touch(src);
766     if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
767 #endif
768 
769     for ( i = 0; i < ELEM_COUNT; ++i )
770         y[i] = (i & 1 ? inv : src)[i];
771 
772 #ifdef select
773 # ifdef UINT_SIZE
774     select(&z, src, inv, alt);
775 # else
776     select(&z, src, inv, alt > 0);
777 # endif
778     if ( !to_bool(z == y) ) return __LINE__;
779 #endif
780 
781 #ifdef select2
782 # ifdef UINT_SIZE
783     select2(&z, src, inv, alt);
784 # else
785     select2(&z, src, inv, alt > 0);
786 # endif
787     if ( !to_bool(z == y) ) return __LINE__;
788 #endif
789 
790 #ifdef mix
791     touch(src);
792     touch(inv);
793     x = mix(src, inv);
794     if ( !to_bool(x == y) ) return __LINE__;
795 
796 # ifdef addsub
797     touch(src);
798     touch(inv);
799     x = addsub(src, inv);
800     touch(src);
801     touch(inv);
802     y = mix(src - inv, src + inv);
803     if ( !to_bool(x == y) ) return __LINE__;
804 # endif
805 #endif
806 
807 #ifdef rotr
808     x = rotr(src, 1);
809     y = (src & (ELEM_COUNT - 1)) + 1;
810     if ( !to_bool(x == y) ) return __LINE__;
811 #endif
812 
813 #ifdef dot_product
814     touch(src);
815     touch(inv);
816     x = dot_product(src, inv);
817     if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
818                                  (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
819 #endif
820 
821 #ifdef hadd
822     x = src;
823     for ( i = ELEM_COUNT; i >>= 1; )
824     {
825         touch(x);
826         x = hadd((vec_t){}, x);
827     }
828     if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
829 
830 # ifdef hsub
831     touch(src);
832     touch(inv);
833     x = hsub(src, inv);
834     for ( i = ELEM_COUNT; i >>= 1; )
835         x = hadd(x, (vec_t){});
836     if ( !to_bool(x == 0) ) return __LINE__;
837 # endif
838 #endif
839 
840 
841     return 0;
842 }
843