1 #include <stdbool.h>
2
3 asm (
4 "\t.text\n"
5 "\t.globl _start\n"
6 "_start:\n"
7 #if defined(__i386__) && VEC_SIZE == 16
8 "\tpush %ebp\n"
9 "\tmov %esp,%ebp\n"
10 "\tand $~0xf,%esp\n"
11 "\tcall simd_test\n"
12 "\tleave\n"
13 "\tret"
14 #else
15 "\tjmp simd_test"
16 #endif
17 );
18
19 typedef
20 #if defined(INT_SIZE)
21 # define ELEM_SIZE INT_SIZE
22 signed int
23 # if INT_SIZE == 1
24 # define MODE QI
25 # elif INT_SIZE == 2
26 # define MODE HI
27 # elif INT_SIZE == 4
28 # define MODE SI
29 # elif INT_SIZE == 8
30 # define MODE DI
31 # endif
32 #elif defined(UINT_SIZE)
33 # define ELEM_SIZE UINT_SIZE
34 unsigned int
35 # if UINT_SIZE == 1
36 # define MODE QI
37 # elif UINT_SIZE == 2
38 # define MODE HI
39 # elif UINT_SIZE == 4
40 # define MODE SI
41 # elif UINT_SIZE == 8
42 # define MODE DI
43 # endif
44 #elif defined(FLOAT_SIZE)
45 float
46 # define ELEM_SIZE FLOAT_SIZE
47 # if FLOAT_SIZE == 4
48 # define MODE SF
49 # elif FLOAT_SIZE == 8
50 # define MODE DF
51 # endif
52 #endif
53 #ifndef VEC_SIZE
54 # define VEC_SIZE ELEM_SIZE
55 #endif
56 __attribute__((mode(MODE), vector_size(VEC_SIZE))) vec_t;
57
58 #define ELEM_COUNT (VEC_SIZE / ELEM_SIZE)
59
60 typedef unsigned int __attribute__((mode(QI), vector_size(VEC_SIZE))) byte_vec_t;
61
62 /* Various builtins want plain char / int / long long vector types ... */
63 typedef char __attribute__((vector_size(VEC_SIZE))) vqi_t;
64 typedef short __attribute__((vector_size(VEC_SIZE))) vhi_t;
65 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
66 #if VEC_SIZE >= 8
67 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
68 #endif
69
70 #if VEC_SIZE == 8 && defined(__SSE__)
71 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
72 #elif VEC_SIZE == 16
73 # if defined(__AVX__) && defined(FLOAT_SIZE)
74 # if ELEM_SIZE == 4
75 # define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
76 # elif ELEM_SIZE == 8
77 # define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
78 # endif
79 # elif defined(__SSE4_1__)
80 # define to_bool(cmp) __builtin_ia32_ptestc128(cmp, (vdi_t){} == 0)
81 # elif defined(__SSE__) && ELEM_SIZE == 4
82 # define to_bool(cmp) (__builtin_ia32_movmskps(cmp) == 0xf)
83 # elif defined(__SSE2__)
84 # if ELEM_SIZE == 8
85 # define to_bool(cmp) (__builtin_ia32_movmskpd(cmp) == 3)
86 # else
87 # define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
88 # endif
89 # endif
90 #elif VEC_SIZE == 32
91 # if defined(__AVX__) && ELEM_SIZE == 4
92 # define to_bool(cmp) (__builtin_ia32_movmskps256(cmp) == 0xff)
93 # elif defined(__AVX__) && ELEM_SIZE == 8
94 # define to_bool(cmp) (__builtin_ia32_movmskpd256(cmp) == 0xf)
95 # endif
96 #endif
97
98 #ifndef to_bool
_to_bool(byte_vec_t bv)99 static inline bool _to_bool(byte_vec_t bv)
100 {
101 unsigned int i;
102
103 for ( i = 0; i < VEC_SIZE; ++i )
104 if ( bv[i] != 0xff )
105 return false;
106
107 return true;
108 }
109 # define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
110 #endif
111
112 #if VEC_SIZE == FLOAT_SIZE
113 # define to_int(x) ((vec_t){ (int)(x)[0] })
114 #elif VEC_SIZE == 16 && defined(__SSE2__)
115 # if FLOAT_SIZE == 4
116 # define to_int(x) __builtin_ia32_cvtdq2ps(__builtin_ia32_cvtps2dq(x))
117 # elif FLOAT_SIZE == 8
118 # define to_int(x) __builtin_ia32_cvtdq2pd(__builtin_ia32_cvtpd2dq(x))
119 # endif
120 #elif VEC_SIZE == 32 && defined(__AVX__)
121 # if FLOAT_SIZE == 4
122 # define to_int(x) __builtin_ia32_cvtdq2ps256(__builtin_ia32_cvtps2dq256(x))
123 # elif FLOAT_SIZE == 8
124 # define to_int(x) __builtin_ia32_cvtdq2pd256(__builtin_ia32_cvtpd2dq256(x))
125 # endif
126 #endif
127
128 #if VEC_SIZE == FLOAT_SIZE
129 # define scalar_1op(x, op) ({ \
130 typeof((x)[0]) __attribute__((vector_size(16))) r_; \
131 asm ( op : [out] "=&x" (r_) : [in] "m" (x) ); \
132 (vec_t){ r_[0] }; \
133 })
134 #endif
135
136 #if FLOAT_SIZE == 4 && defined(__SSE__)
137 # if VEC_SIZE == 32 && defined(__AVX__)
138 # define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss256(&t_); })
139 # define max(x, y) __builtin_ia32_maxps256(x, y)
140 # define min(x, y) __builtin_ia32_minps256(x, y)
141 # define recip(x) __builtin_ia32_rcpps256(x)
142 # define rsqrt(x) __builtin_ia32_rsqrtps256(x)
143 # define sqrt(x) __builtin_ia32_sqrtps256(x)
144 # define swap(x) ({ \
145 vec_t t_ = __builtin_ia32_vpermilps256(x, 0b00011011); \
146 __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
147 })
148 # define swap2(x) ({ \
149 vec_t t_ = __builtin_ia32_vpermilvarps256(x, __builtin_ia32_cvtps2dq256(inv) - 1); \
150 __builtin_ia32_vperm2f128_ps256(t_, t_, 0b00000001); \
151 })
152 # elif VEC_SIZE == 16
153 # ifdef __AVX__
154 # define broadcast(x) ({ float t_ = (x); __builtin_ia32_vbroadcastss(&t_); })
155 # endif
156 # define interleave_hi(x, y) __builtin_ia32_unpckhps(x, y)
157 # define interleave_lo(x, y) __builtin_ia32_unpcklps(x, y)
158 # define max(x, y) __builtin_ia32_maxps(x, y)
159 # define min(x, y) __builtin_ia32_minps(x, y)
160 # define recip(x) __builtin_ia32_rcpps(x)
161 # define rsqrt(x) __builtin_ia32_rsqrtps(x)
162 # define sqrt(x) __builtin_ia32_sqrtps(x)
163 # define swap(x) __builtin_ia32_shufps(x, x, 0b00011011)
164 # ifdef __AVX__
165 # define swap2(x) __builtin_ia32_vpermilvarps(x, __builtin_ia32_cvtps2dq(inv) - 1)
166 # endif
167 # elif VEC_SIZE == 4
168 # define recip(x) scalar_1op(x, "rcpss %[in], %[out]")
169 # define rsqrt(x) scalar_1op(x, "rsqrtss %[in], %[out]")
170 # define sqrt(x) scalar_1op(x, "sqrtss %[in], %[out]")
171 # endif
172 #elif FLOAT_SIZE == 8 && defined(__SSE2__)
173 # if VEC_SIZE == 32 && defined(__AVX__)
174 # define broadcast(x) ({ double t_ = (x); __builtin_ia32_vbroadcastsd256(&t_); })
175 # define max(x, y) __builtin_ia32_maxpd256(x, y)
176 # define min(x, y) __builtin_ia32_minpd256(x, y)
177 # define recip(x) ({ \
178 float __attribute__((vector_size(16))) t_ = __builtin_ia32_cvtpd2ps256(x); \
179 t_ = __builtin_ia32_vextractf128_ps256( \
180 __builtin_ia32_rcpps256( \
181 __builtin_ia32_vbroadcastf128_ps256(&t_)), 0); \
182 __builtin_ia32_cvtps2pd256(t_); \
183 })
184 # define rsqrt(x) ({ \
185 float __attribute__((vector_size(16))) t1_ = __builtin_ia32_cvtpd2ps256(x); \
186 float __attribute__((vector_size(32))) t2_ = __builtin_ia32_vinsertf128_ps256((typeof(t2_)){}, t1_, 0); \
187 t2_ = __builtin_ia32_vinsertf128_ps256(t2_, t1_, 1); \
188 t1_ = __builtin_ia32_vextractf128_ps256(__builtin_ia32_rsqrtps256(t2_), 0); \
189 __builtin_ia32_cvtps2pd256(t1_); \
190 })
191 # define sqrt(x) __builtin_ia32_sqrtpd256(x)
192 # define swap(x) ({ \
193 vec_t t_ = __builtin_ia32_vpermilpd256(x, 0b00000101); \
194 __builtin_ia32_vperm2f128_pd256(t_, t_, 0b00000001); \
195 })
196 # elif VEC_SIZE == 16
197 # define interleave_hi(x, y) __builtin_ia32_unpckhpd(x, y)
198 # define interleave_lo(x, y) __builtin_ia32_unpcklpd(x, y)
199 # define max(x, y) __builtin_ia32_maxpd(x, y)
200 # define min(x, y) __builtin_ia32_minpd(x, y)
201 # define recip(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rcpps(__builtin_ia32_cvtpd2ps(x)))
202 # define rsqrt(x) __builtin_ia32_cvtps2pd(__builtin_ia32_rsqrtps(__builtin_ia32_cvtpd2ps(x)))
203 # define sqrt(x) __builtin_ia32_sqrtpd(x)
204 # define swap(x) __builtin_ia32_shufpd(x, x, 0b01)
205 # ifdef __AVX__
206 # define swap2(x) __builtin_ia32_vpermilvarpd(x, __builtin_ia32_pmovsxdq128( \
207 __builtin_ia32_cvtpd2dq(inv) - 1) << 1)
208 # endif
209 # elif VEC_SIZE == 8
210 # define recip(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rcpss %[out], %[out]; cvtss2sd %[out], %[out]")
211 # define rsqrt(x) scalar_1op(x, "cvtsd2ss %[in], %[out]; rsqrtss %[out], %[out]; cvtss2sd %[out], %[out]")
212 # define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
213 # endif
214 #endif
215 #if VEC_SIZE == 16 && defined(__SSE2__)
216 # if INT_SIZE == 1 || UINT_SIZE == 1
217 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
218 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
219 # elif INT_SIZE == 2 || UINT_SIZE == 2
220 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhwd128((vhi_t)(x), (vhi_t)(y)))
221 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklwd128((vhi_t)(x), (vhi_t)(y)))
222 # define swap(x) ((vec_t)__builtin_ia32_pshufd( \
223 (vsi_t)__builtin_ia32_pshufhw( \
224 __builtin_ia32_pshuflw((vhi_t)(x), 0b00011011), 0b00011011), 0b01001110))
225 # elif INT_SIZE == 4 || UINT_SIZE == 4
226 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhdq128((vsi_t)(x), (vsi_t)(y)))
227 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpckldq128((vsi_t)(x), (vsi_t)(y)))
228 # define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b00011011))
229 # elif INT_SIZE == 8 || UINT_SIZE == 8
230 # define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhqdq128((vdi_t)(x), (vdi_t)(y)))
231 # define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklqdq128((vdi_t)(x), (vdi_t)(y)))
232 # define swap(x) ((vec_t)__builtin_ia32_pshufd((vsi_t)(x), 0b01001110))
233 # endif
234 # if UINT_SIZE == 1
235 # define max(x, y) ((vec_t)__builtin_ia32_pmaxub128((vqi_t)(x), (vqi_t)(y)))
236 # define min(x, y) ((vec_t)__builtin_ia32_pminub128((vqi_t)(x), (vqi_t)(y)))
237 # elif INT_SIZE == 2
238 # define max(x, y) __builtin_ia32_pmaxsw128(x, y)
239 # define min(x, y) __builtin_ia32_pminsw128(x, y)
240 # define mul_hi(x, y) __builtin_ia32_pmulhw128(x, y)
241 # elif UINT_SIZE == 2
242 # define mul_hi(x, y) ((vec_t)__builtin_ia32_pmulhuw128((vhi_t)(x), (vhi_t)(y)))
243 # elif UINT_SIZE == 4
244 # define mul_full(x, y) ((vec_t)__builtin_ia32_pmuludq128((vsi_t)(x), (vsi_t)(y)))
245 # endif
246 # define select(d, x, y, m) ({ \
247 void *d_ = (d); \
248 vqi_t m_ = (vqi_t)(m); \
249 __builtin_ia32_maskmovdqu((vqi_t)(x), m_, d_); \
250 __builtin_ia32_maskmovdqu((vqi_t)(y), ~m_, d_); \
251 })
252 #endif
253 #if VEC_SIZE == 16 && defined(__SSE3__)
254 # if FLOAT_SIZE == 4
255 # define addsub(x, y) __builtin_ia32_addsubps(x, y)
256 # define dup_hi(x) __builtin_ia32_movshdup(x)
257 # define dup_lo(x) __builtin_ia32_movsldup(x)
258 # define hadd(x, y) __builtin_ia32_haddps(x, y)
259 # define hsub(x, y) __builtin_ia32_hsubps(x, y)
260 # elif FLOAT_SIZE == 8
261 # define addsub(x, y) __builtin_ia32_addsubpd(x, y)
262 # define dup_lo(x) ({ \
263 double __attribute__((vector_size(16))) r_; \
264 asm ( "movddup %1,%0" : "=x" (r_) : "m" ((x)[0]) ); \
265 r_; \
266 })
267 # define hadd(x, y) __builtin_ia32_haddpd(x, y)
268 # define hsub(x, y) __builtin_ia32_hsubpd(x, y)
269 # endif
270 #elif VEC_SIZE == 32 && defined(__AVX__)
271 # if FLOAT_SIZE == 4
272 # define addsub(x, y) __builtin_ia32_addsubps256(x, y)
273 # define dup_hi(x) __builtin_ia32_movshdup256(x)
274 # define dup_lo(x) __builtin_ia32_movsldup256(x)
275 # define hadd(x, y) ({ \
276 vec_t t_ = __builtin_ia32_haddps256(x, y); \
277 (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
278 })
279 # define hsub(x, y) ({ \
280 vec_t t_ = __builtin_ia32_hsubps256(x, y); \
281 (vec_t){t_[0], t_[1], t_[4], t_[5], t_[2], t_[3], t_[6], t_[7]}; \
282 })
283 # elif FLOAT_SIZE == 8
284 # define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
285 # define dup_lo(x) __builtin_ia32_movddup256(x)
286 # define hadd(x, y) ({ \
287 vec_t t_ = __builtin_ia32_haddpd256(x, y); \
288 (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
289 })
290 # define hsub(x, y) ({ \
291 vec_t t_ = __builtin_ia32_hsubpd256(x, y); \
292 (vec_t){t_[0], t_[2], t_[1], t_[3]}; \
293 })
294 # endif
295 #endif
296 #if VEC_SIZE == 16 && defined(__SSSE3__)
297 # if INT_SIZE == 1
298 # define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
299 # elif INT_SIZE == 2
300 # define abs(x) __builtin_ia32_pabsw128(x)
301 # elif INT_SIZE == 4
302 # define abs(x) __builtin_ia32_pabsd128(x)
303 # endif
304 # if INT_SIZE == 1 || UINT_SIZE == 1
305 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignb128((vqi_t)(x), (vqi_t)(y)))
306 # define swap(x) ((vec_t)__builtin_ia32_pshufb128((vqi_t)(x), (vqi_t)(inv - 1)))
307 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 8))
308 # elif INT_SIZE == 2 || UINT_SIZE == 2
309 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignw128((vhi_t)(x), (vhi_t)(y)))
310 # define hadd(x, y) ((vec_t)__builtin_ia32_phaddw128((vhi_t)(x), (vhi_t)(y)))
311 # define hsub(x, y) ((vec_t)__builtin_ia32_phsubw128((vhi_t)(x), (vhi_t)(y)))
312 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 16))
313 # elif INT_SIZE == 4 || UINT_SIZE == 4
314 # define copysignz(x, y) ((vec_t)__builtin_ia32_psignd128((vsi_t)(x), (vsi_t)(y)))
315 # define hadd(x, y) ((vec_t)__builtin_ia32_phaddd128((vsi_t)(x), (vsi_t)(y)))
316 # define hsub(x, y) ((vec_t)__builtin_ia32_phsubd128((vsi_t)(x), (vsi_t)(y)))
317 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 32))
318 # elif INT_SIZE == 8 || UINT_SIZE == 8
319 # define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
320 # endif
321 #endif
322 #if VEC_SIZE == 16 && defined(__SSE4_1__)
323 # if INT_SIZE == 1
324 # define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
325 # define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
326 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxbw128((vqi_t)(x)))
327 # define widen2(x) ((vec_t)__builtin_ia32_pmovsxbd128((vqi_t)(x)))
328 # define widen3(x) ((vec_t)__builtin_ia32_pmovsxbq128((vqi_t)(x)))
329 # elif INT_SIZE == 2
330 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxwd128(x))
331 # define widen2(x) ((vec_t)__builtin_ia32_pmovsxwq128(x))
332 # elif INT_SIZE == 4
333 # define max(x, y) __builtin_ia32_pmaxsd128(x, y)
334 # define min(x, y) __builtin_ia32_pminsd128(x, y)
335 # define mul_full(x, y) ((vec_t)__builtin_ia32_pmuldq128(x, y))
336 # define widen1(x) ((vec_t)__builtin_ia32_pmovsxdq128(x))
337 # elif UINT_SIZE == 1
338 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxbw128((vqi_t)(x)))
339 # define widen2(x) ((vec_t)__builtin_ia32_pmovzxbd128((vqi_t)(x)))
340 # define widen3(x) ((vec_t)__builtin_ia32_pmovzxbq128((vqi_t)(x)))
341 # elif UINT_SIZE == 2
342 # define max(x, y) ((vec_t)__builtin_ia32_pmaxuw128((vhi_t)(x), (vhi_t)(y)))
343 # define min(x, y) ((vec_t)__builtin_ia32_pminuw128((vhi_t)(x), (vhi_t)(y)))
344 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxwd128((vhi_t)(x)))
345 # define widen2(x) ((vec_t)__builtin_ia32_pmovzxwq128((vhi_t)(x)))
346 # elif UINT_SIZE == 4
347 # define max(x, y) ((vec_t)__builtin_ia32_pmaxud128((vsi_t)(x), (vsi_t)(y)))
348 # define min(x, y) ((vec_t)__builtin_ia32_pminud128((vsi_t)(x), (vsi_t)(y)))
349 # define widen1(x) ((vec_t)__builtin_ia32_pmovzxdq128((vsi_t)(x)))
350 # endif
351 # undef select
352 # if defined(INT_SIZE) || defined(UINT_SIZE)
353 # define select(d, x, y, m) \
354 (*(d) = (vec_t)__builtin_ia32_pblendvb128((vqi_t)(y), (vqi_t)(x), (vqi_t)(m)))
355 # elif FLOAT_SIZE == 4
356 # define dot_product(x, y) __builtin_ia32_dpps(x, y, 0b11110001)
357 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps(y, x, m))
358 # define trunc(x) __builtin_ia32_roundps(x, 0b1011)
359 # elif FLOAT_SIZE == 8
360 # define dot_product(x, y) __builtin_ia32_dppd(x, y, 0b00110001)
361 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd(y, x, m))
362 # define trunc(x) __builtin_ia32_roundpd(x, 0b1011)
363 # endif
364 # if INT_SIZE == 2 || UINT_SIZE == 2
365 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b10101010))
366 # elif INT_SIZE == 4 || UINT_SIZE == 4
367 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11001100))
368 # elif INT_SIZE == 8 || UINT_SIZE == 8
369 # define mix(x, y) ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), 0b11110000))
370 # elif FLOAT_SIZE == 4
371 # define mix(x, y) __builtin_ia32_blendps(x, y, 0b1010)
372 # elif FLOAT_SIZE == 8
373 # define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
374 # endif
375 #endif
376 #if VEC_SIZE == 32 && defined(__AVX__)
377 # if FLOAT_SIZE == 4
378 # define dot_product(x, y) ({ \
379 vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
380 (vec_t){t_[0] + t_[4]}; \
381 })
382 # define mix(x, y) __builtin_ia32_blendps256(x, y, 0b10101010)
383 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvps256(y, x, m))
384 # define select2(d, x, y, m) ({ \
385 vsi_t m_ = (vsi_t)(m); \
386 *(d) = __builtin_ia32_maskloadps256(&(x), m_); \
387 __builtin_ia32_maskstoreps256(d, ~m_, y); \
388 })
389 # define trunc(x) __builtin_ia32_roundps256(x, 0b1011)
390 # elif FLOAT_SIZE == 8
391 # define mix(x, y) __builtin_ia32_blendpd256(x, y, 0b1010)
392 # define select(d, x, y, m) (*(d) = __builtin_ia32_blendvpd256(y, x, m))
393 # define select2(d, x, y, m) ({ \
394 vdi_t m_ = (vdi_t)(m); \
395 *(d) = __builtin_ia32_maskloadpd256(&(x), m_); \
396 __builtin_ia32_maskstorepd256(d, ~m_, y); \
397 })
398 # define trunc(x) __builtin_ia32_roundpd256(x, 0b1011)
399 # endif
400 #endif
401 #if VEC_SIZE == FLOAT_SIZE
402 # define max(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ > y_ ? x_ : y_; })})
403 # define min(x, y) ((vec_t){({ typeof(x[0]) x_ = (x)[0], y_ = (y)[0]; x_ < y_ ? x_ : y_; })})
404 # ifdef __SSE4_1__
405 # if FLOAT_SIZE == 4
406 # define trunc(x) ({ \
407 float __attribute__((vector_size(16))) r_; \
408 asm ( "roundss $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
409 (vec_t){ r_[0] }; \
410 })
411 # elif FLOAT_SIZE == 8
412 # define trunc(x) ({ \
413 double __attribute__((vector_size(16))) r_; \
414 asm ( "roundsd $0b1011,%1,%0" : "=x" (r_) : "m" (x) ); \
415 (vec_t){ r_[0] }; \
416 })
417 # endif
418 # endif
419 #endif
420
421 /*
422 * Suppress value propagation by the compiler, preventing unwanted
423 * optimization. This at once makes the compiler use memory operands
424 * more often, which for our purposes is the more interesting case.
425 */
426 #define touch(var) asm volatile ( "" : "+m" (var) )
427
simd_test(void)428 int simd_test(void)
429 {
430 unsigned int i, j;
431 vec_t x, y, z, src, inv, alt, sh;
432
433 for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
434 {
435 src[i] = i + 1;
436 inv[i] = ELEM_COUNT - i;
437 #ifdef UINT_SIZE
438 alt[i] = -!(i & 1);
439 #else
440 alt[i] = i & 1 ? -1 : 1;
441 #endif
442 if ( !(i & (i + 1)) )
443 --j;
444 sh[i] = j;
445 }
446
447 touch(src);
448 x = src;
449 touch(x);
450 if ( !to_bool(x == src) ) return __LINE__;
451
452 touch(src);
453 y = x + src;
454 touch(src);
455 touch(y);
456 if ( !to_bool(y == 2 * src) ) return __LINE__;
457
458 touch(src);
459 z = y -= src;
460 touch(z);
461 if ( !to_bool(x == z) ) return __LINE__;
462
463 #if defined(UINT_SIZE)
464
465 touch(inv);
466 x |= inv;
467 touch(inv);
468 y &= inv;
469 touch(inv);
470 z ^= inv;
471 touch(inv);
472 touch(x);
473 if ( !to_bool((x & ~y) == z) ) return __LINE__;
474
475 #elif ELEM_SIZE > 1 || VEC_SIZE <= 8
476
477 touch(src);
478 x *= src;
479 y = inv * inv;
480 touch(src);
481 z = src + inv;
482 touch(inv);
483 z *= (src - inv);
484 if ( !to_bool(x - y == z) ) return __LINE__;
485
486 #endif
487
488 #if defined(FLOAT_SIZE)
489
490 x = src * alt;
491 touch(alt);
492 y = src / alt;
493 if ( !to_bool(x == y) ) return __LINE__;
494 touch(alt);
495 touch(src);
496 if ( !to_bool(x * -alt == -src) ) return __LINE__;
497
498 # if defined(recip) && defined(to_int)
499
500 touch(src);
501 x = recip(src);
502 touch(src);
503 touch(x);
504 if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
505
506 # ifdef rsqrt
507 x = src * src;
508 touch(x);
509 y = rsqrt(x);
510 touch(y);
511 if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
512 touch(src);
513 if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
514 # endif
515
516 # endif
517
518 # ifdef sqrt
519 x = src * src;
520 touch(x);
521 if ( !to_bool(sqrt(x) == src) ) return __LINE__;
522 # endif
523
524 # ifdef trunc
525 x = 1 / src;
526 y = (vec_t){ 1 };
527 touch(x);
528 z = trunc(x);
529 if ( !to_bool(y == z) ) return __LINE__;
530 # endif
531
532 #else
533
534 # if ELEM_SIZE > 1
535
536 touch(inv);
537 x = src * inv;
538 touch(inv);
539 y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
540 for ( i = 1; i < ELEM_COUNT / 2; ++i )
541 y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
542 if ( !to_bool(x == y) ) return __LINE__;
543
544 # ifdef mul_hi
545 touch(alt);
546 x = mul_hi(src, alt);
547 touch(alt);
548 # ifdef INT_SIZE
549 if ( !to_bool(x == (alt < 0)) ) return __LINE__;
550 # else
551 if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
552 # endif
553 # endif
554
555 # ifdef mul_full
556 x = src ^ alt;
557 touch(inv);
558 y = mul_full(x, inv);
559 touch(inv);
560 for ( i = 0; i < ELEM_COUNT; i += 2 )
561 {
562 unsigned long long res = x[i] * 1ULL * inv[i];
563
564 z[i] = res;
565 z[i + 1] = res >> (ELEM_SIZE << 3);
566 }
567 if ( !to_bool(y == z) ) return __LINE__;
568 # endif
569
570 z = src;
571 # ifdef INT_SIZE
572 z *= alt;
573 # endif
574 touch(z);
575 x = z << 3;
576 touch(z);
577 y = z << 2;
578 touch(z);
579 if ( !to_bool(x == y + y) ) return __LINE__;
580
581 touch(x);
582 z = x >> 2;
583 touch(x);
584 if ( !to_bool(y == z + z) ) return __LINE__;
585
586 z = src;
587 # ifdef INT_SIZE
588 z *= alt;
589 # endif
590 /*
591 * Note that despite the touch()-es here there doesn't appear to be a way
592 * to make the compiler use a memory operand for the shift instruction (at
593 * least without resorting to built-ins).
594 */
595 j = 3;
596 touch(j);
597 x = z << j;
598 touch(j);
599 j = 2;
600 touch(j);
601 y = z << j;
602 touch(j);
603 if ( !to_bool(x == y + y) ) return __LINE__;
604
605 z = x >> j;
606 touch(j);
607 if ( !to_bool(y == z + z) ) return __LINE__;
608
609 # endif
610
611 # if ELEM_SIZE == 2 || defined(__SSE4_1__)
612 /*
613 * Even when there are no instructions with varying shift counts per
614 * field, the code turns out to be a nice exercise for pextr/pinsr.
615 */
616 z = src;
617 # ifdef INT_SIZE
618 z *= alt;
619 # endif
620 /*
621 * Zap elements for which the shift count is negative (and the hence the
622 * decrement below would yield a negative count.
623 */
624 z &= (sh > 0);
625 touch(sh);
626 x = z << sh;
627 touch(sh);
628 --sh;
629 touch(sh);
630 y = z << sh;
631 touch(sh);
632 if ( !to_bool(x == y + y) ) return __LINE__;
633
634 # endif
635
636 #endif
637
638 #if defined(max) && defined(min)
639 # ifdef UINT_SIZE
640 touch(inv);
641 x = min(src, inv);
642 touch(inv);
643 y = max(src, inv);
644 touch(inv);
645 if ( !to_bool(x + y == src + inv) ) return __LINE__;
646 # else
647 x = src * alt;
648 y = inv * alt;
649 touch(y);
650 z = max(x, y);
651 touch(y);
652 y = min(x, y);
653 touch(y);
654 if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
655 # endif
656 #endif
657
658 #ifdef abs
659 x = src * alt;
660 touch(x);
661 if ( !to_bool(abs(x) == src) ) return __LINE__;
662 #endif
663
664 #ifdef copysignz
665 touch(alt);
666 if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
667 #endif
668
669 #ifdef swap
670 touch(src);
671 if ( !to_bool(swap(src) == inv) ) return __LINE__;
672 #endif
673
674 #ifdef swap2
675 touch(src);
676 if ( !to_bool(swap2(src) == inv) ) return __LINE__;
677 #endif
678
679 #if defined(broadcast)
680 if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
681 #endif
682
683 #if defined(interleave_lo) && defined(interleave_hi)
684 touch(src);
685 x = interleave_lo(inv, src);
686 touch(src);
687 y = interleave_hi(inv, src);
688 touch(src);
689 # ifdef UINT_SIZE
690 z = ((x - y) ^ ~alt) - ~alt;
691 # else
692 z = (x - y) * alt;
693 # endif
694 if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
695 #endif
696
697 #if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
698
699 x = src * alt;
700 y = interleave_lo(x, alt < 0);
701 touch(x);
702 z = widen1(x);
703 touch(x);
704 if ( !to_bool(z == y) ) return __LINE__;
705
706 # ifdef widen2
707 y = interleave_lo(alt < 0, alt < 0);
708 y = interleave_lo(z, y);
709 touch(x);
710 z = widen2(x);
711 touch(x);
712 if ( !to_bool(z == y) ) return __LINE__;
713
714 # ifdef widen3
715 y = interleave_lo(alt < 0, alt < 0);
716 y = interleave_lo(y, y);
717 y = interleave_lo(z, y);
718 touch(x);
719 z = widen3(x);
720 touch(x);
721 if ( !to_bool(z == y) ) return __LINE__;
722 # endif
723 # endif
724
725 #endif
726
727 #if defined(UINT_SIZE) && defined(interleave_lo)
728
729 y = interleave_lo(src, (vec_t){});
730 z = interleave_lo(y, (vec_t){});
731
732 # ifdef widen1
733 touch(src);
734 x = widen1(src);
735 touch(src);
736 if ( !to_bool(x == y) ) return __LINE__;
737 # endif
738
739 # ifdef widen2
740 touch(src);
741 x = widen2(src);
742 touch(src);
743 if ( !to_bool(x == z) ) return __LINE__;
744 # endif
745
746 # ifdef widen3
747 touch(src);
748 x = widen3(src);
749 touch(src);
750 if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
751 # endif
752
753 #endif
754
755 #ifdef dup_lo
756 touch(src);
757 x = dup_lo(src);
758 touch(src);
759 if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
760 #endif
761
762 #ifdef dup_hi
763 touch(src);
764 x = dup_hi(src);
765 touch(src);
766 if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
767 #endif
768
769 for ( i = 0; i < ELEM_COUNT; ++i )
770 y[i] = (i & 1 ? inv : src)[i];
771
772 #ifdef select
773 # ifdef UINT_SIZE
774 select(&z, src, inv, alt);
775 # else
776 select(&z, src, inv, alt > 0);
777 # endif
778 if ( !to_bool(z == y) ) return __LINE__;
779 #endif
780
781 #ifdef select2
782 # ifdef UINT_SIZE
783 select2(&z, src, inv, alt);
784 # else
785 select2(&z, src, inv, alt > 0);
786 # endif
787 if ( !to_bool(z == y) ) return __LINE__;
788 #endif
789
790 #ifdef mix
791 touch(src);
792 touch(inv);
793 x = mix(src, inv);
794 if ( !to_bool(x == y) ) return __LINE__;
795
796 # ifdef addsub
797 touch(src);
798 touch(inv);
799 x = addsub(src, inv);
800 touch(src);
801 touch(inv);
802 y = mix(src - inv, src + inv);
803 if ( !to_bool(x == y) ) return __LINE__;
804 # endif
805 #endif
806
807 #ifdef rotr
808 x = rotr(src, 1);
809 y = (src & (ELEM_COUNT - 1)) + 1;
810 if ( !to_bool(x == y) ) return __LINE__;
811 #endif
812
813 #ifdef dot_product
814 touch(src);
815 touch(inv);
816 x = dot_product(src, inv);
817 if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
818 (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
819 #endif
820
821 #ifdef hadd
822 x = src;
823 for ( i = ELEM_COUNT; i >>= 1; )
824 {
825 touch(x);
826 x = hadd((vec_t){}, x);
827 }
828 if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
829
830 # ifdef hsub
831 touch(src);
832 touch(inv);
833 x = hsub(src, inv);
834 for ( i = ELEM_COUNT; i >>= 1; )
835 x = hadd(x, (vec_t){});
836 if ( !to_bool(x == 0) ) return __LINE__;
837 # endif
838 #endif
839
840
841 return 0;
842 }
843