1 /* strspn with SSE4.2 intrinsics
2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <nmmintrin.h>
20 #include <string.h>
21 #include "varshift.h"
22 
23 /* We use 0x12:
24 	_SIDD_SBYTE_OPS
25 	| _SIDD_CMP_EQUAL_ANY
26 	| _SIDD_NEGATIVE_POLARITY
27 	| _SIDD_LEAST_SIGNIFICANT
28    on pcmpistri to compare xmm/mem128
29 
30    0 1 2 3 4 5 6 7 8 9 A B C D E F
31    X X X X X X X X X X X X X X X X
32 
33    against xmm
34 
35    0 1 2 3 4 5 6 7 8 9 A B C D E F
36    A A A A A A A A A A A A A A A A
37 
38    to find out if the first 16byte data element has any non-A byte and
39    the offset of the first byte.  There are 2 cases:
40 
41    1. The first 16byte data element has the non-A byte, including
42       EOS, at the offset X.
43    2. The first 16byte data element is valid and doesn't have the non-A
44       byte.
45 
46    Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
47 
48    case		ECX	CFlag	ZFlag	SFlag
49     1		 X	  1	 0/1	  0
50     2		16	  0	  0	  0
51 
52    We exit from the loop for case 1.  */
53 
54 extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
55 
56 
57 size_t
58 __attribute__ ((section (".text.sse4.2")))
__strspn_sse42(const char * s,const char * a)59 __strspn_sse42 (const char *s, const char *a)
60 {
61   if (*a == 0)
62     return 0;
63 
64   const char *aligned;
65   __m128i mask;
66   int offset = (int) ((size_t) a & 15);
67   if (offset != 0)
68     {
69       /* Load masks.  */
70       aligned = (const char *) ((size_t) a & -16L);
71       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
72 
73       mask = __m128i_shift_right (mask0, offset);
74 
75       /* Find where the NULL terminator is.  */
76       int length = _mm_cmpistri (mask, mask, 0x3a);
77       if (length == 16 - offset)
78 	{
79 	  /* There is no NULL terminator.  */
80 	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
81 	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
82 	  length += index;
83 
84 	  /* Don't use SSE4.2 if the length of A > 16.  */
85 	  if (length > 16)
86 	    return __strspn_sse2 (s, a);
87 
88 	  if (index != 0)
89 	    {
90 	      /* Combine mask0 and mask1.  We could play games with
91 		 palignr, but frankly this data should be in L1 now
92 		 so do the merge via an unaligned load.  */
93 	      mask = _mm_loadu_si128 ((__m128i *) a);
94 	    }
95 	}
96     }
97   else
98     {
99       /* A is aligned.  */
100       mask = _mm_load_si128 ((__m128i *) a);
101 
102       /* Find where the NULL terminator is.  */
103       int length = _mm_cmpistri (mask, mask, 0x3a);
104       if (length == 16)
105 	{
106 	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
107 	     of A > 16.  */
108 	  if (a[16] != 0)
109 	    return __strspn_sse2 (s, a);
110 	}
111     }
112 
113   offset = (int) ((size_t) s & 15);
114   if (offset != 0)
115     {
116       /* Check partial string.  */
117       aligned = (const char *) ((size_t) s & -16L);
118       __m128i value = _mm_load_si128 ((__m128i *) aligned);
119 
120       value = __m128i_shift_right (value, offset);
121 
122       int length = _mm_cmpistri (mask, value, 0x12);
123       /* No need to check CFlag since it is always 1.  */
124       if (length < 16 - offset)
125 	return length;
126       /* Find where the NULL terminator is.  */
127       int index = _mm_cmpistri (value, value, 0x3a);
128       if (index < 16 - offset)
129 	return length;
130       aligned += 16;
131     }
132   else
133     aligned = s;
134 
135   while (1)
136     {
137       __m128i value = _mm_load_si128 ((__m128i *) aligned);
138       int index = _mm_cmpistri (mask, value, 0x12);
139       int cflag = _mm_cmpistrc (mask, value, 0x12);
140       if (cflag)
141 	return (size_t) (aligned + index - s);
142       aligned += 16;
143     }
144 }
145