1 /* strcspn with SSE4.2 intrinsics
2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <nmmintrin.h>
20 #include <string.h>
21 #include "varshift.h"
22 
23 /* We use 0x2:
24 	_SIDD_SBYTE_OPS
25 	| _SIDD_CMP_EQUAL_ANY
26 	| _SIDD_POSITIVE_POLARITY
27 	| _SIDD_LEAST_SIGNIFICANT
28    on pcmpistri to compare xmm/mem128
29 
30    0 1 2 3 4 5 6 7 8 9 A B C D E F
31    X X X X X X X X X X X X X X X X
32 
33    against xmm
34 
35    0 1 2 3 4 5 6 7 8 9 A B C D E F
36    A A A A A A A A A A A A A A A A
37 
38    to find out if the first 16byte data element has any byte A and
39    the offset of the first byte.  There are 3 cases:
40 
41    1. The first 16byte data element has the byte A at the offset X.
42    2. The first 16byte data element has EOS and doesn't have the byte A.
43    3. The first 16byte data element is valid and doesn't have the byte A.
44 
45    Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
46 
47     1		 X	  1	 0/1	  0
48     2		16	  0	  1	  0
49     3		16	  0	  0	  0
50 
51    We exit from the loop for cases 1 and 2 with jbe which branches
52    when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
53    X for case 1.  */
54 
55 #ifndef STRCSPN_SSE2
56 # define STRCSPN_SSE2 __strcspn_sse2
57 # define STRCSPN_SSE42 __strcspn_sse42
58 #endif
59 
60 #ifdef USE_AS_STRPBRK
61 # define RETURN(val1, val2) return val1
62 #else
63 # define RETURN(val1, val2) return val2
64 #endif
65 
66 extern
67 #ifdef USE_AS_STRPBRK
68 char *
69 #else
70 size_t
71 #endif
72 STRCSPN_SSE2 (const char *, const char *) attribute_hidden;
73 
74 
75 #ifdef USE_AS_STRPBRK
76 char *
77 #else
78 size_t
79 #endif
80 __attribute__ ((section (".text.sse4.2")))
STRCSPN_SSE42(const char * s,const char * a)81 STRCSPN_SSE42 (const char *s, const char *a)
82 {
83   if (*a == 0)
84     RETURN (NULL, strlen (s));
85 
86   const char *aligned;
87   __m128i mask;
88   int offset = (int) ((size_t) a & 15);
89   if (offset != 0)
90     {
91       /* Load masks.  */
92       aligned = (const char *) ((size_t) a & -16L);
93       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
94 
95       mask = __m128i_shift_right (mask0, offset);
96 
97       /* Find where the NULL terminator is.  */
98       int length = _mm_cmpistri (mask, mask, 0x3a);
99       if (length == 16 - offset)
100 	{
101 	  /* There is no NULL terminator.  */
102 	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
103 	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
104 	  length += index;
105 
106 	  /* Don't use SSE4.2 if the length of A > 16.  */
107 	  if (length > 16)
108 	    return STRCSPN_SSE2 (s, a);
109 
110 	  if (index != 0)
111 	    {
112 	      /* Combine mask0 and mask1.  We could play games with
113 		 palignr, but frankly this data should be in L1 now
114 		 so do the merge via an unaligned load.  */
115 	      mask = _mm_loadu_si128 ((__m128i *) a);
116 	    }
117 	}
118     }
119   else
120     {
121       /* A is aligned.  */
122       mask = _mm_load_si128 ((__m128i *) a);
123 
124       /* Find where the NULL terminator is.  */
125       int length = _mm_cmpistri (mask, mask, 0x3a);
126       if (length == 16)
127 	{
128 	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
129 	     of A > 16.  */
130 	  if (a[16] != 0)
131 	    return STRCSPN_SSE2 (s, a);
132 	}
133     }
134 
135   offset = (int) ((size_t) s & 15);
136   if (offset != 0)
137     {
138       /* Check partial string.  */
139       aligned = (const char *) ((size_t) s & -16L);
140       __m128i value = _mm_load_si128 ((__m128i *) aligned);
141 
142       value = __m128i_shift_right (value, offset);
143 
144       int length = _mm_cmpistri (mask, value, 0x2);
145       /* No need to check ZFlag since ZFlag is always 1.  */
146       int cflag = _mm_cmpistrc (mask, value, 0x2);
147       if (cflag)
148 	RETURN ((char *) (s + length), length);
149       /* Find where the NULL terminator is.  */
150       int index = _mm_cmpistri (value, value, 0x3a);
151       if (index < 16 - offset)
152 	RETURN (NULL, index);
153       aligned += 16;
154     }
155   else
156     aligned = s;
157 
158   while (1)
159     {
160       __m128i value = _mm_load_si128 ((__m128i *) aligned);
161       int index = _mm_cmpistri (mask, value, 0x2);
162       int cflag = _mm_cmpistrc (mask, value, 0x2);
163       int zflag = _mm_cmpistrz (mask, value, 0x2);
164       if (cflag)
165 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
166       if (zflag)
167 	RETURN (NULL,
168 		/* Find where the NULL terminator is.  */
169 		(size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
170       aligned += 16;
171     }
172 }
173