1/* Find character CH in a NUL terminated string. 2 Highly optimized version for ix85, x>=5. 3 Copyright (C) 1995-2021 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21#include "asm-syntax.h" 22 23/* This version is especially optimized for the i586 (and following?) 24 processors. This is mainly done by using the two pipelines. The 25 version optimized for i486 is weak in this aspect because to get 26 as much parallelism we have to execute some *more* instructions. 27 28 The code below is structured to reflect the pairing of the instructions 29 as *I think* it is. I have no processor data book to verify this. 30 If you find something you think is incorrect let me know. */ 31 32 33/* The magic value which is used throughout in the whole code. */ 34#define magic 0xfefefeff 35 36#define PARMS 4+16 /* space for 4 saved regs */ 37#define RTN PARMS 38#define STR RTN 39#define CHR STR+4 40 41 .text 42ENTRY (strchr) 43 44 pushl %edi /* Save callee-safe registers. */ 45 cfi_adjust_cfa_offset (-4) 46 pushl %esi 47 cfi_adjust_cfa_offset (-4) 48 49 pushl %ebx 50 cfi_adjust_cfa_offset (-4) 51 pushl %ebp 52 cfi_adjust_cfa_offset (-4) 53 54 movl STR(%esp), %eax 55 movl CHR(%esp), %edx 56 57 movl %eax, %edi /* duplicate string pointer for later */ 58 cfi_rel_offset (edi, 12) 59 xorl %ecx, %ecx /* clear %ecx */ 60 61 /* At the moment %edx contains C. What we need for the 62 algorithm is C in all bytes of the dword. Avoid 63 operations on 16 bit words because these require an 64 prefix byte (and one more cycle). */ 65 movb %dl, %dh /* now it is 0|0|c|c */ 66 movb %dl, %cl /* we construct the lower half in %ecx */ 67 68 shll $16, %edx /* now %edx is c|c|0|0 */ 69 movb %cl, %ch /* now %ecx is 0|0|c|c */ 70 71 orl %ecx, %edx /* and finally c|c|c|c */ 72 andl $3, %edi /* mask alignment bits */ 73 74 jz L(11) /* alignment is 0 => start loop */ 75 76 movb %dl, %cl /* 0 is needed below */ 77 jp L(0) /* exactly two bits set */ 78 79 xorb (%eax), %cl /* is byte the one we are looking for? */ 80 jz L(out) /* yes => return pointer */ 81 82 xorb %dl, %cl /* load single byte and test for NUL */ 83 je L(3) /* yes => return NULL */ 84 85 movb 1(%eax), %cl /* load single byte */ 86 incl %eax 87 88 cmpb %cl, %dl /* is byte == C? */ 89 je L(out) /* aligned => return pointer */ 90 91 cmpb $0, %cl /* is byte NUL? */ 92 je L(3) /* yes => return NULL */ 93 94 incl %eax 95 decl %edi 96 97 jne L(11) 98 99L(0): movb (%eax), %cl /* load single byte */ 100 101 cmpb %cl, %dl /* is byte == C? */ 102 je L(out) /* aligned => return pointer */ 103 104 cmpb $0, %cl /* is byte NUL? */ 105 je L(3) /* yes => return NULL */ 106 107 incl %eax /* increment pointer */ 108 109 cfi_rel_offset (esi, 8) 110 cfi_rel_offset (ebx, 4) 111 cfi_rel_offset (ebp, 0) 112 113 /* The following code is the preparation for the loop. The 114 four instruction up to `L1' will not be executed in the loop 115 because the same code is found at the end of the loop, but 116 there it is executed in parallel with other instructions. */ 117L(11): movl (%eax), %ecx 118 movl $magic, %ebp 119 120 movl $magic, %edi 121 addl %ecx, %ebp 122 123 /* The main loop: it looks complex and indeed it is. I would 124 love to say `it was hard to write, so it should he hard to 125 read' but I will give some more hints. To fully understand 126 this code you should first take a look at the i486 version. 127 The basic algorithm is the same, but here the code organized 128 in a way which permits to use both pipelines all the time. 129 130 I tried to make it a bit more understandable by indenting 131 the code according to stage in the algorithm. It goes as 132 follows: 133 check for 0 in 1st word 134 check for C in 1st word 135 check for 0 in 2nd word 136 check for C in 2nd word 137 check for 0 in 3rd word 138 check for C in 3rd word 139 check for 0 in 4th word 140 check for C in 4th word 141 142 Please note that doing the test for NUL before the test for 143 C allows us to overlap the test for 0 in the next word with 144 the test for C. */ 145 146L(1): xorl %ecx, %ebp /* (word^magic) */ 147 addl %ecx, %edi /* add magic word */ 148 149 leal 4(%eax), %eax /* increment pointer */ 150 jnc L(4) /* previous addl caused overflow? */ 151 152 movl %ecx, %ebx /* duplicate original word */ 153 orl $magic, %ebp /* (word^magic)|magic */ 154 155 addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ 156 jne L(4) /* yes => we found word with NUL */ 157 158 movl $magic, %esi /* load magic value */ 159 xorl %edx, %ebx /* clear words which are C */ 160 161 movl (%eax), %ecx 162 addl %ebx, %esi /* (word+magic) */ 163 164 movl $magic, %edi 165 jnc L(5) /* previous addl caused overflow? */ 166 167 movl %edi, %ebp 168 xorl %ebx, %esi /* (word+magic)^word */ 169 170 addl %ecx, %ebp 171 orl $magic, %esi /* ((word+magic)^word)|magic */ 172 173 addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ 174 jne L(5) /* yes => we found word with C */ 175 176 xorl %ecx, %ebp 177 addl %ecx, %edi 178 179 leal 4(%eax), %eax 180 jnc L(4) 181 182 movl %ecx, %ebx 183 orl $magic, %ebp 184 185 addl $1, %ebp 186 jne L(4) 187 188 movl $magic, %esi 189 xorl %edx, %ebx 190 191 movl (%eax), %ecx 192 addl %ebx, %esi 193 194 movl $magic, %edi 195 jnc L(5) 196 197 movl %edi, %ebp 198 xorl %ebx, %esi 199 200 addl %ecx, %ebp 201 orl $magic, %esi 202 203 addl $1, %esi 204 jne L(5) 205 206 xorl %ecx, %ebp 207 addl %ecx, %edi 208 209 leal 4(%eax), %eax 210 jnc L(4) 211 212 movl %ecx, %ebx 213 orl $magic, %ebp 214 215 addl $1, %ebp 216 jne L(4) 217 218 movl $magic, %esi 219 xorl %edx, %ebx 220 221 movl (%eax), %ecx 222 addl %ebx, %esi 223 224 movl $magic, %edi 225 jnc L(5) 226 227 movl %edi, %ebp 228 xorl %ebx, %esi 229 230 addl %ecx, %ebp 231 orl $magic, %esi 232 233 addl $1, %esi 234 jne L(5) 235 236 xorl %ecx, %ebp 237 addl %ecx, %edi 238 239 leal 4(%eax), %eax 240 jnc L(4) 241 242 movl %ecx, %ebx 243 orl $magic, %ebp 244 245 addl $1, %ebp 246 jne L(4) 247 248 movl $magic, %esi 249 xorl %edx, %ebx 250 251 movl (%eax), %ecx 252 addl %ebx, %esi 253 254 movl $magic, %edi 255 jnc L(5) 256 257 movl %edi, %ebp 258 xorl %ebx, %esi 259 260 addl %ecx, %ebp 261 orl $magic, %esi 262 263 addl $1, %esi 264 265 je L(1) 266 267 /* We know there is no NUL byte but a C byte in the word. 268 %ebx contains NUL in this particular byte. */ 269L(5): subl $4, %eax /* adjust pointer */ 270 testb %bl, %bl /* first byte == C? */ 271 272 jz L(out) /* yes => return pointer */ 273 274 incl %eax /* increment pointer */ 275 testb %bh, %bh /* second byte == C? */ 276 277 jz L(out) /* yes => return pointer */ 278 279 shrl $16, %ebx /* make upper bytes accessible */ 280 incl %eax /* increment pointer */ 281 282 cmp $0, %bl /* third byte == C */ 283 je L(out) /* yes => return pointer */ 284 285 incl %eax /* increment pointer */ 286 287L(out): popl %ebp /* restore saved registers */ 288 cfi_adjust_cfa_offset (-4) 289 cfi_restore (ebp) 290 popl %ebx 291 cfi_adjust_cfa_offset (-4) 292 cfi_restore (ebx) 293 294 popl %esi 295 cfi_adjust_cfa_offset (-4) 296 cfi_restore (esi) 297 popl %edi 298 cfi_adjust_cfa_offset (-4) 299 cfi_restore (edi) 300 301 ret 302 303 cfi_adjust_cfa_offset (16) 304 cfi_rel_offset (edi, 12) 305 cfi_rel_offset (esi, 8) 306 cfi_rel_offset (ebx, 4) 307 cfi_rel_offset (ebp, 0) 308 /* We know there is a NUL byte in the word. But we have to test 309 whether there is an C byte before it in the word. */ 310L(4): subl $4, %eax /* adjust pointer */ 311 cmpb %dl, %cl /* first byte == C? */ 312 313 je L(out) /* yes => return pointer */ 314 315 cmpb $0, %cl /* first byte == NUL? */ 316 je L(3) /* yes => return NULL */ 317 318 incl %eax /* increment pointer */ 319 320 cmpb %dl, %ch /* second byte == C? */ 321 je L(out) /* yes => return pointer */ 322 323 cmpb $0, %ch /* second byte == NUL? */ 324 je L(3) /* yes => return NULL */ 325 326 shrl $16, %ecx /* make upper bytes accessible */ 327 incl %eax /* increment pointer */ 328 329 cmpb %dl, %cl /* third byte == C? */ 330 je L(out) /* yes => return pointer */ 331 332 cmpb $0, %cl /* third byte == NUL? */ 333 je L(3) /* yes => return NULL */ 334 335 incl %eax /* increment pointer */ 336 337 /* The test four the fourth byte is necessary! */ 338 cmpb %dl, %ch /* fourth byte == C? */ 339 je L(out) /* yes => return pointer */ 340 341L(3): xorl %eax, %eax 342 jmp L(out) 343END (strchr) 344 345#undef index 346weak_alias (strchr, index) 347libc_hidden_builtin_def (strchr) 348