1/* strcmp implementation for ARMv7-A, optimized for Cortex-A15. 2 Copyright (C) 2012-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library. If not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <arm-features.h> 20#include <sysdep.h> 21 22/* Implementation of strcmp for ARMv7 when DSP instructions are 23 available. Use ldrd to support wider loads, provided the data 24 is sufficiently aligned. Use saturating arithmetic to optimize 25 the compares. */ 26 27/* Build Options: 28 STRCMP_PRECHECK: Run a quick pre-check of the first byte in the 29 string. If comparing completely random strings the pre-check will 30 save time, since there is a very high probability of a mismatch in 31 the first character: we save significant overhead if this is the 32 common case. However, if strings are likely to be identical (e.g. 33 because we're verifying a hit in a hash table), then this check 34 is largely redundant. */ 35 36#define STRCMP_PRECHECK 1 37 38 .syntax unified 39 40#ifdef __ARM_BIG_ENDIAN 41# define S2LO lsl 42# define S2LOEQ lsleq 43# define S2HI lsr 44# define MSB 0x000000ff 45# define LSB 0xff000000 46# define BYTE0_OFFSET 24 47# define BYTE1_OFFSET 16 48# define BYTE2_OFFSET 8 49# define BYTE3_OFFSET 0 50#else /* not __ARM_BIG_ENDIAN */ 51# define S2LO lsr 52# define S2LOEQ lsreq 53# define S2HI lsl 54# define BYTE0_OFFSET 0 55# define BYTE1_OFFSET 8 56# define BYTE2_OFFSET 16 57# define BYTE3_OFFSET 24 58# define MSB 0xff000000 59# define LSB 0x000000ff 60#endif /* not __ARM_BIG_ENDIAN */ 61 62/* Parameters and result. */ 63#define src1 r0 64#define src2 r1 65#define result r0 /* Overlaps src1. */ 66 67/* Internal variables. */ 68#define tmp1 r4 69#define tmp2 r5 70#define const_m1 r12 71 72/* Additional internal variables for 64-bit aligned data. */ 73#define data1a r2 74#define data1b r3 75#define data2a r6 76#define data2b r7 77#define syndrome_a tmp1 78#define syndrome_b tmp2 79 80/* Additional internal variables for 32-bit aligned data. */ 81#define data1 r2 82#define data2 r3 83#define syndrome tmp2 84 85 86 .thumb 87 88/* In Thumb code we can't use MVN with a register shift, but we do have ORN. */ 89.macro prepare_mask mask_reg, nbits_reg 90 S2HI \mask_reg, const_m1, \nbits_reg 91.endm 92.macro apply_mask data_reg, mask_reg 93 orn \data_reg, \data_reg, \mask_reg 94.endm 95 96 /* Macro to compute and return the result value for word-aligned 97 cases. */ 98 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 99#ifdef __ARM_BIG_ENDIAN 100 /* If data1 contains a zero byte, then syndrome will contain a 1 in 101 bit 7 of that byte. Otherwise, the highest set bit in the 102 syndrome will highlight the first different bit. It is therefore 103 sufficient to extract the eight bits starting with the syndrome 104 bit. */ 105 clz tmp1, \synd 106 lsl r1, \d2, tmp1 107 .if \restore_r6 108 ldrd r6, r7, [sp, #8] 109 .endif 110 lsl \d1, \d1, tmp1 111 lsr result, \d1, #24 112 ldrd r4, r5, [sp], #16 113 cfi_remember_state 114 cfi_def_cfa_offset (0) 115 cfi_restore (r4) 116 cfi_restore (r5) 117 cfi_restore (r6) 118 cfi_restore (r7) 119 sub result, result, r1, lsr #24 120 bx lr 121#else 122 /* To use the big-endian trick we'd have to reverse all three words. 123 that's slower than this approach. */ 124 rev \synd, \synd 125 clz tmp1, \synd 126 bic tmp1, tmp1, #7 127 lsr r1, \d2, tmp1 128 .if \restore_r6 129 ldrd r6, r7, [sp, #8] 130 .endif 131 lsr \d1, \d1, tmp1 132 and result, \d1, #255 133 and r1, r1, #255 134 ldrd r4, r5, [sp], #16 135 cfi_remember_state 136 cfi_def_cfa_offset (0) 137 cfi_restore (r4) 138 cfi_restore (r5) 139 cfi_restore (r6) 140 cfi_restore (r7) 141 sub result, result, r1 142 143 bx lr 144#endif 145 .endm 146 147 .text 148 .p2align 5 149.Lstrcmp_start_addr: 150#if STRCMP_PRECHECK == 1 151.Lfastpath_exit: 152 sub r0, r2, r3 153 bx lr 154 nop 155#endif 156ENTRY (strcmp) 157#if STRCMP_PRECHECK == 1 158 ldrb r2, [src1] 159 ldrb r3, [src2] 160 cmp r2, #1 161 it cs 162 cmpcs r2, r3 163 bne .Lfastpath_exit 164#endif 165 strd r4, r5, [sp, #-16]! 166 cfi_def_cfa_offset (16) 167 cfi_offset (r4, -16) 168 cfi_offset (r5, -12) 169 orr tmp1, src1, src2 170 strd r6, r7, [sp, #8] 171 cfi_offset (r6, -8) 172 cfi_offset (r7, -4) 173 mvn const_m1, #0 174 lsl r2, tmp1, #29 175 cbz r2, .Lloop_aligned8 176 177.Lnot_aligned: 178 eor tmp1, src1, src2 179 tst tmp1, #7 180 bne .Lmisaligned8 181 182 /* Deal with mutual misalignment by aligning downwards and then 183 masking off the unwanted loaded data to prevent a difference. */ 184 and tmp1, src1, #7 185 bic src1, src1, #7 186 and tmp2, tmp1, #3 187 bic src2, src2, #7 188 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 189 ldrd data1a, data1b, [src1], #16 190 tst tmp1, #4 191 ldrd data2a, data2b, [src2], #16 192 prepare_mask tmp1, tmp2 193 apply_mask data1a, tmp1 194 apply_mask data2a, tmp1 195 beq .Lstart_realigned8 196 apply_mask data1b, tmp1 197 mov data1a, const_m1 198 apply_mask data2b, tmp1 199 mov data2a, const_m1 200 b .Lstart_realigned8 201 202 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 203 pass. */ 204 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 205 .p2align 2 /* Always word aligned. */ 206.Lloop_aligned8: 207 ldrd data1a, data1b, [src1], #16 208 ldrd data2a, data2b, [src2], #16 209.Lstart_realigned8: 210 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 211 eor syndrome_a, data1a, data2a 212 sel syndrome_a, syndrome_a, const_m1 213 cbnz syndrome_a, .Ldiff_in_a 214 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 215 eor syndrome_b, data1b, data2b 216 sel syndrome_b, syndrome_b, const_m1 217 cbnz syndrome_b, .Ldiff_in_b 218 219 ldrd data1a, data1b, [src1, #-8] 220 ldrd data2a, data2b, [src2, #-8] 221 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 222 eor syndrome_a, data1a, data2a 223 sel syndrome_a, syndrome_a, const_m1 224 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 225 eor syndrome_b, data1b, data2b 226 sel syndrome_b, syndrome_b, const_m1 227 /* Can't use CBZ for backwards branch. */ 228 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 229 beq .Lloop_aligned8 230 231.Ldiff_found: 232 cbnz syndrome_a, .Ldiff_in_a 233 234.Ldiff_in_b: 235 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 236 237.Ldiff_in_a: 238 cfi_restore_state 239 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 240 241 cfi_restore_state 242.Lmisaligned8: 243 tst tmp1, #3 244 bne .Lmisaligned4 245 ands tmp1, src1, #3 246 bne .Lmutual_align4 247 248 /* Unrolled by a factor of 2, to reduce the number of post-increment 249 operations. */ 250.Lloop_aligned4: 251 ldr data1, [src1], #8 252 ldr data2, [src2], #8 253.Lstart_realigned4: 254 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 255 eor syndrome, data1, data2 256 sel syndrome, syndrome, const_m1 257 cbnz syndrome, .Laligned4_done 258 ldr data1, [src1, #-4] 259 ldr data2, [src2, #-4] 260 uadd8 syndrome, data1, const_m1 261 eor syndrome, data1, data2 262 sel syndrome, syndrome, const_m1 263 cmp syndrome, #0 264 beq .Lloop_aligned4 265 266.Laligned4_done: 267 strcmp_epilogue_aligned syndrome, data1, data2, 0 268 269.Lmutual_align4: 270 cfi_restore_state 271 /* Deal with mutual misalignment by aligning downwards and then 272 masking off the unwanted loaded data to prevent a difference. */ 273 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 274 bic src1, src1, #3 275 ldr data1, [src1], #8 276 bic src2, src2, #3 277 ldr data2, [src2], #8 278 279 prepare_mask tmp1, tmp1 280 apply_mask data1, tmp1 281 apply_mask data2, tmp1 282 b .Lstart_realigned4 283 284.Lmisaligned4: 285 ands tmp1, src1, #3 286 beq .Lsrc1_aligned 287 sub src2, src2, tmp1 288 bic src1, src1, #3 289 lsls tmp1, tmp1, #31 290 ldr data1, [src1], #4 291 beq .Laligned_m2 292 bcs .Laligned_m1 293 294#if STRCMP_PRECHECK == 0 295 ldrb data2, [src2, #1] 296 uxtb tmp1, data1, ror #BYTE1_OFFSET 297 subs tmp1, tmp1, data2 298 bne .Lmisaligned_exit 299 cbz data2, .Lmisaligned_exit 300 301.Laligned_m2: 302 ldrb data2, [src2, #2] 303 uxtb tmp1, data1, ror #BYTE2_OFFSET 304 subs tmp1, tmp1, data2 305 bne .Lmisaligned_exit 306 cbz data2, .Lmisaligned_exit 307 308.Laligned_m1: 309 ldrb data2, [src2, #3] 310 uxtb tmp1, data1, ror #BYTE3_OFFSET 311 subs tmp1, tmp1, data2 312 bne .Lmisaligned_exit 313 add src2, src2, #4 314 cbnz data2, .Lsrc1_aligned 315#else /* STRCMP_PRECHECK */ 316 /* If we've done the pre-check, then we don't need to check the 317 first byte again here. */ 318 ldrb data2, [src2, #2] 319 uxtb tmp1, data1, ror #BYTE2_OFFSET 320 subs tmp1, tmp1, data2 321 bne .Lmisaligned_exit 322 cbz data2, .Lmisaligned_exit 323 324.Laligned_m2: 325 ldrb data2, [src2, #3] 326 uxtb tmp1, data1, ror #BYTE3_OFFSET 327 subs tmp1, tmp1, data2 328 bne .Lmisaligned_exit 329 cbnz data2, .Laligned_m1 330#endif 331 332.Lmisaligned_exit: 333 mov result, tmp1 334 ldr r4, [sp], #16 335 cfi_remember_state 336 cfi_def_cfa_offset (0) 337 cfi_restore (r4) 338 cfi_restore (r5) 339 cfi_restore (r6) 340 cfi_restore (r7) 341 bx lr 342 343#if STRCMP_PRECHECK == 1 344.Laligned_m1: 345 add src2, src2, #4 346#endif 347.Lsrc1_aligned: 348 cfi_restore_state 349 /* src1 is word aligned, but src2 has no common alignment 350 with it. */ 351 ldr data1, [src1], #4 352 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 353 354 bic src2, src2, #3 355 ldr data2, [src2], #4 356 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ 357 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ 358 359 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 360.Loverlap3: 361 bic tmp1, data1, #MSB 362 uadd8 syndrome, data1, const_m1 363 eors syndrome, tmp1, data2, S2LO #8 364 sel syndrome, syndrome, const_m1 365 bne 4f 366 cbnz syndrome, 5f 367 ldr data2, [src2], #4 368 eor tmp1, tmp1, data1 369 cmp tmp1, data2, S2HI #24 370 bne 6f 371 ldr data1, [src1], #4 372 b .Loverlap3 3734: 374 S2LO data2, data2, #8 375 b .Lstrcmp_tail 376 3775: 378 bics syndrome, syndrome, #MSB 379 bne .Lstrcmp_done_equal 380 381 /* We can only get here if the MSB of data1 contains 0, so 382 fast-path the exit. */ 383 ldrb result, [src2] 384 ldrd r4, r5, [sp], #16 385 cfi_remember_state 386 cfi_def_cfa_offset (0) 387 cfi_restore (r4) 388 cfi_restore (r5) 389 /* R6/7 Not used in this sequence. */ 390 cfi_restore (r6) 391 cfi_restore (r7) 392 neg result, result 393 bx lr 394 3956: 396 cfi_restore_state 397 S2LO data1, data1, #24 398 and data2, data2, #LSB 399 b .Lstrcmp_tail 400 401 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 402.Loverlap2: 403 and tmp1, data1, const_m1, S2LO #16 404 uadd8 syndrome, data1, const_m1 405 eors syndrome, tmp1, data2, S2LO #16 406 sel syndrome, syndrome, const_m1 407 bne 4f 408 cbnz syndrome, 5f 409 ldr data2, [src2], #4 410 eor tmp1, tmp1, data1 411 cmp tmp1, data2, S2HI #16 412 bne 6f 413 ldr data1, [src1], #4 414 b .Loverlap2 4154: 416 S2LO data2, data2, #16 417 b .Lstrcmp_tail 4185: 419 ands syndrome, syndrome, const_m1, S2LO #16 420 bne .Lstrcmp_done_equal 421 422 ldrh data2, [src2] 423 S2LO data1, data1, #16 424#ifdef __ARM_BIG_ENDIAN 425 lsl data2, data2, #16 426#endif 427 b .Lstrcmp_tail 428 4296: 430 S2LO data1, data1, #16 431 and data2, data2, const_m1, S2LO #16 432 b .Lstrcmp_tail 433 434 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 435.Loverlap1: 436 and tmp1, data1, #LSB 437 uadd8 syndrome, data1, const_m1 438 eors syndrome, tmp1, data2, S2LO #24 439 sel syndrome, syndrome, const_m1 440 bne 4f 441 cbnz syndrome, 5f 442 ldr data2, [src2], #4 443 eor tmp1, tmp1, data1 444 cmp tmp1, data2, S2HI #8 445 bne 6f 446 ldr data1, [src1], #4 447 b .Loverlap1 4484: 449 S2LO data2, data2, #24 450 b .Lstrcmp_tail 4515: 452 tst syndrome, #LSB 453 bne .Lstrcmp_done_equal 454 ldr data2, [src2] 4556: 456 S2LO data1, data1, #8 457 bic data2, data2, #MSB 458 b .Lstrcmp_tail 459 460.Lstrcmp_done_equal: 461 mov result, #0 462 ldrd r4, r5, [sp], #16 463 cfi_remember_state 464 cfi_def_cfa_offset (0) 465 cfi_restore (r4) 466 cfi_restore (r5) 467 /* R6/7 not used in this sequence. */ 468 cfi_restore (r6) 469 cfi_restore (r7) 470 bx lr 471 472.Lstrcmp_tail: 473 cfi_restore_state 474#ifndef __ARM_BIG_ENDIAN 475 rev data1, data1 476 rev data2, data2 477 /* Now everything looks big-endian... */ 478#endif 479 uadd8 tmp1, data1, const_m1 480 eor tmp1, data1, data2 481 sel syndrome, tmp1, const_m1 482 clz tmp1, syndrome 483 lsl data1, data1, tmp1 484 lsl data2, data2, tmp1 485 lsr result, data1, #24 486 ldrd r4, r5, [sp], #16 487 cfi_def_cfa_offset (0) 488 cfi_restore (r4) 489 cfi_restore (r5) 490 /* R6/7 not used in this sequence. */ 491 cfi_restore (r6) 492 cfi_restore (r7) 493 sub result, result, data2, lsr #24 494 bx lr 495END (strcmp) 496libc_hidden_builtin_def (strcmp) 497