1/* memrchr - find the last occurrence of a byte in a memory block 2 3 Copyright (C) 2015-2021 Free Software Foundation, Inc. 4 5 This file is part of the GNU C Library. 6 7 The GNU C Library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public 9 License as published by the Free Software Foundation; either 10 version 2.1 of the License, or (at your option) any later version. 11 12 The GNU C Library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with the GNU C Library. If not, see 19 <https://www.gnu.org/licenses/>. */ 20 21#include <sysdep.h> 22 23/* Assumptions: 24 * 25 * ARMv8-a, AArch64, Advanced SIMD. 26 * MTE compatible. 27 */ 28 29/* Arguments and results. */ 30#define srcin x0 31#define chrin w1 32#define cntin x2 33#define result x0 34 35#define src x3 36#define cntrem x4 37#define synd x5 38#define shift x6 39#define tmp x7 40#define wtmp w7 41#define end x8 42#define endm1 x9 43 44#define vrepchr v0 45#define qdata q1 46#define vdata v1 47#define vhas_chr v2 48#define vrepmask v3 49#define vend v4 50#define dend d4 51 52/* 53 Core algorithm: 54 For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 55 per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 56 requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 57 set likewise for odd bytes so that adjacent bytes can be merged. Since the 58 bits in the syndrome reflect the order in which things occur in the original 59 string, counting trailing zeros identifies exactly which byte matched. */ 60 61ENTRY (__memrchr) 62 PTR_ARG (0) 63 SIZE_ARG (2) 64 add end, srcin, cntin 65 sub endm1, end, 1 66 bic src, endm1, 15 67 cbz cntin, L(nomatch) 68 ld1 {vdata.16b}, [src] 69 dup vrepchr.16b, chrin 70 mov wtmp, 0xf00f 71 dup vrepmask.8h, wtmp 72 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 73 neg shift, end, lsl 2 74 and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 75 addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 76 fmov synd, dend 77 lsl synd, synd, shift 78 cbz synd, L(start_loop) 79 80 clz synd, synd 81 sub result, endm1, synd, lsr 2 82 cmp cntin, synd, lsr 2 83 csel result, result, xzr, hi 84 ret 85 86L(start_loop): 87 sub tmp, end, src 88 subs cntrem, cntin, tmp 89 b.ls L(nomatch) 90 91 /* Make sure that it won't overread by a 16-byte chunk */ 92 add tmp, cntrem, 15 93 tbnz tmp, 4, L(loop32_2) 94 95 .p2align 4 96L(loop32): 97 ldr qdata, [src, -16]! 98 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 99 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 100 fmov synd, dend 101 cbnz synd, L(end) 102 103L(loop32_2): 104 ldr qdata, [src, -16]! 105 subs cntrem, cntrem, 32 106 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b 107 b.ls L(end) 108 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 109 fmov synd, dend 110 cbz synd, L(loop32) 111L(end): 112 and vhas_chr.16b, vhas_chr.16b, vrepmask.16b 113 addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ 114 fmov synd, dend 115 116 add tmp, src, 15 117#ifdef __AARCH64EB__ 118 rbit synd, synd 119#endif 120 clz synd, synd 121 sub tmp, tmp, synd, lsr 2 122 cmp tmp, srcin 123 csel result, tmp, xzr, hs 124 ret 125 126L(nomatch): 127 mov result, 0 128 ret 129 130END (__memrchr) 131weak_alias (__memrchr, memrchr) 132libc_hidden_builtin_def (memrchr) 133