1/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized 2 loop. 3 Copyright (C) 2016-2021 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22/* int [r3] strlen (char *s [r3]) */ 23 24#ifndef STRLEN 25# define STRLEN strlen 26#endif 27 .machine power8 28ENTRY_TOCLESS (STRLEN, 4) 29 CALL_MCOUNT 1 30 dcbt 0,r3 31 clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ 32 rlwinm r6,r3,3,26,28 /* Calculate padding. */ 33 li r0,0 /* Doubleword with null chars to use 34 with cmpb. */ 35 li r5,-1 /* MASK = 0xffffffffffffffff. */ 36 ld r12,0(r4) /* Load doubleword from memory. */ 37#ifdef __LITTLE_ENDIAN__ 38 sld r5,r5,r6 39#else 40 srd r5,r5,r6 /* MASK = MASK >> padding. */ 41#endif 42 orc r9,r12,r5 /* Mask bits that are not part of the string. */ 43 cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ 44 cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ 45 bne cr7,L(done) 46 47 /* For shorter strings (< 64 bytes), we will not use vector registers, 48 as the overhead isn't worth it. So, let's use GPRs instead. This 49 will be done the same way as we do in the POWER7 implementation. 50 Let's see if we are aligned to a quadword boundary. If so, we can 51 jump to the first (non-vectorized) loop. Otherwise, we have to 52 handle the next DWORD first. */ 53 mtcrf 0x01,r4 54 mr r9,r4 55 addi r9,r9,8 56 bt 28,L(align64) 57 58 /* Handle the next 8 bytes so we are aligned to a quadword 59 boundary. */ 60 ldu r5,8(r4) 61 cmpb r10,r5,r0 62 cmpdi cr7,r10,0 63 addi r9,r9,8 64 bne cr7,L(done) 65 66L(align64): 67 /* Proceed to the old (POWER7) implementation, checking two doublewords 68 per iteraction. For the first 56 bytes, we will just check for null 69 characters. After that, we will also check if we are 64-byte aligned 70 so we can jump to the vectorized implementation. We will unroll 71 these loops to avoid excessive branching. */ 72 ld r6,8(r4) 73 ldu r5,16(r4) 74 cmpb r10,r6,r0 75 cmpb r11,r5,r0 76 or r5,r10,r11 77 cmpdi cr7,r5,0 78 addi r9,r9,16 79 bne cr7,L(dword_zero) 80 81 ld r6,8(r4) 82 ldu r5,16(r4) 83 cmpb r10,r6,r0 84 cmpb r11,r5,r0 85 or r5,r10,r11 86 cmpdi cr7,r5,0 87 addi r9,r9,16 88 bne cr7,L(dword_zero) 89 90 ld r6,8(r4) 91 ldu r5,16(r4) 92 cmpb r10,r6,r0 93 cmpb r11,r5,r0 94 or r5,r10,r11 95 cmpdi cr7,r5,0 96 addi r9,r9,16 97 bne cr7,L(dword_zero) 98 99 /* Are we 64-byte aligned? If so, jump to the vectorized loop. 100 Note: aligning to 64-byte will necessarily slow down performance for 101 strings around 64 bytes in length due to the extra comparisons 102 required to check alignment for the vectorized loop. This is a 103 necessary tradeoff we are willing to take in order to speed up the 104 calculation for larger strings. */ 105 andi. r10,r9,63 106 beq cr0,L(preloop) 107 ld r6,8(r4) 108 ldu r5,16(r4) 109 cmpb r10,r6,r0 110 cmpb r11,r5,r0 111 or r5,r10,r11 112 cmpdi cr7,r5,0 113 addi r9,r9,16 114 bne cr7,L(dword_zero) 115 116 andi. r10,r9,63 117 beq cr0,L(preloop) 118 ld r6,8(r4) 119 ldu r5,16(r4) 120 cmpb r10,r6,r0 121 cmpb r11,r5,r0 122 or r5,r10,r11 123 cmpdi cr7,r5,0 124 addi r9,r9,16 125 bne cr7,L(dword_zero) 126 127 andi. r10,r9,63 128 beq cr0,L(preloop) 129 ld r6,8(r4) 130 ldu r5,16(r4) 131 cmpb r10,r6,r0 132 cmpb r11,r5,r0 133 or r5,r10,r11 134 cmpdi cr7,r5,0 135 addi r9,r9,16 136 137 /* At this point, we are necessarily 64-byte aligned. If no zeroes were 138 found, jump to the vectorized loop. */ 139 beq cr7,L(preloop) 140 141L(dword_zero): 142 /* OK, one (or both) of the doublewords contains a null byte. Check 143 the first doubleword and decrement the address in case the first 144 doubleword really contains a null byte. */ 145 146 cmpdi cr6,r10,0 147 addi r4,r4,-8 148 bne cr6,L(done) 149 150 /* The null byte must be in the second doubleword. Adjust the address 151 again and move the result of cmpb to r10 so we can calculate the 152 length. */ 153 154 mr r10,r11 155 addi r4,r4,8 156 157 /* If the null byte was found in the non-vectorized code, compute the 158 final length. r10 has the output of the cmpb instruction, that is, 159 it contains 0xff in the same position as the null byte in the 160 original doubleword from the string. Use that to calculate the 161 length. */ 162L(done): 163#ifdef __LITTLE_ENDIAN__ 164 addi r9, r10,-1 /* Form a mask from trailing zeros. */ 165 andc r9, r9,r10 166 popcntd r0, r9 /* Count the bits in the mask. */ 167#else 168 cntlzd r0,r10 /* Count leading zeros before the match. */ 169#endif 170 subf r5,r3,r4 171 srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ 172 add r3,r5,r0 /* Compute final length. */ 173 blr 174 175 /* Vectorized implementation starts here. */ 176 .p2align 4 177L(preloop): 178 /* Set up for the loop. */ 179 mr r4,r9 180 li r7, 16 /* Load required offsets. */ 181 li r8, 32 182 li r9, 48 183 li r12, 8 184 vxor v0,v0,v0 /* VR with null chars to use with 185 vcmpequb. */ 186 187 /* Main loop to look for the end of the string. We will read in 188 64-byte chunks. Align it to 32 bytes and unroll it 3 times to 189 leverage the icache performance. */ 190 .p2align 5 191L(loop): 192 lvx v1,r4,r0 /* Load 4 quadwords. */ 193 lvx v2,r4,r7 194 lvx v3,r4,r8 195 lvx v4,r4,r9 196 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ 197 vminub v6,v3,v4 198 vminub v7,v5,v6 199 vcmpequb. v7,v7,v0 /* Check for NULLs. */ 200 addi r4,r4,64 /* Adjust address for the next iteration. */ 201 bne cr6,L(vmx_zero) 202 203 lvx v1,r4,r0 /* Load 4 quadwords. */ 204 lvx v2,r4,r7 205 lvx v3,r4,r8 206 lvx v4,r4,r9 207 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ 208 vminub v6,v3,v4 209 vminub v7,v5,v6 210 vcmpequb. v7,v7,v0 /* Check for NULLs. */ 211 addi r4,r4,64 /* Adjust address for the next iteration. */ 212 bne cr6,L(vmx_zero) 213 214 lvx v1,r4,r0 /* Load 4 quadwords. */ 215 lvx v2,r4,r7 216 lvx v3,r4,r8 217 lvx v4,r4,r9 218 vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ 219 vminub v6,v3,v4 220 vminub v7,v5,v6 221 vcmpequb. v7,v7,v0 /* Check for NULLs. */ 222 addi r4,r4,64 /* Adjust address for the next iteration. */ 223 beq cr6,L(loop) 224 225L(vmx_zero): 226 /* OK, we found a null byte. Let's look for it in the current 64-byte 227 block and mark it in its corresponding VR. */ 228 vcmpequb v1,v1,v0 229 vcmpequb v2,v2,v0 230 vcmpequb v3,v3,v0 231 vcmpequb v4,v4,v0 232 233 /* We will now 'compress' the result into a single doubleword, so it 234 can be moved to a GPR for the final calculation. First, we 235 generate an appropriate mask for vbpermq, so we can permute bits into 236 the first halfword. */ 237 vspltisb v10,3 238 lvsl v11,r0,r0 239 vslb v10,v11,v10 240 241 /* Permute the first bit of each byte into bits 48-63. */ 242 vbpermq v1,v1,v10 243 vbpermq v2,v2,v10 244 vbpermq v3,v3,v10 245 vbpermq v4,v4,v10 246 247 /* Shift each component into its correct position for merging. */ 248#ifdef __LITTLE_ENDIAN__ 249 vsldoi v2,v2,v2,2 250 vsldoi v3,v3,v3,4 251 vsldoi v4,v4,v4,6 252#else 253 vsldoi v1,v1,v1,6 254 vsldoi v2,v2,v2,4 255 vsldoi v3,v3,v3,2 256#endif 257 258 /* Merge the results and move to a GPR. */ 259 vor v1,v2,v1 260 vor v2,v3,v4 261 vor v4,v1,v2 262 mfvrd r10,v4 263 264 /* Adjust address to the begninning of the current 64-byte block. */ 265 addi r4,r4,-64 266 267#ifdef __LITTLE_ENDIAN__ 268 addi r9, r10,-1 /* Form a mask from trailing zeros. */ 269 andc r9, r9,r10 270 popcntd r0, r9 /* Count the bits in the mask. */ 271#else 272 cntlzd r0,r10 /* Count leading zeros before the match. */ 273#endif 274 subf r5,r3,r4 275 add r3,r5,r0 /* Compute final length. */ 276 blr 277 278END (STRLEN) 279libc_hidden_builtin_def (strlen) 280