1/* strcmp with unaligned loads 2 Copyright (C) 2013-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21#include "sysdep.h" 22 23ENTRY ( __strcmp_sse2_unaligned) 24 movl %edi, %eax 25 xorl %edx, %edx 26 pxor %xmm7, %xmm7 27 orl %esi, %eax 28 andl $4095, %eax 29 cmpl $4032, %eax 30 jg L(cross_page) 31 movdqu (%rdi), %xmm1 32 movdqu (%rsi), %xmm0 33 pcmpeqb %xmm1, %xmm0 34 pminub %xmm1, %xmm0 35 pxor %xmm1, %xmm1 36 pcmpeqb %xmm1, %xmm0 37 pmovmskb %xmm0, %eax 38 testq %rax, %rax 39 je L(next_48_bytes) 40L(return): 41 bsfq %rax, %rdx 42 movzbl (%rdi, %rdx), %eax 43 movzbl (%rsi, %rdx), %edx 44 subl %edx, %eax 45 ret 46 47 .p2align 4 48L(next_48_bytes): 49 movdqu 16(%rdi), %xmm6 50 movdqu 16(%rsi), %xmm3 51 movdqu 32(%rdi), %xmm5 52 pcmpeqb %xmm6, %xmm3 53 movdqu 32(%rsi), %xmm2 54 pminub %xmm6, %xmm3 55 pcmpeqb %xmm1, %xmm3 56 movdqu 48(%rdi), %xmm4 57 pcmpeqb %xmm5, %xmm2 58 pmovmskb %xmm3, %edx 59 movdqu 48(%rsi), %xmm0 60 pminub %xmm5, %xmm2 61 pcmpeqb %xmm1, %xmm2 62 pcmpeqb %xmm4, %xmm0 63 pmovmskb %xmm2, %eax 64 salq $16, %rdx 65 pminub %xmm4, %xmm0 66 pcmpeqb %xmm1, %xmm0 67 salq $32, %rax 68 orq %rdx, %rax 69 pmovmskb %xmm0, %ecx 70 movq %rcx, %rdx 71 salq $48, %rdx 72 orq %rdx, %rax 73 jne L(return) 74L(main_loop_header): 75 leaq 64(%rdi), %rdx 76 movl $4096, %ecx 77 pxor %xmm9, %xmm9 78 andq $-64, %rdx 79 subq %rdi, %rdx 80 leaq (%rdi, %rdx), %rax 81 addq %rsi, %rdx 82 movq %rdx, %rsi 83 andl $4095, %esi 84 subq %rsi, %rcx 85 shrq $6, %rcx 86 movq %rcx, %rsi 87 jmp L(loop_start) 88 89 .p2align 4 90L(loop): 91 addq $64, %rax 92 addq $64, %rdx 93L(loop_start): 94 testq %rsi, %rsi 95 leaq -1(%rsi), %rsi 96 je L(loop_cross_page) 97L(back_to_loop): 98 movdqu (%rdx), %xmm0 99 movdqu 16(%rdx), %xmm1 100 movdqa (%rax), %xmm2 101 movdqa 16(%rax), %xmm3 102 pcmpeqb %xmm2, %xmm0 103 movdqu 32(%rdx), %xmm5 104 pcmpeqb %xmm3, %xmm1 105 pminub %xmm2, %xmm0 106 movdqu 48(%rdx), %xmm6 107 pminub %xmm3, %xmm1 108 movdqa 32(%rax), %xmm2 109 pminub %xmm1, %xmm0 110 movdqa 48(%rax), %xmm3 111 pcmpeqb %xmm2, %xmm5 112 pcmpeqb %xmm3, %xmm6 113 pminub %xmm2, %xmm5 114 pminub %xmm3, %xmm6 115 pminub %xmm5, %xmm0 116 pminub %xmm6, %xmm0 117 pcmpeqb %xmm7, %xmm0 118 pmovmskb %xmm0, %ecx 119 testl %ecx, %ecx 120 je L(loop) 121 pcmpeqb %xmm7, %xmm5 122 movdqu (%rdx), %xmm0 123 pcmpeqb %xmm7, %xmm1 124 movdqa (%rax), %xmm2 125 pcmpeqb %xmm2, %xmm0 126 pminub %xmm2, %xmm0 127 pcmpeqb %xmm7, %xmm6 128 pcmpeqb %xmm7, %xmm0 129 pmovmskb %xmm1, %ecx 130 pmovmskb %xmm5, %r8d 131 pmovmskb %xmm0, %edi 132 salq $16, %rcx 133 salq $32, %r8 134 pmovmskb %xmm6, %esi 135 orq %r8, %rcx 136 orq %rdi, %rcx 137 salq $48, %rsi 138 orq %rsi, %rcx 139 bsfq %rcx, %rcx 140 movzbl (%rax, %rcx), %eax 141 movzbl (%rdx, %rcx), %edx 142 subl %edx, %eax 143 ret 144 145 .p2align 4 146L(loop_cross_page): 147 xor %r10, %r10 148 movq %rdx, %r9 149 and $63, %r9 150 subq %r9, %r10 151 152 movdqa (%rdx, %r10), %xmm0 153 movdqa 16(%rdx, %r10), %xmm1 154 movdqu (%rax, %r10), %xmm2 155 movdqu 16(%rax, %r10), %xmm3 156 pcmpeqb %xmm2, %xmm0 157 movdqa 32(%rdx, %r10), %xmm5 158 pcmpeqb %xmm3, %xmm1 159 pminub %xmm2, %xmm0 160 movdqa 48(%rdx, %r10), %xmm6 161 pminub %xmm3, %xmm1 162 movdqu 32(%rax, %r10), %xmm2 163 movdqu 48(%rax, %r10), %xmm3 164 pcmpeqb %xmm2, %xmm5 165 pcmpeqb %xmm3, %xmm6 166 pminub %xmm2, %xmm5 167 pminub %xmm3, %xmm6 168 169 pcmpeqb %xmm7, %xmm0 170 pcmpeqb %xmm7, %xmm1 171 pcmpeqb %xmm7, %xmm5 172 pcmpeqb %xmm7, %xmm6 173 174 pmovmskb %xmm1, %ecx 175 pmovmskb %xmm5, %r8d 176 pmovmskb %xmm0, %edi 177 salq $16, %rcx 178 salq $32, %r8 179 pmovmskb %xmm6, %esi 180 orq %r8, %rdi 181 orq %rcx, %rdi 182 salq $48, %rsi 183 orq %rsi, %rdi 184 movq %r9, %rcx 185 movq $63, %rsi 186 shrq %cl, %rdi 187 test %rdi, %rdi 188 je L(back_to_loop) 189 bsfq %rdi, %rcx 190 movzbl (%rax, %rcx), %eax 191 movzbl (%rdx, %rcx), %edx 192 subl %edx, %eax 193 ret 194 195 .p2align 4 196L(cross_page_loop): 197 cmpb %cl, %al 198 jne L(different) 199 addq $1, %rdx 200 cmpq $64, %rdx 201 je L(main_loop_header) 202L(cross_page): 203 movzbl (%rdi, %rdx), %eax 204 movzbl (%rsi, %rdx), %ecx 205 testb %al, %al 206 jne L(cross_page_loop) 207 xorl %eax, %eax 208L(different): 209 subl %ecx, %eax 210 ret 211END (__strcmp_sse2_unaligned) 212 213#endif 214