| 1 | /* memcmp - compare memory |
| 2 | |
| 3 | Copyright (C) 2013-2024 Free Software Foundation, Inc. |
| 4 | |
| 5 | This file is part of the GNU C Library. |
| 6 | |
| 7 | The GNU C Library is free software; you can redistribute it and/or |
| 8 | modify it under the terms of the GNU Lesser General Public |
| 9 | License as published by the Free Software Foundation; either |
| 10 | version 2.1 of the License, or (at your option) any later version. |
| 11 | |
| 12 | The GNU C Library is distributed in the hope that it will be useful, |
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | Lesser General Public License for more details. |
| 16 | |
| 17 | You should have received a copy of the GNU Lesser General Public |
| 18 | License along with the GNU C Library. If not, see |
| 19 | <https://www.gnu.org/licenses/>. */ |
| 20 | |
| 21 | #include <sysdep.h> |
| 22 | |
| 23 | /* Assumptions: |
| 24 | * |
| 25 | * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. |
| 26 | */ |
| 27 | |
| 28 | #define src1 x0 |
| 29 | #define src2 x1 |
| 30 | #define limit x2 |
| 31 | #define result w0 |
| 32 | |
| 33 | #define data1 x3 |
| 34 | #define data1w w3 |
| 35 | #define data2 x4 |
| 36 | #define data2w w4 |
| 37 | #define data3 x5 |
| 38 | #define data3w w5 |
| 39 | #define data4 x6 |
| 40 | #define data4w w6 |
| 41 | #define tmp x6 |
| 42 | #define src1end x7 |
| 43 | #define src2end x8 |
| 44 | |
| 45 | |
| 46 | ENTRY (memcmp) |
| 47 | PTR_ARG (0) |
| 48 | PTR_ARG (1) |
| 49 | SIZE_ARG (2) |
| 50 | |
| 51 | cmp limit, 16 |
| 52 | b.lo L(less16) |
| 53 | ldp data1, data3, [src1] |
| 54 | ldp data2, data4, [src2] |
| 55 | ccmp data1, data2, 0, ne |
| 56 | ccmp data3, data4, 0, eq |
| 57 | b.ne L(return2) |
| 58 | |
| 59 | add src1end, src1, limit |
| 60 | add src2end, src2, limit |
| 61 | cmp limit, 32 |
| 62 | b.ls L(last_bytes) |
| 63 | cmp limit, 160 |
| 64 | b.hs L(loop_align) |
| 65 | sub limit, limit, 32 |
| 66 | |
| 67 | .p2align 4 |
| 68 | L(loop32): |
| 69 | ldp data1, data3, [src1, 16] |
| 70 | ldp data2, data4, [src2, 16] |
| 71 | cmp data1, data2 |
| 72 | ccmp data3, data4, 0, eq |
| 73 | b.ne L(return2) |
| 74 | cmp limit, 16 |
| 75 | b.ls L(last_bytes) |
| 76 | |
| 77 | ldp data1, data3, [src1, 32] |
| 78 | ldp data2, data4, [src2, 32] |
| 79 | cmp data1, data2 |
| 80 | ccmp data3, data4, 0, eq |
| 81 | b.ne L(return2) |
| 82 | add src1, src1, 32 |
| 83 | add src2, src2, 32 |
| 84 | L(last64): |
| 85 | subs limit, limit, 32 |
| 86 | b.hi L(loop32) |
| 87 | |
| 88 | /* Compare last 1-16 bytes using unaligned access. */ |
| 89 | L(last_bytes): |
| 90 | ldp data1, data3, [src1end, -16] |
| 91 | ldp data2, data4, [src2end, -16] |
| 92 | L(return2): |
| 93 | cmp data1, data2 |
| 94 | csel data1, data1, data3, ne |
| 95 | csel data2, data2, data4, ne |
| 96 | |
| 97 | /* Compare data bytes and set return value to 0, -1 or 1. */ |
| 98 | L(return): |
| 99 | #ifndef __AARCH64EB__ |
| 100 | rev data1, data1 |
| 101 | rev data2, data2 |
| 102 | #endif |
| 103 | cmp data1, data2 |
| 104 | cset result, ne |
| 105 | cneg result, result, lo |
| 106 | ret |
| 107 | |
| 108 | .p2align 4 |
| 109 | L(less16): |
| 110 | add src1end, src1, limit |
| 111 | add src2end, src2, limit |
| 112 | tbz limit, 3, L(less8) |
| 113 | ldr data1, [src1] |
| 114 | ldr data2, [src2] |
| 115 | ldr data3, [src1end, -8] |
| 116 | ldr data4, [src2end, -8] |
| 117 | b L(return2) |
| 118 | |
| 119 | .p2align 4 |
| 120 | L(less8): |
| 121 | tbz limit, 2, L(less4) |
| 122 | ldr data1w, [src1] |
| 123 | ldr data2w, [src2] |
| 124 | ldr data3w, [src1end, -4] |
| 125 | ldr data4w, [src2end, -4] |
| 126 | b L(return2) |
| 127 | |
| 128 | L(less4): |
| 129 | tbz limit, 1, L(less2) |
| 130 | ldrh data1w, [src1] |
| 131 | ldrh data2w, [src2] |
| 132 | cmp data1w, data2w |
| 133 | b.ne L(return) |
| 134 | L(less2): |
| 135 | mov result, 0 |
| 136 | tbz limit, 0, L(return_zero) |
| 137 | ldrb data1w, [src1end, -1] |
| 138 | ldrb data2w, [src2end, -1] |
| 139 | sub result, data1w, data2w |
| 140 | L(return_zero): |
| 141 | ret |
| 142 | |
| 143 | L(loop_align): |
| 144 | ldp data1, data3, [src1, 16] |
| 145 | ldp data2, data4, [src2, 16] |
| 146 | cmp data1, data2 |
| 147 | ccmp data3, data4, 0, eq |
| 148 | b.ne L(return2) |
| 149 | |
| 150 | /* Align src2 and adjust src1, src2 and limit. */ |
| 151 | and tmp, src2, 15 |
| 152 | sub tmp, tmp, 16 |
| 153 | sub src2, src2, tmp |
| 154 | add limit, limit, tmp |
| 155 | sub src1, src1, tmp |
| 156 | sub limit, limit, 64 + 16 |
| 157 | |
| 158 | .p2align 4 |
| 159 | L(loop64): |
| 160 | ldr q0, [src1, 16] |
| 161 | ldr q1, [src2, 16] |
| 162 | subs limit, limit, 64 |
| 163 | ldr q2, [src1, 32] |
| 164 | ldr q3, [src2, 32] |
| 165 | eor v0.16b, v0.16b, v1.16b |
| 166 | eor v1.16b, v2.16b, v3.16b |
| 167 | ldr q2, [src1, 48] |
| 168 | ldr q3, [src2, 48] |
| 169 | umaxp v0.16b, v0.16b, v1.16b |
| 170 | ldr q4, [src1, 64]! |
| 171 | ldr q5, [src2, 64]! |
| 172 | eor v1.16b, v2.16b, v3.16b |
| 173 | eor v2.16b, v4.16b, v5.16b |
| 174 | umaxp v1.16b, v1.16b, v2.16b |
| 175 | umaxp v0.16b, v0.16b, v1.16b |
| 176 | umaxp v0.16b, v0.16b, v0.16b |
| 177 | fmov tmp, d0 |
| 178 | ccmp tmp, 0, 0, hi |
| 179 | b.eq L(loop64) |
| 180 | |
| 181 | /* If equal, process last 1-64 bytes using scalar loop. */ |
| 182 | add limit, limit, 64 + 16 |
| 183 | cbz tmp, L(last64) |
| 184 | |
| 185 | /* Determine the 8-byte aligned offset of the first difference. */ |
| 186 | #ifdef __AARCH64EB__ |
| 187 | rev16 tmp, tmp |
| 188 | #endif |
| 189 | rev tmp, tmp |
| 190 | clz tmp, tmp |
| 191 | bic tmp, tmp, 7 |
| 192 | sub tmp, tmp, 48 |
| 193 | ldr data1, [src1, tmp] |
| 194 | ldr data2, [src2, tmp] |
| 195 | #ifndef __AARCH64EB__ |
| 196 | rev data1, data1 |
| 197 | rev data2, data2 |
| 198 | #endif |
| 199 | mov result, 1 |
| 200 | cmp data1, data2 |
| 201 | cneg result, result, lo |
| 202 | ret |
| 203 | |
| 204 | END (memcmp) |
| 205 | #undef bcmp |
| 206 | weak_alias (memcmp, bcmp) |
| 207 | #undef __memcmpeq |
| 208 | strong_alias (memcmp, __memcmpeq) |
| 209 | libc_hidden_builtin_def (memcmp) |
| 210 | libc_hidden_def (__memcmpeq) |
| 211 | |