| 1 | /* memchr implemented using NEON. |
| 2 | Copyright (C) 2011-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library. If not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | |
| 21 | /* For __ARM_NEON__ this file defines memchr. */ |
| 22 | #ifndef __ARM_NEON__ |
| 23 | # define memchr __memchr_neon |
| 24 | # undef libc_hidden_builtin_def |
| 25 | # define libc_hidden_builtin_def(a) |
| 26 | #endif |
| 27 | |
| 28 | .arch armv7-a |
| 29 | .fpu neon |
| 30 | |
| 31 | |
| 32 | /* Arguments */ |
| 33 | #define srcin r0 |
| 34 | #define chrin r1 |
| 35 | #define cntin r2 |
| 36 | |
| 37 | /* Retval */ |
| 38 | #define result r0 /* Live range does not overlap with srcin */ |
| 39 | |
| 40 | /* Working registers */ |
| 41 | #define src r1 /* Live range does not overlap with chrin */ |
| 42 | #define tmp r3 |
| 43 | #define synd r0 /* No overlap with srcin or result */ |
| 44 | #define soff r12 |
| 45 | |
| 46 | /* Working NEON registers */ |
| 47 | #define vrepchr q0 |
| 48 | #define vdata0 q1 |
| 49 | #define vdata0_0 d2 /* Lower half of vdata0 */ |
| 50 | #define vdata0_1 d3 /* Upper half of vdata0 */ |
| 51 | #define vdata1 q2 |
| 52 | #define vdata1_0 d4 /* Lower half of vhas_chr0 */ |
| 53 | #define vdata1_1 d5 /* Upper half of vhas_chr0 */ |
| 54 | #define vrepmask q3 |
| 55 | #define vrepmask0 d6 |
| 56 | #define vrepmask1 d7 |
| 57 | #define vend q4 |
| 58 | #define vend0 d8 |
| 59 | #define vend1 d9 |
| 60 | |
| 61 | /* |
| 62 | * Core algorithm: |
| 63 | * |
| 64 | * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per |
| 65 | * byte. Each bit is set if the relevant byte matched the requested character |
| 66 | * and cleared otherwise. Since the bits in the syndrome reflect exactly the |
| 67 | * order in which things occur in the original string, counting trailing zeros |
| 68 | * allows to identify exactly which byte has matched. |
| 69 | */ |
| 70 | |
| 71 | .thumb_func |
| 72 | .p2align 4,,15 |
| 73 | |
| 74 | ENTRY(memchr) |
| 75 | /* Use a simple loop if there are less than 8 bytes to search. */ |
| 76 | cmp cntin, #7 |
| 77 | bhi .Llargestr |
| 78 | and chrin, chrin, #0xff |
| 79 | |
| 80 | .Lsmallstr: |
| 81 | subs cntin, cntin, #1 |
| 82 | blo .Lnotfound /* Return not found if reached end. */ |
| 83 | ldrb tmp, [srcin], #1 |
| 84 | cmp tmp, chrin |
| 85 | bne .Lsmallstr /* Loop again if not found. */ |
| 86 | /* Otherwise fixup address and return. */ |
| 87 | sub result, srcin, #1 |
| 88 | bx lr |
| 89 | |
| 90 | |
| 91 | .Llargestr: |
| 92 | vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ |
| 93 | /* |
| 94 | * Magic constant 0x8040201008040201 allows us to identify which lane |
| 95 | * matches the requested byte. |
| 96 | */ |
| 97 | movw tmp, #0x0201 |
| 98 | movt tmp, #0x0804 |
| 99 | lsl soff, tmp, #4 |
| 100 | vmov vrepmask0, tmp, soff |
| 101 | vmov vrepmask1, tmp, soff |
| 102 | /* Work with aligned 32-byte chunks */ |
| 103 | bic src, srcin, #31 |
| 104 | ands soff, srcin, #31 |
| 105 | beq .Lloopintro /* Go straight to main loop if it's aligned. */ |
| 106 | |
| 107 | /* |
| 108 | * Input string is not 32-byte aligned. We calculate the syndrome |
| 109 | * value for the aligned 32 bytes block containing the first bytes |
| 110 | * and mask the irrelevant part. |
| 111 | */ |
| 112 | vld1.8 {vdata0, vdata1}, [src:256]! |
| 113 | sub tmp, soff, #32 |
| 114 | adds cntin, cntin, tmp |
| 115 | vceq.i8 vdata0, vdata0, vrepchr |
| 116 | vceq.i8 vdata1, vdata1, vrepchr |
| 117 | vand vdata0, vdata0, vrepmask |
| 118 | vand vdata1, vdata1, vrepmask |
| 119 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 |
| 120 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 |
| 121 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 |
| 122 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 |
| 123 | vmov synd, vdata0_0[0] |
| 124 | |
| 125 | /* Clear the soff lower bits */ |
| 126 | lsr synd, synd, soff |
| 127 | lsl synd, synd, soff |
| 128 | /* The first block can also be the last */ |
| 129 | bls .Lmasklast |
| 130 | /* Have we found something already? */ |
| 131 | cbnz synd, .Ltail |
| 132 | |
| 133 | |
| 134 | .Lloopintro: |
| 135 | vpush {vend} |
| 136 | /* 264/265 correspond to d8/d9 for q4 */ |
| 137 | cfi_adjust_cfa_offset (16) |
| 138 | cfi_rel_offset (264, 0) |
| 139 | cfi_rel_offset (265, 8) |
| 140 | .p2align 3,,7 |
| 141 | .Lloop: |
| 142 | vld1.8 {vdata0, vdata1}, [src:256]! |
| 143 | subs cntin, cntin, #32 |
| 144 | vceq.i8 vdata0, vdata0, vrepchr |
| 145 | vceq.i8 vdata1, vdata1, vrepchr |
| 146 | /* If we're out of data we finish regardless of the result. */ |
| 147 | bls .Lend |
| 148 | /* Use a fast check for the termination condition. */ |
| 149 | vorr vend, vdata0, vdata1 |
| 150 | vorr vend0, vend0, vend1 |
| 151 | vmov synd, tmp, vend0 |
| 152 | orrs synd, synd, tmp |
| 153 | /* We're not out of data, loop if we haven't found the character. */ |
| 154 | beq .Lloop |
| 155 | |
| 156 | .Lend: |
| 157 | vpop {vend} |
| 158 | cfi_adjust_cfa_offset (-16) |
| 159 | cfi_restore (264) |
| 160 | cfi_restore (265) |
| 161 | |
| 162 | /* Termination condition found, let's calculate the syndrome value. */ |
| 163 | vand vdata0, vdata0, vrepmask |
| 164 | vand vdata1, vdata1, vrepmask |
| 165 | vpadd.i8 vdata0_0, vdata0_0, vdata0_1 |
| 166 | vpadd.i8 vdata1_0, vdata1_0, vdata1_1 |
| 167 | vpadd.i8 vdata0_0, vdata0_0, vdata1_0 |
| 168 | vpadd.i8 vdata0_0, vdata0_0, vdata0_0 |
| 169 | vmov synd, vdata0_0[0] |
| 170 | cbz synd, .Lnotfound |
| 171 | bhi .Ltail /* Uses the condition code from |
| 172 | subs cntin, cntin, #32 above. */ |
| 173 | |
| 174 | |
| 175 | .Lmasklast: |
| 176 | /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ |
| 177 | neg cntin, cntin |
| 178 | lsl synd, synd, cntin |
| 179 | lsrs synd, synd, cntin |
| 180 | it eq |
| 181 | moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ |
| 182 | |
| 183 | |
| 184 | .Ltail: |
| 185 | /* Count the trailing zeros using bit reversing */ |
| 186 | rbit synd, synd |
| 187 | /* Compensate the last post-increment */ |
| 188 | sub src, src, #32 |
| 189 | /* Count the leading zeros */ |
| 190 | clz synd, synd |
| 191 | /* Compute the potential result and return */ |
| 192 | add result, src, synd |
| 193 | bx lr |
| 194 | |
| 195 | |
| 196 | .Lnotfound: |
| 197 | /* Set result to NULL if not found and return */ |
| 198 | mov result, #0 |
| 199 | bx lr |
| 200 | |
| 201 | END(memchr) |
| 202 | libc_hidden_builtin_def (memchr) |
| 203 | |