1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qbytearraymatcher.h" |
5 | |
6 | #include <qtconfiginclude.h> |
7 | #ifndef QT_BOOTSTRAPPED |
8 | # include <private/qtcore-config_p.h> |
9 | #endif |
10 | |
11 | #include <limits.h> |
12 | |
13 | QT_BEGIN_NAMESPACE |
14 | |
15 | static inline void bm_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable) |
16 | { |
17 | int l = int(qMin(a: len, b: qsizetype(255))); |
18 | memset(s: skiptable, c: l, n: 256*sizeof(uchar)); |
19 | cc += len - l; |
20 | while (l--) |
21 | skiptable[*cc++] = l; |
22 | } |
23 | |
24 | static inline qsizetype bm_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc, |
25 | qsizetype pl, const uchar *skiptable) |
26 | { |
27 | if (pl == 0) |
28 | return index > l ? -1 : index; |
29 | const qsizetype pl_minus_one = pl - 1; |
30 | |
31 | const uchar *current = cc + index + pl_minus_one; |
32 | const uchar *end = cc + l; |
33 | while (current < end) { |
34 | qsizetype skip = skiptable[*current]; |
35 | if (!skip) { |
36 | // possible match |
37 | while (skip < pl) { |
38 | if (*(current - skip) != puc[pl_minus_one - skip]) |
39 | break; |
40 | skip++; |
41 | } |
42 | if (skip > pl_minus_one) // we have a match |
43 | return (current - cc) - skip + 1; |
44 | |
45 | // in case we don't have a match we are a bit inefficient as we only skip by one |
46 | // when we have the non matching char in the string. |
47 | if (skiptable[*(current - skip)] == pl) |
48 | skip = pl - skip; |
49 | else |
50 | skip = 1; |
51 | } |
52 | if (current > end - skip) |
53 | break; |
54 | current += skip; |
55 | } |
56 | return -1; // not found |
57 | } |
58 | |
59 | /*! \class QByteArrayMatcher |
60 | \inmodule QtCore |
61 | \brief The QByteArrayMatcher class holds a sequence of bytes that |
62 | can be quickly matched in a byte array. |
63 | |
64 | \ingroup tools |
65 | \ingroup string-processing |
66 | |
67 | This class is useful when you have a sequence of bytes that you |
68 | want to repeatedly match against some byte arrays (perhaps in a |
69 | loop), or when you want to search for the same sequence of bytes |
70 | multiple times in the same byte array. Using a matcher object and |
71 | indexIn() is faster than matching a plain QByteArray with |
72 | QByteArray::indexOf() if repeated matching takes place. This |
73 | class offers no benefit if you are doing one-off byte array |
74 | matches. |
75 | |
76 | Create the QByteArrayMatcher with the QByteArray you want to |
77 | search for. Then call indexIn() on the QByteArray that you want to |
78 | search. |
79 | |
80 | \sa QByteArray, QStringMatcher |
81 | */ |
82 | |
83 | /*! |
84 | Constructs an empty byte array matcher that won't match anything. |
85 | Call setPattern() to give it a pattern to match. |
86 | */ |
87 | QByteArrayMatcher::QByteArrayMatcher() |
88 | : d(nullptr) |
89 | { |
90 | p.p = nullptr; |
91 | p.l = 0; |
92 | memset(s: p.q_skiptable, c: 0, n: sizeof(p.q_skiptable)); |
93 | } |
94 | |
95 | /*! |
96 | Constructs a byte array matcher from \a pattern. \a pattern |
97 | has the given \a length. Call indexIn() to perform a search. |
98 | |
99 | \note the data that \a pattern is referencing must remain valid while this |
100 | object is used. |
101 | */ |
102 | QByteArrayMatcher::QByteArrayMatcher(const char *pattern, qsizetype length) : d(nullptr) |
103 | { |
104 | p.p = reinterpret_cast<const uchar *>(pattern); |
105 | if (length < 0) |
106 | p.l = qstrlen(str: pattern); |
107 | else |
108 | p.l = length; |
109 | bm_init_skiptable(cc: p.p, len: p.l, skiptable: p.q_skiptable); |
110 | } |
111 | |
112 | /*! |
113 | Constructs a byte array matcher that will search for \a pattern. |
114 | Call indexIn() to perform a search. |
115 | */ |
116 | QByteArrayMatcher::QByteArrayMatcher(const QByteArray &pattern) |
117 | : d(nullptr), q_pattern(pattern) |
118 | { |
119 | p.p = reinterpret_cast<const uchar *>(pattern.constData()); |
120 | p.l = pattern.size(); |
121 | bm_init_skiptable(cc: p.p, len: p.l, skiptable: p.q_skiptable); |
122 | } |
123 | |
124 | /*! |
125 | \fn QByteArrayMatcher::QByteArrayMatcher(QByteArrayView pattern) |
126 | \since 6.3 |
127 | \overload |
128 | |
129 | Constructs a byte array matcher that will search for \a pattern. |
130 | Call indexIn() to perform a search. |
131 | |
132 | \note the data that \a pattern is referencing must remain valid while this |
133 | object is used. |
134 | */ |
135 | |
136 | /*! |
137 | Copies the \a other byte array matcher to this byte array matcher. |
138 | */ |
139 | QByteArrayMatcher::QByteArrayMatcher(const QByteArrayMatcher &other) |
140 | : d(nullptr) |
141 | { |
142 | operator=(other); |
143 | } |
144 | |
145 | /*! |
146 | Destroys the byte array matcher. |
147 | */ |
148 | QByteArrayMatcher::~QByteArrayMatcher() |
149 | { |
150 | Q_UNUSED(d); |
151 | } |
152 | |
153 | /*! |
154 | Assigns the \a other byte array matcher to this byte array matcher. |
155 | */ |
156 | QByteArrayMatcher &QByteArrayMatcher::operator=(const QByteArrayMatcher &other) |
157 | { |
158 | q_pattern = other.q_pattern; |
159 | memcpy(dest: &p, src: &other.p, n: sizeof(p)); |
160 | return *this; |
161 | } |
162 | |
163 | /*! |
164 | Sets the byte array that this byte array matcher will search for |
165 | to \a pattern. |
166 | |
167 | \sa pattern(), indexIn() |
168 | */ |
169 | void QByteArrayMatcher::setPattern(const QByteArray &pattern) |
170 | { |
171 | q_pattern = pattern; |
172 | p.p = reinterpret_cast<const uchar *>(pattern.constData()); |
173 | p.l = pattern.size(); |
174 | bm_init_skiptable(cc: p.p, len: p.l, skiptable: p.q_skiptable); |
175 | } |
176 | |
177 | /*! |
178 | Searches the char string \a str, which has length \a len, from |
179 | byte position \a from (default 0, i.e. from the first byte), for |
180 | the byte array pattern() that was set in the constructor or in the |
181 | most recent call to setPattern(). Returns the position where the |
182 | pattern() matched in \a str, or -1 if no match was found. |
183 | */ |
184 | qsizetype QByteArrayMatcher::indexIn(const char *str, qsizetype len, qsizetype from) const |
185 | { |
186 | if (from < 0) |
187 | from = 0; |
188 | return bm_find(cc: reinterpret_cast<const uchar *>(str), l: len, index: from, |
189 | puc: p.p, pl: p.l, skiptable: p.q_skiptable); |
190 | } |
191 | |
192 | /*! |
193 | \fn qsizetype QByteArrayMatcher::indexIn(QByteArrayView data, qsizetype from) const |
194 | \since 6.3 |
195 | \overload |
196 | |
197 | Searches the byte array \a data, from byte position \a from (default |
198 | 0, i.e. from the first byte), for the byte array pattern() that |
199 | was set in the constructor or in the most recent call to |
200 | setPattern(). Returns the position where the pattern() matched in |
201 | \a data, or -1 if no match was found. |
202 | */ |
203 | qsizetype QByteArrayMatcher::indexIn(QByteArrayView data, qsizetype from) const |
204 | { |
205 | if (from < 0) |
206 | from = 0; |
207 | return bm_find(cc: reinterpret_cast<const uchar *>(data.data()), l: data.size(), index: from, |
208 | puc: p.p, pl: p.l, skiptable: p.q_skiptable); |
209 | } |
210 | |
211 | /*! |
212 | \fn QByteArray QByteArrayMatcher::pattern() const |
213 | |
214 | Returns the byte array pattern that this byte array matcher will |
215 | search for. |
216 | |
217 | \sa setPattern() |
218 | */ |
219 | |
220 | /*! |
221 | \internal |
222 | */ |
223 | Q_NEVER_INLINE |
224 | static qsizetype qFindByteArrayBoyerMoore( |
225 | const char *haystack, qsizetype haystackLen, qsizetype haystackOffset, |
226 | const char *needle, qsizetype needleLen) |
227 | { |
228 | uchar skiptable[256]; |
229 | bm_init_skiptable(cc: (const uchar *)needle, len: needleLen, skiptable); |
230 | if (haystackOffset < 0) |
231 | haystackOffset = 0; |
232 | return bm_find(cc: (const uchar *)haystack, l: haystackLen, index: haystackOffset, |
233 | puc: (const uchar *)needle, pl: needleLen, skiptable); |
234 | } |
235 | |
236 | /*! |
237 | \internal |
238 | */ |
239 | static qsizetype qFindByteArray(const char *haystack0, qsizetype l, qsizetype from, |
240 | const char *needle, qsizetype sl); |
241 | qsizetype QtPrivate::findByteArray(QByteArrayView haystack, qsizetype from, QByteArrayView needle) noexcept |
242 | { |
243 | const auto haystack0 = haystack.data(); |
244 | const auto l = haystack.size(); |
245 | const auto sl = needle.size(); |
246 | #if !QT_CONFIG(memmem) |
247 | if (sl == 1) |
248 | return findByteArray(haystack, from, needle.front()); |
249 | #endif |
250 | |
251 | if (from < 0) |
252 | from += l; |
253 | if (std::size_t(sl + from) > std::size_t(l)) |
254 | return -1; |
255 | if (!sl) |
256 | return from; |
257 | if (!l) |
258 | return -1; |
259 | |
260 | #if QT_CONFIG(memmem) |
261 | auto where = memmem(haystack: haystack0 + from, haystacklen: l - from, needle: needle.data(), needlelen: sl); |
262 | return where ? static_cast<const char *>(where) - haystack0 : -1; |
263 | #endif |
264 | |
265 | /* |
266 | We use the Boyer-Moore algorithm in cases where the overhead |
267 | for the skip table should pay off, otherwise we use a simple |
268 | hash function. |
269 | */ |
270 | if (l > 500 && sl > 5) |
271 | return qFindByteArrayBoyerMoore(haystack: haystack0, haystackLen: l, haystackOffset: from, needle: needle.data(), needleLen: sl); |
272 | return qFindByteArray(haystack0, l, from, needle: needle.data(), sl); |
273 | } |
274 | |
275 | qsizetype qFindByteArray(const char *haystack0, qsizetype l, qsizetype from, |
276 | const char *needle, qsizetype sl) |
277 | { |
278 | /* |
279 | We use some hashing for efficiency's sake. Instead of |
280 | comparing strings, we compare the hash value of str with that |
281 | of a part of this QByteArray. Only if that matches, we call memcmp(). |
282 | */ |
283 | const char *haystack = haystack0 + from; |
284 | const char *end = haystack0 + (l - sl); |
285 | const qregisteruint sl_minus_1 = sl - 1; |
286 | qregisteruint hashNeedle = 0, hashHaystack = 0; |
287 | qsizetype idx; |
288 | for (idx = 0; idx < sl; ++idx) { |
289 | hashNeedle = ((hashNeedle<<1) + needle[idx]); |
290 | hashHaystack = ((hashHaystack<<1) + haystack[idx]); |
291 | } |
292 | hashHaystack -= *(haystack + sl_minus_1); |
293 | |
294 | while (haystack <= end) { |
295 | hashHaystack += *(haystack + sl_minus_1); |
296 | if (hashHaystack == hashNeedle && *needle == *haystack |
297 | && memcmp(s1: needle, s2: haystack, n: sl) == 0) |
298 | return haystack - haystack0; |
299 | |
300 | if (sl_minus_1 < sizeof(sl_minus_1) * CHAR_BIT) |
301 | hashHaystack -= qregisteruint(*haystack) << sl_minus_1; |
302 | hashHaystack <<= 1; |
303 | ++haystack; |
304 | } |
305 | return -1; |
306 | } |
307 | |
308 | /*! |
309 | \class QStaticByteArrayMatcherBase |
310 | \since 5.9 |
311 | \internal |
312 | \brief Non-template base class of QStaticByteArrayMatcher. |
313 | */ |
314 | |
315 | /*! |
316 | \class QStaticByteArrayMatcher |
317 | \since 5.9 |
318 | \inmodule QtCore |
319 | \brief The QStaticByteArrayMatcher class is a compile-time version of QByteArrayMatcher. |
320 | |
321 | \ingroup tools |
322 | \ingroup string-processing |
323 | |
324 | This class is useful when you have a sequence of bytes that you |
325 | want to repeatedly match against some byte arrays (perhaps in a |
326 | loop), or when you want to search for the same sequence of bytes |
327 | multiple times in the same byte array. Using a matcher object and |
328 | indexIn() is faster than matching a plain QByteArray with |
329 | QByteArray::indexOf(), in particular if repeated matching takes place. |
330 | |
331 | Unlike QByteArrayMatcher, this class calculates the internal |
332 | representation at \e{compile-time}, so it can |
333 | even benefit if you are doing one-off byte array matches. |
334 | |
335 | Create the QStaticByteArrayMatcher by calling qMakeStaticByteArrayMatcher(), |
336 | passing it the C string literal you want to search for. Store the return |
337 | value of that function in a \c{static const auto} variable, so you don't need |
338 | to pass the \c{N} template parameter explicitly: |
339 | |
340 | \snippet code/src_corelib_text_qbytearraymatcher.cpp 0 |
341 | |
342 | Then call indexIn() on the QByteArray in which you want to search, just like |
343 | with QByteArrayMatcher. |
344 | |
345 | Since this class is designed to do all the up-front calculations at compile-time, |
346 | it does not offer a setPattern() method. |
347 | |
348 | \sa QByteArrayMatcher, QStringMatcher |
349 | */ |
350 | |
351 | /*! |
352 | \fn template <size_t N> qsizetype QStaticByteArrayMatcher<N>::indexIn(const char *haystack, qsizetype hlen, qsizetype from = 0) const |
353 | |
354 | Searches the char string \a haystack, which has length \a hlen, from |
355 | byte position \a from (default 0, i.e. from the first byte), for |
356 | the byte array pattern() that was set in the constructor. |
357 | |
358 | Returns the position where the pattern() matched in \a haystack, or -1 if no match was found. |
359 | */ |
360 | |
361 | /*! |
362 | \fn template <size_t N> qsizetype QStaticByteArrayMatcher<N>::indexIn(const QByteArray &haystack, qsizetype from = 0) const |
363 | |
364 | Searches the char string \a haystack, from byte position \a from |
365 | (default 0, i.e. from the first byte), for the byte array pattern() |
366 | that was set in the constructor. |
367 | |
368 | Returns the position where the pattern() matched in \a haystack, or -1 if no match was found. |
369 | */ |
370 | |
371 | /*! |
372 | \fn template <size_t N> QByteArray QStaticByteArrayMatcher<N>::pattern() const |
373 | |
374 | Returns the byte array pattern that this byte array matcher will |
375 | search for. |
376 | |
377 | \sa QByteArrayMatcher::setPattern() |
378 | */ |
379 | |
380 | /*! |
381 | \internal |
382 | */ |
383 | qsizetype QStaticByteArrayMatcherBase::indexOfIn(const char *needle, size_t nlen, const char *haystack, qsizetype hlen, qsizetype from) const noexcept |
384 | { |
385 | if (from < 0) |
386 | from = 0; |
387 | return bm_find(cc: reinterpret_cast<const uchar *>(haystack), l: hlen, index: from, |
388 | puc: reinterpret_cast<const uchar *>(needle), pl: nlen, skiptable: m_skiptable.data); |
389 | } |
390 | |
391 | /*! |
392 | \fn template <size_t N> QStaticByteArrayMatcher<N>::QStaticByteArrayMatcher(const char (&pattern)[N]) |
393 | \internal |
394 | */ |
395 | |
396 | /*! |
397 | \fn template <size_t N> QStaticByteArrayMatcher qMakeStaticByteArrayMatcher(const char (&pattern)[N]) |
398 | \since 5.9 |
399 | \relates QStaticByteArrayMatcher |
400 | |
401 | Return a QStaticByteArrayMatcher with the correct \c{N} determined |
402 | automatically from the \a pattern passed. |
403 | |
404 | To take full advantage of this function, assign the result to an |
405 | \c{auto} variable: |
406 | |
407 | \snippet code/src_corelib_text_qbytearraymatcher.cpp 1 |
408 | */ |
409 | |
410 | QT_END_NAMESPACE |
411 | |