1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qbytearraymatcher.h" |
5 | |
6 | #include <limits.h> |
7 | |
8 | QT_BEGIN_NAMESPACE |
9 | |
10 | static inline void bm_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable) |
11 | { |
12 | int l = int(qMin(a: len, b: qsizetype(255))); |
13 | memset(s: skiptable, c: l, n: 256*sizeof(uchar)); |
14 | cc += len - l; |
15 | while (l--) |
16 | skiptable[*cc++] = l; |
17 | } |
18 | |
19 | static inline qsizetype bm_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc, |
20 | qsizetype pl, const uchar *skiptable) |
21 | { |
22 | if (pl == 0) |
23 | return index > l ? -1 : index; |
24 | const qsizetype pl_minus_one = pl - 1; |
25 | |
26 | const uchar *current = cc + index + pl_minus_one; |
27 | const uchar *end = cc + l; |
28 | while (current < end) { |
29 | qsizetype skip = skiptable[*current]; |
30 | if (!skip) { |
31 | // possible match |
32 | while (skip < pl) { |
33 | if (*(current - skip) != puc[pl_minus_one - skip]) |
34 | break; |
35 | skip++; |
36 | } |
37 | if (skip > pl_minus_one) // we have a match |
38 | return (current - cc) - skip + 1; |
39 | |
40 | // in case we don't have a match we are a bit inefficient as we only skip by one |
41 | // when we have the non matching char in the string. |
42 | if (skiptable[*(current - skip)] == pl) |
43 | skip = pl - skip; |
44 | else |
45 | skip = 1; |
46 | } |
47 | if (current > end - skip) |
48 | break; |
49 | current += skip; |
50 | } |
51 | return -1; // not found |
52 | } |
53 | |
54 | /*! \class QByteArrayMatcher |
55 | \inmodule QtCore |
56 | \brief The QByteArrayMatcher class holds a sequence of bytes that |
57 | can be quickly matched in a byte array. |
58 | |
59 | \ingroup tools |
60 | \ingroup string-processing |
61 | |
62 | This class is useful when you have a sequence of bytes that you |
63 | want to repeatedly match against some byte arrays (perhaps in a |
64 | loop), or when you want to search for the same sequence of bytes |
65 | multiple times in the same byte array. Using a matcher object and |
66 | indexIn() is faster than matching a plain QByteArray with |
67 | QByteArray::indexOf() if repeated matching takes place. This |
68 | class offers no benefit if you are doing one-off byte array |
69 | matches. |
70 | |
71 | Create the QByteArrayMatcher with the QByteArray you want to |
72 | search for. Then call indexIn() on the QByteArray that you want to |
73 | search. |
74 | |
75 | \sa QByteArray, QStringMatcher |
76 | */ |
77 | |
78 | /*! |
79 | Constructs an empty byte array matcher that won't match anything. |
80 | Call setPattern() to give it a pattern to match. |
81 | */ |
82 | QByteArrayMatcher::QByteArrayMatcher() |
83 | : d(nullptr) |
84 | { |
85 | p.p = nullptr; |
86 | p.l = 0; |
87 | memset(s: p.q_skiptable, c: 0, n: sizeof(p.q_skiptable)); |
88 | } |
89 | |
90 | /*! |
91 | Constructs a byte array matcher from \a pattern. \a pattern |
92 | has the given \a length. Call indexIn() to perform a search. |
93 | |
94 | \note the data that \a pattern is referencing must remain valid while this |
95 | object is used. |
96 | */ |
97 | QByteArrayMatcher::QByteArrayMatcher(const char *pattern, qsizetype length) : d(nullptr) |
98 | { |
99 | p.p = reinterpret_cast<const uchar *>(pattern); |
100 | if (length < 0) |
101 | p.l = qstrlen(str: pattern); |
102 | else |
103 | p.l = length; |
104 | bm_init_skiptable(cc: p.p, len: p.l, skiptable: p.q_skiptable); |
105 | } |
106 | |
107 | /*! |
108 | Constructs a byte array matcher that will search for \a pattern. |
109 | Call indexIn() to perform a search. |
110 | */ |
111 | QByteArrayMatcher::QByteArrayMatcher(const QByteArray &pattern) |
112 | : d(nullptr), q_pattern(pattern) |
113 | { |
114 | p.p = reinterpret_cast<const uchar *>(pattern.constData()); |
115 | p.l = pattern.size(); |
116 | bm_init_skiptable(cc: p.p, len: p.l, skiptable: p.q_skiptable); |
117 | } |
118 | |
119 | /*! |
120 | \fn QByteArrayMatcher::QByteArrayMatcher(QByteArrayView pattern) |
121 | \since 6.3 |
122 | \overload |
123 | |
124 | Constructs a byte array matcher that will search for \a pattern. |
125 | Call indexIn() to perform a search. |
126 | |
127 | \note the data that \a pattern is referencing must remain valid while this |
128 | object is used. |
129 | */ |
130 | |
131 | /*! |
132 | Copies the \a other byte array matcher to this byte array matcher. |
133 | */ |
134 | QByteArrayMatcher::QByteArrayMatcher(const QByteArrayMatcher &other) |
135 | : d(nullptr) |
136 | { |
137 | operator=(other); |
138 | } |
139 | |
140 | /*! |
141 | Destroys the byte array matcher. |
142 | */ |
143 | QByteArrayMatcher::~QByteArrayMatcher() |
144 | { |
145 | Q_UNUSED(d); |
146 | } |
147 | |
148 | /*! |
149 | Assigns the \a other byte array matcher to this byte array matcher. |
150 | */ |
151 | QByteArrayMatcher &QByteArrayMatcher::operator=(const QByteArrayMatcher &other) |
152 | { |
153 | q_pattern = other.q_pattern; |
154 | memcpy(dest: &p, src: &other.p, n: sizeof(p)); |
155 | return *this; |
156 | } |
157 | |
158 | /*! |
159 | Sets the byte array that this byte array matcher will search for |
160 | to \a pattern. |
161 | |
162 | \sa pattern(), indexIn() |
163 | */ |
164 | void QByteArrayMatcher::setPattern(const QByteArray &pattern) |
165 | { |
166 | q_pattern = pattern; |
167 | p.p = reinterpret_cast<const uchar *>(pattern.constData()); |
168 | p.l = pattern.size(); |
169 | bm_init_skiptable(cc: p.p, len: p.l, skiptable: p.q_skiptable); |
170 | } |
171 | |
172 | /*! |
173 | Searches the char string \a str, which has length \a len, from |
174 | byte position \a from (default 0, i.e. from the first byte), for |
175 | the byte array pattern() that was set in the constructor or in the |
176 | most recent call to setPattern(). Returns the position where the |
177 | pattern() matched in \a str, or -1 if no match was found. |
178 | */ |
179 | qsizetype QByteArrayMatcher::indexIn(const char *str, qsizetype len, qsizetype from) const |
180 | { |
181 | if (from < 0) |
182 | from = 0; |
183 | return bm_find(cc: reinterpret_cast<const uchar *>(str), l: len, index: from, |
184 | puc: p.p, pl: p.l, skiptable: p.q_skiptable); |
185 | } |
186 | |
187 | /*! |
188 | \fn qsizetype QByteArrayMatcher::indexIn(QByteArrayView data, qsizetype from) const |
189 | \since 6.3 |
190 | \overload |
191 | |
192 | Searches the byte array \a data, from byte position \a from (default |
193 | 0, i.e. from the first byte), for the byte array pattern() that |
194 | was set in the constructor or in the most recent call to |
195 | setPattern(). Returns the position where the pattern() matched in |
196 | \a data, or -1 if no match was found. |
197 | */ |
198 | qsizetype QByteArrayMatcher::indexIn(QByteArrayView data, qsizetype from) const |
199 | { |
200 | if (from < 0) |
201 | from = 0; |
202 | return bm_find(cc: reinterpret_cast<const uchar *>(data.data()), l: data.size(), index: from, |
203 | puc: p.p, pl: p.l, skiptable: p.q_skiptable); |
204 | } |
205 | |
206 | /*! |
207 | \fn QByteArray QByteArrayMatcher::pattern() const |
208 | |
209 | Returns the byte array pattern that this byte array matcher will |
210 | search for. |
211 | |
212 | \sa setPattern() |
213 | */ |
214 | |
215 | |
216 | static qsizetype findChar(const char *str, qsizetype len, char ch, qsizetype from) |
217 | { |
218 | const uchar *s = (const uchar *)str; |
219 | uchar c = (uchar)ch; |
220 | if (from < 0) |
221 | from = qMax(a: from + len, b: qsizetype(0)); |
222 | if (from < len) { |
223 | const uchar *n = s + from - 1; |
224 | const uchar *e = s + len; |
225 | while (++n != e) |
226 | if (*n == c) |
227 | return n - s; |
228 | } |
229 | return -1; |
230 | } |
231 | |
232 | /*! |
233 | \internal |
234 | */ |
235 | static qsizetype qFindByteArrayBoyerMoore( |
236 | const char *haystack, qsizetype haystackLen, qsizetype haystackOffset, |
237 | const char *needle, qsizetype needleLen) |
238 | { |
239 | uchar skiptable[256]; |
240 | bm_init_skiptable(cc: (const uchar *)needle, len: needleLen, skiptable); |
241 | if (haystackOffset < 0) |
242 | haystackOffset = 0; |
243 | return bm_find(cc: (const uchar *)haystack, l: haystackLen, index: haystackOffset, |
244 | puc: (const uchar *)needle, pl: needleLen, skiptable); |
245 | } |
246 | |
247 | #define REHASH(a) \ |
248 | if (sl_minus_1 < sizeof(std::size_t) * CHAR_BIT) \ |
249 | hashHaystack -= std::size_t(a) << sl_minus_1; \ |
250 | hashHaystack <<= 1 |
251 | |
252 | /*! |
253 | \internal |
254 | */ |
255 | qsizetype qFindByteArray( |
256 | const char *haystack0, qsizetype haystackLen, qsizetype from, |
257 | const char *needle, qsizetype needleLen) |
258 | { |
259 | const auto l = haystackLen; |
260 | const auto sl = needleLen; |
261 | if (from < 0) |
262 | from += l; |
263 | if (std::size_t(sl + from) > std::size_t(l)) |
264 | return -1; |
265 | if (!sl) |
266 | return from; |
267 | if (!l) |
268 | return -1; |
269 | |
270 | if (sl == 1) |
271 | return findChar(str: haystack0, len: haystackLen, ch: needle[0], from); |
272 | |
273 | /* |
274 | We use the Boyer-Moore algorithm in cases where the overhead |
275 | for the skip table should pay off, otherwise we use a simple |
276 | hash function. |
277 | */ |
278 | if (l > 500 && sl > 5) |
279 | return qFindByteArrayBoyerMoore(haystack: haystack0, haystackLen, haystackOffset: from, |
280 | needle, needleLen); |
281 | |
282 | /* |
283 | We use some hashing for efficiency's sake. Instead of |
284 | comparing strings, we compare the hash value of str with that |
285 | of a part of this QString. Only if that matches, we call memcmp(). |
286 | */ |
287 | const char *haystack = haystack0 + from; |
288 | const char *end = haystack0 + (l - sl); |
289 | const auto sl_minus_1 = std::size_t(sl - 1); |
290 | std::size_t hashNeedle = 0, hashHaystack = 0; |
291 | qsizetype idx; |
292 | for (idx = 0; idx < sl; ++idx) { |
293 | hashNeedle = ((hashNeedle<<1) + needle[idx]); |
294 | hashHaystack = ((hashHaystack<<1) + haystack[idx]); |
295 | } |
296 | hashHaystack -= *(haystack + sl_minus_1); |
297 | |
298 | while (haystack <= end) { |
299 | hashHaystack += *(haystack + sl_minus_1); |
300 | if (hashHaystack == hashNeedle && *needle == *haystack |
301 | && memcmp(s1: needle, s2: haystack, n: sl) == 0) |
302 | return haystack - haystack0; |
303 | |
304 | REHASH(*haystack); |
305 | ++haystack; |
306 | } |
307 | return -1; |
308 | } |
309 | |
310 | /*! |
311 | \class QStaticByteArrayMatcherBase |
312 | \since 5.9 |
313 | \internal |
314 | \brief Non-template base class of QStaticByteArrayMatcher. |
315 | */ |
316 | |
317 | /*! |
318 | \class QStaticByteArrayMatcher |
319 | \since 5.9 |
320 | \inmodule QtCore |
321 | \brief The QStaticByteArrayMatcher class is a compile-time version of QByteArrayMatcher. |
322 | |
323 | \ingroup tools |
324 | \ingroup string-processing |
325 | |
326 | This class is useful when you have a sequence of bytes that you |
327 | want to repeatedly match against some byte arrays (perhaps in a |
328 | loop), or when you want to search for the same sequence of bytes |
329 | multiple times in the same byte array. Using a matcher object and |
330 | indexIn() is faster than matching a plain QByteArray with |
331 | QByteArray::indexOf(), in particular if repeated matching takes place. |
332 | |
333 | Unlike QByteArrayMatcher, this class calculates the internal |
334 | representation at \e{compile-time}, so it can |
335 | even benefit if you are doing one-off byte array matches. |
336 | |
337 | Create the QStaticByteArrayMatcher by calling qMakeStaticByteArrayMatcher(), |
338 | passing it the C string literal you want to search for. Store the return |
339 | value of that function in a \c{static const auto} variable, so you don't need |
340 | to pass the \c{N} template parameter explicitly: |
341 | |
342 | \snippet code/src_corelib_text_qbytearraymatcher.cpp 0 |
343 | |
344 | Then call indexIn() on the QByteArray in which you want to search, just like |
345 | with QByteArrayMatcher. |
346 | |
347 | Since this class is designed to do all the up-front calculations at compile-time, |
348 | it does not offer a setPattern() method. |
349 | |
350 | \sa QByteArrayMatcher, QStringMatcher |
351 | */ |
352 | |
353 | /*! |
354 | \fn template <size_t N> qsizetype QStaticByteArrayMatcher<N>::indexIn(const char *haystack, qsizetype hlen, qsizetype from = 0) const |
355 | |
356 | Searches the char string \a haystack, which has length \a hlen, from |
357 | byte position \a from (default 0, i.e. from the first byte), for |
358 | the byte array pattern() that was set in the constructor. |
359 | |
360 | Returns the position where the pattern() matched in \a haystack, or -1 if no match was found. |
361 | */ |
362 | |
363 | /*! |
364 | \fn template <size_t N> qsizetype QStaticByteArrayMatcher<N>::indexIn(const QByteArray &haystack, qsizetype from = 0) const |
365 | |
366 | Searches the char string \a haystack, from byte position \a from |
367 | (default 0, i.e. from the first byte), for the byte array pattern() |
368 | that was set in the constructor. |
369 | |
370 | Returns the position where the pattern() matched in \a haystack, or -1 if no match was found. |
371 | */ |
372 | |
373 | /*! |
374 | \fn template <size_t N> QByteArray QStaticByteArrayMatcher<N>::pattern() const |
375 | |
376 | Returns the byte array pattern that this byte array matcher will |
377 | search for. |
378 | |
379 | \sa QByteArrayMatcher::setPattern() |
380 | */ |
381 | |
382 | /*! |
383 | \internal |
384 | */ |
385 | qsizetype QStaticByteArrayMatcherBase::indexOfIn(const char *needle, size_t nlen, const char *haystack, qsizetype hlen, qsizetype from) const noexcept |
386 | { |
387 | if (from < 0) |
388 | from = 0; |
389 | return bm_find(cc: reinterpret_cast<const uchar *>(haystack), l: hlen, index: from, |
390 | puc: reinterpret_cast<const uchar *>(needle), pl: nlen, skiptable: m_skiptable.data); |
391 | } |
392 | |
393 | /*! |
394 | \fn template <size_t N> QStaticByteArrayMatcher<N>::QStaticByteArrayMatcher(const char (&pattern)[N]) |
395 | \internal |
396 | */ |
397 | |
398 | /*! |
399 | \fn template <size_t N> QStaticByteArrayMatcher qMakeStaticByteArrayMatcher(const char (&pattern)[N]) |
400 | \since 5.9 |
401 | \relates QStaticByteArrayMatcher |
402 | |
403 | Return a QStaticByteArrayMatcher with the correct \c{N} determined |
404 | automatically from the \a pattern passed. |
405 | |
406 | To take full advantage of this function, assign the result to an |
407 | \c{auto} variable: |
408 | |
409 | \snippet code/src_corelib_text_qbytearraymatcher.cpp 1 |
410 | */ |
411 | |
412 | |
413 | QT_END_NAMESPACE |
414 | |
415 | #undef REHASH |
416 | |