1// Copyright (C) 2020 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Marc Mutz <marc.mutz@kdab.com>
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qstringtokenizer.h"
5#include "qstringalgorithms.h"
6
7QT_BEGIN_NAMESPACE
8
9/*!
10 \class QStringTokenizer
11 \inmodule QtCore
12 \since 6.0
13 \brief The QStringTokenizer class splits strings into tokens along given separators.
14 \reentrant
15 \ingroup tools
16 \ingroup string-processing
17
18 Splits a string into substrings wherever a given separator occurs,
19 returning a (lazily constructed) list of those strings. If the separator does
20 not match anywhere in the string, produces a single-element list
21 containing this string. If the separator is empty,
22 QStringTokenizer produces an empty string, followed by each of the
23 string's characters, followed by another empty string. The two
24 enumerations Qt::SplitBehavior and Qt::CaseSensitivity further
25 control the output.
26
27 QStringTokenizer drives QStringView::tokenize(), but you can use it
28 directly, too:
29
30 \code
31 for (auto it : QStringTokenizer{string, separator})
32 use(*it);
33 \endcode
34
35 \note You should never name the template arguments of a
36 QStringTokenizer explicitly. You may write
37 \c{QStringTokenizer{string, separator}} (without template arguments),
38 or use either QStringView::tokenize() or QLatin1StringView::tokenize(),
39 then store the return value only in an \c{auto} variable:
40
41 \code
42 auto result = strview.tokenize(sep);
43 \endcode
44
45 This is because the template arguments of QStringTokenizer have a
46 very subtle dependency on the specific string and separator types
47 from with which they are constructed, and they don't usually
48 correspond to the actual types passed.
49
50 \section1 Lazy Sequences
51
52 QStringTokenizer acts as a so-called lazy sequence, that is, each
53 next element is only computed once you ask for it. Lazy sequences
54 have the advantage that they only require O(1) memory. They have
55 the disadvantage that, at least for QStringTokenizer, they only
56 allow forward, not random-access, iteration.
57
58 The intended use-case is that you just plug it into a ranged for loop:
59
60 \code
61 for (auto it : QStringTokenizer{string, separator})
62 use(*it);
63 \endcode
64
65 or a C++20 ranged algorithm:
66
67 \code
68 std::ranges::for_each(QStringTokenizer{string, separator},
69 [] (auto token) { use(token); });
70 \endcode
71
72 \section1 End Sentinel
73
74 The QStringTokenizer iterators cannot be used with classical STL
75 algorithms, because those require iterator/iterator pairs, while
76 QStringTokenizer uses sentinels. That is, it uses a different
77 type, QStringTokenizer::sentinel, to mark the end of the
78 range. This improves performance, because the sentinel is an empty
79 type. Sentinels are supported from C++17 (for ranged for)
80 and C++20 (for algorithms using the new ranges library).
81
82 \section1 Temporaries
83
84 QStringTokenizer is very carefully designed to avoid dangling
85 references. If you construct a tokenizer from a temporary string
86 (an rvalue), that argument is stored internally, so the referenced
87 data isn't deleted before it is tokenized:
88
89 \code
90 auto tok = QStringTokenizer{widget.text(), u','};
91 // return value of `widget.text()` is destroyed, but content was moved into `tok`
92 for (auto e : tok)
93 use(e);
94 \endcode
95
96 If you pass named objects (lvalues), then QStringTokenizer does
97 not store a copy. You are responsible to keep the named object's
98 data around for longer than the tokenizer operates on it:
99
100 \code
101 auto text = widget.text();
102 auto tok = QStringTokenizer{text, u','};
103 text.clear(); // destroy content of `text`
104 for (auto e : tok) // ERROR: `tok` references deleted data!
105 use(e);
106 \endcode
107
108 \sa QStringView::split(), QString::split(), QRegularExpression
109*/
110
111/*!
112 \typealias QStringTokenizer::value_type
113
114 Alias for \c{const QStringView} or \c{const QLatin1StringView},
115 depending on the tokenizer's \c Haystack template argument.
116*/
117
118/*!
119 \typealias QStringTokenizer::difference_type
120
121 Alias for qsizetype.
122*/
123
124/*!
125 \typealias QStringTokenizer::size_type
126
127 Alias for qsizetype.
128*/
129
130/*!
131 \typealias QStringTokenizer::reference
132
133 Alias for \c{value_type &}.
134
135 QStringTokenizer does not support mutable references, so this is
136 the same as const_reference.
137*/
138
139/*!
140 \typealias QStringTokenizer::const_reference
141
142 Alias for \c{value_type &}.
143*/
144
145/*!
146 \typealias QStringTokenizer::pointer
147
148 Alias for \c{value_type *}.
149
150 QStringTokenizer does not support mutable iterators, so this is
151 the same as const_pointer.
152*/
153
154/*!
155 \typealias QStringTokenizer::const_pointer
156
157 Alias for \c{value_type *}.
158*/
159
160/*!
161 \typealias QStringTokenizer::iterator
162
163 This typedef provides an STL-style const iterator for
164 QStringTokenizer.
165
166 QStringTokenizer does not support mutable iterators, so this is
167 the same as const_iterator.
168
169 \sa const_iterator
170*/
171
172/*!
173 \typedef QStringTokenizer::const_iterator
174
175 This typedef provides an STL-style const iterator for
176 QStringTokenizer.
177
178 \sa iterator
179*/
180
181/*!
182 \typealias QStringTokenizer::sentinel
183
184 This typedef provides an STL-style sentinel for
185 QStringTokenizer::iterator and QStringTokenizer::const_iterator.
186
187 \sa const_iterator
188*/
189
190/*!
191 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::QStringTokenizer(Haystack haystack, Needle needle, Qt::CaseSensitivity cs, Qt::SplitBehavior sb)
192 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::QStringTokenizer(Haystack haystack, Needle needle, Qt::SplitBehavior sb, Qt::CaseSensitivity cs)
193
194 Constructs a string tokenizer that splits the string \a haystack
195 into substrings wherever \a needle occurs, and allows iteration
196 over those strings as they are found. If \a needle does not match
197 anywhere in \a haystack, a single element containing \a haystack
198 is produced.
199
200 \a cs specifies whether \a needle should be matched case
201 sensitively or case insensitively.
202
203 If \a sb is Qt::SkipEmptyParts, empty entries don't
204 appear in the result. By default, empty entries are included.
205
206 \sa QStringView::split(), QString::split(), Qt::CaseSensitivity, Qt::SplitBehavior
207*/
208
209/*!
210 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::iterator QStringTokenizer<Haystack, Needle>::begin() const
211 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::iterator QStringTokenizer<Haystack, Needle>::cbegin() const
212
213 Returns a const \l{STL-style iterators}{STL-style iterator}
214 pointing to the first token in the list.
215
216 \sa end(), cend()
217*/
218
219/*!
220 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::sentinel QStringTokenizer<Haystack, Needle>::end() const
221
222 Returns a const \l{STL-style iterators}{STL-style sentinel}
223 pointing to the imaginary token after the last token in the list.
224
225 \sa begin(), cend()
226*/
227
228/*!
229 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::sentinel QStringTokenizer<Haystack, Needle>::cend() const
230
231 Same as end().
232
233 \sa cbegin(), end()
234*/
235
236/*!
237 \fn template <typename Haystack, typename Needle> template<typename LContainer> LContainer QStringTokenizer<Haystack, Needle>::toContainer(LContainer &&c) const &
238
239 Converts the lazy sequence into a (typically) random-access container of
240 type \c LContainer.
241
242 This function is only available if \c Container has a \c value_type
243 matching this tokenizer's value_type.
244
245 If you pass in a named container (an lvalue) for \a c, then that container
246 is filled, and a reference to it is returned. If you pass in a temporary
247 container (an rvalue, incl. the default argument), then that container is
248 filled, and returned by value.
249
250 \code
251 // assuming tok's value_type is QStringView, then...
252 auto tok = QStringTokenizer{~~~};
253 // ... rac1 is a QList:
254 auto rac1 = tok.toContainer();
255 // ... rac2 is std::pmr::vector<QStringView>:
256 auto rac2 = tok.toContainer<std::pmr::vector<QStringView>>();
257 auto rac3 = QVarLengthArray<QStringView, 12>{};
258 // appends the token sequence produced by tok to rac3
259 // and returns a reference to rac3 (which we ignore here):
260 tok.toContainer(rac3);
261 \endcode
262
263 This gives you maximum flexibility in how you want the sequence to
264 be stored.
265*/
266
267/*!
268 \fn template <typename Haystack, typename Needle> template<typename RContainer> RContainer QStringTokenizer<Haystack, Needle>::toContainer(RContainer &&c) const &&
269 \overload
270
271 Converts the lazy sequence into a (typically) random-access container of
272 type \c RContainer.
273
274 In addition to the constraints on the lvalue-this overload, this
275 rvalue-this overload is only available when this QStringTokenizer
276 does not store the haystack internally, as this could create a
277 container full of dangling references:
278
279 \code
280 auto tokens = QStringTokenizer{widget.text(), u','}.toContainer();
281 // ERROR: cannot call toContainer() on rvalue
282 // 'tokens' references the data of the copy of widget.text()
283 // stored inside the QStringTokenizer, which has since been deleted
284 \endcode
285
286 To fix, store the QStringTokenizer in a temporary:
287
288 \code
289 auto tokenizer = QStringTokenizer{widget.text90, u','};
290 auto tokens = tokenizer.toContainer();
291 // OK: the copy of widget.text() stored in 'tokenizer' keeps the data
292 // referenced by 'tokens' alive.
293 \endcode
294
295 You can force this function into existence by passing a view instead:
296
297 \code
298 func(QStringTokenizer{QStringView{widget.text()}, u','}.toContainer());
299 // OK: compiler keeps widget.text() around until after func() has executed
300 \endcode
301
302 If you pass in a named container (an lvalue)for \a c, then that container
303 is filled, and a reference to it is returned. If you pass in a temporary
304 container (an rvalue, incl. the default argument), then that container is
305 filled, and returned by value.
306*/
307
308/*!
309 \fn template <typename Haystack, typename Needle, typename...Flags> auto qTokenize(Haystack &&haystack, Needle &&needle, Flags...flags)
310 \relates QStringTokenizer
311 \since 6.0
312
313 Factory function for a QStringTokenizer that splits the string \a haystack
314 into substrings wherever \a needle occurs, and allows iteration
315 over those strings as they are found. If \a needle does not match
316 anywhere in \a haystack, a single element containing \a haystack
317 is produced.
318
319 Pass values from Qt::CaseSensitivity and Qt::SplitBehavior enumerators
320 as \a flags to modify the behavior of the tokenizer.
321*/
322
323QT_END_NAMESPACE
324

Provided by KDAB

Privacy Policy
Learn Advanced QML with KDAB
Find out more

source code of qtbase/src/corelib/text/qstringtokenizer.cpp