1// Copyright (C) 2020 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Marc Mutz <marc.mutz@kdab.com>
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qstringtokenizer.h"
5#include "qstringalgorithms.h"
6
7QT_BEGIN_NAMESPACE
8
9/*!
10 \class QStringTokenizer
11 \inmodule QtCore
12 \since 6.0
13 \brief The QStringTokenizer class splits strings into tokens along given separators.
14 \reentrant
15 \ingroup tools
16 \ingroup string-processing
17
18 Splits a string into substrings wherever a given separator occurs,
19 returning a (lazily constructed) list of those strings. If the separator does
20 not match anywhere in the string, produces a single-element list
21 containing this string. If the separator is empty,
22 QStringTokenizer produces an empty string, followed by each of the
23 string's characters, followed by another empty string. The two
24 enumerations Qt::SplitBehavior and Qt::CaseSensitivity further
25 control the output.
26
27 QStringTokenizer drives QStringView::tokenize(), but, at least with a
28 recent compiler, you can use it directly, too:
29
30 \code
31 for (auto it : QStringTokenizer{string, separator})
32 use(*it);
33 \endcode
34
35 \note You should never, ever, name the template arguments of a
36 QStringTokenizer explicitly. If you can use C++17 Class Template
37 Argument Deduction (CTAD), you may write
38 \c{QStringTokenizer{string, separator}} (without template
39 arguments). If you can't use C++17 CTAD, you must use the
40 QStringView::split() or QLatin1StringView::split() member functions
41 and store the return value only in \c{auto} variables:
42
43 \code
44 auto result = string.split(sep);
45 \endcode
46
47 This is because the template arguments of QStringTokenizer have a
48 very subtle dependency on the specific string and separator types
49 from with which they are constructed, and they don't usually
50 correspond to the actual types passed.
51
52 \section1 Lazy Sequences
53
54 QStringTokenizer acts as a so-called lazy sequence, that is, each
55 next element is only computed once you ask for it. Lazy sequences
56 have the advantage that they only require O(1) memory. They have
57 the disadvantage that, at least for QStringTokenizer, they only
58 allow forward, not random-access, iteration.
59
60 The intended use-case is that you just plug it into a ranged for loop:
61
62 \code
63 for (auto it : QStringTokenizer{string, separator})
64 use(*it);
65 \endcode
66
67 or a C++20 ranged algorithm:
68
69 \code
70 std::ranges::for_each(QStringTokenizer{string, separator},
71 [] (auto token) { use(token); });
72 \endcode
73
74 \section1 End Sentinel
75
76 The QStringTokenizer iterators cannot be used with classical STL
77 algorithms, because those require iterator/iterator pairs, while
78 QStringTokenizer uses sentinels. That is, it uses a different
79 type, QStringTokenizer::sentinel, to mark the end of the
80 range. This improves performance, because the sentinel is an empty
81 type. Sentinels are supported from C++17 (for ranged for)
82 and C++20 (for algorithms using the new ranges library).
83
84 \section1 Temporaries
85
86 QStringTokenizer is very carefully designed to avoid dangling
87 references. If you construct a tokenizer from a temporary string
88 (an rvalue), that argument is stored internally, so the referenced
89 data isn't deleted before it is tokenized:
90
91 \code
92 auto tok = QStringTokenizer{widget.text(), u','};
93 // return value of `widget.text()` is destroyed, but content was moved into `tok`
94 for (auto e : tok)
95 use(e);
96 \endcode
97
98 If you pass named objects (lvalues), then QStringTokenizer does
99 not store a copy. You are responsible to keep the named object's
100 data around for longer than the tokenizer operates on it:
101
102 \code
103 auto text = widget.text();
104 auto tok = QStringTokenizer{text, u','};
105 text.clear(); // destroy content of `text`
106 for (auto e : tok) // ERROR: `tok` references deleted data!
107 use(e);
108 \endcode
109
110 \sa QStringView::split(), QString::split(), QRegularExpression
111*/
112
113/*!
114 \typealias QStringTokenizer::value_type
115
116 Alias for \c{const QStringView} or \c{const QLatin1StringView},
117 depending on the tokenizer's \c Haystack template argument.
118*/
119
120/*!
121 \typealias QStringTokenizer::difference_type
122
123 Alias for qsizetype.
124*/
125
126/*!
127 \typealias QStringTokenizer::size_type
128
129 Alias for qsizetype.
130*/
131
132/*!
133 \typealias QStringTokenizer::reference
134
135 Alias for \c{value_type &}.
136
137 QStringTokenizer does not support mutable references, so this is
138 the same as const_reference.
139*/
140
141/*!
142 \typealias QStringTokenizer::const_reference
143
144 Alias for \c{value_type &}.
145*/
146
147/*!
148 \typealias QStringTokenizer::pointer
149
150 Alias for \c{value_type *}.
151
152 QStringTokenizer does not support mutable iterators, so this is
153 the same as const_pointer.
154*/
155
156/*!
157 \typealias QStringTokenizer::const_pointer
158
159 Alias for \c{value_type *}.
160*/
161
162/*!
163 \typealias QStringTokenizer::iterator
164
165 This typedef provides an STL-style const iterator for
166 QStringTokenizer.
167
168 QStringTokenizer does not support mutable iterators, so this is
169 the same as const_iterator.
170
171 \sa const_iterator
172*/
173
174/*!
175 \typedef QStringTokenizer::const_iterator
176
177 This typedef provides an STL-style const iterator for
178 QStringTokenizer.
179
180 \sa iterator
181*/
182
183/*!
184 \typealias QStringTokenizer::sentinel
185
186 This typedef provides an STL-style sentinel for
187 QStringTokenizer::iterator and QStringTokenizer::const_iterator.
188
189 \sa const_iterator
190*/
191
192/*!
193 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::QStringTokenizer(Haystack haystack, Needle needle, Qt::CaseSensitivity cs, Qt::SplitBehavior sb)
194 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::QStringTokenizer(Haystack haystack, Needle needle, Qt::SplitBehavior sb, Qt::CaseSensitivity cs)
195
196 Constructs a string tokenizer that splits the string \a haystack
197 into substrings wherever \a needle occurs, and allows iteration
198 over those strings as they are found. If \a needle does not match
199 anywhere in \a haystack, a single element containing \a haystack
200 is produced.
201
202 \a cs specifies whether \a needle should be matched case
203 sensitively or case insensitively.
204
205 If \a sb is Qt::SkipEmptyParts, empty entries don't
206 appear in the result. By default, empty entries are included.
207
208 \sa QStringView::split(), QString::split(), Qt::CaseSensitivity, Qt::SplitBehavior
209*/
210
211/*!
212 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::iterator QStringTokenizer<Haystack, Needle>::begin() const
213 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::iterator QStringTokenizer<Haystack, Needle>::cbegin() const
214
215 Returns a const \l{STL-style iterators}{STL-style iterator}
216 pointing to the first token in the list.
217
218 \sa end(), cend()
219*/
220
221/*!
222 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::sentinel QStringTokenizer<Haystack, Needle>::end() const
223
224 Returns a const \l{STL-style iterators}{STL-style sentinel}
225 pointing to the imaginary token after the last token in the list.
226
227 \sa begin(), cend()
228*/
229
230/*!
231 \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::sentinel QStringTokenizer<Haystack, Needle>::cend() const
232
233 Same as end().
234
235 \sa cbegin(), end()
236*/
237
238/*!
239 \fn template <typename Haystack, typename Needle> template<typename LContainer> LContainer QStringTokenizer<Haystack, Needle>::toContainer(LContainer &&c) const &
240
241 Converts the lazy sequence into a (typically) random-access container of
242 type \c LContainer.
243
244 This function is only available if \c Container has a \c value_type
245 matching this tokenizer's value_type.
246
247 If you pass in a named container (an lvalue) for \a c, then that container
248 is filled, and a reference to it is returned. If you pass in a temporary
249 container (an rvalue, incl. the default argument), then that container is
250 filled, and returned by value.
251
252 \code
253 // assuming tok's value_type is QStringView, then...
254 auto tok = QStringTokenizer{~~~};
255 // ... rac1 is a QList:
256 auto rac1 = tok.toContainer();
257 // ... rac2 is std::pmr::vector<QStringView>:
258 auto rac2 = tok.toContainer<std::pmr::vector<QStringView>>();
259 auto rac3 = QVarLengthArray<QStringView, 12>{};
260 // appends the token sequence produced by tok to rac3
261 // and returns a reference to rac3 (which we ignore here):
262 tok.toContainer(rac3);
263 \endcode
264
265 This gives you maximum flexibility in how you want the sequence to
266 be stored.
267*/
268
269/*!
270 \fn template <typename Haystack, typename Needle> template<typename RContainer> RContainer QStringTokenizer<Haystack, Needle>::toContainer(RContainer &&c) const &&
271 \overload
272
273 Converts the lazy sequence into a (typically) random-access container of
274 type \c RContainer.
275
276 In addition to the constraints on the lvalue-this overload, this
277 rvalue-this overload is only available when this QStringTokenizer
278 does not store the haystack internally, as this could create a
279 container full of dangling references:
280
281 \code
282 auto tokens = QStringTokenizer{widget.text(), u','}.toContainer();
283 // ERROR: cannot call toContainer() on rvalue
284 // 'tokens' references the data of the copy of widget.text()
285 // stored inside the QStringTokenizer, which has since been deleted
286 \endcode
287
288 To fix, store the QStringTokenizer in a temporary:
289
290 \code
291 auto tokenizer = QStringTokenizer{widget.text90, u','};
292 auto tokens = tokenizer.toContainer();
293 // OK: the copy of widget.text() stored in 'tokenizer' keeps the data
294 // referenced by 'tokens' alive.
295 \endcode
296
297 You can force this function into existence by passing a view instead:
298
299 \code
300 func(QStringTokenizer{QStringView{widget.text()}, u','}.toContainer());
301 // OK: compiler keeps widget.text() around until after func() has executed
302 \endcode
303
304 If you pass in a named container (an lvalue)for \a c, then that container
305 is filled, and a reference to it is returned. If you pass in a temporary
306 container (an rvalue, incl. the default argument), then that container is
307 filled, and returned by value.
308*/
309
310/*!
311 \fn template <typename Haystack, typename Needle, typename...Flags> auto qTokenize(Haystack &&haystack, Needle &&needle, Flags...flags)
312 \relates QStringTokenizer
313 \since 6.0
314
315 Factory function for a QStringTokenizer that splits the string \a haystack
316 into substrings wherever \a needle occurs, and allows iteration
317 over those strings as they are found. If \a needle does not match
318 anywhere in \a haystack, a single element containing \a haystack
319 is produced.
320
321 Pass values from Qt::CaseSensitivity and Qt::SplitBehavior enumerators
322 as \a flags to modify the behavior of the tokenizer.
323
324 You can use this function if your compiler doesn't, yet, support C++17 Class
325 Template Argument Deduction (CTAD). We recommend direct use of QStringTokenizer
326 with CTAD instead.
327*/
328
329QT_END_NAMESPACE
330

source code of qtbase/src/corelib/text/qstringtokenizer.cpp