1 | // Copyright (C) 2020 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Marc Mutz <marc.mutz@kdab.com> |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qstringtokenizer.h" |
5 | #include "qstringalgorithms.h" |
6 | |
7 | QT_BEGIN_NAMESPACE |
8 | |
9 | /*! |
10 | \class QStringTokenizer |
11 | \inmodule QtCore |
12 | \since 6.0 |
13 | \brief The QStringTokenizer class splits strings into tokens along given separators. |
14 | \reentrant |
15 | \ingroup tools |
16 | \ingroup string-processing |
17 | |
18 | Splits a string into substrings wherever a given separator occurs, |
19 | returning a (lazily constructed) list of those strings. If the separator does |
20 | not match anywhere in the string, produces a single-element list |
21 | containing this string. If the separator is empty, |
22 | QStringTokenizer produces an empty string, followed by each of the |
23 | string's characters, followed by another empty string. The two |
24 | enumerations Qt::SplitBehavior and Qt::CaseSensitivity further |
25 | control the output. |
26 | |
27 | QStringTokenizer drives QStringView::tokenize(), but you can use it |
28 | directly, too: |
29 | |
30 | \code |
31 | for (auto it : QStringTokenizer{string, separator}) |
32 | use(*it); |
33 | \endcode |
34 | |
35 | \note You should never name the template arguments of a |
36 | QStringTokenizer explicitly. You may write |
37 | \c{QStringTokenizer{string, separator}} (without template arguments), |
38 | or use either QStringView::tokenize() or QLatin1StringView::tokenize(), |
39 | then store the return value only in an \c{auto} variable: |
40 | |
41 | \code |
42 | auto result = strview.tokenize(sep); |
43 | \endcode |
44 | |
45 | This is because the template arguments of QStringTokenizer have a |
46 | very subtle dependency on the specific string and separator types |
47 | from with which they are constructed, and they don't usually |
48 | correspond to the actual types passed. |
49 | |
50 | \section1 Lazy Sequences |
51 | |
52 | QStringTokenizer acts as a so-called lazy sequence, that is, each |
53 | next element is only computed once you ask for it. Lazy sequences |
54 | have the advantage that they only require O(1) memory. They have |
55 | the disadvantage that, at least for QStringTokenizer, they only |
56 | allow forward, not random-access, iteration. |
57 | |
58 | The intended use-case is that you just plug it into a ranged for loop: |
59 | |
60 | \code |
61 | for (auto it : QStringTokenizer{string, separator}) |
62 | use(*it); |
63 | \endcode |
64 | |
65 | or a C++20 ranged algorithm: |
66 | |
67 | \code |
68 | std::ranges::for_each(QStringTokenizer{string, separator}, |
69 | [] (auto token) { use(token); }); |
70 | \endcode |
71 | |
72 | \section1 End Sentinel |
73 | |
74 | The QStringTokenizer iterators cannot be used with classical STL |
75 | algorithms, because those require iterator/iterator pairs, while |
76 | QStringTokenizer uses sentinels. That is, it uses a different |
77 | type, QStringTokenizer::sentinel, to mark the end of the |
78 | range. This improves performance, because the sentinel is an empty |
79 | type. Sentinels are supported from C++17 (for ranged for) |
80 | and C++20 (for algorithms using the new ranges library). |
81 | |
82 | \section1 Temporaries |
83 | |
84 | QStringTokenizer is very carefully designed to avoid dangling |
85 | references. If you construct a tokenizer from a temporary string |
86 | (an rvalue), that argument is stored internally, so the referenced |
87 | data isn't deleted before it is tokenized: |
88 | |
89 | \code |
90 | auto tok = QStringTokenizer{widget.text(), u','}; |
91 | // return value of `widget.text()` is destroyed, but content was moved into `tok` |
92 | for (auto e : tok) |
93 | use(e); |
94 | \endcode |
95 | |
96 | If you pass named objects (lvalues), then QStringTokenizer does |
97 | not store a copy. You are responsible to keep the named object's |
98 | data around for longer than the tokenizer operates on it: |
99 | |
100 | \code |
101 | auto text = widget.text(); |
102 | auto tok = QStringTokenizer{text, u','}; |
103 | text.clear(); // destroy content of `text` |
104 | for (auto e : tok) // ERROR: `tok` references deleted data! |
105 | use(e); |
106 | \endcode |
107 | |
108 | \sa QStringView::split(), QString::split(), QRegularExpression |
109 | */ |
110 | |
111 | /*! |
112 | \typealias QStringTokenizer::value_type |
113 | |
114 | Alias for \c{const QStringView} or \c{const QLatin1StringView}, |
115 | depending on the tokenizer's \c Haystack template argument. |
116 | */ |
117 | |
118 | /*! |
119 | \typealias QStringTokenizer::difference_type |
120 | |
121 | Alias for qsizetype. |
122 | */ |
123 | |
124 | /*! |
125 | \typealias QStringTokenizer::size_type |
126 | |
127 | Alias for qsizetype. |
128 | */ |
129 | |
130 | /*! |
131 | \typealias QStringTokenizer::reference |
132 | |
133 | Alias for \c{value_type &}. |
134 | |
135 | QStringTokenizer does not support mutable references, so this is |
136 | the same as const_reference. |
137 | */ |
138 | |
139 | /*! |
140 | \typealias QStringTokenizer::const_reference |
141 | |
142 | Alias for \c{value_type &}. |
143 | */ |
144 | |
145 | /*! |
146 | \typealias QStringTokenizer::pointer |
147 | |
148 | Alias for \c{value_type *}. |
149 | |
150 | QStringTokenizer does not support mutable iterators, so this is |
151 | the same as const_pointer. |
152 | */ |
153 | |
154 | /*! |
155 | \typealias QStringTokenizer::const_pointer |
156 | |
157 | Alias for \c{value_type *}. |
158 | */ |
159 | |
160 | /*! |
161 | \typealias QStringTokenizer::iterator |
162 | |
163 | This typedef provides an STL-style const iterator for |
164 | QStringTokenizer. |
165 | |
166 | QStringTokenizer does not support mutable iterators, so this is |
167 | the same as const_iterator. |
168 | |
169 | \sa const_iterator |
170 | */ |
171 | |
172 | /*! |
173 | \typedef QStringTokenizer::const_iterator |
174 | |
175 | This typedef provides an STL-style const iterator for |
176 | QStringTokenizer. |
177 | |
178 | \sa iterator |
179 | */ |
180 | |
181 | /*! |
182 | \typealias QStringTokenizer::sentinel |
183 | |
184 | This typedef provides an STL-style sentinel for |
185 | QStringTokenizer::iterator and QStringTokenizer::const_iterator. |
186 | |
187 | \sa const_iterator |
188 | */ |
189 | |
190 | /*! |
191 | \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::QStringTokenizer(Haystack haystack, Needle needle, Qt::CaseSensitivity cs, Qt::SplitBehavior sb) |
192 | \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::QStringTokenizer(Haystack haystack, Needle needle, Qt::SplitBehavior sb, Qt::CaseSensitivity cs) |
193 | |
194 | Constructs a string tokenizer that splits the string \a haystack |
195 | into substrings wherever \a needle occurs, and allows iteration |
196 | over those strings as they are found. If \a needle does not match |
197 | anywhere in \a haystack, a single element containing \a haystack |
198 | is produced. |
199 | |
200 | \a cs specifies whether \a needle should be matched case |
201 | sensitively or case insensitively. |
202 | |
203 | If \a sb is Qt::SkipEmptyParts, empty entries don't |
204 | appear in the result. By default, empty entries are included. |
205 | |
206 | \sa QStringView::split(), QString::split(), Qt::CaseSensitivity, Qt::SplitBehavior |
207 | */ |
208 | |
209 | /*! |
210 | \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::iterator QStringTokenizer<Haystack, Needle>::begin() const |
211 | \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::iterator QStringTokenizer<Haystack, Needle>::cbegin() const |
212 | |
213 | Returns a const \l{STL-style iterators}{STL-style iterator} |
214 | pointing to the first token in the list. |
215 | |
216 | \sa end(), cend() |
217 | */ |
218 | |
219 | /*! |
220 | \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::sentinel QStringTokenizer<Haystack, Needle>::end() const |
221 | |
222 | Returns a const \l{STL-style iterators}{STL-style sentinel} |
223 | pointing to the imaginary token after the last token in the list. |
224 | |
225 | \sa begin(), cend() |
226 | */ |
227 | |
228 | /*! |
229 | \fn template <typename Haystack, typename Needle> QStringTokenizer<Haystack, Needle>::sentinel QStringTokenizer<Haystack, Needle>::cend() const |
230 | |
231 | Same as end(). |
232 | |
233 | \sa cbegin(), end() |
234 | */ |
235 | |
236 | /*! |
237 | \fn template <typename Haystack, typename Needle> template<typename LContainer> LContainer QStringTokenizer<Haystack, Needle>::toContainer(LContainer &&c) const & |
238 | |
239 | Converts the lazy sequence into a (typically) random-access container of |
240 | type \c LContainer. |
241 | |
242 | This function is only available if \c Container has a \c value_type |
243 | matching this tokenizer's value_type. |
244 | |
245 | If you pass in a named container (an lvalue) for \a c, then that container |
246 | is filled, and a reference to it is returned. If you pass in a temporary |
247 | container (an rvalue, incl. the default argument), then that container is |
248 | filled, and returned by value. |
249 | |
250 | \code |
251 | // assuming tok's value_type is QStringView, then... |
252 | auto tok = QStringTokenizer{~~~}; |
253 | // ... rac1 is a QList: |
254 | auto rac1 = tok.toContainer(); |
255 | // ... rac2 is std::pmr::vector<QStringView>: |
256 | auto rac2 = tok.toContainer<std::pmr::vector<QStringView>>(); |
257 | auto rac3 = QVarLengthArray<QStringView, 12>{}; |
258 | // appends the token sequence produced by tok to rac3 |
259 | // and returns a reference to rac3 (which we ignore here): |
260 | tok.toContainer(rac3); |
261 | \endcode |
262 | |
263 | This gives you maximum flexibility in how you want the sequence to |
264 | be stored. |
265 | */ |
266 | |
267 | /*! |
268 | \fn template <typename Haystack, typename Needle> template<typename RContainer> RContainer QStringTokenizer<Haystack, Needle>::toContainer(RContainer &&c) const && |
269 | \overload |
270 | |
271 | Converts the lazy sequence into a (typically) random-access container of |
272 | type \c RContainer. |
273 | |
274 | In addition to the constraints on the lvalue-this overload, this |
275 | rvalue-this overload is only available when this QStringTokenizer |
276 | does not store the haystack internally, as this could create a |
277 | container full of dangling references: |
278 | |
279 | \code |
280 | auto tokens = QStringTokenizer{widget.text(), u','}.toContainer(); |
281 | // ERROR: cannot call toContainer() on rvalue |
282 | // 'tokens' references the data of the copy of widget.text() |
283 | // stored inside the QStringTokenizer, which has since been deleted |
284 | \endcode |
285 | |
286 | To fix, store the QStringTokenizer in a temporary: |
287 | |
288 | \code |
289 | auto tokenizer = QStringTokenizer{widget.text90, u','}; |
290 | auto tokens = tokenizer.toContainer(); |
291 | // OK: the copy of widget.text() stored in 'tokenizer' keeps the data |
292 | // referenced by 'tokens' alive. |
293 | \endcode |
294 | |
295 | You can force this function into existence by passing a view instead: |
296 | |
297 | \code |
298 | func(QStringTokenizer{QStringView{widget.text()}, u','}.toContainer()); |
299 | // OK: compiler keeps widget.text() around until after func() has executed |
300 | \endcode |
301 | |
302 | If you pass in a named container (an lvalue)for \a c, then that container |
303 | is filled, and a reference to it is returned. If you pass in a temporary |
304 | container (an rvalue, incl. the default argument), then that container is |
305 | filled, and returned by value. |
306 | */ |
307 | |
308 | /*! |
309 | \fn template <typename Haystack, typename Needle, typename...Flags> auto qTokenize(Haystack &&haystack, Needle &&needle, Flags...flags) |
310 | \relates QStringTokenizer |
311 | \since 6.0 |
312 | |
313 | Factory function for a QStringTokenizer that splits the string \a haystack |
314 | into substrings wherever \a needle occurs, and allows iteration |
315 | over those strings as they are found. If \a needle does not match |
316 | anywhere in \a haystack, a single element containing \a haystack |
317 | is produced. |
318 | |
319 | Pass values from Qt::CaseSensitivity and Qt::SplitBehavior enumerators |
320 | as \a flags to modify the behavior of the tokenizer. |
321 | */ |
322 | |
323 | QT_END_NAMESPACE |
324 | |