1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2002-2011 International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: uiter.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2002jan18
16* created by: Markus W. Scherer
17*/
18
19#ifndef __UITER_H__
20#define __UITER_H__
21
22/**
23 * \file
24 * \brief C API: Unicode Character Iteration
25 *
26 * @see UCharIterator
27 */
28
29#include "unicode/utypes.h"
30
31#if U_SHOW_CPLUSPLUS_API
32 U_NAMESPACE_BEGIN
33
34 class CharacterIterator;
35 class Replaceable;
36
37 U_NAMESPACE_END
38#endif
39
40U_CDECL_BEGIN
41
42struct UCharIterator;
43typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
44
45/**
46 * Origin constants for UCharIterator.getIndex() and UCharIterator.move().
47 * @see UCharIteratorMove
48 * @see UCharIterator
49 * @stable ICU 2.1
50 */
51typedef enum UCharIteratorOrigin {
52 UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
53} UCharIteratorOrigin;
54
55/** Constants for UCharIterator. @stable ICU 2.6 */
56enum {
57 /**
58 * Constant value that may be returned by UCharIteratorMove
59 * indicating that the final UTF-16 index is not known, but that the move succeeded.
60 * This can occur when moving relative to limit or length, or
61 * when moving relative to the current index after a setState()
62 * when the current UTF-16 index is not known.
63 *
64 * It would be very inefficient to have to count from the beginning of the text
65 * just to get the current/limit/length index after moving relative to it.
66 * The actual index can be determined with getIndex(UITER_CURRENT)
67 * which will count the UChars if necessary.
68 *
69 * @stable ICU 2.6
70 */
71 UITER_UNKNOWN_INDEX=-2
72};
73
74
75/**
76 * Constant for UCharIterator getState() indicating an error or
77 * an unknown state.
78 * Returned by uiter_getState()/UCharIteratorGetState
79 * when an error occurs.
80 * Also, some UCharIterator implementations may not be able to return
81 * a valid state for each position. This will be clearly documented
82 * for each such iterator (none of the public ones here).
83 *
84 * @stable ICU 2.6
85 */
86#define UITER_NO_STATE ((uint32_t)0xffffffff)
87
88/**
89 * Function type declaration for UCharIterator.getIndex().
90 *
91 * Gets the current position, or the start or limit of the
92 * iteration range.
93 *
94 * This function may perform slowly for UITER_CURRENT after setState() was called,
95 * or for UITER_LENGTH, because an iterator implementation may have to count
96 * UChars if the underlying storage is not UTF-16.
97 *
98 * @param iter the UCharIterator structure ("this pointer")
99 * @param origin get the 0, start, limit, length, or current index
100 * @return the requested index, or U_SENTINEL in an error condition
101 *
102 * @see UCharIteratorOrigin
103 * @see UCharIterator
104 * @stable ICU 2.1
105 */
106typedef int32_t U_CALLCONV
107UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
108
109/**
110 * Function type declaration for UCharIterator.move().
111 *
112 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
113 *
114 * Moves the current position relative to the start or limit of the
115 * iteration range, or relative to the current position itself.
116 * The movement is expressed in numbers of code units forward
117 * or backward by specifying a positive or negative delta.
118 * Out of bounds movement will be pinned to the start or limit.
119 *
120 * This function may perform slowly for moving relative to UITER_LENGTH
121 * because an iterator implementation may have to count the rest of the
122 * UChars if the native storage is not UTF-16.
123 *
124 * When moving relative to the limit or length, or
125 * relative to the current position after setState() was called,
126 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
127 * determination of the actual UTF-16 index.
128 * The actual index can be determined with getIndex(UITER_CURRENT)
129 * which will count the UChars if necessary.
130 * See UITER_UNKNOWN_INDEX for details.
131 *
132 * @param iter the UCharIterator structure ("this pointer")
133 * @param delta can be positive, zero, or negative
134 * @param origin move relative to the 0, start, limit, length, or current index
135 * @return the new index, or U_SENTINEL on an error condition,
136 * or UITER_UNKNOWN_INDEX when the index is not known.
137 *
138 * @see UCharIteratorOrigin
139 * @see UCharIterator
140 * @see UITER_UNKNOWN_INDEX
141 * @stable ICU 2.1
142 */
143typedef int32_t U_CALLCONV
144UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
145
146/**
147 * Function type declaration for UCharIterator.hasNext().
148 *
149 * Check if current() and next() can still
150 * return another code unit.
151 *
152 * @param iter the UCharIterator structure ("this pointer")
153 * @return boolean value for whether current() and next() can still return another code unit
154 *
155 * @see UCharIterator
156 * @stable ICU 2.1
157 */
158typedef UBool U_CALLCONV
159UCharIteratorHasNext(UCharIterator *iter);
160
161/**
162 * Function type declaration for UCharIterator.hasPrevious().
163 *
164 * Check if previous() can still return another code unit.
165 *
166 * @param iter the UCharIterator structure ("this pointer")
167 * @return boolean value for whether previous() can still return another code unit
168 *
169 * @see UCharIterator
170 * @stable ICU 2.1
171 */
172typedef UBool U_CALLCONV
173UCharIteratorHasPrevious(UCharIterator *iter);
174
175/**
176 * Function type declaration for UCharIterator.current().
177 *
178 * Return the code unit at the current position,
179 * or U_SENTINEL if there is none (index is at the limit).
180 *
181 * @param iter the UCharIterator structure ("this pointer")
182 * @return the current code unit
183 *
184 * @see UCharIterator
185 * @stable ICU 2.1
186 */
187typedef UChar32 U_CALLCONV
188UCharIteratorCurrent(UCharIterator *iter);
189
190/**
191 * Function type declaration for UCharIterator.next().
192 *
193 * Return the code unit at the current index and increment
194 * the index (post-increment, like s[i++]),
195 * or return U_SENTINEL if there is none (index is at the limit).
196 *
197 * @param iter the UCharIterator structure ("this pointer")
198 * @return the current code unit (and post-increment the current index)
199 *
200 * @see UCharIterator
201 * @stable ICU 2.1
202 */
203typedef UChar32 U_CALLCONV
204UCharIteratorNext(UCharIterator *iter);
205
206/**
207 * Function type declaration for UCharIterator.previous().
208 *
209 * Decrement the index and return the code unit from there
210 * (pre-decrement, like s[--i]),
211 * or return U_SENTINEL if there is none (index is at the start).
212 *
213 * @param iter the UCharIterator structure ("this pointer")
214 * @return the previous code unit (after pre-decrementing the current index)
215 *
216 * @see UCharIterator
217 * @stable ICU 2.1
218 */
219typedef UChar32 U_CALLCONV
220UCharIteratorPrevious(UCharIterator *iter);
221
222/**
223 * Function type declaration for UCharIterator.reservedFn().
224 * Reserved for future use.
225 *
226 * @param iter the UCharIterator structure ("this pointer")
227 * @param something some integer argument
228 * @return some integer
229 *
230 * @see UCharIterator
231 * @stable ICU 2.1
232 */
233typedef int32_t U_CALLCONV
234UCharIteratorReserved(UCharIterator *iter, int32_t something);
235
236/**
237 * Function type declaration for UCharIterator.getState().
238 *
239 * Get the "state" of the iterator in the form of a single 32-bit word.
240 * It is recommended that the state value be calculated to be as small as
241 * is feasible. For strings with limited lengths, fewer than 32 bits may
242 * be sufficient.
243 *
244 * This is used together with setState()/UCharIteratorSetState
245 * to save and restore the iterator position more efficiently than with
246 * getIndex()/move().
247 *
248 * The iterator state is defined as a uint32_t value because it is designed
249 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
250 * of the character iterator.
251 *
252 * With some UCharIterator implementations (e.g., UTF-8),
253 * getting and setting the UTF-16 index with existing functions
254 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
255 * relatively slow because the iterator has to "walk" from a known index
256 * to the requested one.
257 * This takes more time the farther it needs to go.
258 *
259 * An opaque state value allows an iterator implementation to provide
260 * an internal index (UTF-8: the source byte array index) for
261 * fast, constant-time restoration.
262 *
263 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
264 * the UTF-16 index may not be restored as well, but the iterator can deliver
265 * the correct text contents and move relative to the current position
266 * without performance degradation.
267 *
268 * Some UCharIterator implementations may not be able to return
269 * a valid state for each position, in which case they return UITER_NO_STATE instead.
270 * This will be clearly documented for each such iterator (none of the public ones here).
271 *
272 * @param iter the UCharIterator structure ("this pointer")
273 * @return the state word
274 *
275 * @see UCharIterator
276 * @see UCharIteratorSetState
277 * @see UITER_NO_STATE
278 * @stable ICU 2.6
279 */
280typedef uint32_t U_CALLCONV
281UCharIteratorGetState(const UCharIterator *iter);
282
283/**
284 * Function type declaration for UCharIterator.setState().
285 *
286 * Restore the "state" of the iterator using a state word from a getState() call.
287 * The iterator object need not be the same one as for which getState() was called,
288 * but it must be of the same type (set up using the same uiter_setXYZ function)
289 * and it must iterate over the same string
290 * (binary identical regardless of memory address).
291 * For more about the state word see UCharIteratorGetState.
292 *
293 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because
294 * the UTF-16 index may not be restored as well, but the iterator can deliver
295 * the correct text contents and move relative to the current position
296 * without performance degradation.
297 *
298 * @param iter the UCharIterator structure ("this pointer")
299 * @param state the state word from a getState() call
300 * on a same-type, same-string iterator
301 * @param pErrorCode Must be a valid pointer to an error code value,
302 * which must not indicate a failure before the function call.
303 *
304 * @see UCharIterator
305 * @see UCharIteratorGetState
306 * @stable ICU 2.6
307 */
308typedef void U_CALLCONV
309UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
310
311
312/**
313 * C API for code unit iteration.
314 * This can be used as a C wrapper around
315 * CharacterIterator, Replaceable, or implemented using simple strings, etc.
316 *
317 * There are two roles for using UCharIterator:
318 *
319 * A "provider" sets the necessary function pointers and controls the "protected"
320 * fields of the UCharIterator structure. A "provider" passes a UCharIterator
321 * into C APIs that need a UCharIterator as an abstract, flexible string interface.
322 *
323 * Implementations of such C APIs are "callers" of UCharIterator functions;
324 * they only use the "public" function pointers and never access the "protected"
325 * fields directly.
326 *
327 * The current() and next() functions only check the current index against the
328 * limit, and previous() only checks the current index against the start,
329 * to see if the iterator already reached the end of the iteration range.
330 *
331 * The assumption - in all iterators - is that the index is moved via the API,
332 * which means it won't go out of bounds, or the index is modified by
333 * user code that knows enough about the iterator implementation to set valid
334 * index values.
335 *
336 * UCharIterator functions return code unit values 0..0xffff,
337 * or U_SENTINEL if the iteration bounds are reached.
338 *
339 * @stable ICU 2.1
340 */
341struct UCharIterator {
342 /**
343 * (protected) Pointer to string or wrapped object or similar.
344 * Not used by caller.
345 * @stable ICU 2.1
346 */
347 const void *context;
348
349 /**
350 * (protected) Length of string or similar.
351 * Not used by caller.
352 * @stable ICU 2.1
353 */
354 int32_t length;
355
356 /**
357 * (protected) Start index or similar.
358 * Not used by caller.
359 * @stable ICU 2.1
360 */
361 int32_t start;
362
363 /**
364 * (protected) Current index or similar.
365 * Not used by caller.
366 * @stable ICU 2.1
367 */
368 int32_t index;
369
370 /**
371 * (protected) Limit index or similar.
372 * Not used by caller.
373 * @stable ICU 2.1
374 */
375 int32_t limit;
376
377 /**
378 * (protected) Used by UTF-8 iterators and possibly others.
379 * @stable ICU 2.1
380 */
381 int32_t reservedField;
382
383 /**
384 * (public) Returns the current position or the
385 * start or limit index of the iteration range.
386 *
387 * @see UCharIteratorGetIndex
388 * @stable ICU 2.1
389 */
390 UCharIteratorGetIndex *getIndex;
391
392 /**
393 * (public) Moves the current position relative to the start or limit of the
394 * iteration range, or relative to the current position itself.
395 * The movement is expressed in numbers of code units forward
396 * or backward by specifying a positive or negative delta.
397 *
398 * @see UCharIteratorMove
399 * @stable ICU 2.1
400 */
401 UCharIteratorMove *move;
402
403 /**
404 * (public) Check if current() and next() can still
405 * return another code unit.
406 *
407 * @see UCharIteratorHasNext
408 * @stable ICU 2.1
409 */
410 UCharIteratorHasNext *hasNext;
411
412 /**
413 * (public) Check if previous() can still return another code unit.
414 *
415 * @see UCharIteratorHasPrevious
416 * @stable ICU 2.1
417 */
418 UCharIteratorHasPrevious *hasPrevious;
419
420 /**
421 * (public) Return the code unit at the current position,
422 * or U_SENTINEL if there is none (index is at the limit).
423 *
424 * @see UCharIteratorCurrent
425 * @stable ICU 2.1
426 */
427 UCharIteratorCurrent *current;
428
429 /**
430 * (public) Return the code unit at the current index and increment
431 * the index (post-increment, like s[i++]),
432 * or return U_SENTINEL if there is none (index is at the limit).
433 *
434 * @see UCharIteratorNext
435 * @stable ICU 2.1
436 */
437 UCharIteratorNext *next;
438
439 /**
440 * (public) Decrement the index and return the code unit from there
441 * (pre-decrement, like s[--i]),
442 * or return U_SENTINEL if there is none (index is at the start).
443 *
444 * @see UCharIteratorPrevious
445 * @stable ICU 2.1
446 */
447 UCharIteratorPrevious *previous;
448
449 /**
450 * (public) Reserved for future use. Currently NULL.
451 *
452 * @see UCharIteratorReserved
453 * @stable ICU 2.1
454 */
455 UCharIteratorReserved *reservedFn;
456
457 /**
458 * (public) Return the state of the iterator, to be restored later with setState().
459 * This function pointer is NULL if the iterator does not implement it.
460 *
461 * @see UCharIteratorGet
462 * @stable ICU 2.6
463 */
464 UCharIteratorGetState *getState;
465
466 /**
467 * (public) Restore the iterator state from the state word from a call
468 * to getState().
469 * This function pointer is NULL if the iterator does not implement it.
470 *
471 * @see UCharIteratorSet
472 * @stable ICU 2.6
473 */
474 UCharIteratorSetState *setState;
475};
476
477/**
478 * Helper function for UCharIterator to get the code point
479 * at the current index.
480 *
481 * Return the code point that includes the code unit at the current position,
482 * or U_SENTINEL if there is none (index is at the limit).
483 * If the current code unit is a lead or trail surrogate,
484 * then the following or preceding surrogate is used to form
485 * the code point value.
486 *
487 * @param iter the UCharIterator structure ("this pointer")
488 * @return the current code point
489 *
490 * @see UCharIterator
491 * @see U16_GET
492 * @see UnicodeString::char32At()
493 * @stable ICU 2.1
494 */
495U_CAPI UChar32 U_EXPORT2
496uiter_current32(UCharIterator *iter);
497
498/**
499 * Helper function for UCharIterator to get the next code point.
500 *
501 * Return the code point at the current index and increment
502 * the index (post-increment, like s[i++]),
503 * or return U_SENTINEL if there is none (index is at the limit).
504 *
505 * @param iter the UCharIterator structure ("this pointer")
506 * @return the current code point (and post-increment the current index)
507 *
508 * @see UCharIterator
509 * @see U16_NEXT
510 * @stable ICU 2.1
511 */
512U_CAPI UChar32 U_EXPORT2
513uiter_next32(UCharIterator *iter);
514
515/**
516 * Helper function for UCharIterator to get the previous code point.
517 *
518 * Decrement the index and return the code point from there
519 * (pre-decrement, like s[--i]),
520 * or return U_SENTINEL if there is none (index is at the start).
521 *
522 * @param iter the UCharIterator structure ("this pointer")
523 * @return the previous code point (after pre-decrementing the current index)
524 *
525 * @see UCharIterator
526 * @see U16_PREV
527 * @stable ICU 2.1
528 */
529U_CAPI UChar32 U_EXPORT2
530uiter_previous32(UCharIterator *iter);
531
532/**
533 * Get the "state" of the iterator in the form of a single 32-bit word.
534 * This is a convenience function that calls iter->getState(iter)
535 * if iter->getState is not NULL;
536 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
537 *
538 * Some UCharIterator implementations may not be able to return
539 * a valid state for each position, in which case they return UITER_NO_STATE instead.
540 * This will be clearly documented for each such iterator (none of the public ones here).
541 *
542 * @param iter the UCharIterator structure ("this pointer")
543 * @return the state word
544 *
545 * @see UCharIterator
546 * @see UCharIteratorGetState
547 * @see UITER_NO_STATE
548 * @stable ICU 2.6
549 */
550U_CAPI uint32_t U_EXPORT2
551uiter_getState(const UCharIterator *iter);
552
553/**
554 * Restore the "state" of the iterator using a state word from a getState() call.
555 * This is a convenience function that calls iter->setState(iter, state, pErrorCode)
556 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
557 *
558 * @param iter the UCharIterator structure ("this pointer")
559 * @param state the state word from a getState() call
560 * on a same-type, same-string iterator
561 * @param pErrorCode Must be a valid pointer to an error code value,
562 * which must not indicate a failure before the function call.
563 *
564 * @see UCharIterator
565 * @see UCharIteratorSetState
566 * @stable ICU 2.6
567 */
568U_CAPI void U_EXPORT2
569uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
570
571/**
572 * Set up a UCharIterator to iterate over a string.
573 *
574 * Sets the UCharIterator function pointers for iteration over the string s
575 * with iteration boundaries start=index=0 and length=limit=string length.
576 * The "provider" may set the start, index, and limit values at any time
577 * within the range 0..length.
578 * The length field will be ignored.
579 *
580 * The string pointer s is set into UCharIterator.context without copying
581 * or reallocating the string contents.
582 *
583 * getState() simply returns the current index.
584 * move() will always return the final index.
585 *
586 * @param iter UCharIterator structure to be set for iteration
587 * @param s String to iterate over
588 * @param length Length of s, or -1 if NUL-terminated
589 *
590 * @see UCharIterator
591 * @stable ICU 2.1
592 */
593U_CAPI void U_EXPORT2
594uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
595
596/**
597 * Set up a UCharIterator to iterate over a UTF-16BE string
598 * (byte vector with a big-endian pair of bytes per UChar).
599 *
600 * Everything works just like with a normal UChar iterator (uiter_setString),
601 * except that UChars are assembled from byte pairs,
602 * and that the length argument here indicates an even number of bytes.
603 *
604 * getState() simply returns the current index.
605 * move() will always return the final index.
606 *
607 * @param iter UCharIterator structure to be set for iteration
608 * @param s UTF-16BE string to iterate over
609 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated
610 * (NUL means pair of 0 bytes at even index from s)
611 *
612 * @see UCharIterator
613 * @see uiter_setString
614 * @stable ICU 2.6
615 */
616U_CAPI void U_EXPORT2
617uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
618
619/**
620 * Set up a UCharIterator to iterate over a UTF-8 string.
621 *
622 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s
623 * with UTF-8 iteration boundaries 0 and length.
624 * The implementation counts the UTF-16 index on the fly and
625 * lazily evaluates the UTF-16 length of the text.
626 *
627 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
628 * When the reservedField is not 0, then it contains a supplementary code point
629 * and the UTF-16 index is between the two corresponding surrogates.
630 * At that point, the UTF-8 index is behind that code point.
631 *
632 * The UTF-8 string pointer s is set into UCharIterator.context without copying
633 * or reallocating the string contents.
634 *
635 * getState() returns a state value consisting of
636 * - the current UTF-8 source byte index (bits 31..1)
637 * - a flag (bit 0) that indicates whether the UChar position is in the middle
638 * of a surrogate pair
639 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
640 *
641 * getState() cannot also encode the UTF-16 index in the state value.
642 * move(relative to limit or length), or
643 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
644 *
645 * @param iter UCharIterator structure to be set for iteration
646 * @param s UTF-8 string to iterate over
647 * @param length Length of s in bytes, or -1 if NUL-terminated
648 *
649 * @see UCharIterator
650 * @stable ICU 2.6
651 */
652U_CAPI void U_EXPORT2
653uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
654
655#if U_SHOW_CPLUSPLUS_API
656
657/**
658 * Set up a UCharIterator to wrap around a C++ CharacterIterator.
659 *
660 * Sets the UCharIterator function pointers for iteration using the
661 * CharacterIterator charIter.
662 *
663 * The CharacterIterator pointer charIter is set into UCharIterator.context
664 * without copying or cloning the CharacterIterator object.
665 * The other "protected" UCharIterator fields are set to 0 and will be ignored.
666 * The iteration index and boundaries are controlled by the CharacterIterator.
667 *
668 * getState() simply returns the current index.
669 * move() will always return the final index.
670 *
671 * @param iter UCharIterator structure to be set for iteration
672 * @param charIter CharacterIterator to wrap
673 *
674 * @see UCharIterator
675 * @stable ICU 2.1
676 */
677U_CAPI void U_EXPORT2
678uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);
679
680/**
681 * Set up a UCharIterator to iterate over a C++ Replaceable.
682 *
683 * Sets the UCharIterator function pointers for iteration over the
684 * Replaceable rep with iteration boundaries start=index=0 and
685 * length=limit=rep->length().
686 * The "provider" may set the start, index, and limit values at any time
687 * within the range 0..length=rep->length().
688 * The length field will be ignored.
689 *
690 * The Replaceable pointer rep is set into UCharIterator.context without copying
691 * or cloning/reallocating the Replaceable object.
692 *
693 * getState() simply returns the current index.
694 * move() will always return the final index.
695 *
696 * @param iter UCharIterator structure to be set for iteration
697 * @param rep Replaceable to iterate over
698 *
699 * @see UCharIterator
700 * @stable ICU 2.1
701 */
702U_CAPI void U_EXPORT2
703uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);
704
705#endif
706
707U_CDECL_END
708
709#endif
710

source code of include/unicode/uiter.h