| 1 | // © 2016 and later: Unicode, Inc. and others. | 
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
| 3 | /* | 
| 4 | ******************************************************************************* | 
| 5 | * | 
| 6 | *   Copyright (C) 1999-2015, International Business Machines | 
| 7 | *   Corporation and others.  All Rights Reserved. | 
| 8 | * | 
| 9 | ******************************************************************************* | 
| 10 | *   file name:  utf8.h | 
| 11 | *   encoding:   UTF-8 | 
| 12 | *   tab size:   8 (not used) | 
| 13 | *   indentation:4 | 
| 14 | * | 
| 15 | *   created on: 1999sep13 | 
| 16 | *   created by: Markus W. Scherer | 
| 17 | */ | 
| 18 |  | 
| 19 | /** | 
| 20 |  * \file | 
| 21 |  * \brief C API: 8-bit Unicode handling macros | 
| 22 |  *  | 
| 23 |  * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. | 
| 24 |  * | 
| 25 |  * For more information see utf.h and the ICU User Guide Strings chapter | 
| 26 |  * (https://unicode-org.github.io/icu/userguide/strings). | 
| 27 |  * | 
| 28 |  * <em>Usage:</em> | 
| 29 |  * ICU coding guidelines for if() statements should be followed when using these macros. | 
| 30 |  * Compound statements (curly braces {}) must be used  for if-else-while...  | 
| 31 |  * bodies and all macro statements should be terminated with semicolon. | 
| 32 |  */ | 
| 33 |  | 
| 34 | #ifndef __UTF8_H__ | 
| 35 | #define __UTF8_H__ | 
| 36 |  | 
| 37 | #include <stdbool.h> | 
| 38 | #include "unicode/umachine.h" | 
| 39 | #ifndef __UTF_H__ | 
| 40 | #   include "unicode/utf.h" | 
| 41 | #endif | 
| 42 |  | 
| 43 | /* internal definitions ----------------------------------------------------- */ | 
| 44 |  | 
| 45 | /** | 
| 46 |  * Counts the trail bytes for a UTF-8 lead byte. | 
| 47 |  * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. | 
| 48 |  * leadByte might be evaluated multiple times. | 
| 49 |  * | 
| 50 |  * This is internal since it is not meant to be called directly by external clients; | 
| 51 |  * however it is called by public macros in this file and thus must remain stable. | 
| 52 |  * | 
| 53 |  * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. | 
| 54 |  * @internal | 
| 55 |  */ | 
| 56 | #define U8_COUNT_TRAIL_BYTES(leadByte) \ | 
| 57 |     (U8_IS_LEAD(leadByte) ? \ | 
| 58 |         ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) | 
| 59 |  | 
| 60 | /** | 
| 61 |  * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. | 
| 62 |  * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. | 
| 63 |  * leadByte might be evaluated multiple times. | 
| 64 |  * | 
| 65 |  * This is internal since it is not meant to be called directly by external clients; | 
| 66 |  * however it is called by public macros in this file and thus must remain stable. | 
| 67 |  * | 
| 68 |  * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. | 
| 69 |  * @internal | 
| 70 |  */ | 
| 71 | #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ | 
| 72 |     (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) | 
| 73 |  | 
| 74 | /** | 
| 75 |  * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. | 
| 76 |  * | 
| 77 |  * This is internal since it is not meant to be called directly by external clients; | 
| 78 |  * however it is called by public macros in this file and thus must remain stable. | 
| 79 |  * @internal | 
| 80 |  */ | 
| 81 | #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) | 
| 82 |  | 
| 83 | /** | 
| 84 |  * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. | 
| 85 |  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. | 
| 86 |  * Lead byte E0..EF bits 3..0 are used as byte index, | 
| 87 |  * first trail byte bits 7..5 are used as bit index into that byte. | 
| 88 |  * @see U8_IS_VALID_LEAD3_AND_T1 | 
| 89 |  * @internal | 
| 90 |  */ | 
| 91 | #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" | 
| 92 |  | 
| 93 | /** | 
| 94 |  * Internal 3-byte UTF-8 validity check. | 
| 95 |  * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. | 
| 96 |  * @internal | 
| 97 |  */ | 
| 98 | #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) | 
| 99 |  | 
| 100 | /** | 
| 101 |  * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. | 
| 102 |  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. | 
| 103 |  * First trail byte bits 7..4 are used as byte index, | 
| 104 |  * lead byte F0..F4 bits 2..0 are used as bit index into that byte. | 
| 105 |  * @see U8_IS_VALID_LEAD4_AND_T1 | 
| 106 |  * @internal | 
| 107 |  */ | 
| 108 | #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" | 
| 109 |  | 
| 110 | /** | 
| 111 |  * Internal 4-byte UTF-8 validity check. | 
| 112 |  * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. | 
| 113 |  * @internal | 
| 114 |  */ | 
| 115 | #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) | 
| 116 |  | 
| 117 | /** | 
| 118 |  * Function for handling "next code point" with error-checking. | 
| 119 |  * | 
| 120 |  * This is internal since it is not meant to be called directly by external clients; | 
| 121 |  * however it is called by public macros in this | 
| 122 |  * file and thus must remain stable, and should not be hidden when other internal | 
| 123 |  * functions are hidden (otherwise public macros would fail to compile). | 
| 124 |  * @internal | 
| 125 |  */ | 
| 126 | U_CAPI UChar32 U_EXPORT2 | 
| 127 | utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); | 
| 128 |  | 
| 129 | /** | 
| 130 |  * Function for handling "append code point" with error-checking. | 
| 131 |  * | 
| 132 |  * This is internal since it is not meant to be called directly by external clients; | 
| 133 |  * however it is called by public macros in this | 
| 134 |  * file and thus must remain stable, and should not be hidden when other internal | 
| 135 |  * functions are hidden (otherwise public macros would fail to compile). | 
| 136 |  * @internal | 
| 137 |  */ | 
| 138 | U_CAPI int32_t U_EXPORT2 | 
| 139 | utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); | 
| 140 |  | 
| 141 | /** | 
| 142 |  * Function for handling "previous code point" with error-checking. | 
| 143 |  * | 
| 144 |  * This is internal since it is not meant to be called directly by external clients; | 
| 145 |  * however it is called by public macros in this | 
| 146 |  * file and thus must remain stable, and should not be hidden when other internal | 
| 147 |  * functions are hidden (otherwise public macros would fail to compile). | 
| 148 |  * @internal | 
| 149 |  */ | 
| 150 | U_CAPI UChar32 U_EXPORT2 | 
| 151 | utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); | 
| 152 |  | 
| 153 | /** | 
| 154 |  * Function for handling "skip backward one code point" with error-checking. | 
| 155 |  * | 
| 156 |  * This is internal since it is not meant to be called directly by external clients; | 
| 157 |  * however it is called by public macros in this | 
| 158 |  * file and thus must remain stable, and should not be hidden when other internal | 
| 159 |  * functions are hidden (otherwise public macros would fail to compile). | 
| 160 |  * @internal | 
| 161 |  */ | 
| 162 | U_CAPI int32_t U_EXPORT2 | 
| 163 | utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); | 
| 164 |  | 
| 165 | /* single-code point definitions -------------------------------------------- */ | 
| 166 |  | 
| 167 | /** | 
| 168 |  * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? | 
| 169 |  * @param c 8-bit code unit (byte) | 
| 170 |  * @return true or false | 
| 171 |  * @stable ICU 2.4 | 
| 172 |  */ | 
| 173 | #define U8_IS_SINGLE(c) (((c)&0x80)==0) | 
| 174 |  | 
| 175 | /** | 
| 176 |  * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) | 
| 177 |  * @param c 8-bit code unit (byte) | 
| 178 |  * @return true or false | 
| 179 |  * @stable ICU 2.4 | 
| 180 |  */ | 
| 181 | #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) | 
| 182 | // 0x32=0xf4-0xc2 | 
| 183 |  | 
| 184 | /** | 
| 185 |  * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) | 
| 186 |  * @param c 8-bit code unit (byte) | 
| 187 |  * @return true or false | 
| 188 |  * @stable ICU 2.4 | 
| 189 |  */ | 
| 190 | #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) | 
| 191 |  | 
| 192 | /** | 
| 193 |  * How many code units (bytes) are used for the UTF-8 encoding | 
| 194 |  * of this Unicode code point? | 
| 195 |  * @param c 32-bit code point | 
| 196 |  * @return 1..4, or 0 if c is a surrogate or not a Unicode code point | 
| 197 |  * @stable ICU 2.4 | 
| 198 |  */ | 
| 199 | #define U8_LENGTH(c) \ | 
| 200 |     ((uint32_t)(c)<=0x7f ? 1 : \ | 
| 201 |         ((uint32_t)(c)<=0x7ff ? 2 : \ | 
| 202 |             ((uint32_t)(c)<=0xd7ff ? 3 : \ | 
| 203 |                 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ | 
| 204 |                     ((uint32_t)(c)<=0xffff ? 3 : 4)\ | 
| 205 |                 ) \ | 
| 206 |             ) \ | 
| 207 |         ) \ | 
| 208 |     ) | 
| 209 |  | 
| 210 | /** | 
| 211 |  * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). | 
| 212 |  * @return 4 | 
| 213 |  * @stable ICU 2.4 | 
| 214 |  */ | 
| 215 | #define U8_MAX_LENGTH 4 | 
| 216 |  | 
| 217 | /** | 
| 218 |  * Get a code point from a string at a random-access offset, | 
| 219 |  * without changing the offset. | 
| 220 |  * The offset may point to either the lead byte or one of the trail bytes | 
| 221 |  * for a code point, in which case the macro will read all of the bytes | 
| 222 |  * for the code point. | 
| 223 |  * The result is undefined if the offset points to an illegal UTF-8 | 
| 224 |  * byte sequence. | 
| 225 |  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. | 
| 226 |  * | 
| 227 |  * @param s const uint8_t * string | 
| 228 |  * @param i string offset | 
| 229 |  * @param c output UChar32 variable | 
| 230 |  * @see U8_GET | 
| 231 |  * @stable ICU 2.4 | 
| 232 |  */ | 
| 233 | #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 234 |     int32_t _u8_get_unsafe_index=(int32_t)(i); \ | 
| 235 |     U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ | 
| 236 |     U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ | 
| 237 | } UPRV_BLOCK_MACRO_END | 
| 238 |  | 
| 239 | /** | 
| 240 |  * Get a code point from a string at a random-access offset, | 
| 241 |  * without changing the offset. | 
| 242 |  * The offset may point to either the lead byte or one of the trail bytes | 
| 243 |  * for a code point, in which case the macro will read all of the bytes | 
| 244 |  * for the code point. | 
| 245 |  * | 
| 246 |  * The length can be negative for a NUL-terminated string. | 
| 247 |  * | 
| 248 |  * If the offset points to an illegal UTF-8 byte sequence, then | 
| 249 |  * c is set to a negative value. | 
| 250 |  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. | 
| 251 |  * | 
| 252 |  * @param s const uint8_t * string | 
| 253 |  * @param start int32_t starting string offset | 
| 254 |  * @param i int32_t string offset, must be start<=i<length | 
| 255 |  * @param length int32_t string length | 
| 256 |  * @param c output UChar32 variable, set to <0 in case of an error | 
| 257 |  * @see U8_GET_UNSAFE | 
| 258 |  * @stable ICU 2.4 | 
| 259 |  */ | 
| 260 | #define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 261 |     int32_t _u8_get_index=(i); \ | 
| 262 |     U8_SET_CP_START(s, start, _u8_get_index); \ | 
| 263 |     U8_NEXT(s, _u8_get_index, length, c); \ | 
| 264 | } UPRV_BLOCK_MACRO_END | 
| 265 |  | 
| 266 | /** | 
| 267 |  * Get a code point from a string at a random-access offset, | 
| 268 |  * without changing the offset. | 
| 269 |  * The offset may point to either the lead byte or one of the trail bytes | 
| 270 |  * for a code point, in which case the macro will read all of the bytes | 
| 271 |  * for the code point. | 
| 272 |  * | 
| 273 |  * The length can be negative for a NUL-terminated string. | 
| 274 |  * | 
| 275 |  * If the offset points to an illegal UTF-8 byte sequence, then | 
| 276 |  * c is set to U+FFFD. | 
| 277 |  * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. | 
| 278 |  * | 
| 279 |  * This macro does not distinguish between a real U+FFFD in the text | 
| 280 |  * and U+FFFD returned for an ill-formed sequence. | 
| 281 |  * Use U8_GET() if that distinction is important. | 
| 282 |  * | 
| 283 |  * @param s const uint8_t * string | 
| 284 |  * @param start int32_t starting string offset | 
| 285 |  * @param i int32_t string offset, must be start<=i<length | 
| 286 |  * @param length int32_t string length | 
| 287 |  * @param c output UChar32 variable, set to U+FFFD in case of an error | 
| 288 |  * @see U8_GET | 
| 289 |  * @stable ICU 51 | 
| 290 |  */ | 
| 291 | #define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 292 |     int32_t _u8_get_index=(i); \ | 
| 293 |     U8_SET_CP_START(s, start, _u8_get_index); \ | 
| 294 |     U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ | 
| 295 | } UPRV_BLOCK_MACRO_END | 
| 296 |  | 
| 297 | /* definitions with forward iteration --------------------------------------- */ | 
| 298 |  | 
| 299 | /** | 
| 300 |  * Get a code point from a string at a code point boundary offset, | 
| 301 |  * and advance the offset to the next code point boundary. | 
| 302 |  * (Post-incrementing forward iteration.) | 
| 303 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 304 |  * | 
| 305 |  * The offset may point to the lead byte of a multi-byte sequence, | 
| 306 |  * in which case the macro will read the whole sequence. | 
| 307 |  * The result is undefined if the offset points to a trail byte | 
| 308 |  * or an illegal UTF-8 sequence. | 
| 309 |  * | 
| 310 |  * @param s const uint8_t * string | 
| 311 |  * @param i string offset | 
| 312 |  * @param c output UChar32 variable | 
| 313 |  * @see U8_NEXT | 
| 314 |  * @stable ICU 2.4 | 
| 315 |  */ | 
| 316 | #define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 317 |     (c)=(uint8_t)(s)[(i)++]; \ | 
| 318 |     if(!U8_IS_SINGLE(c)) { \ | 
| 319 |         if((c)<0xe0) { \ | 
| 320 |             (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ | 
| 321 |         } else if((c)<0xf0) { \ | 
| 322 |             /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ | 
| 323 |             (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ | 
| 324 |             (i)+=2; \ | 
| 325 |         } else { \ | 
| 326 |             (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ | 
| 327 |             (i)+=3; \ | 
| 328 |         } \ | 
| 329 |     } \ | 
| 330 | } UPRV_BLOCK_MACRO_END | 
| 331 |  | 
| 332 | /** | 
| 333 |  * Get a code point from a string at a code point boundary offset, | 
| 334 |  * and advance the offset to the next code point boundary. | 
| 335 |  * (Post-incrementing forward iteration.) | 
| 336 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 337 |  * | 
| 338 |  * The length can be negative for a NUL-terminated string. | 
| 339 |  * | 
| 340 |  * The offset may point to the lead byte of a multi-byte sequence, | 
| 341 |  * in which case the macro will read the whole sequence. | 
| 342 |  * If the offset points to a trail byte or an illegal UTF-8 sequence, then | 
| 343 |  * c is set to a negative value. | 
| 344 |  * | 
| 345 |  * @param s const uint8_t * string | 
| 346 |  * @param i int32_t string offset, must be i<length | 
| 347 |  * @param length int32_t string length | 
| 348 |  * @param c output UChar32 variable, set to <0 in case of an error | 
| 349 |  * @see U8_NEXT_UNSAFE | 
| 350 |  * @stable ICU 2.4 | 
| 351 |  */ | 
| 352 | #define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL) | 
| 353 |  | 
| 354 | /** | 
| 355 |  * Get a code point from a string at a code point boundary offset, | 
| 356 |  * and advance the offset to the next code point boundary. | 
| 357 |  * (Post-incrementing forward iteration.) | 
| 358 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 359 |  * | 
| 360 |  * The length can be negative for a NUL-terminated string. | 
| 361 |  * | 
| 362 |  * The offset may point to the lead byte of a multi-byte sequence, | 
| 363 |  * in which case the macro will read the whole sequence. | 
| 364 |  * If the offset points to a trail byte or an illegal UTF-8 sequence, then | 
| 365 |  * c is set to U+FFFD. | 
| 366 |  * | 
| 367 |  * This macro does not distinguish between a real U+FFFD in the text | 
| 368 |  * and U+FFFD returned for an ill-formed sequence. | 
| 369 |  * Use U8_NEXT() if that distinction is important. | 
| 370 |  * | 
| 371 |  * @param s const uint8_t * string | 
| 372 |  * @param i int32_t string offset, must be i<length | 
| 373 |  * @param length int32_t string length | 
| 374 |  * @param c output UChar32 variable, set to U+FFFD in case of an error | 
| 375 |  * @see U8_NEXT | 
| 376 |  * @stable ICU 51 | 
| 377 |  */ | 
| 378 | #define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd) | 
| 379 |  | 
| 380 | /** @internal */ | 
| 381 | #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 382 |     (c)=(uint8_t)(s)[(i)++]; \ | 
| 383 |     if(!U8_IS_SINGLE(c)) { \ | 
| 384 |         uint8_t __t = 0; \ | 
| 385 |         if((i)!=(length) && \ | 
| 386 |             /* fetch/validate/assemble all but last trail byte */ \ | 
| 387 |             ((c)>=0xe0 ? \ | 
| 388 |                 ((c)<0xf0 ?  /* U+0800..U+FFFF except surrogates */ \ | 
| 389 |                     U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ | 
| 390 |                     (__t&=0x3f, 1) \ | 
| 391 |                 :  /* U+10000..U+10FFFF */ \ | 
| 392 |                     ((c)-=0xf0)<=4 && \ | 
| 393 |                     U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ | 
| 394 |                     ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ | 
| 395 |                     (__t=(s)[i]-0x80)<=0x3f) && \ | 
| 396 |                 /* valid second-to-last trail byte */ \ | 
| 397 |                 ((c)=((c)<<6)|__t, ++(i)!=(length)) \ | 
| 398 |             :  /* U+0080..U+07FF */ \ | 
| 399 |                 (c)>=0xc2 && ((c)&=0x1f, 1)) && \ | 
| 400 |             /* last trail byte */ \ | 
| 401 |             (__t=(s)[i]-0x80)<=0x3f && \ | 
| 402 |             ((c)=((c)<<6)|__t, ++(i), 1)) { \ | 
| 403 |         } else { \ | 
| 404 |             (c)=(sub);  /* ill-formed*/ \ | 
| 405 |         } \ | 
| 406 |     } \ | 
| 407 | } UPRV_BLOCK_MACRO_END | 
| 408 |  | 
| 409 | /** | 
| 410 |  * Append a code point to a string, overwriting 1 to 4 bytes. | 
| 411 |  * The offset points to the current end of the string contents | 
| 412 |  * and is advanced (post-increment). | 
| 413 |  * "Unsafe" macro, assumes a valid code point and sufficient space in the string. | 
| 414 |  * Otherwise, the result is undefined. | 
| 415 |  * | 
| 416 |  * @param s const uint8_t * string buffer | 
| 417 |  * @param i string offset | 
| 418 |  * @param c code point to append | 
| 419 |  * @see U8_APPEND | 
| 420 |  * @stable ICU 2.4 | 
| 421 |  */ | 
| 422 | #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 423 |     uint32_t __uc=(c); \ | 
| 424 |     if(__uc<=0x7f) { \ | 
| 425 |         (s)[(i)++]=(uint8_t)__uc; \ | 
| 426 |     } else { \ | 
| 427 |         if(__uc<=0x7ff) { \ | 
| 428 |             (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ | 
| 429 |         } else { \ | 
| 430 |             if(__uc<=0xffff) { \ | 
| 431 |                 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ | 
| 432 |             } else { \ | 
| 433 |                 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ | 
| 434 |                 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ | 
| 435 |             } \ | 
| 436 |             (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ | 
| 437 |         } \ | 
| 438 |         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ | 
| 439 |     } \ | 
| 440 | } UPRV_BLOCK_MACRO_END | 
| 441 |  | 
| 442 | /** | 
| 443 |  * Append a code point to a string, overwriting 1 to 4 bytes. | 
| 444 |  * The offset points to the current end of the string contents | 
| 445 |  * and is advanced (post-increment). | 
| 446 |  * "Safe" macro, checks for a valid code point. | 
| 447 |  * If a non-ASCII code point is written, checks for sufficient space in the string. | 
| 448 |  * If the code point is not valid or trail bytes do not fit, | 
| 449 |  * then isError is set to true. | 
| 450 |  * | 
| 451 |  * @param s const uint8_t * string buffer | 
| 452 |  * @param i int32_t string offset, must be i<capacity | 
| 453 |  * @param capacity int32_t size of the string buffer | 
| 454 |  * @param c UChar32 code point to append | 
| 455 |  * @param isError output UBool set to true if an error occurs, otherwise not modified | 
| 456 |  * @see U8_APPEND_UNSAFE | 
| 457 |  * @stable ICU 2.4 | 
| 458 |  */ | 
| 459 | #define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 460 |     uint32_t __uc=(c); \ | 
| 461 |     if(__uc<=0x7f) { \ | 
| 462 |         (s)[(i)++]=(uint8_t)__uc; \ | 
| 463 |     } else if(__uc<=0x7ff && (i)+1<(capacity)) { \ | 
| 464 |         (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ | 
| 465 |         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ | 
| 466 |     } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \ | 
| 467 |         (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ | 
| 468 |         (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ | 
| 469 |         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ | 
| 470 |     } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \ | 
| 471 |         (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ | 
| 472 |         (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ | 
| 473 |         (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ | 
| 474 |         (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ | 
| 475 |     } else { \ | 
| 476 |         (isError)=true; \ | 
| 477 |     } \ | 
| 478 | } UPRV_BLOCK_MACRO_END | 
| 479 |  | 
| 480 | /** | 
| 481 |  * Advance the string offset from one code point boundary to the next. | 
| 482 |  * (Post-incrementing iteration.) | 
| 483 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 484 |  * | 
| 485 |  * @param s const uint8_t * string | 
| 486 |  * @param i string offset | 
| 487 |  * @see U8_FWD_1 | 
| 488 |  * @stable ICU 2.4 | 
| 489 |  */ | 
| 490 | #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 491 |     (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ | 
| 492 | } UPRV_BLOCK_MACRO_END | 
| 493 |  | 
| 494 | /** | 
| 495 |  * Advance the string offset from one code point boundary to the next. | 
| 496 |  * (Post-incrementing iteration.) | 
| 497 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 498 |  * | 
| 499 |  * The length can be negative for a NUL-terminated string. | 
| 500 |  * | 
| 501 |  * @param s const uint8_t * string | 
| 502 |  * @param i int32_t string offset, must be i<length | 
| 503 |  * @param length int32_t string length | 
| 504 |  * @see U8_FWD_1_UNSAFE | 
| 505 |  * @stable ICU 2.4 | 
| 506 |  */ | 
| 507 | #define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 508 |     uint8_t __b=(s)[(i)++]; \ | 
| 509 |     if(U8_IS_LEAD(__b) && (i)!=(length)) { \ | 
| 510 |         uint8_t __t1=(s)[i]; \ | 
| 511 |         if((0xe0<=__b && __b<0xf0)) { \ | 
| 512 |             if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \ | 
| 513 |                     ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ | 
| 514 |                 ++(i); \ | 
| 515 |             } \ | 
| 516 |         } else if(__b<0xe0) { \ | 
| 517 |             if(U8_IS_TRAIL(__t1)) { \ | 
| 518 |                 ++(i); \ | 
| 519 |             } \ | 
| 520 |         } else /* c>=0xf0 */ { \ | 
| 521 |             if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ | 
| 522 |                     ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ | 
| 523 |                     ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ | 
| 524 |                 ++(i); \ | 
| 525 |             } \ | 
| 526 |         } \ | 
| 527 |     } \ | 
| 528 | } UPRV_BLOCK_MACRO_END | 
| 529 |  | 
| 530 | /** | 
| 531 |  * Advance the string offset from one code point boundary to the n-th next one, | 
| 532 |  * i.e., move forward by n code points. | 
| 533 |  * (Post-incrementing iteration.) | 
| 534 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 535 |  * | 
| 536 |  * @param s const uint8_t * string | 
| 537 |  * @param i string offset | 
| 538 |  * @param n number of code points to skip | 
| 539 |  * @see U8_FWD_N | 
| 540 |  * @stable ICU 2.4 | 
| 541 |  */ | 
| 542 | #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 543 |     int32_t __N=(n); \ | 
| 544 |     while(__N>0) { \ | 
| 545 |         U8_FWD_1_UNSAFE(s, i); \ | 
| 546 |         --__N; \ | 
| 547 |     } \ | 
| 548 | } UPRV_BLOCK_MACRO_END | 
| 549 |  | 
| 550 | /** | 
| 551 |  * Advance the string offset from one code point boundary to the n-th next one, | 
| 552 |  * i.e., move forward by n code points. | 
| 553 |  * (Post-incrementing iteration.) | 
| 554 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 555 |  * | 
| 556 |  * The length can be negative for a NUL-terminated string. | 
| 557 |  * | 
| 558 |  * @param s const uint8_t * string | 
| 559 |  * @param i int32_t string offset, must be i<length | 
| 560 |  * @param length int32_t string length | 
| 561 |  * @param n number of code points to skip | 
| 562 |  * @see U8_FWD_N_UNSAFE | 
| 563 |  * @stable ICU 2.4 | 
| 564 |  */ | 
| 565 | #define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 566 |     int32_t __N=(n); \ | 
| 567 |     while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ | 
| 568 |         U8_FWD_1(s, i, length); \ | 
| 569 |         --__N; \ | 
| 570 |     } \ | 
| 571 | } UPRV_BLOCK_MACRO_END | 
| 572 |  | 
| 573 | /** | 
| 574 |  * Adjust a random-access offset to a code point boundary | 
| 575 |  * at the start of a code point. | 
| 576 |  * If the offset points to a UTF-8 trail byte, | 
| 577 |  * then the offset is moved backward to the corresponding lead byte. | 
| 578 |  * Otherwise, it is not modified. | 
| 579 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 580 |  * | 
| 581 |  * @param s const uint8_t * string | 
| 582 |  * @param i string offset | 
| 583 |  * @see U8_SET_CP_START | 
| 584 |  * @stable ICU 2.4 | 
| 585 |  */ | 
| 586 | #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 587 |     while(U8_IS_TRAIL((s)[i])) { --(i); } \ | 
| 588 | } UPRV_BLOCK_MACRO_END | 
| 589 |  | 
| 590 | /** | 
| 591 |  * Adjust a random-access offset to a code point boundary | 
| 592 |  * at the start of a code point. | 
| 593 |  * If the offset points to a UTF-8 trail byte, | 
| 594 |  * then the offset is moved backward to the corresponding lead byte. | 
| 595 |  * Otherwise, it is not modified. | 
| 596 |  * | 
| 597 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 598 |  * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. | 
| 599 |  * | 
| 600 |  * @param s const uint8_t * string | 
| 601 |  * @param start int32_t starting string offset (usually 0) | 
| 602 |  * @param i int32_t string offset, must be start<=i | 
| 603 |  * @see U8_SET_CP_START_UNSAFE | 
| 604 |  * @see U8_TRUNCATE_IF_INCOMPLETE | 
| 605 |  * @stable ICU 2.4 | 
| 606 |  */ | 
| 607 | #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 608 |     if(U8_IS_TRAIL((s)[(i)])) { \ | 
| 609 |         (i)=utf8_back1SafeBody(s, start, (i)); \ | 
| 610 |     } \ | 
| 611 | } UPRV_BLOCK_MACRO_END | 
| 612 |  | 
| 613 | /** | 
| 614 |  * If the string ends with a UTF-8 byte sequence that is valid so far | 
| 615 |  * but incomplete, then reduce the length of the string to end before | 
| 616 |  * the lead byte of that incomplete sequence. | 
| 617 |  * For example, if the string ends with E1 80, the length is reduced by 2. | 
| 618 |  * | 
| 619 |  * In all other cases (the string ends with a complete sequence, or it is not | 
| 620 |  * possible for any further trail byte to extend the trailing sequence) | 
| 621 |  * the length remains unchanged. | 
| 622 |  * | 
| 623 |  * Useful for processing text split across multiple buffers | 
| 624 |  * (save the incomplete sequence for later) | 
| 625 |  * and for optimizing iteration | 
| 626 |  * (check for string length only once per character). | 
| 627 |  * | 
| 628 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 629 |  * Unlike U8_SET_CP_START(), this macro never reads s[length]. | 
| 630 |  * | 
| 631 |  * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) | 
| 632 |  * | 
| 633 |  * @param s const uint8_t * string | 
| 634 |  * @param start int32_t starting string offset (usually 0) | 
| 635 |  * @param length int32_t string length (usually start<=length) | 
| 636 |  * @see U8_SET_CP_START | 
| 637 |  * @stable ICU 61 | 
| 638 |  */ | 
| 639 | #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 640 |     if((length)>(start)) { \ | 
| 641 |         uint8_t __b1=s[(length)-1]; \ | 
| 642 |         if(U8_IS_SINGLE(__b1)) { \ | 
| 643 |             /* common ASCII character */ \ | 
| 644 |         } else if(U8_IS_LEAD(__b1)) { \ | 
| 645 |             --(length); \ | 
| 646 |         } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ | 
| 647 |             uint8_t __b2=s[(length)-2]; \ | 
| 648 |             if(0xe0<=__b2 && __b2<=0xf4) { \ | 
| 649 |                 if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ | 
| 650 |                         U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ | 
| 651 |                     (length)-=2; \ | 
| 652 |                 } \ | 
| 653 |             } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ | 
| 654 |                 uint8_t __b3=s[(length)-3]; \ | 
| 655 |                 if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ | 
| 656 |                     (length)-=3; \ | 
| 657 |                 } \ | 
| 658 |             } \ | 
| 659 |         } \ | 
| 660 |     } \ | 
| 661 | } UPRV_BLOCK_MACRO_END | 
| 662 |  | 
| 663 | /* definitions with backward iteration -------------------------------------- */ | 
| 664 |  | 
| 665 | /** | 
| 666 |  * Move the string offset from one code point boundary to the previous one | 
| 667 |  * and get the code point between them. | 
| 668 |  * (Pre-decrementing backward iteration.) | 
| 669 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 670 |  * | 
| 671 |  * The input offset may be the same as the string length. | 
| 672 |  * If the offset is behind a multi-byte sequence, then the macro will read | 
| 673 |  * the whole sequence. | 
| 674 |  * If the offset is behind a lead byte, then that itself | 
| 675 |  * will be returned as the code point. | 
| 676 |  * The result is undefined if the offset is behind an illegal UTF-8 sequence. | 
| 677 |  * | 
| 678 |  * @param s const uint8_t * string | 
| 679 |  * @param i string offset | 
| 680 |  * @param c output UChar32 variable | 
| 681 |  * @see U8_PREV | 
| 682 |  * @stable ICU 2.4 | 
| 683 |  */ | 
| 684 | #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 685 |     (c)=(uint8_t)(s)[--(i)]; \ | 
| 686 |     if(U8_IS_TRAIL(c)) { \ | 
| 687 |         uint8_t __b, __count=1, __shift=6; \ | 
| 688 | \ | 
| 689 |         /* c is a trail byte */ \ | 
| 690 |         (c)&=0x3f; \ | 
| 691 |         for(;;) { \ | 
| 692 |             __b=(s)[--(i)]; \ | 
| 693 |             if(__b>=0xc0) { \ | 
| 694 |                 U8_MASK_LEAD_BYTE(__b, __count); \ | 
| 695 |                 (c)|=(UChar32)__b<<__shift; \ | 
| 696 |                 break; \ | 
| 697 |             } else { \ | 
| 698 |                 (c)|=(UChar32)(__b&0x3f)<<__shift; \ | 
| 699 |                 ++__count; \ | 
| 700 |                 __shift+=6; \ | 
| 701 |             } \ | 
| 702 |         } \ | 
| 703 |     } \ | 
| 704 | } UPRV_BLOCK_MACRO_END | 
| 705 |  | 
| 706 | /** | 
| 707 |  * Move the string offset from one code point boundary to the previous one | 
| 708 |  * and get the code point between them. | 
| 709 |  * (Pre-decrementing backward iteration.) | 
| 710 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 711 |  * | 
| 712 |  * The input offset may be the same as the string length. | 
| 713 |  * If the offset is behind a multi-byte sequence, then the macro will read | 
| 714 |  * the whole sequence. | 
| 715 |  * If the offset is behind a lead byte, then that itself | 
| 716 |  * will be returned as the code point. | 
| 717 |  * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. | 
| 718 |  * | 
| 719 |  * @param s const uint8_t * string | 
| 720 |  * @param start int32_t starting string offset (usually 0) | 
| 721 |  * @param i int32_t string offset, must be start<i | 
| 722 |  * @param c output UChar32 variable, set to <0 in case of an error | 
| 723 |  * @see U8_PREV_UNSAFE | 
| 724 |  * @stable ICU 2.4 | 
| 725 |  */ | 
| 726 | #define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 727 |     (c)=(uint8_t)(s)[--(i)]; \ | 
| 728 |     if(!U8_IS_SINGLE(c)) { \ | 
| 729 |         (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ | 
| 730 |     } \ | 
| 731 | } UPRV_BLOCK_MACRO_END | 
| 732 |  | 
| 733 | /** | 
| 734 |  * Move the string offset from one code point boundary to the previous one | 
| 735 |  * and get the code point between them. | 
| 736 |  * (Pre-decrementing backward iteration.) | 
| 737 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 738 |  * | 
| 739 |  * The input offset may be the same as the string length. | 
| 740 |  * If the offset is behind a multi-byte sequence, then the macro will read | 
| 741 |  * the whole sequence. | 
| 742 |  * If the offset is behind a lead byte, then that itself | 
| 743 |  * will be returned as the code point. | 
| 744 |  * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. | 
| 745 |  * | 
| 746 |  * This macro does not distinguish between a real U+FFFD in the text | 
| 747 |  * and U+FFFD returned for an ill-formed sequence. | 
| 748 |  * Use U8_PREV() if that distinction is important. | 
| 749 |  * | 
| 750 |  * @param s const uint8_t * string | 
| 751 |  * @param start int32_t starting string offset (usually 0) | 
| 752 |  * @param i int32_t string offset, must be start<i | 
| 753 |  * @param c output UChar32 variable, set to U+FFFD in case of an error | 
| 754 |  * @see U8_PREV | 
| 755 |  * @stable ICU 51 | 
| 756 |  */ | 
| 757 | #define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 758 |     (c)=(uint8_t)(s)[--(i)]; \ | 
| 759 |     if(!U8_IS_SINGLE(c)) { \ | 
| 760 |         (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ | 
| 761 |     } \ | 
| 762 | } UPRV_BLOCK_MACRO_END | 
| 763 |  | 
| 764 | /** | 
| 765 |  * Move the string offset from one code point boundary to the previous one. | 
| 766 |  * (Pre-decrementing backward iteration.) | 
| 767 |  * The input offset may be the same as the string length. | 
| 768 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 769 |  * | 
| 770 |  * @param s const uint8_t * string | 
| 771 |  * @param i string offset | 
| 772 |  * @see U8_BACK_1 | 
| 773 |  * @stable ICU 2.4 | 
| 774 |  */ | 
| 775 | #define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 776 |     while(U8_IS_TRAIL((s)[--(i)])) {} \ | 
| 777 | } UPRV_BLOCK_MACRO_END | 
| 778 |  | 
| 779 | /** | 
| 780 |  * Move the string offset from one code point boundary to the previous one. | 
| 781 |  * (Pre-decrementing backward iteration.) | 
| 782 |  * The input offset may be the same as the string length. | 
| 783 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 784 |  * | 
| 785 |  * @param s const uint8_t * string | 
| 786 |  * @param start int32_t starting string offset (usually 0) | 
| 787 |  * @param i int32_t string offset, must be start<i | 
| 788 |  * @see U8_BACK_1_UNSAFE | 
| 789 |  * @stable ICU 2.4 | 
| 790 |  */ | 
| 791 | #define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 792 |     if(U8_IS_TRAIL((s)[--(i)])) { \ | 
| 793 |         (i)=utf8_back1SafeBody(s, start, (i)); \ | 
| 794 |     } \ | 
| 795 | } UPRV_BLOCK_MACRO_END | 
| 796 |  | 
| 797 | /** | 
| 798 |  * Move the string offset from one code point boundary to the n-th one before it, | 
| 799 |  * i.e., move backward by n code points. | 
| 800 |  * (Pre-decrementing backward iteration.) | 
| 801 |  * The input offset may be the same as the string length. | 
| 802 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 803 |  * | 
| 804 |  * @param s const uint8_t * string | 
| 805 |  * @param i string offset | 
| 806 |  * @param n number of code points to skip | 
| 807 |  * @see U8_BACK_N | 
| 808 |  * @stable ICU 2.4 | 
| 809 |  */ | 
| 810 | #define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 811 |     int32_t __N=(n); \ | 
| 812 |     while(__N>0) { \ | 
| 813 |         U8_BACK_1_UNSAFE(s, i); \ | 
| 814 |         --__N; \ | 
| 815 |     } \ | 
| 816 | } UPRV_BLOCK_MACRO_END | 
| 817 |  | 
| 818 | /** | 
| 819 |  * Move the string offset from one code point boundary to the n-th one before it, | 
| 820 |  * i.e., move backward by n code points. | 
| 821 |  * (Pre-decrementing backward iteration.) | 
| 822 |  * The input offset may be the same as the string length. | 
| 823 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 824 |  * | 
| 825 |  * @param s const uint8_t * string | 
| 826 |  * @param start int32_t index of the start of the string | 
| 827 |  * @param i int32_t string offset, must be start<i | 
| 828 |  * @param n number of code points to skip | 
| 829 |  * @see U8_BACK_N_UNSAFE | 
| 830 |  * @stable ICU 2.4 | 
| 831 |  */ | 
| 832 | #define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 833 |     int32_t __N=(n); \ | 
| 834 |     while(__N>0 && (i)>(start)) { \ | 
| 835 |         U8_BACK_1(s, start, i); \ | 
| 836 |         --__N; \ | 
| 837 |     } \ | 
| 838 | } UPRV_BLOCK_MACRO_END | 
| 839 |  | 
| 840 | /** | 
| 841 |  * Adjust a random-access offset to a code point boundary after a code point. | 
| 842 |  * If the offset is behind a partial multi-byte sequence, | 
| 843 |  * then the offset is incremented to behind the whole sequence. | 
| 844 |  * Otherwise, it is not modified. | 
| 845 |  * The input offset may be the same as the string length. | 
| 846 |  * "Unsafe" macro, assumes well-formed UTF-8. | 
| 847 |  * | 
| 848 |  * @param s const uint8_t * string | 
| 849 |  * @param i string offset | 
| 850 |  * @see U8_SET_CP_LIMIT | 
| 851 |  * @stable ICU 2.4 | 
| 852 |  */ | 
| 853 | #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 854 |     U8_BACK_1_UNSAFE(s, i); \ | 
| 855 |     U8_FWD_1_UNSAFE(s, i); \ | 
| 856 | } UPRV_BLOCK_MACRO_END | 
| 857 |  | 
| 858 | /** | 
| 859 |  * Adjust a random-access offset to a code point boundary after a code point. | 
| 860 |  * If the offset is behind a partial multi-byte sequence, | 
| 861 |  * then the offset is incremented to behind the whole sequence. | 
| 862 |  * Otherwise, it is not modified. | 
| 863 |  * The input offset may be the same as the string length. | 
| 864 |  * "Safe" macro, checks for illegal sequences and for string boundaries. | 
| 865 |  * | 
| 866 |  * The length can be negative for a NUL-terminated string. | 
| 867 |  * | 
| 868 |  * @param s const uint8_t * string | 
| 869 |  * @param start int32_t starting string offset (usually 0) | 
| 870 |  * @param i int32_t string offset, must be start<=i<=length | 
| 871 |  * @param length int32_t string length | 
| 872 |  * @see U8_SET_CP_LIMIT_UNSAFE | 
| 873 |  * @stable ICU 2.4 | 
| 874 |  */ | 
| 875 | #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ | 
| 876 |     if((start)<(i) && ((i)<(length) || (length)<0)) { \ | 
| 877 |         U8_BACK_1(s, start, i); \ | 
| 878 |         U8_FWD_1(s, i, length); \ | 
| 879 |     } \ | 
| 880 | } UPRV_BLOCK_MACRO_END | 
| 881 |  | 
| 882 | #endif | 
| 883 |  |