ustring.h source code [include/unicode/ustring.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1998-2014, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*
9	* File ustring.h
10	*
11	* Modification History:
12	*
13	* Date Name Description
14	* 12/07/98 bertrand Creation.
15	******************************************************************************
16	*/
17
18	#ifndef USTRING_H
19	#define USTRING_H
20
21	#include "unicode/utypes.h"
22	#include "unicode/putil.h"
23	#include "unicode/uiter.h"
24
25	/**
26	* \def UBRK_TYPEDEF_UBREAK_ITERATOR
27	* @internal
28	*/
29
30	#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
31	# define UBRK_TYPEDEF_UBREAK_ITERATOR
32	/* Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1/
33	typedef struct UBreakIterator UBreakIterator;
34	#endif
35
36	/**
37	* \file
38	* \brief C API: Unicode string handling functions
39	*
40	* These C API functions provide general Unicode string handling.
41	*
42	* Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
43	* functions. (For example, they do not check for bad arguments like NULL string pointers.)
44	* In some cases, only the thread-safe variant of such a function is implemented here
45	* (see u_strtok_r()).
46	*
47	* Other functions provide more Unicode-specific functionality like locale-specific
48	* upper/lower-casing and string comparison in code point order.
49	*
50	* ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
51	* UTF-16 encodes each Unicode code point with either one or two UChar code units.
52	* (This is the default form of Unicode, and a forward-compatible extension of the original,
53	* fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
54	* in 1996.)
55	*
56	* Some APIs accept a 32-bit UChar32 value for a single code point.
57	*
58	* ICU also handles 16-bit Unicode text with unpaired surrogates.
59	* Such text is not well-formed UTF-16.
60	* Code-point-related functions treat unpaired surrogates as surrogate code points,
61	* i.e., as separate units.
62	*
63	* Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
64	* it is much more efficient even for random access because the code unit values
65	* for single-unit characters vs. lead units vs. trail units are completely disjoint.
66	* This means that it is easy to determine character (code point) boundaries from
67	* random offsets in the string.
68	*
69	* Unicode (UTF-16) string processing is optimized for the single-unit case.
70	* Although it is important to support supplementary characters
71	* (which use pairs of lead/trail code units called "surrogates"),
72	* their occurrence is rare. Almost all characters in modern use require only
73	* a single UChar code unit (i.e., their code point values are <=0xffff).
74	*
75	* For more details see the User Guide Strings chapter (https://unicode-org.github.io/icu/userguide/strings/).
76	* For a discussion of the handling of unpaired surrogates see also
77	* Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
78	*/
79
80	/**
81	* \defgroup ustring_ustrlen String Length
82	* \ingroup ustring_strlen
83	*/
84	/@{/
85	/**
86	* Determine the length of an array of UChar.
87	*
88	* @param s The array of UChars, NULL (U+0000) terminated.
89	* @return The number of UChars in <code>chars</code>, minus the terminator.
90	* @stable ICU 2.0
91	*/
92	U_CAPI int32_t U_EXPORT2
93	u_strlen(const UChar *s);
94	/@}/
95
96	/**
97	* Count Unicode code points in the length UChar code units of the string.
98	* A code point may occupy either one or two UChar code units.
99	* Counting code points involves reading all code units.
100	*
101	* This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
102	*
103	* @param s The input string.
104	* @param length The number of UChar code units to be checked, or -1 to count all
105	* code points before the first NUL (U+0000).
106	* @return The number of code points in the specified code units.
107	* @stable ICU 2.0
108	*/
109	U_CAPI int32_t U_EXPORT2
110	u_countChar32(const UChar *s, int32_t length);
111
112	/**
113	* Check if the string contains more Unicode code points than a certain number.
114	* This is more efficient than counting all code points in the entire string
115	* and comparing that number with a threshold.
116	* This function may not need to scan the string at all if the length is known
117	* (not -1 for NUL-termination) and falls within a certain range, and
118	* never needs to count more than 'number+1' code points.
119	* Logically equivalent to (u_countChar32(s, length)>number).
120	* A Unicode code point may occupy either one or two UChar code units.
121	*
122	* @param s The input string.
123	* @param length The length of the string, or -1 if it is NUL-terminated.
124	* @param number The number of code points in the string is compared against
125	* the 'number' parameter.
126	* @return Boolean value for whether the string contains more Unicode code points
127	* than 'number'. Same as (u_countChar32(s, length)>number).
128	* @stable ICU 2.4
129	*/
130	U_CAPI UBool U_EXPORT2
131	u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
132
133	/**
134	* Concatenate two ustrings. Appends a copy of <code>src</code>,
135	* including the null terminator, to <code>dst</code>. The initial copied
136	* character from <code>src</code> overwrites the null terminator in <code>dst</code>.
137	*
138	* @param dst The destination string.
139	* @param src The source string.
140	* @return A pointer to <code>dst</code>.
141	* @stable ICU 2.0
142	*/
143	U_CAPI UChar* U_EXPORT2
144	u_strcat(UChar *dst,
145	const UChar *src);
146
147	/**
148	* Concatenate two ustrings.
149	* Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
150	* Adds a terminating NUL.
151	* If src is too long, then only <code>n-1</code> characters will be copied
152	* before the terminating NUL.
153	* If <code>n<=0</code> then dst is not modified.
154	*
155	* @param dst The destination string.
156	* @param src The source string (can be NULL/invalid if n<=0).
157	* @param n The maximum number of characters to append; no-op if <=0.
158	* @return A pointer to <code>dst</code>.
159	* @stable ICU 2.0
160	*/
161	U_CAPI UChar* U_EXPORT2
162	u_strncat(UChar *dst,
163	const UChar *src,
164	int32_t n);
165
166	/**
167	* Find the first occurrence of a substring in a string.
168	* The substring is found at code point boundaries.
169	* That means that if the substring begins with
170	* a trail surrogate or ends with a lead surrogate,
171	* then it is found only if these surrogates stand alone in the text.
172	* Otherwise, the substring edge units would be matched against
173	* halves of surrogate pairs.
174	*
175	* @param s The string to search (NUL-terminated).
176	* @param substring The substring to find (NUL-terminated).
177	* @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
178	* or <code>s</code> itself if the <code>substring</code> is empty,
179	* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
180	* @stable ICU 2.0
181	*
182	* @see u_strrstr
183	* @see u_strFindFirst
184	* @see u_strFindLast
185	*/
186	U_CAPI UChar * U_EXPORT2
187	u_strstr(const UChar s, const* UChar *substring);
188
189	/**
190	* Find the first occurrence of a substring in a string.
191	* The substring is found at code point boundaries.
192	* That means that if the substring begins with
193	* a trail surrogate or ends with a lead surrogate,
194	* then it is found only if these surrogates stand alone in the text.
195	* Otherwise, the substring edge units would be matched against
196	* halves of surrogate pairs.
197	*
198	* @param s The string to search.
199	* @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
200	* @param substring The substring to find (NUL-terminated).
201	* @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
202	* @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
203	* or <code>s</code> itself if the <code>substring</code> is empty,
204	* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
205	* @stable ICU 2.4
206	*
207	* @see u_strstr
208	* @see u_strFindLast
209	*/
210	U_CAPI UChar * U_EXPORT2
211	u_strFindFirst(const UChar s, int32_t length, const* UChar *substring, int32_t subLength);
212
213	/**
214	* Find the first occurrence of a BMP code point in a string.
215	* A surrogate code point is found only if its match in the text is not
216	* part of a surrogate pair.
217	* A NUL character is found at the string terminator.
218	*
219	* @param s The string to search (NUL-terminated).
220	* @param c The BMP code point to find.
221	* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
222	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
223	* @stable ICU 2.0
224	*
225	* @see u_strchr32
226	* @see u_memchr
227	* @see u_strstr
228	* @see u_strFindFirst
229	*/
230	U_CAPI UChar * U_EXPORT2
231	u_strchr(const UChar *s, UChar c);
232
233	/**
234	* Find the first occurrence of a code point in a string.
235	* A surrogate code point is found only if its match in the text is not
236	* part of a surrogate pair.
237	* A NUL character is found at the string terminator.
238	*
239	* @param s The string to search (NUL-terminated).
240	* @param c The code point to find.
241	* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
242	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
243	* @stable ICU 2.0
244	*
245	* @see u_strchr
246	* @see u_memchr32
247	* @see u_strstr
248	* @see u_strFindFirst
249	*/
250	U_CAPI UChar * U_EXPORT2
251	u_strchr32(const UChar *s, UChar32 c);
252
253	/**
254	* Find the last occurrence of a substring in a string.
255	* The substring is found at code point boundaries.
256	* That means that if the substring begins with
257	* a trail surrogate or ends with a lead surrogate,
258	* then it is found only if these surrogates stand alone in the text.
259	* Otherwise, the substring edge units would be matched against
260	* halves of surrogate pairs.
261	*
262	* @param s The string to search (NUL-terminated).
263	* @param substring The substring to find (NUL-terminated).
264	* @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
265	* or <code>s</code> itself if the <code>substring</code> is empty,
266	* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
267	* @stable ICU 2.4
268	*
269	* @see u_strstr
270	* @see u_strFindFirst
271	* @see u_strFindLast
272	*/
273	U_CAPI UChar * U_EXPORT2
274	u_strrstr(const UChar s, const* UChar *substring);
275
276	/**
277	* Find the last occurrence of a substring in a string.
278	* The substring is found at code point boundaries.
279	* That means that if the substring begins with
280	* a trail surrogate or ends with a lead surrogate,
281	* then it is found only if these surrogates stand alone in the text.
282	* Otherwise, the substring edge units would be matched against
283	* halves of surrogate pairs.
284	*
285	* @param s The string to search.
286	* @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
287	* @param substring The substring to find (NUL-terminated).
288	* @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
289	* @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
290	* or <code>s</code> itself if the <code>substring</code> is empty,
291	* or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
292	* @stable ICU 2.4
293	*
294	* @see u_strstr
295	* @see u_strFindLast
296	*/
297	U_CAPI UChar * U_EXPORT2
298	u_strFindLast(const UChar s, int32_t length, const* UChar *substring, int32_t subLength);
299
300	/**
301	* Find the last occurrence of a BMP code point in a string.
302	* A surrogate code point is found only if its match in the text is not
303	* part of a surrogate pair.
304	* A NUL character is found at the string terminator.
305	*
306	* @param s The string to search (NUL-terminated).
307	* @param c The BMP code point to find.
308	* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
309	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
310	* @stable ICU 2.4
311	*
312	* @see u_strrchr32
313	* @see u_memrchr
314	* @see u_strrstr
315	* @see u_strFindLast
316	*/
317	U_CAPI UChar * U_EXPORT2
318	u_strrchr(const UChar *s, UChar c);
319
320	/**
321	* Find the last occurrence of a code point in a string.
322	* A surrogate code point is found only if its match in the text is not
323	* part of a surrogate pair.
324	* A NUL character is found at the string terminator.
325	*
326	* @param s The string to search (NUL-terminated).
327	* @param c The code point to find.
328	* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
329	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
330	* @stable ICU 2.4
331	*
332	* @see u_strrchr
333	* @see u_memchr32
334	* @see u_strrstr
335	* @see u_strFindLast
336	*/
337	U_CAPI UChar * U_EXPORT2
338	u_strrchr32(const UChar *s, UChar32 c);
339
340	/**
341	* Locates the first occurrence in the string <code>string</code> of any of the characters
342	* in the string <code>matchSet</code>.
343	* Works just like C's strpbrk but with Unicode.
344	*
345	* @param string The string in which to search, NUL-terminated.
346	* @param matchSet A NUL-terminated string defining a set of code points
347	* for which to search in the text string.
348	* @return A pointer to the character in <code>string</code> that matches one of the
349	* characters in <code>matchSet</code>, or NULL if no such character is found.
350	* @stable ICU 2.0
351	*/
352	U_CAPI UChar * U_EXPORT2
353	u_strpbrk(const UChar string, const* UChar *matchSet);
354
355	/**
356	* Returns the number of consecutive characters in <code>string</code>,
357	* beginning with the first, that do not occur somewhere in <code>matchSet</code>.
358	* Works just like C's strcspn but with Unicode.
359	*
360	* @param string The string in which to search, NUL-terminated.
361	* @param matchSet A NUL-terminated string defining a set of code points
362	* for which to search in the text string.
363	* @return The number of initial characters in <code>string</code> that do not
364	* occur in <code>matchSet</code>.
365	* @see u_strspn
366	* @stable ICU 2.0
367	*/
368	U_CAPI int32_t U_EXPORT2
369	u_strcspn(const UChar string, const* UChar *matchSet);
370
371	/**
372	* Returns the number of consecutive characters in <code>string</code>,
373	* beginning with the first, that occur somewhere in <code>matchSet</code>.
374	* Works just like C's strspn but with Unicode.
375	*
376	* @param string The string in which to search, NUL-terminated.
377	* @param matchSet A NUL-terminated string defining a set of code points
378	* for which to search in the text string.
379	* @return The number of initial characters in <code>string</code> that do
380	* occur in <code>matchSet</code>.
381	* @see u_strcspn
382	* @stable ICU 2.0
383	*/
384	U_CAPI int32_t U_EXPORT2
385	u_strspn(const UChar string, const* UChar *matchSet);
386
387	/**
388	* The string tokenizer API allows an application to break a string into
389	* tokens. Unlike strtok(), the saveState (the current pointer within the
390	* original string) is maintained in saveState. In the first call, the
391	* argument src is a pointer to the string. In subsequent calls to
392	* return successive tokens of that string, src must be specified as
393	* NULL. The value saveState is set by this function to maintain the
394	* function's position within the string, and on each subsequent call
395	* you must give this argument the same variable. This function does
396	* handle surrogate pairs. This function is similar to the strtok_r()
397	* the POSIX Threads Extension (1003.1c-1995) version.
398	*
399	* @param src String containing token(s). This string will be modified.
400	* After the first call to u_strtok_r(), this argument must
401	* be NULL to get to the next token.
402	* @param delim Set of delimiter characters (Unicode code points).
403	* @param saveState The current pointer within the original string,
404	* which is set by this function. The saveState
405	* parameter should the address of a local variable of type
406	* UChar . (i.e. defined "UChar myLocalSaveState" and use
407	* &myLocalSaveState for this parameter).
408	* @return A pointer to the next token found in src, or NULL
409	* when there are no more tokens.
410	* @stable ICU 2.0
411	*/
412	U_CAPI UChar * U_EXPORT2
413	u_strtok_r(UChar *src,
414	const UChar *delim,
415	UChar **saveState);
416
417	/**
418	* Compare two Unicode strings for bitwise equality (code unit order).
419	*
420	* @param s1 A string to compare.
421	* @param s2 A string to compare.
422	* @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
423	* value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
424	* value if <code>s1</code> is bitwise greater than <code>s2</code>.
425	* @stable ICU 2.0
426	*/
427	U_CAPI int32_t U_EXPORT2
428	u_strcmp(const UChar *s1,
429	const UChar *s2);
430
431	/**
432	* Compare two Unicode strings in code point order.
433	* See u_strCompare for details.
434	*
435	* @param s1 A string to compare.
436	* @param s2 A string to compare.
437	* @return a negative/zero/positive integer corresponding to whether
438	* the first string is less than/equal to/greater than the second one
439	* in code point order
440	* @stable ICU 2.0
441	*/
442	U_CAPI int32_t U_EXPORT2
443	u_strcmpCodePointOrder(const UChar s1, const* UChar *s2);
444
445	/**
446	* Compare two Unicode strings (binary order).
447	*
448	* The comparison can be done in code unit order or in code point order.
449	* They differ only in UTF-16 when
450	* comparing supplementary code points (U+10000..U+10ffff)
451	* to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
452	* In code unit order, high BMP code points sort after supplementary code points
453	* because they are stored as pairs of surrogates which are at U+d800..U+dfff.
454	*
455	* This functions works with strings of different explicitly specified lengths
456	* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
457	* NUL-terminated strings are possible with length arguments of -1.
458	*
459	* @param s1 First source string.
460	* @param length1 Length of first source string, or -1 if NUL-terminated.
461	*
462	* @param s2 Second source string.
463	* @param length2 Length of second source string, or -1 if NUL-terminated.
464	*
465	* @param codePointOrder Choose between code unit order (false)
466	* and code point order (true).
467	*
468	* @return <0 or 0 or >0 as usual for string comparisons
469	*
470	* @stable ICU 2.2
471	*/
472	U_CAPI int32_t U_EXPORT2
473	u_strCompare(const UChar *s1, int32_t length1,
474	const UChar *s2, int32_t length2,
475	UBool codePointOrder);
476
477	/**
478	* Compare two Unicode strings (binary order)
479	* as presented by UCharIterator objects.
480	* Works otherwise just like u_strCompare().
481	*
482	* Both iterators are reset to their start positions.
483	* When the function returns, it is undefined where the iterators
484	* have stopped.
485	*
486	* @param iter1 First source string iterator.
487	* @param iter2 Second source string iterator.
488	* @param codePointOrder Choose between code unit order (false)
489	* and code point order (true).
490	*
491	* @return <0 or 0 or >0 as usual for string comparisons
492	*
493	* @see u_strCompare
494	*
495	* @stable ICU 2.6
496	*/
497	U_CAPI int32_t U_EXPORT2
498	u_strCompareIter(UCharIterator iter1, UCharIterator iter2, UBool codePointOrder);
499
500	/**
501	* Compare two strings case-insensitively using full case folding.
502	* This is equivalent to
503	* u_strCompare(u_strFoldCase(s1, options),
504	* u_strFoldCase(s2, options),
505	* (options&U_COMPARE_CODE_POINT_ORDER)!=0).
506	*
507	* The comparison can be done in UTF-16 code unit order or in code point order.
508	* They differ only when comparing supplementary code points (U+10000..U+10ffff)
509	* to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
510	* In code unit order, high BMP code points sort after supplementary code points
511	* because they are stored as pairs of surrogates which are at U+d800..U+dfff.
512	*
513	* This functions works with strings of different explicitly specified lengths
514	* unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
515	* NUL-terminated strings are possible with length arguments of -1.
516	*
517	* @param s1 First source string.
518	* @param length1 Length of first source string, or -1 if NUL-terminated.
519	*
520	* @param s2 Second source string.
521	* @param length2 Length of second source string, or -1 if NUL-terminated.
522	*
523	* @param options A bit set of options:
524	* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
525	* Comparison in code unit order with default case folding.
526	*
527	* - U_COMPARE_CODE_POINT_ORDER
528	* Set to choose code point order instead of code unit order
529	* (see u_strCompare for details).
530	*
531	* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
532	*
533	* @param pErrorCode Must be a valid pointer to an error code value,
534	* which must not indicate a failure before the function call.
535	*
536	* @return <0 or 0 or >0 as usual for string comparisons
537	*
538	* @stable ICU 2.2
539	*/
540	U_CAPI int32_t U_EXPORT2
541	u_strCaseCompare(const UChar *s1, int32_t length1,
542	const UChar *s2, int32_t length2,
543	uint32_t options,
544	UErrorCode *pErrorCode);
545
546	/**
547	* Compare two ustrings for bitwise equality.
548	* Compares at most <code>n</code> characters.
549	*
550	* @param ucs1 A string to compare (can be NULL/invalid if n<=0).
551	* @param ucs2 A string to compare (can be NULL/invalid if n<=0).
552	* @param n The maximum number of characters to compare; always returns 0 if n<=0.
553	* @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
554	* value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
555	* value if <code>s1</code> is bitwise greater than <code>s2</code>.
556	* @stable ICU 2.0
557	*/
558	U_CAPI int32_t U_EXPORT2
559	u_strncmp(const UChar *ucs1,
560	const UChar *ucs2,
561	int32_t n);
562
563	/**
564	* Compare two Unicode strings in code point order.
565	* This is different in UTF-16 from u_strncmp() if supplementary characters are present.
566	* For details, see u_strCompare().
567	*
568	* @param s1 A string to compare.
569	* @param s2 A string to compare.
570	* @param n The maximum number of characters to compare.
571	* @return a negative/zero/positive integer corresponding to whether
572	* the first string is less than/equal to/greater than the second one
573	* in code point order
574	* @stable ICU 2.0
575	*/
576	U_CAPI int32_t U_EXPORT2
577	u_strncmpCodePointOrder(const UChar s1, const* UChar *s2, int32_t n);
578
579	/**
580	* Compare two strings case-insensitively using full case folding.
581	* This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
582	*
583	* @param s1 A string to compare.
584	* @param s2 A string to compare.
585	* @param options A bit set of options:
586	* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
587	* Comparison in code unit order with default case folding.
588	*
589	* - U_COMPARE_CODE_POINT_ORDER
590	* Set to choose code point order instead of code unit order
591	* (see u_strCompare for details).
592	*
593	* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
594	*
595	* @return A negative, zero, or positive integer indicating the comparison result.
596	* @stable ICU 2.0
597	*/
598	U_CAPI int32_t U_EXPORT2
599	u_strcasecmp(const UChar s1, const* UChar *s2, uint32_t options);
600
601	/**
602	* Compare two strings case-insensitively using full case folding.
603	* This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
604	* u_strFoldCase(s2, at most n, options)).
605	*
606	* @param s1 A string to compare.
607	* @param s2 A string to compare.
608	* @param n The maximum number of characters each string to case-fold and then compare.
609	* @param options A bit set of options:
610	* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
611	* Comparison in code unit order with default case folding.
612	*
613	* - U_COMPARE_CODE_POINT_ORDER
614	* Set to choose code point order instead of code unit order
615	* (see u_strCompare for details).
616	*
617	* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
618	*
619	* @return A negative, zero, or positive integer indicating the comparison result.
620	* @stable ICU 2.0
621	*/
622	U_CAPI int32_t U_EXPORT2
623	u_strncasecmp(const UChar s1, const* UChar *s2, int32_t n, uint32_t options);
624
625	/**
626	* Compare two strings case-insensitively using full case folding.
627	* This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
628	* u_strFoldCase(s2, n, options)).
629	*
630	* @param s1 A string to compare.
631	* @param s2 A string to compare.
632	* @param length The number of characters in each string to case-fold and then compare.
633	* @param options A bit set of options:
634	* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
635	* Comparison in code unit order with default case folding.
636	*
637	* - U_COMPARE_CODE_POINT_ORDER
638	* Set to choose code point order instead of code unit order
639	* (see u_strCompare for details).
640	*
641	* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
642	*
643	* @return A negative, zero, or positive integer indicating the comparison result.
644	* @stable ICU 2.0
645	*/
646	U_CAPI int32_t U_EXPORT2
647	u_memcasecmp(const UChar s1, const* UChar *s2, int32_t length, uint32_t options);
648
649	/**
650	* Copy a ustring. Adds a null terminator.
651	*
652	* @param dst The destination string.
653	* @param src The source string.
654	* @return A pointer to <code>dst</code>.
655	* @stable ICU 2.0
656	*/
657	U_CAPI UChar* U_EXPORT2
658	u_strcpy(UChar *dst,
659	const UChar *src);
660
661	/**
662	* Copy a ustring.
663	* Copies at most <code>n</code> characters. The result will be null terminated
664	* if the length of <code>src</code> is less than <code>n</code>.
665	*
666	* @param dst The destination string.
667	* @param src The source string (can be NULL/invalid if n<=0).
668	* @param n The maximum number of characters to copy; no-op if <=0.
669	* @return A pointer to <code>dst</code>.
670	* @stable ICU 2.0
671	*/
672	U_CAPI UChar* U_EXPORT2
673	u_strncpy(UChar *dst,
674	const UChar *src,
675	int32_t n);
676
677	#if !UCONFIG_NO_CONVERSION
678
679	/**
680	* Copy a byte string encoded in the default codepage to a ustring.
681	* Adds a null terminator.
682	* Performs a host byte to UChar conversion
683	*
684	* @param dst The destination string.
685	* @param src The source string.
686	* @return A pointer to <code>dst</code>.
687	* @stable ICU 2.0
688	*/
689	U_CAPI UChar* U_EXPORT2 u_uastrcpy(UChar *dst,
690	const char *src );
691
692	/**
693	* Copy a byte string encoded in the default codepage to a ustring.
694	* Copies at most <code>n</code> characters. The result will be null terminated
695	* if the length of <code>src</code> is less than <code>n</code>.
696	* Performs a host byte to UChar conversion
697	*
698	* @param dst The destination string.
699	* @param src The source string.
700	* @param n The maximum number of characters to copy.
701	* @return A pointer to <code>dst</code>.
702	* @stable ICU 2.0
703	*/
704	U_CAPI UChar* U_EXPORT2 u_uastrncpy(UChar *dst,
705	const char *src,
706	int32_t n);
707
708	/**
709	* Copy ustring to a byte string encoded in the default codepage.
710	* Adds a null terminator.
711	* Performs a UChar to host byte conversion
712	*
713	* @param dst The destination string.
714	* @param src The source string.
715	* @return A pointer to <code>dst</code>.
716	* @stable ICU 2.0
717	*/
718	U_CAPI char* U_EXPORT2 u_austrcpy(char *dst,
719	const UChar *src );
720
721	/**
722	* Copy ustring to a byte string encoded in the default codepage.
723	* Copies at most <code>n</code> characters. The result will be null terminated
724	* if the length of <code>src</code> is less than <code>n</code>.
725	* Performs a UChar to host byte conversion
726	*
727	* @param dst The destination string.
728	* @param src The source string.
729	* @param n The maximum number of characters to copy.
730	* @return A pointer to <code>dst</code>.
731	* @stable ICU 2.0
732	*/
733	U_CAPI char* U_EXPORT2 u_austrncpy(char *dst,
734	const UChar *src,
735	int32_t n );
736
737	#endif
738
739	/**
740	* Synonym for memcpy(), but with UChars only.
741	* @param dest The destination string
742	* @param src The source string (can be NULL/invalid if count<=0)
743	* @param count The number of characters to copy; no-op if <=0
744	* @return A pointer to <code>dest</code>
745	* @stable ICU 2.0
746	*/
747	U_CAPI UChar* U_EXPORT2
748	u_memcpy(UChar dest, const* UChar *src, int32_t count);
749
750	/**
751	* Synonym for memmove(), but with UChars only.
752	* @param dest The destination string
753	* @param src The source string (can be NULL/invalid if count<=0)
754	* @param count The number of characters to move; no-op if <=0
755	* @return A pointer to <code>dest</code>
756	* @stable ICU 2.0
757	*/
758	U_CAPI UChar* U_EXPORT2
759	u_memmove(UChar dest, const* UChar *src, int32_t count);
760
761	/**
762	* Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
763	*
764	* @param dest The destination string.
765	* @param c The character to initialize the string.
766	* @param count The maximum number of characters to set.
767	* @return A pointer to <code>dest</code>.
768	* @stable ICU 2.0
769	*/
770	U_CAPI UChar* U_EXPORT2
771	u_memset(UChar *dest, UChar c, int32_t count);
772
773	/**
774	* Compare the first <code>count</code> UChars of each buffer.
775	*
776	* @param buf1 The first string to compare.
777	* @param buf2 The second string to compare.
778	* @param count The maximum number of UChars to compare.
779	* @return When buf1 < buf2, a negative number is returned.
780	* When buf1 == buf2, 0 is returned.
781	* When buf1 > buf2, a positive number is returned.
782	* @stable ICU 2.0
783	*/
784	U_CAPI int32_t U_EXPORT2
785	u_memcmp(const UChar buf1, const* UChar *buf2, int32_t count);
786
787	/**
788	* Compare two Unicode strings in code point order.
789	* This is different in UTF-16 from u_memcmp() if supplementary characters are present.
790	* For details, see u_strCompare().
791	*
792	* @param s1 A string to compare.
793	* @param s2 A string to compare.
794	* @param count The maximum number of characters to compare.
795	* @return a negative/zero/positive integer corresponding to whether
796	* the first string is less than/equal to/greater than the second one
797	* in code point order
798	* @stable ICU 2.0
799	*/
800	U_CAPI int32_t U_EXPORT2
801	u_memcmpCodePointOrder(const UChar s1, const* UChar *s2, int32_t count);
802
803	/**
804	* Find the first occurrence of a BMP code point in a string.
805	* A surrogate code point is found only if its match in the text is not
806	* part of a surrogate pair.
807	* A NUL character is found at the string terminator.
808	*
809	* @param s The string to search (contains <code>count</code> UChars).
810	* @param c The BMP code point to find.
811	* @param count The length of the string.
812	* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
813	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
814	* @stable ICU 2.0
815	*
816	* @see u_strchr
817	* @see u_memchr32
818	* @see u_strFindFirst
819	*/
820	U_CAPI UChar* U_EXPORT2
821	u_memchr(const UChar *s, UChar c, int32_t count);
822
823	/**
824	* Find the first occurrence of a code point in a string.
825	* A surrogate code point is found only if its match in the text is not
826	* part of a surrogate pair.
827	* A NUL character is found at the string terminator.
828	*
829	* @param s The string to search (contains <code>count</code> UChars).
830	* @param c The code point to find.
831	* @param count The length of the string.
832	* @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
833	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
834	* @stable ICU 2.0
835	*
836	* @see u_strchr32
837	* @see u_memchr
838	* @see u_strFindFirst
839	*/
840	U_CAPI UChar* U_EXPORT2
841	u_memchr32(const UChar *s, UChar32 c, int32_t count);
842
843	/**
844	* Find the last occurrence of a BMP code point in a string.
845	* A surrogate code point is found only if its match in the text is not
846	* part of a surrogate pair.
847	* A NUL character is found at the string terminator.
848	*
849	* @param s The string to search (contains <code>count</code> UChars).
850	* @param c The BMP code point to find.
851	* @param count The length of the string.
852	* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
853	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
854	* @stable ICU 2.4
855	*
856	* @see u_strrchr
857	* @see u_memrchr32
858	* @see u_strFindLast
859	*/
860	U_CAPI UChar* U_EXPORT2
861	u_memrchr(const UChar *s, UChar c, int32_t count);
862
863	/**
864	* Find the last occurrence of a code point in a string.
865	* A surrogate code point is found only if its match in the text is not
866	* part of a surrogate pair.
867	* A NUL character is found at the string terminator.
868	*
869	* @param s The string to search (contains <code>count</code> UChars).
870	* @param c The code point to find.
871	* @param count The length of the string.
872	* @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
873	* or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
874	* @stable ICU 2.4
875	*
876	* @see u_strrchr32
877	* @see u_memrchr
878	* @see u_strFindLast
879	*/
880	U_CAPI UChar* U_EXPORT2
881	u_memrchr32(const UChar *s, UChar32 c, int32_t count);
882
883	/**
884	* Unicode String literals in C.
885	* We need one macro to declare a variable for the string
886	* and to statically preinitialize it if possible,
887	* and a second macro to dynamically initialize such a string variable if necessary.
888	*
889	* The macros are defined for maximum performance.
890	* They work only for strings that contain "invariant characters", i.e.,
891	* only latin letters, digits, and some punctuation.
892	* See utypes.h for details.
893	*
894	* A pair of macros for a single string must be used with the same
895	* parameters.
896	* The string parameter must be a C string literal.
897	* The length of the string, not including the terminating
898	* `NUL`, must be specified as a constant.
899	* The U_STRING_DECL macro should be invoked exactly once for one
900	* such string variable before it is used.
901	*
902	* Usage:
903	*
904	* U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
905	* U_STRING_DECL(ustringVar2, "jumps 5%", 8);
906	* static UBool didInit=false;
907	*
908	* int32_t function() {
909	* if(!didInit) {
910	* U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
911	* U_STRING_INIT(ustringVar2, "jumps 5%", 8);
912	* didInit=true;
913	* }
914	* return u_strcmp(ustringVar1, ustringVar2);
915	* }
916	*
917	* Note that the macros will NOT consistently work if their argument is another #`define`.
918	* The following will not work on all platforms, don't use it.
919	*
920	* #define GLUCK "Mr. Gluck"
921	* U_STRING_DECL(var, GLUCK, 9)
922	* U_STRING_INIT(var, GLUCK, 9)
923	*
924	* Instead, use the string literal "Mr. Gluck" as the argument to both macro
925	* calls.
926	*
927	*
928	* @stable ICU 2.0
929	*/
930	#if defined(U_DECLARE_UTF16)
931	# define U_STRING_DECL(var, cs, length) static const UChar var=(const UChar )U_DECLARE_UTF16(cs)
932	/@stable ICU 2.0 /*
933	# define U_STRING_INIT(var, cs, length)
934	#elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY \|\| defined(U_WCHAR_IS_UTF16))
935	# define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
936	/@stable ICU 2.0 /*
937	# define U_STRING_INIT(var, cs, length)
938	#else
939	# define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
940	/@stable ICU 2.0 /*
941	# define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
942	#endif
943
944	/**
945	* Unescape a string of characters and write the resulting
946	* Unicode characters to the destination buffer. The following escape
947	* sequences are recognized:
948	*
949	* \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
950	* \\Uhhhhhhhh 8 hex digits
951	* \\xhh 1-2 hex digits
952	* \\x{h...} 1-8 hex digits
953	* \\ooo 1-3 octal digits; o in [0-7]
954	* \\cX control-X; X is masked with 0x1F
955	*
956	* as well as the standard ANSI C escapes:
957	*
958	* \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
959	* \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
960	* \\" => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
961	*
962	* Anything else following a backslash is generically escaped. For
963	* example, "[a\\-z]" returns "[a-z]".
964	*
965	* If an escape sequence is ill-formed, this method returns an empty
966	* string. An example of an ill-formed sequence is "\\u" followed by
967	* fewer than 4 hex digits.
968	*
969	* The above characters are recognized in the compiler's codepage,
970	* that is, they are coded as 'u', '\\', etc. Characters that are
971	* not parts of escape sequences are converted using u_charsToUChars().
972	*
973	* This function is similar to UnicodeString::unescape() but not
974	* identical to it. The latter takes a source UnicodeString, so it
975	* does escape recognition but no conversion.
976	*
977	* @param src a zero-terminated string of invariant characters
978	* @param dest pointer to buffer to receive converted and unescaped
979	* text and, if there is room, a zero terminator. May be NULL for
980	* preflighting, in which case no UChars will be written, but the
981	* return value will still be valid. On error, an empty string is
982	* stored here (if possible).
983	* @param destCapacity the number of UChars that may be written at
984	* dest. Ignored if dest == NULL.
985	* @return the length of unescaped string.
986	* @see u_unescapeAt
987	* @see UnicodeString#unescape()
988	* @see UnicodeString#unescapeAt()
989	* @stable ICU 2.0
990	*/
991	U_CAPI int32_t U_EXPORT2
992	u_unescape(const char *src,
993	UChar *dest, int32_t destCapacity);
994
995	U_CDECL_BEGIN
996	/**
997	* Callback function for u_unescapeAt() that returns a character of
998	* the source text given an offset and a context pointer. The context
999	* pointer will be whatever is passed into u_unescapeAt().
1000	*
1001	* @param offset pointer to the offset that will be passed to u_unescapeAt().
1002	* @param context an opaque pointer passed directly into u_unescapeAt()
1003	* @return the character represented by the escape sequence at
1004	* offset
1005	* @see u_unescapeAt
1006	* @stable ICU 2.0
1007	*/
1008	typedef UChar (U_CALLCONV UNESCAPE_CHAR_AT)(int32_t offset, void* *context);
1009	U_CDECL_END
1010
1011	/**
1012	* Unescape a single sequence. The character at offset-1 is assumed
1013	* (without checking) to be a backslash. This method takes a callback
1014	* pointer to a function that returns the UChar at a given offset. By
1015	* varying this callback, ICU functions are able to unescape char*
1016	* strings, UnicodeString objects, and UFILE pointers.
1017	*
1018	* If offset is out of range, or if the escape sequence is ill-formed,
1019	* (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape()
1020	* for a list of recognized sequences.
1021	*
1022	* @param charAt callback function that returns a UChar of the source
1023	* text given an offset and a context pointer.
1024	* @param offset pointer to the offset that will be passed to charAt.
1025	* The offset value will be updated upon return to point after the
1026	* last parsed character of the escape sequence. On error the offset
1027	* is unchanged.
1028	* @param length the number of characters in the source text. The
1029	* last character of the source text is considered to be at offset
1030	* length-1.
1031	* @param context an opaque pointer passed directly into charAt.
1032	* @return the character represented by the escape sequence at
1033	* offset, or (UChar32)0xFFFFFFFF on error.
1034	* @see u_unescape()
1035	* @see UnicodeString#unescape()
1036	* @see UnicodeString#unescapeAt()
1037	* @stable ICU 2.0
1038	*/
1039	U_CAPI UChar32 U_EXPORT2
1040	u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1041	int32_t *offset,
1042	int32_t length,
1043	void *context);
1044
1045	/**
1046	* Uppercase the characters in a string.
1047	* Casing is locale-dependent and context-sensitive.
1048	* The result may be longer or shorter than the original.
1049	* The source string and the destination buffer are allowed to overlap.
1050	*
1051	* @param dest A buffer for the result string. The result will be zero-terminated if
1052	* the buffer is large enough.
1053	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1054	* dest may be NULL and the function will only return the length of the result
1055	* without writing any of the result string.
1056	* @param src The original string
1057	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1058	* @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
1059	* @param pErrorCode Must be a valid pointer to an error code value,
1060	* which must not indicate a failure before the function call.
1061	* @return The length of the result string. It may be greater than destCapacity. In that case,
1062	* only some of the result was written to the destination buffer.
1063	* @stable ICU 2.0
1064	*/
1065	U_CAPI int32_t U_EXPORT2
1066	u_strToUpper(UChar *dest, int32_t destCapacity,
1067	const UChar *src, int32_t srcLength,
1068	const char *locale,
1069	UErrorCode *pErrorCode);
1070
1071	/**
1072	* Lowercase the characters in a string.
1073	* Casing is locale-dependent and context-sensitive.
1074	* The result may be longer or shorter than the original.
1075	* The source string and the destination buffer are allowed to overlap.
1076	*
1077	* @param dest A buffer for the result string. The result will be zero-terminated if
1078	* the buffer is large enough.
1079	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1080	* dest may be NULL and the function will only return the length of the result
1081	* without writing any of the result string.
1082	* @param src The original string
1083	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1084	* @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
1085	* @param pErrorCode Must be a valid pointer to an error code value,
1086	* which must not indicate a failure before the function call.
1087	* @return The length of the result string. It may be greater than destCapacity. In that case,
1088	* only some of the result was written to the destination buffer.
1089	* @stable ICU 2.0
1090	*/
1091	U_CAPI int32_t U_EXPORT2
1092	u_strToLower(UChar *dest, int32_t destCapacity,
1093	const UChar *src, int32_t srcLength,
1094	const char *locale,
1095	UErrorCode *pErrorCode);
1096
1097	#if !UCONFIG_NO_BREAK_ITERATION
1098
1099	/**
1100	* Titlecase a string.
1101	* Casing is locale-dependent and context-sensitive.
1102	* Titlecasing uses a break iterator to find the first characters of words
1103	* that are to be titlecased. It titlecases those characters and lowercases
1104	* all others.
1105	*
1106	* The titlecase break iterator can be provided to customize for arbitrary
1107	* styles, using rules and dictionaries beyond the standard iterators.
1108	* It may be more efficient to always provide an iterator to avoid
1109	* opening and closing one for each string.
1110	* The standard titlecase iterator for the root locale implements the
1111	* algorithm of Unicode TR 21.
1112	*
1113	* This function uses only the setText(), first() and next() methods of the
1114	* provided break iterator.
1115	*
1116	* The result may be longer or shorter than the original.
1117	* The source string and the destination buffer are allowed to overlap.
1118	*
1119	* @param dest A buffer for the result string. The result will be zero-terminated if
1120	* the buffer is large enough.
1121	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1122	* dest may be NULL and the function will only return the length of the result
1123	* without writing any of the result string.
1124	* @param src The original string
1125	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1126	* @param titleIter A break iterator to find the first characters of words
1127	* that are to be titlecased.
1128	* If none is provided (NULL), then a standard titlecase
1129	* break iterator is opened.
1130	* @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
1131	* @param pErrorCode Must be a valid pointer to an error code value,
1132	* which must not indicate a failure before the function call.
1133	* @return The length of the result string. It may be greater than destCapacity. In that case,
1134	* only some of the result was written to the destination buffer.
1135	* @stable ICU 2.1
1136	*/
1137	U_CAPI int32_t U_EXPORT2
1138	u_strToTitle(UChar *dest, int32_t destCapacity,
1139	const UChar *src, int32_t srcLength,
1140	UBreakIterator *titleIter,
1141	const char *locale,
1142	UErrorCode *pErrorCode);
1143
1144	#endif
1145
1146	/**
1147	* Case-folds the characters in a string.
1148	*
1149	* Case-folding is locale-independent and not context-sensitive,
1150	* but there is an option for whether to include or exclude mappings for dotted I
1151	* and dotless i that are marked with 'T' in CaseFolding.txt.
1152	*
1153	* The result may be longer or shorter than the original.
1154	* The source string and the destination buffer are allowed to overlap.
1155	*
1156	* @param dest A buffer for the result string. The result will be zero-terminated if
1157	* the buffer is large enough.
1158	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1159	* dest may be NULL and the function will only return the length of the result
1160	* without writing any of the result string.
1161	* @param src The original string
1162	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1163	* @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
1164	* @param pErrorCode Must be a valid pointer to an error code value,
1165	* which must not indicate a failure before the function call.
1166	* @return The length of the result string. It may be greater than destCapacity. In that case,
1167	* only some of the result was written to the destination buffer.
1168	* @stable ICU 2.0
1169	*/
1170	U_CAPI int32_t U_EXPORT2
1171	u_strFoldCase(UChar *dest, int32_t destCapacity,
1172	const UChar *src, int32_t srcLength,
1173	uint32_t options,
1174	UErrorCode *pErrorCode);
1175
1176	#if defined(U_WCHAR_IS_UTF16) \|\| defined(U_WCHAR_IS_UTF32) \|\| !UCONFIG_NO_CONVERSION
1177	/**
1178	* Convert a UTF-16 string to a wchar_t string.
1179	* If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
1180	* this function simply calls the fast, dedicated function for that.
1181	* Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
1182	*
1183	* @param dest A buffer for the result string. The result will be zero-terminated if
1184	* the buffer is large enough.
1185	* @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then
1186	* dest may be NULL and the function will only return the length of the
1187	* result without writing any of the result string (pre-flighting).
1188	* @param pDestLength A pointer to receive the number of units written to the destination. If
1189	* pDestLength!=NULL then *pDestLength is always set to the
1190	* number of output units corresponding to the transformation of
1191	* all the input units, even in case of a buffer overflow.
1192	* @param src The original source string
1193	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1194	* @param pErrorCode Must be a valid pointer to an error code value,
1195	* which must not indicate a failure before the function call.
1196	* @return The pointer to destination buffer.
1197	* @stable ICU 2.0
1198	*/
1199	U_CAPI wchar_t* U_EXPORT2
1200	u_strToWCS(wchar_t *dest,
1201	int32_t destCapacity,
1202	int32_t *pDestLength,
1203	const UChar *src,
1204	int32_t srcLength,
1205	UErrorCode *pErrorCode);
1206	/**
1207	* Convert a wchar_t string to UTF-16.
1208	* If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
1209	* this function simply calls the fast, dedicated function for that.
1210	* Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
1211	*
1212	* @param dest A buffer for the result string. The result will be zero-terminated if
1213	* the buffer is large enough.
1214	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1215	* dest may be NULL and the function will only return the length of the
1216	* result without writing any of the result string (pre-flighting).
1217	* @param pDestLength A pointer to receive the number of units written to the destination. If
1218	* pDestLength!=NULL then *pDestLength is always set to the
1219	* number of output units corresponding to the transformation of
1220	* all the input units, even in case of a buffer overflow.
1221	* @param src The original source string
1222	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1223	* @param pErrorCode Must be a valid pointer to an error code value,
1224	* which must not indicate a failure before the function call.
1225	* @return The pointer to destination buffer.
1226	* @stable ICU 2.0
1227	*/
1228	U_CAPI UChar* U_EXPORT2
1229	u_strFromWCS(UChar *dest,
1230	int32_t destCapacity,
1231	int32_t *pDestLength,
1232	const wchar_t *src,
1233	int32_t srcLength,
1234	UErrorCode *pErrorCode);
1235	#endif /* defined(U_WCHAR_IS_UTF16) \|\| defined(U_WCHAR_IS_UTF32) \|\| !UCONFIG_NO_CONVERSION */
1236
1237	/**
1238	* Convert a UTF-16 string to UTF-8.
1239	* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1240	*
1241	* @param dest A buffer for the result string. The result will be zero-terminated if
1242	* the buffer is large enough.
1243	* @param destCapacity The size of the buffer (number of chars). If it is 0, then
1244	* dest may be NULL and the function will only return the length of the
1245	* result without writing any of the result string (pre-flighting).
1246	* @param pDestLength A pointer to receive the number of units written to the destination. If
1247	* pDestLength!=NULL then *pDestLength is always set to the
1248	* number of output units corresponding to the transformation of
1249	* all the input units, even in case of a buffer overflow.
1250	* @param src The original source string
1251	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1252	* @param pErrorCode Must be a valid pointer to an error code value,
1253	* which must not indicate a failure before the function call.
1254	* @return The pointer to destination buffer.
1255	* @stable ICU 2.0
1256	* @see u_strToUTF8WithSub
1257	* @see u_strFromUTF8
1258	*/
1259	U_CAPI char* U_EXPORT2
1260	u_strToUTF8(char *dest,
1261	int32_t destCapacity,
1262	int32_t *pDestLength,
1263	const UChar *src,
1264	int32_t srcLength,
1265	UErrorCode *pErrorCode);
1266
1267	/**
1268	* Convert a UTF-8 string to UTF-16.
1269	* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1270	*
1271	* @param dest A buffer for the result string. The result will be zero-terminated if
1272	* the buffer is large enough.
1273	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1274	* dest may be NULL and the function will only return the length of the
1275	* result without writing any of the result string (pre-flighting).
1276	* @param pDestLength A pointer to receive the number of units written to the destination. If
1277	* pDestLength!=NULL then *pDestLength is always set to the
1278	* number of output units corresponding to the transformation of
1279	* all the input units, even in case of a buffer overflow.
1280	* @param src The original source string
1281	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1282	* @param pErrorCode Must be a valid pointer to an error code value,
1283	* which must not indicate a failure before the function call.
1284	* @return The pointer to destination buffer.
1285	* @stable ICU 2.0
1286	* @see u_strFromUTF8WithSub
1287	* @see u_strFromUTF8Lenient
1288	*/
1289	U_CAPI UChar* U_EXPORT2
1290	u_strFromUTF8(UChar *dest,
1291	int32_t destCapacity,
1292	int32_t *pDestLength,
1293	const char *src,
1294	int32_t srcLength,
1295	UErrorCode *pErrorCode);
1296
1297	/**
1298	* Convert a UTF-16 string to UTF-8.
1299	*
1300	* Same as u_strToUTF8() except for the additional subchar which is output for
1301	* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1302	* With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
1303	*
1304	* @param dest A buffer for the result string. The result will be zero-terminated if
1305	* the buffer is large enough.
1306	* @param destCapacity The size of the buffer (number of chars). If it is 0, then
1307	* dest may be NULL and the function will only return the length of the
1308	* result without writing any of the result string (pre-flighting).
1309	* @param pDestLength A pointer to receive the number of units written to the destination. If
1310	* pDestLength!=NULL then *pDestLength is always set to the
1311	* number of output units corresponding to the transformation of
1312	* all the input units, even in case of a buffer overflow.
1313	* @param src The original source string
1314	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1315	* @param subchar The substitution character to use in place of an illegal input sequence,
1316	* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1317	* A substitution character can be any valid Unicode code point (up to U+10FFFF)
1318	* except for surrogate code points (U+D800..U+DFFF).
1319	* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1320	* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1321	* Set to 0 if no substitutions occur or subchar<0.
1322	* pNumSubstitutions can be NULL.
1323	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1324	* pass the U_SUCCESS() test, or else the function returns
1325	* immediately. Check for U_FAILURE() on output or use with
1326	* function chaining. (See User Guide for details.)
1327	* @return The pointer to destination buffer.
1328	* @see u_strToUTF8
1329	* @see u_strFromUTF8WithSub
1330	* @stable ICU 3.6
1331	*/
1332	U_CAPI char* U_EXPORT2
1333	u_strToUTF8WithSub(char *dest,
1334	int32_t destCapacity,
1335	int32_t *pDestLength,
1336	const UChar *src,
1337	int32_t srcLength,
1338	UChar32 subchar, int32_t *pNumSubstitutions,
1339	UErrorCode *pErrorCode);
1340
1341	/**
1342	* Convert a UTF-8 string to UTF-16.
1343	*
1344	* Same as u_strFromUTF8() except for the additional subchar which is output for
1345	* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1346	* With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
1347	*
1348	* @param dest A buffer for the result string. The result will be zero-terminated if
1349	* the buffer is large enough.
1350	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1351	* dest may be NULL and the function will only return the length of the
1352	* result without writing any of the result string (pre-flighting).
1353	* @param pDestLength A pointer to receive the number of units written to the destination. If
1354	* pDestLength!=NULL then *pDestLength is always set to the
1355	* number of output units corresponding to the transformation of
1356	* all the input units, even in case of a buffer overflow.
1357	* @param src The original source string
1358	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1359	* @param subchar The substitution character to use in place of an illegal input sequence,
1360	* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1361	* A substitution character can be any valid Unicode code point (up to U+10FFFF)
1362	* except for surrogate code points (U+D800..U+DFFF).
1363	* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1364	* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1365	* Set to 0 if no substitutions occur or subchar<0.
1366	* pNumSubstitutions can be NULL.
1367	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1368	* pass the U_SUCCESS() test, or else the function returns
1369	* immediately. Check for U_FAILURE() on output or use with
1370	* function chaining. (See User Guide for details.)
1371	* @return The pointer to destination buffer.
1372	* @see u_strFromUTF8
1373	* @see u_strFromUTF8Lenient
1374	* @see u_strToUTF8WithSub
1375	* @stable ICU 3.6
1376	*/
1377	U_CAPI UChar* U_EXPORT2
1378	u_strFromUTF8WithSub(UChar *dest,
1379	int32_t destCapacity,
1380	int32_t *pDestLength,
1381	const char *src,
1382	int32_t srcLength,
1383	UChar32 subchar, int32_t *pNumSubstitutions,
1384	UErrorCode *pErrorCode);
1385
1386	/**
1387	* Convert a UTF-8 string to UTF-16.
1388	*
1389	* Same as u_strFromUTF8() except that this function is designed to be very fast,
1390	* which it achieves by being lenient about malformed UTF-8 sequences.
1391	* This function is intended for use in environments where UTF-8 text is
1392	* expected to be well-formed.
1393	*
1394	* Its semantics are:
1395	* - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
1396	* - The function will not read beyond the input string, nor write beyond
1397	* the destCapacity.
1398	* - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
1399	* be well-formed UTF-16.
1400	* The function will resynchronize to valid code point boundaries
1401	* within a small number of code points after an illegal sequence.
1402	* - Non-shortest forms are not detected and will result in "spoofing" output.
1403	*
1404	* For further performance improvement, if srcLength is given (>=0),
1405	* then it must be destCapacity>=srcLength.
1406	*
1407	* There is no inverse u_strToUTF8Lenient() function because there is practically
1408	* no performance gain from not checking that a UTF-16 string is well-formed.
1409	*
1410	* @param dest A buffer for the result string. The result will be zero-terminated if
1411	* the buffer is large enough.
1412	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1413	* dest may be NULL and the function will only return the length of the
1414	* result without writing any of the result string (pre-flighting).
1415	* Unlike for other ICU functions, if srcLength>=0 then it
1416	* must be destCapacity>=srcLength.
1417	* @param pDestLength A pointer to receive the number of units written to the destination. If
1418	* pDestLength!=NULL then *pDestLength is always set to the
1419	* number of output units corresponding to the transformation of
1420	* all the input units, even in case of a buffer overflow.
1421	* Unlike for other ICU functions, if srcLength>=0 but
1422	* destCapacity<srcLength, then *pDestLength will be set to srcLength
1423	* (and U_BUFFER_OVERFLOW_ERROR will be set)
1424	* regardless of the actual result length.
1425	* @param src The original source string
1426	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1427	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1428	* pass the U_SUCCESS() test, or else the function returns
1429	* immediately. Check for U_FAILURE() on output or use with
1430	* function chaining. (See User Guide for details.)
1431	* @return The pointer to destination buffer.
1432	* @see u_strFromUTF8
1433	* @see u_strFromUTF8WithSub
1434	* @see u_strToUTF8WithSub
1435	* @stable ICU 3.6
1436	*/
1437	U_CAPI UChar * U_EXPORT2
1438	u_strFromUTF8Lenient(UChar *dest,
1439	int32_t destCapacity,
1440	int32_t *pDestLength,
1441	const char *src,
1442	int32_t srcLength,
1443	UErrorCode *pErrorCode);
1444
1445	/**
1446	* Convert a UTF-16 string to UTF-32.
1447	* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1448	*
1449	* @param dest A buffer for the result string. The result will be zero-terminated if
1450	* the buffer is large enough.
1451	* @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
1452	* dest may be NULL and the function will only return the length of the
1453	* result without writing any of the result string (pre-flighting).
1454	* @param pDestLength A pointer to receive the number of units written to the destination. If
1455	* pDestLength!=NULL then *pDestLength is always set to the
1456	* number of output units corresponding to the transformation of
1457	* all the input units, even in case of a buffer overflow.
1458	* @param src The original source string
1459	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1460	* @param pErrorCode Must be a valid pointer to an error code value,
1461	* which must not indicate a failure before the function call.
1462	* @return The pointer to destination buffer.
1463	* @see u_strToUTF32WithSub
1464	* @see u_strFromUTF32
1465	* @stable ICU 2.0
1466	*/
1467	U_CAPI UChar32* U_EXPORT2
1468	u_strToUTF32(UChar32 *dest,
1469	int32_t destCapacity,
1470	int32_t *pDestLength,
1471	const UChar *src,
1472	int32_t srcLength,
1473	UErrorCode *pErrorCode);
1474
1475	/**
1476	* Convert a UTF-32 string to UTF-16.
1477	* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
1478	*
1479	* @param dest A buffer for the result string. The result will be zero-terminated if
1480	* the buffer is large enough.
1481	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1482	* dest may be NULL and the function will only return the length of the
1483	* result without writing any of the result string (pre-flighting).
1484	* @param pDestLength A pointer to receive the number of units written to the destination. If
1485	* pDestLength!=NULL then *pDestLength is always set to the
1486	* number of output units corresponding to the transformation of
1487	* all the input units, even in case of a buffer overflow.
1488	* @param src The original source string
1489	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1490	* @param pErrorCode Must be a valid pointer to an error code value,
1491	* which must not indicate a failure before the function call.
1492	* @return The pointer to destination buffer.
1493	* @see u_strFromUTF32WithSub
1494	* @see u_strToUTF32
1495	* @stable ICU 2.0
1496	*/
1497	U_CAPI UChar* U_EXPORT2
1498	u_strFromUTF32(UChar *dest,
1499	int32_t destCapacity,
1500	int32_t *pDestLength,
1501	const UChar32 *src,
1502	int32_t srcLength,
1503	UErrorCode *pErrorCode);
1504
1505	/**
1506	* Convert a UTF-16 string to UTF-32.
1507	*
1508	* Same as u_strToUTF32() except for the additional subchar which is output for
1509	* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1510	* With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
1511	*
1512	* @param dest A buffer for the result string. The result will be zero-terminated if
1513	* the buffer is large enough.
1514	* @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
1515	* dest may be NULL and the function will only return the length of the
1516	* result without writing any of the result string (pre-flighting).
1517	* @param pDestLength A pointer to receive the number of units written to the destination. If
1518	* pDestLength!=NULL then *pDestLength is always set to the
1519	* number of output units corresponding to the transformation of
1520	* all the input units, even in case of a buffer overflow.
1521	* @param src The original source string
1522	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1523	* @param subchar The substitution character to use in place of an illegal input sequence,
1524	* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1525	* A substitution character can be any valid Unicode code point (up to U+10FFFF)
1526	* except for surrogate code points (U+D800..U+DFFF).
1527	* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1528	* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1529	* Set to 0 if no substitutions occur or subchar<0.
1530	* pNumSubstitutions can be NULL.
1531	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1532	* pass the U_SUCCESS() test, or else the function returns
1533	* immediately. Check for U_FAILURE() on output or use with
1534	* function chaining. (See User Guide for details.)
1535	* @return The pointer to destination buffer.
1536	* @see u_strToUTF32
1537	* @see u_strFromUTF32WithSub
1538	* @stable ICU 4.2
1539	*/
1540	U_CAPI UChar32* U_EXPORT2
1541	u_strToUTF32WithSub(UChar32 *dest,
1542	int32_t destCapacity,
1543	int32_t *pDestLength,
1544	const UChar *src,
1545	int32_t srcLength,
1546	UChar32 subchar, int32_t *pNumSubstitutions,
1547	UErrorCode *pErrorCode);
1548
1549	/**
1550	* Convert a UTF-32 string to UTF-16.
1551	*
1552	* Same as u_strFromUTF32() except for the additional subchar which is output for
1553	* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
1554	* With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
1555	*
1556	* @param dest A buffer for the result string. The result will be zero-terminated if
1557	* the buffer is large enough.
1558	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1559	* dest may be NULL and the function will only return the length of the
1560	* result without writing any of the result string (pre-flighting).
1561	* @param pDestLength A pointer to receive the number of units written to the destination. If
1562	* pDestLength!=NULL then *pDestLength is always set to the
1563	* number of output units corresponding to the transformation of
1564	* all the input units, even in case of a buffer overflow.
1565	* @param src The original source string
1566	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1567	* @param subchar The substitution character to use in place of an illegal input sequence,
1568	* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1569	* A substitution character can be any valid Unicode code point (up to U+10FFFF)
1570	* except for surrogate code points (U+D800..U+DFFF).
1571	* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1572	* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1573	* Set to 0 if no substitutions occur or subchar<0.
1574	* pNumSubstitutions can be NULL.
1575	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1576	* pass the U_SUCCESS() test, or else the function returns
1577	* immediately. Check for U_FAILURE() on output or use with
1578	* function chaining. (See User Guide for details.)
1579	* @return The pointer to destination buffer.
1580	* @see u_strFromUTF32
1581	* @see u_strToUTF32WithSub
1582	* @stable ICU 4.2
1583	*/
1584	U_CAPI UChar* U_EXPORT2
1585	u_strFromUTF32WithSub(UChar *dest,
1586	int32_t destCapacity,
1587	int32_t *pDestLength,
1588	const UChar32 *src,
1589	int32_t srcLength,
1590	UChar32 subchar, int32_t *pNumSubstitutions,
1591	UErrorCode *pErrorCode);
1592
1593	/**
1594	* Convert a 16-bit Unicode string to Java Modified UTF-8.
1595	* See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
1596	*
1597	* This function behaves according to the documentation for Java DataOutput.writeUTF()
1598	* except that it does not encode the output length in the destination buffer
1599	* and does not have an output length restriction.
1600	* See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
1601	*
1602	* The input string need not be well-formed UTF-16.
1603	* (Therefore there is no subchar parameter.)
1604	*
1605	* @param dest A buffer for the result string. The result will be zero-terminated if
1606	* the buffer is large enough.
1607	* @param destCapacity The size of the buffer (number of chars). If it is 0, then
1608	* dest may be NULL and the function will only return the length of the
1609	* result without writing any of the result string (pre-flighting).
1610	* @param pDestLength A pointer to receive the number of units written to the destination. If
1611	* pDestLength!=NULL then *pDestLength is always set to the
1612	* number of output units corresponding to the transformation of
1613	* all the input units, even in case of a buffer overflow.
1614	* @param src The original source string
1615	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1616	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1617	* pass the U_SUCCESS() test, or else the function returns
1618	* immediately. Check for U_FAILURE() on output or use with
1619	* function chaining. (See User Guide for details.)
1620	* @return The pointer to destination buffer.
1621	* @stable ICU 4.4
1622	* @see u_strToUTF8WithSub
1623	* @see u_strFromJavaModifiedUTF8WithSub
1624	*/
1625	U_CAPI char* U_EXPORT2
1626	u_strToJavaModifiedUTF8(
1627	char *dest,
1628	int32_t destCapacity,
1629	int32_t *pDestLength,
1630	const UChar *src,
1631	int32_t srcLength,
1632	UErrorCode *pErrorCode);
1633
1634	/**
1635	* Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
1636	* If the input string is not well-formed and no substitution char is specified,
1637	* then the U_INVALID_CHAR_FOUND error code is set.
1638	*
1639	* This function behaves according to the documentation for Java DataInput.readUTF()
1640	* except that it takes a length parameter rather than
1641	* interpreting the first two input bytes as the length.
1642	* See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
1643	*
1644	* The output string may not be well-formed UTF-16.
1645	*
1646	* @param dest A buffer for the result string. The result will be zero-terminated if
1647	* the buffer is large enough.
1648	* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
1649	* dest may be NULL and the function will only return the length of the
1650	* result without writing any of the result string (pre-flighting).
1651	* @param pDestLength A pointer to receive the number of units written to the destination. If
1652	* pDestLength!=NULL then *pDestLength is always set to the
1653	* number of output units corresponding to the transformation of
1654	* all the input units, even in case of a buffer overflow.
1655	* @param src The original source string
1656	* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
1657	* @param subchar The substitution character to use in place of an illegal input sequence,
1658	* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
1659	* A substitution character can be any valid Unicode code point (up to U+10FFFF)
1660	* except for surrogate code points (U+D800..U+DFFF).
1661	* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
1662	* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
1663	* Set to 0 if no substitutions occur or subchar<0.
1664	* pNumSubstitutions can be NULL.
1665	* @param pErrorCode Pointer to a standard ICU error code. Its input value must
1666	* pass the U_SUCCESS() test, or else the function returns
1667	* immediately. Check for U_FAILURE() on output or use with
1668	* function chaining. (See User Guide for details.)
1669	* @return The pointer to destination buffer.
1670	* @see u_strFromUTF8WithSub
1671	* @see u_strFromUTF8Lenient
1672	* @see u_strToJavaModifiedUTF8
1673	* @stable ICU 4.4
1674	*/
1675	U_CAPI UChar* U_EXPORT2
1676	u_strFromJavaModifiedUTF8WithSub(
1677	UChar *dest,
1678	int32_t destCapacity,
1679	int32_t *pDestLength,
1680	const char *src,
1681	int32_t srcLength,
1682	UChar32 subchar, int32_t *pNumSubstitutions,
1683	UErrorCode *pErrorCode);
1684
1685	#endif
1686

source code of include/unicode/ustring.h