normalizer2.h source code [include/unicode/normalizer2.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 2009-2013, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: normalizer2.h
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2009nov22
16	* created by: Markus W. Scherer
17	*/
18
19	#ifndef __NORMALIZER2_H__
20	#define __NORMALIZER2_H__
21
22	/**
23	* \file
24	* \brief C++ API: New API for Unicode Normalization.
25	*/
26
27	#include "unicode/utypes.h"
28
29	#if U_SHOW_CPLUSPLUS_API
30
31	#if !UCONFIG_NO_NORMALIZATION
32
33	#include "unicode/stringpiece.h"
34	#include "unicode/uniset.h"
35	#include "unicode/unistr.h"
36	#include "unicode/unorm2.h"
37
38	U_NAMESPACE_BEGIN
39
40	class ByteSink;
41
42	/**
43	* Unicode normalization functionality for standard Unicode normalization or
44	* for using custom mapping tables.
45	* All instances of this class are unmodifiable/immutable.
46	* Instances returned by getInstance() are singletons that must not be deleted by the caller.
47	* The Normalizer2 class is not intended for public subclassing.
48	*
49	* The primary functions are to produce a normalized string and to detect whether
50	* a string is already normalized.
51	* The most commonly used normalization forms are those defined in
52	* http://www.unicode.org/unicode/reports/tr15/
53	* However, this API supports additional normalization forms for specialized purposes.
54	* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
55	* and can be used in implementations of UTS #46.
56	*
57	* Not only are the standard compose and decompose modes supplied,
58	* but additional modes are provided as documented in the Mode enum.
59	*
60	* Some of the functions in this class identify normalization boundaries.
61	* At a normalization boundary, the portions of the string
62	* before it and starting from it do not interact and can be handled independently.
63	*
64	* The spanQuickCheckYes() stops at a normalization boundary.
65	* When the goal is a normalized string, then the text before the boundary
66	* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
67	*
68	* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
69	* a character is guaranteed to be at a normalization boundary,
70	* regardless of context.
71	* This is used for moving from one normalization boundary to the next
72	* or preceding boundary, and for performing iterative normalization.
73	*
74	* Iterative normalization is useful when only a small portion of a
75	* longer string needs to be processed.
76	* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
77	* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
78	* (to process only the substring for which sort key bytes are computed).
79	*
80	* The set of normalization boundaries returned by these functions may not be
81	* complete: There may be more boundaries that could be returned.
82	* Different functions may return different boundaries.
83	* @stable ICU 4.4
84	*/
85	class U_COMMON_API Normalizer2 : public UObject {
86	public:
87	/**
88	* Destructor.
89	* @stable ICU 4.4
90	*/
91	~Normalizer2();
92
93	/**
94	* Returns a Normalizer2 instance for Unicode NFC normalization.
95	* Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
96	* Returns an unmodifiable singleton instance. Do not delete it.
97	* @param errorCode Standard ICU error code. Its input value must
98	* pass the U_SUCCESS() test, or else the function returns
99	* immediately. Check for U_FAILURE() on output or use with
100	* function chaining. (See User Guide for details.)
101	* @return the requested Normalizer2, if successful
102	* @stable ICU 49
103	*/
104	static const Normalizer2 *
105	getNFCInstance(UErrorCode &errorCode);
106
107	/**
108	* Returns a Normalizer2 instance for Unicode NFD normalization.
109	* Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
110	* Returns an unmodifiable singleton instance. Do not delete it.
111	* @param errorCode Standard ICU error code. Its input value must
112	* pass the U_SUCCESS() test, or else the function returns
113	* immediately. Check for U_FAILURE() on output or use with
114	* function chaining. (See User Guide for details.)
115	* @return the requested Normalizer2, if successful
116	* @stable ICU 49
117	*/
118	static const Normalizer2 *
119	getNFDInstance(UErrorCode &errorCode);
120
121	/**
122	* Returns a Normalizer2 instance for Unicode NFKC normalization.
123	* Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
124	* Returns an unmodifiable singleton instance. Do not delete it.
125	* @param errorCode Standard ICU error code. Its input value must
126	* pass the U_SUCCESS() test, or else the function returns
127	* immediately. Check for U_FAILURE() on output or use with
128	* function chaining. (See User Guide for details.)
129	* @return the requested Normalizer2, if successful
130	* @stable ICU 49
131	*/
132	static const Normalizer2 *
133	getNFKCInstance(UErrorCode &errorCode);
134
135	/**
136	* Returns a Normalizer2 instance for Unicode NFKD normalization.
137	* Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
138	* Returns an unmodifiable singleton instance. Do not delete it.
139	* @param errorCode Standard ICU error code. Its input value must
140	* pass the U_SUCCESS() test, or else the function returns
141	* immediately. Check for U_FAILURE() on output or use with
142	* function chaining. (See User Guide for details.)
143	* @return the requested Normalizer2, if successful
144	* @stable ICU 49
145	*/
146	static const Normalizer2 *
147	getNFKDInstance(UErrorCode &errorCode);
148
149	/**
150	* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
151	* Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
152	* Returns an unmodifiable singleton instance. Do not delete it.
153	* @param errorCode Standard ICU error code. Its input value must
154	* pass the U_SUCCESS() test, or else the function returns
155	* immediately. Check for U_FAILURE() on output or use with
156	* function chaining. (See User Guide for details.)
157	* @return the requested Normalizer2, if successful
158	* @stable ICU 49
159	*/
160	static const Normalizer2 *
161	getNFKCCasefoldInstance(UErrorCode &errorCode);
162
163	/**
164	* Returns a Normalizer2 instance which uses the specified data file
165	* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
166	* and which composes or decomposes text according to the specified mode.
167	* Returns an unmodifiable singleton instance. Do not delete it.
168	*
169	* Use packageName=NULL for data files that are part of ICU's own data.
170	* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
171	* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
172	* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
173	*
174	* @param packageName NULL for ICU built-in data, otherwise application data package name
175	* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
176	* @param mode normalization mode (compose or decompose etc.)
177	* @param errorCode Standard ICU error code. Its input value must
178	* pass the U_SUCCESS() test, or else the function returns
179	* immediately. Check for U_FAILURE() on output or use with
180	* function chaining. (See User Guide for details.)
181	* @return the requested Normalizer2, if successful
182	* @stable ICU 4.4
183	*/
184	static const Normalizer2 *
185	getInstance(const char *packageName,
186	const char *name,
187	UNormalization2Mode mode,
188	UErrorCode &errorCode);
189
190	/**
191	* Returns the normalized form of the source string.
192	* @param src source string
193	* @param errorCode Standard ICU error code. Its input value must
194	* pass the U_SUCCESS() test, or else the function returns
195	* immediately. Check for U_FAILURE() on output or use with
196	* function chaining. (See User Guide for details.)
197	* @return normalized src
198	* @stable ICU 4.4
199	*/
200	UnicodeString
201	normalize(const UnicodeString &src, UErrorCode &errorCode) const {
202	UnicodeString result;
203	normalize(src, dest&: result, errorCode);
204	return result;
205	}
206	/**
207	* Writes the normalized form of the source string to the destination string
208	* (replacing its contents) and returns the destination string.
209	* The source and destination strings must be different objects.
210	* @param src source string
211	* @param dest destination string; its contents is replaced with normalized src
212	* @param errorCode Standard ICU error code. Its input value must
213	* pass the U_SUCCESS() test, or else the function returns
214	* immediately. Check for U_FAILURE() on output or use with
215	* function chaining. (See User Guide for details.)
216	* @return dest
217	* @stable ICU 4.4
218	*/
219	virtual UnicodeString &
220	normalize(const UnicodeString &src,
221	UnicodeString &dest,
222	UErrorCode &errorCode) const = `0`;
223
224	/**
225	* Normalizes a UTF-8 string and optionally records how source substrings
226	* relate to changed and unchanged result substrings.
227	*
228	* Implemented completely for all built-in modes except for FCD.
229	* The base class implementation converts to & from UTF-16 and does not support edits.
230	*
231	* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
232	* @param src Source UTF-8 string.
233	* @param sink A ByteSink to which the normalized UTF-8 result string is written.
234	* sink.Flush() is called at the end.
235	* @param edits Records edits for index mapping, working with styled text,
236	* and getting only changes (if any).
237	* The Edits contents is undefined if any error occurs.
238	* This function calls edits->reset() first unless
239	* options includes U_EDITS_NO_RESET. edits can be nullptr.
240	* @param errorCode Standard ICU error code. Its input value must
241	* pass the U_SUCCESS() test, or else the function returns
242	* immediately. Check for U_FAILURE() on output or use with
243	* function chaining. (See User Guide for details.)
244	* @stable ICU 60
245	*/
246	virtual void
247	normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
248	Edits edits, UErrorCode &errorCode) const*;
249
250	/**
251	* Appends the normalized form of the second string to the first string
252	* (merging them at the boundary) and returns the first string.
253	* The result is normalized if the first string was normalized.
254	* The first and second strings must be different objects.
255	* @param first string, should be normalized
256	* @param second string, will be normalized
257	* @param errorCode Standard ICU error code. Its input value must
258	* pass the U_SUCCESS() test, or else the function returns
259	* immediately. Check for U_FAILURE() on output or use with
260	* function chaining. (See User Guide for details.)
261	* @return first
262	* @stable ICU 4.4
263	*/
264	virtual UnicodeString &
265	normalizeSecondAndAppend(UnicodeString &first,
266	const UnicodeString &second,
267	UErrorCode &errorCode) const = `0`;
268	/**
269	* Appends the second string to the first string
270	* (merging them at the boundary) and returns the first string.
271	* The result is normalized if both the strings were normalized.
272	* The first and second strings must be different objects.
273	* @param first string, should be normalized
274	* @param second string, should be normalized
275	* @param errorCode Standard ICU error code. Its input value must
276	* pass the U_SUCCESS() test, or else the function returns
277	* immediately. Check for U_FAILURE() on output or use with
278	* function chaining. (See User Guide for details.)
279	* @return first
280	* @stable ICU 4.4
281	*/
282	virtual UnicodeString &
283	append(UnicodeString &first,
284	const UnicodeString &second,
285	UErrorCode &errorCode) const = `0`;
286
287	/**
288	* Gets the decomposition mapping of c.
289	* Roughly equivalent to normalizing the String form of c
290	* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
291	* returns false and does not write a string
292	* if c does not have a decomposition mapping in this instance's data.
293	* This function is independent of the mode of the Normalizer2.
294	* @param c code point
295	* @param decomposition String object which will be set to c's
296	* decomposition mapping, if there is one.
297	* @return true if c has a decomposition, otherwise false
298	* @stable ICU 4.6
299	*/
300	virtual UBool
301	getDecomposition(UChar32 c, UnicodeString &decomposition) const = `0`;
302
303	/**
304	* Gets the raw decomposition mapping of c.
305	*
306	* This is similar to the getDecomposition() method but returns the
307	* raw decomposition mapping as specified in UnicodeData.txt or
308	* (for custom data) in the mapping files processed by the gennorm2 tool.
309	* By contrast, getDecomposition() returns the processed,
310	* recursively-decomposed version of this mapping.
311	*
312	* When used on a standard NFKC Normalizer2 instance,
313	* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
314	*
315	* When used on a standard NFC Normalizer2 instance,
316	* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
317	* in this case, the result contains either one or two code points (=1..4 char16_ts).
318	*
319	* This function is independent of the mode of the Normalizer2.
320	* The default implementation returns false.
321	* @param c code point
322	* @param decomposition String object which will be set to c's
323	* raw decomposition mapping, if there is one.
324	* @return true if c has a decomposition, otherwise false
325	* @stable ICU 49
326	*/
327	virtual UBool
328	getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
329
330	/**
331	* Performs pairwise composition of a & b and returns the composite if there is one.
332	*
333	* Returns a composite code point c only if c has a two-way mapping to a+b.
334	* In standard Unicode normalization, this means that
335	* c has a canonical decomposition to a+b
336	* and c does not have the Full_Composition_Exclusion property.
337	*
338	* This function is independent of the mode of the Normalizer2.
339	* The default implementation returns a negative value.
340	* @param a A (normalization starter) code point.
341	* @param b Another code point.
342	* @return The non-negative composite code point if there is one; otherwise a negative value.
343	* @stable ICU 49
344	*/
345	virtual UChar32
346	composePair(UChar32 a, UChar32 b) const;
347
348	/**
349	* Gets the combining class of c.
350	* The default implementation returns 0
351	* but all standard implementations return the Unicode Canonical_Combining_Class value.
352	* @param c code point
353	* @return c's combining class
354	* @stable ICU 49
355	*/
356	virtual uint8_t
357	getCombiningClass(UChar32 c) const;
358
359	/**
360	* Tests if the string is normalized.
361	* Internally, in cases where the quickCheck() method would return "maybe"
362	* (which is only possible for the two COMPOSE modes) this method
363	* resolves to "yes" or "no" to provide a definitive result,
364	* at the cost of doing more work in those cases.
365	* @param s input string
366	* @param errorCode Standard ICU error code. Its input value must
367	* pass the U_SUCCESS() test, or else the function returns
368	* immediately. Check for U_FAILURE() on output or use with
369	* function chaining. (See User Guide for details.)
370	* @return true if s is normalized
371	* @stable ICU 4.4
372	*/
373	virtual UBool
374	isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = `0`;
375	/**
376	* Tests if the UTF-8 string is normalized.
377	* Internally, in cases where the quickCheck() method would return "maybe"
378	* (which is only possible for the two COMPOSE modes) this method
379	* resolves to "yes" or "no" to provide a definitive result,
380	* at the cost of doing more work in those cases.
381	*
382	* This works for all normalization modes.
383	* It is optimized for UTF-8 for all built-in modes except for FCD.
384	* The base class implementation converts to UTF-16 and calls isNormalized().
385	*
386	* @param s UTF-8 input string
387	* @param errorCode Standard ICU error code. Its input value must
388	* pass the U_SUCCESS() test, or else the function returns
389	* immediately. Check for U_FAILURE() on output or use with
390	* function chaining. (See User Guide for details.)
391	* @return true if s is normalized
392	* @stable ICU 60
393	*/
394	virtual UBool
395	isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
396
397
398	/**
399	* Tests if the string is normalized.
400	* For the two COMPOSE modes, the result could be "maybe" in cases that
401	* would take a little more work to resolve definitively.
402	* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
403	* combination of quick check + normalization, to avoid
404	* re-checking the "yes" prefix.
405	* @param s input string
406	* @param errorCode Standard ICU error code. Its input value must
407	* pass the U_SUCCESS() test, or else the function returns
408	* immediately. Check for U_FAILURE() on output or use with
409	* function chaining. (See User Guide for details.)
410	* @return UNormalizationCheckResult
411	* @stable ICU 4.4
412	*/
413	virtual UNormalizationCheckResult
414	quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = `0`;
415
416	/**
417	* Returns the end of the normalized substring of the input string.
418	* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
419	* the substring <code>UnicodeString(s, 0, end)</code>
420	* will pass the quick check with a "yes" result.
421	*
422	* The returned end index is usually one or more characters before the
423	* "no" or "maybe" character: The end index is at a normalization boundary.
424	* (See the class documentation for more about normalization boundaries.)
425	*
426	* When the goal is a normalized string and most input strings are expected
427	* to be normalized already, then call this method,
428	* and if it returns a prefix shorter than the input string,
429	* copy that prefix and use normalizeSecondAndAppend() for the remainder.
430	* @param s input string
431	* @param errorCode Standard ICU error code. Its input value must
432	* pass the U_SUCCESS() test, or else the function returns
433	* immediately. Check for U_FAILURE() on output or use with
434	* function chaining. (See User Guide for details.)
435	* @return "yes" span end index
436	* @stable ICU 4.4
437	*/
438	virtual int32_t
439	spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = `0`;
440
441	/**
442	* Tests if the character always has a normalization boundary before it,
443	* regardless of context.
444	* If true, then the character does not normalization-interact with
445	* preceding characters.
446	* In other words, a string containing this character can be normalized
447	* by processing portions before this character and starting from this
448	* character independently.
449	* This is used for iterative normalization. See the class documentation for details.
450	* @param c character to test
451	* @return true if c has a normalization boundary before it
452	* @stable ICU 4.4
453	*/
454	virtual UBool hasBoundaryBefore(UChar32 c) const = `0`;
455
456	/**
457	* Tests if the character always has a normalization boundary after it,
458	* regardless of context.
459	* If true, then the character does not normalization-interact with
460	* following characters.
461	* In other words, a string containing this character can be normalized
462	* by processing portions up to this character and after this
463	* character independently.
464	* This is used for iterative normalization. See the class documentation for details.
465	* Note that this operation may be significantly slower than hasBoundaryBefore().
466	* @param c character to test
467	* @return true if c has a normalization boundary after it
468	* @stable ICU 4.4
469	*/
470	virtual UBool hasBoundaryAfter(UChar32 c) const = `0`;
471
472	/**
473	* Tests if the character is normalization-inert.
474	* If true, then the character does not change, nor normalization-interact with
475	* preceding or following characters.
476	* In other words, a string containing this character can be normalized
477	* by processing portions before this character and after this
478	* character independently.
479	* This is used for iterative normalization. See the class documentation for details.
480	* Note that this operation may be significantly slower than hasBoundaryBefore().
481	* @param c character to test
482	* @return true if c is normalization-inert
483	* @stable ICU 4.4
484	*/
485	virtual UBool isInert(UChar32 c) const = `0`;
486	};
487
488	/**
489	* Normalization filtered by a UnicodeSet.
490	* Normalizes portions of the text contained in the filter set and leaves
491	* portions not contained in the filter set unchanged.
492	* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
493	* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
494	* This class implements all of (and only) the Normalizer2 API.
495	* An instance of this class is unmodifiable/immutable but is constructed and
496	* must be destructed by the owner.
497	* @stable ICU 4.4
498	*/
499	class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
500	public:
501	/**
502	* Constructs a filtered normalizer wrapping any Normalizer2 instance
503	* and a filter set.
504	* Both are aliased and must not be modified or deleted while this object
505	* is used.
506	* The filter set should be frozen; otherwise the performance will suffer greatly.
507	* @param n2 wrapped Normalizer2 instance
508	* @param filterSet UnicodeSet which determines the characters to be normalized
509	* @stable ICU 4.4
510	*/
511	FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
512	norm2(n2), set(filterSet) {}
513
514	/**
515	* Destructor.
516	* @stable ICU 4.4
517	*/
518	~FilteredNormalizer2();
519
520	/**
521	* Writes the normalized form of the source string to the destination string
522	* (replacing its contents) and returns the destination string.
523	* The source and destination strings must be different objects.
524	* @param src source string
525	* @param dest destination string; its contents is replaced with normalized src
526	* @param errorCode Standard ICU error code. Its input value must
527	* pass the U_SUCCESS() test, or else the function returns
528	* immediately. Check for U_FAILURE() on output or use with
529	* function chaining. (See User Guide for details.)
530	* @return dest
531	* @stable ICU 4.4
532	*/
533	virtual UnicodeString &
534	normalize(const UnicodeString &src,
535	UnicodeString &dest,
536	UErrorCode &errorCode) const U_OVERRIDE;
537
538	/**
539	* Normalizes a UTF-8 string and optionally records how source substrings
540	* relate to changed and unchanged result substrings.
541	*
542	* Implemented completely for most built-in modes except for FCD.
543	* The base class implementation converts to & from UTF-16 and does not support edits.
544	*
545	* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
546	* @param src Source UTF-8 string.
547	* @param sink A ByteSink to which the normalized UTF-8 result string is written.
548	* sink.Flush() is called at the end.
549	* @param edits Records edits for index mapping, working with styled text,
550	* and getting only changes (if any).
551	* The Edits contents is undefined if any error occurs.
552	* This function calls edits->reset() first unless
553	* options includes U_EDITS_NO_RESET. edits can be nullptr.
554	* @param errorCode Standard ICU error code. Its input value must
555	* pass the U_SUCCESS() test, or else the function returns
556	* immediately. Check for U_FAILURE() on output or use with
557	* function chaining. (See User Guide for details.)
558	* @stable ICU 60
559	*/
560	virtual void
561	normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
562	Edits edits, UErrorCode &errorCode) const* U_OVERRIDE;
563
564	/**
565	* Appends the normalized form of the second string to the first string
566	* (merging them at the boundary) and returns the first string.
567	* The result is normalized if the first string was normalized.
568	* The first and second strings must be different objects.
569	* @param first string, should be normalized
570	* @param second string, will be normalized
571	* @param errorCode Standard ICU error code. Its input value must
572	* pass the U_SUCCESS() test, or else the function returns
573	* immediately. Check for U_FAILURE() on output or use with
574	* function chaining. (See User Guide for details.)
575	* @return first
576	* @stable ICU 4.4
577	*/
578	virtual UnicodeString &
579	normalizeSecondAndAppend(UnicodeString &first,
580	const UnicodeString &second,
581	UErrorCode &errorCode) const U_OVERRIDE;
582	/**
583	* Appends the second string to the first string
584	* (merging them at the boundary) and returns the first string.
585	* The result is normalized if both the strings were normalized.
586	* The first and second strings must be different objects.
587	* @param first string, should be normalized
588	* @param second string, should be normalized
589	* @param errorCode Standard ICU error code. Its input value must
590	* pass the U_SUCCESS() test, or else the function returns
591	* immediately. Check for U_FAILURE() on output or use with
592	* function chaining. (See User Guide for details.)
593	* @return first
594	* @stable ICU 4.4
595	*/
596	virtual UnicodeString &
597	append(UnicodeString &first,
598	const UnicodeString &second,
599	UErrorCode &errorCode) const U_OVERRIDE;
600
601	/**
602	* Gets the decomposition mapping of c.
603	* For details see the base class documentation.
604	*
605	* This function is independent of the mode of the Normalizer2.
606	* @param c code point
607	* @param decomposition String object which will be set to c's
608	* decomposition mapping, if there is one.
609	* @return true if c has a decomposition, otherwise false
610	* @stable ICU 4.6
611	*/
612	virtual UBool
613	getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
614
615	/**
616	* Gets the raw decomposition mapping of c.
617	* For details see the base class documentation.
618	*
619	* This function is independent of the mode of the Normalizer2.
620	* @param c code point
621	* @param decomposition String object which will be set to c's
622	* raw decomposition mapping, if there is one.
623	* @return true if c has a decomposition, otherwise false
624	* @stable ICU 49
625	*/
626	virtual UBool
627	getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
628
629	/**
630	* Performs pairwise composition of a & b and returns the composite if there is one.
631	* For details see the base class documentation.
632	*
633	* This function is independent of the mode of the Normalizer2.
634	* @param a A (normalization starter) code point.
635	* @param b Another code point.
636	* @return The non-negative composite code point if there is one; otherwise a negative value.
637	* @stable ICU 49
638	*/
639	virtual UChar32
640	composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
641
642	/**
643	* Gets the combining class of c.
644	* The default implementation returns 0
645	* but all standard implementations return the Unicode Canonical_Combining_Class value.
646	* @param c code point
647	* @return c's combining class
648	* @stable ICU 49
649	*/
650	virtual uint8_t
651	getCombiningClass(UChar32 c) const U_OVERRIDE;
652
653	/**
654	* Tests if the string is normalized.
655	* For details see the Normalizer2 base class documentation.
656	* @param s input string
657	* @param errorCode Standard ICU error code. Its input value must
658	* pass the U_SUCCESS() test, or else the function returns
659	* immediately. Check for U_FAILURE() on output or use with
660	* function chaining. (See User Guide for details.)
661	* @return true if s is normalized
662	* @stable ICU 4.4
663	*/
664	virtual UBool
665	isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
666	/**
667	* Tests if the UTF-8 string is normalized.
668	* Internally, in cases where the quickCheck() method would return "maybe"
669	* (which is only possible for the two COMPOSE modes) this method
670	* resolves to "yes" or "no" to provide a definitive result,
671	* at the cost of doing more work in those cases.
672	*
673	* This works for all normalization modes.
674	* It is optimized for UTF-8 for all built-in modes except for FCD.
675	* The base class implementation converts to UTF-16 and calls isNormalized().
676	*
677	* @param s UTF-8 input string
678	* @param errorCode Standard ICU error code. Its input value must
679	* pass the U_SUCCESS() test, or else the function returns
680	* immediately. Check for U_FAILURE() on output or use with
681	* function chaining. (See User Guide for details.)
682	* @return true if s is normalized
683	* @stable ICU 60
684	*/
685	virtual UBool
686	isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
687	/**
688	* Tests if the string is normalized.
689	* For details see the Normalizer2 base class documentation.
690	* @param s input string
691	* @param errorCode Standard ICU error code. Its input value must
692	* pass the U_SUCCESS() test, or else the function returns
693	* immediately. Check for U_FAILURE() on output or use with
694	* function chaining. (See User Guide for details.)
695	* @return UNormalizationCheckResult
696	* @stable ICU 4.4
697	*/
698	virtual UNormalizationCheckResult
699	quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
700	/**
701	* Returns the end of the normalized substring of the input string.
702	* For details see the Normalizer2 base class documentation.
703	* @param s input string
704	* @param errorCode Standard ICU error code. Its input value must
705	* pass the U_SUCCESS() test, or else the function returns
706	* immediately. Check for U_FAILURE() on output or use with
707	* function chaining. (See User Guide for details.)
708	* @return "yes" span end index
709	* @stable ICU 4.4
710	*/
711	virtual int32_t
712	spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
713
714	/**
715	* Tests if the character always has a normalization boundary before it,
716	* regardless of context.
717	* For details see the Normalizer2 base class documentation.
718	* @param c character to test
719	* @return true if c has a normalization boundary before it
720	* @stable ICU 4.4
721	*/
722	virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
723
724	/**
725	* Tests if the character always has a normalization boundary after it,
726	* regardless of context.
727	* For details see the Normalizer2 base class documentation.
728	* @param c character to test
729	* @return true if c has a normalization boundary after it
730	* @stable ICU 4.4
731	*/
732	virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
733
734	/**
735	* Tests if the character is normalization-inert.
736	* For details see the Normalizer2 base class documentation.
737	* @param c character to test
738	* @return true if c is normalization-inert
739	* @stable ICU 4.4
740	*/
741	virtual UBool isInert(UChar32 c) const U_OVERRIDE;
742	private:
743	UnicodeString &
744	normalize(const UnicodeString &src,
745	UnicodeString &dest,
746	USetSpanCondition spanCondition,
747	UErrorCode &errorCode) const;
748
749	void
750	normalizeUTF8(uint32_t options, const char *src, int32_t length,
751	ByteSink &sink, Edits *edits,
752	USetSpanCondition spanCondition,
753	UErrorCode &errorCode) const;
754
755	UnicodeString &
756	normalizeSecondAndAppend(UnicodeString &first,
757	const UnicodeString &second,
758	UBool doNormalize,
759	UErrorCode &errorCode) const;
760
761	const Normalizer2 &norm2;
762	const UnicodeSet &set;
763	};
764
765	U_NAMESPACE_END
766
767	#endif // !UCONFIG_NO_NORMALIZATION
768
769	#endif /* U_SHOW_CPLUSPLUS_API */
770
771	#endif // __NORMALIZER2_H__
772

source code of include/unicode/normalizer2.h