uniset.h source code [include/unicode/uniset.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	***************************************************************************
5	* Copyright (C) 1999-2016, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	***************************************************************************
8	* Date Name Description
9	* 10/20/99 alan Creation.
10	***************************************************************************
11	*/
12
13	#ifndef UNICODESET_H
14	#define UNICODESET_H
15
16	#include "unicode/utypes.h"
17
18	#if U_SHOW_CPLUSPLUS_API
19
20	#include "unicode/ucpmap.h"
21	#include "unicode/unifilt.h"
22	#include "unicode/unistr.h"
23	#include "unicode/uset.h"
24
25	/**
26	* \file
27	* \brief C++ API: Unicode Set
28	*/
29
30	U_NAMESPACE_BEGIN
31
32	// Forward Declarations.
33	class BMPSet;
34	class ParsePosition;
35	class RBBIRuleScanner;
36	class SymbolTable;
37	class UnicodeSetStringSpan;
38	class UVector;
39	class RuleCharacterIterator;
40
41	/**
42	* A mutable set of Unicode characters and multicharacter strings. Objects of this class
43	* represent <em>character classes</em> used in regular expressions.
44	* A character specifies a subset of Unicode code points. Legal
45	* code points are U+0000 to U+10FFFF, inclusive.
46	*
47	* <p>The UnicodeSet class is not designed to be subclassed.
48	*
49	* <p><code>UnicodeSet</code> supports two APIs. The first is the
50	* <em>operand</em> API that allows the caller to modify the value of
51	* a <code>UnicodeSet</code> object. It conforms to Java 2's
52	* <code>java.util.Set</code> interface, although
53	* <code>UnicodeSet</code> does not actually implement that
54	* interface. All methods of <code>Set</code> are supported, with the
55	* modification that they take a character range or single character
56	* instead of an <code>Object</code>, and they take a
57	* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
58	* operand API may be thought of in terms of boolean logic: a boolean
59	* OR is implemented by <code>add</code>, a boolean AND is implemented
60	* by <code>retain</code>, a boolean XOR is implemented by
61	* <code>complement</code> taking an argument, and a boolean NOT is
62	* implemented by <code>complement</code> with no argument. In terms
63	* of traditional set theory function names, <code>add</code> is a
64	* union, <code>retain</code> is an intersection, <code>remove</code>
65	* is an asymmetric difference, and <code>complement</code> with no
66	* argument is a set complement with respect to the superset range
67	* <code>MIN_VALUE-MAX_VALUE</code>
68	*
69	* <p>The second API is the
70	* <code>applyPattern()</code>/<code>toPattern()</code> API from the
71	* <code>java.text.Format</code>-derived classes. Unlike the
72	* methods that add characters, add categories, and control the logic
73	* of the set, the method <code>applyPattern()</code> sets all
74	* attributes of a <code>UnicodeSet</code> at once, based on a
75	* string pattern.
76	*
77	* <p><b>Pattern syntax</b></p>
78	*
79	* Patterns are accepted by the constructors and the
80	* <code>applyPattern()</code> methods and returned by the
81	* <code>toPattern()</code> method. These patterns follow a syntax
82	* similar to that employed by version 8 regular expression character
83	* classes. Here are some simple examples:
84	*
85	* \htmlonly<blockquote>\endhtmlonly
86	* <table>
87	* <tr align="top">
88	* <td nowrap valign="top" align="left"><code>[]</code></td>
89	* <td valign="top">No characters</td>
90	* </tr><tr align="top">
91	* <td nowrap valign="top" align="left"><code>[a]</code></td>
92	* <td valign="top">The character 'a'</td>
93	* </tr><tr align="top">
94	* <td nowrap valign="top" align="left"><code>[ae]</code></td>
95	* <td valign="top">The characters 'a' and 'e'</td>
96	* </tr>
97	* <tr>
98	* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
99	* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
100	* point order</td>
101	* </tr>
102	* <tr>
103	* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
104	* <td valign="top">The character U+4E01</td>
105	* </tr>
106	* <tr>
107	* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
108	* <td valign="top">The character 'a' and the multicharacter strings "ab" and
109	* "ac"</td>
110	* </tr>
111	* <tr>
112	* <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
113	* <td valign="top">All characters in the general category Uppercase Letter</td>
114	* </tr>
115	* </table>
116	* \htmlonly</blockquote>\endhtmlonly
117	*
118	* Any character may be preceded by a backslash in order to remove any special
119	* meaning. White space characters, as defined by UCharacter.isWhitespace(), are
120	* ignored, unless they are escaped.
121	*
122	* <p>Property patterns specify a set of characters having a certain
123	* property as defined by the Unicode standard. Both the POSIX-like
124	* "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
125	* complete list of supported property patterns, see the User's Guide
126	* for UnicodeSet at
127	* <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
128	* https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
129	* Actual determination of property data is defined by the underlying
130	* Unicode database as implemented by UCharacter.
131	*
132	* <p>Patterns specify individual characters, ranges of characters, and
133	* Unicode property sets. When elements are concatenated, they
134	* specify their union. To complement a set, place a '^' immediately
135	* after the opening '['. Property patterns are inverted by modifying
136	* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
137	* '^' has no special meaning.
138	*
139	* <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
140	* perform a “code point complement” (all code points minus the original set),
141	* removing all multicharacter strings,
142	* equivalent to <code>.complement().removeAllStrings()</code>.
143	* The complement() API function continues to perform a
144	* symmetric difference with all code points and thus retains all multicharacter strings.
145	*
146	* <p>Ranges are indicated by placing two a '-' between two
147	* characters, as in "a-z". This specifies the range of all
148	* characters from the left to the right, in Unicode order. If the
149	* left character is greater than or equal to the
150	* right character it is a syntax error. If a '-' occurs as the first
151	* character after the opening '[' or '[^', or if it occurs as the
152	* last character before the closing ']', then it is taken as a
153	* literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
154	* set of three characters, 'a', 'b', and '-'.
155	*
156	* <p>Sets may be intersected using the '&' operator or the asymmetric
157	* set difference may be taken using the '-' operator, for example,
158	* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
159	* with values less than 4096. Operators ('&' and '\|') have equal
160	* precedence and bind left-to-right. Thus
161	* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
162	* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
163	* difference; intersection is commutative.
164	*
165	* <table>
166	* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
167	* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
168	* through 'z' and all letters in between, in Unicode order
169	* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
170	* all characters but 'a' through 'z',
171	* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
172	* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
173	* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
174	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
175	* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
176	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
177	* <td>The asymmetric difference of sets specified by <em>pat1</em> and
178	* <em>pat2</em>
179	* <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
180	* <td>The set of characters having the specified
181	* Unicode property; in
182	* this case, Unicode uppercase letters
183	* <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
184	* <td>The set of characters <em>not</em> having the given
185	* Unicode property
186	* </table>
187	*
188	* <p><b>Formal syntax</b></p>
189	*
190	* \htmlonly<blockquote>\endhtmlonly
191	* <table>
192	* <tr align="top">
193	* <td nowrap valign="top" align="right"><code>pattern :=  </code></td>
194	* <td valign="top"><code>('[' '^'? item* ']') \|
195	* property</code></td>
196	* </tr>
197	* <tr align="top">
198	* <td nowrap valign="top" align="right"><code>item :=  </code></td>
199	* <td valign="top"><code>char \| (char '-' char) \| pattern-expr<br>
200	* </code></td>
201	* </tr>
202	* <tr align="top">
203	* <td nowrap valign="top" align="right"><code>pattern-expr :=  </code></td>
204	* <td valign="top"><code>pattern \| pattern-expr pattern \|
205	* pattern-expr op pattern<br>
206	* </code></td>
207	* </tr>
208	* <tr align="top">
209	* <td nowrap valign="top" align="right"><code>op :=  </code></td>
210	* <td valign="top"><code>'&' \| '-'<br>
211	* </code></td>
212	* </tr>
213	* <tr align="top">
214	* <td nowrap valign="top" align="right"><code>special :=  </code></td>
215	* <td valign="top"><code>'[' \| ']' \| '-'<br>
216	* </code></td>
217	* </tr>
218	* <tr align="top">
219	* <td nowrap valign="top" align="right"><code>char :=  </code></td>
220	* <td valign="top"><em>any character that is not</em><code> special<br>
221	* \| ('\' </code><em>any character</em><code>)<br>
222	* \| ('\\u' hex hex hex hex)<br>
223	* </code></td>
224	* </tr>
225	* <tr align="top">
226	* <td nowrap valign="top" align="right"><code>hex :=  </code></td>
227	* <td valign="top"><code>'0' \| '1' \| '2' \| '3' \| '4' \| '5' \| '6' \| '7' \| '8' \| '9' \|<br>
228	*     'A' \| 'B' \| 'C' \| 'D' \| 'E' \| 'F' \| 'a' \| 'b' \| 'c' \| 'd' \| 'e' \| 'f'</code></td>
229	* </tr>
230	* <tr>
231	* <td nowrap valign="top" align="right"><code>property :=  </code></td>
232	* <td valign="top"><em>a Unicode property set pattern</em></td>
233	* </tr>
234	* </table>
235	* <br>
236	* <table border="1">
237	* <tr>
238	* <td>Legend: <table>
239	* <tr>
240	* <td nowrap valign="top"><code>a := b</code></td>
241	* <td width="20" valign="top">  </td>
242	* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
243	* </tr>
244	* <tr>
245	* <td nowrap valign="top"><code>a?</code></td>
246	* <td valign="top"></td>
247	* <td valign="top">zero or one instance of <code>a</code><br>
248	* </td>
249	* </tr>
250	* <tr>
251	* <td nowrap valign="top"><code>a*</code></td>
252	* <td valign="top"></td>
253	* <td valign="top">one or more instances of <code>a</code><br>
254	* </td>
255	* </tr>
256	* <tr>
257	* <td nowrap valign="top"><code>a \| b</code></td>
258	* <td valign="top"></td>
259	* <td valign="top">either <code>a</code> or <code>b</code><br>
260	* </td>
261	* </tr>
262	* <tr>
263	* <td nowrap valign="top"><code>'a'</code></td>
264	* <td valign="top"></td>
265	* <td valign="top">the literal string between the quotes </td>
266	* </tr>
267	* </table>
268	* </td>
269	* </tr>
270	* </table>
271	* \htmlonly</blockquote>\endhtmlonly
272	*
273	* <p>Note:
274	* - Most UnicodeSet methods do not take a UErrorCode parameter because
275	* there are usually very few opportunities for failure other than a shortage
276	* of memory, error codes in low-level C++ string methods would be inconvenient,
277	* and the error code as the last parameter (ICU convention) would prevent
278	* the use of default parameter values.
279	* Instead, such methods set the UnicodeSet into a "bogus" state
280	* (see isBogus()) if an error occurs.
281	*
282	* @author Alan Liu
283	* @stable ICU 2.0
284	*/
285	class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
286	private:
287	/**
288	* Enough for sets with few ranges.
289	* For example, White_Space has 10 ranges, list length 21.
290	*/
291	static constexpr int32_t INITIAL_CAPACITY = `25`;
292	// fFlags constant
293	static constexpr uint8_t kIsBogus = `1`; // This set is bogus (i.e. not valid)
294
295	UChar32* list = stackList; // MUST be terminated with HIGH
296	int32_t capacity = INITIAL_CAPACITY; // capacity of list
297	int32_t len = `1`; // length of list used; 1 <= len <= capacity
298	uint8_t fFlags = `0`; // Bit flag (see constants above)
299
300	BMPSet bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.*
301	UChar32* buffer = nullptr; // internal buffer, may be NULL
302	int32_t bufferCapacity = `0`; // capacity of buffer
303
304	/**
305	* The pattern representation of this set. This may not be the
306	* most economical pattern. It is the pattern supplied to
307	* applyPattern(), with variables substituted and whitespace
308	* removed. For sets constructed without applyPattern(), or
309	* modified using the non-pattern API, this string will be empty,
310	* indicating that toPattern() must generate a pattern
311	* representation from the inversion list.
312	*/
313	char16_t pat = nullptr*;
314	int32_t patLen = `0`;
315
316	UVector* strings = nullptr; // maintained in sorted order
317	UnicodeSetStringSpan stringSpan = nullptr*;
318
319	/**
320	* Initial list array.
321	* Avoids some heap allocations, and list is never nullptr.
322	* Increases the object size a bit.
323	*/
324	UChar32 stackList[INITIAL_CAPACITY];
325
326	public:
327	/**
328	* Determine if this object contains a valid set.
329	* A bogus set has no value. It is different from an empty set.
330	* It can be used to indicate that no set value is available.
331	*
332	* @return true if the set is bogus/invalid, false otherwise
333	* @see setToBogus()
334	* @stable ICU 4.0
335	*/
336	inline UBool isBogus(void) const;
337
338	/**
339	* Make this UnicodeSet object invalid.
340	* The string will test true with isBogus().
341	*
342	* A bogus set has no value. It is different from an empty set.
343	* It can be used to indicate that no set value is available.
344	*
345	* This utility function is used throughout the UnicodeSet
346	* implementation to indicate that a UnicodeSet operation failed,
347	* and may be used in other functions,
348	* especially but not exclusively when such functions do not
349	* take a UErrorCode for simplicity.
350	*
351	* @see isBogus()
352	* @stable ICU 4.0
353	*/
354	void setToBogus();
355
356	public:
357
358	enum {
359	/**
360	* Minimum value that can be stored in a UnicodeSet.
361	* @stable ICU 2.4
362	*/
363	MIN_VALUE = `0`,
364
365	/**
366	* Maximum value that can be stored in a UnicodeSet.
367	* @stable ICU 2.4
368	*/
369	MAX_VALUE = `0x10ffff`
370	};
371
372	//----------------------------------------------------------------
373	// Constructors &c
374	//----------------------------------------------------------------
375
376	public:
377
378	/**
379	* Constructs an empty set.
380	* @stable ICU 2.0
381	*/
382	UnicodeSet();
383
384	/**
385	* Constructs a set containing the given range. If <code>end <
386	* start</code> then an empty set is created.
387	*
388	* @param start first character, inclusive, of range
389	* @param end last character, inclusive, of range
390	* @stable ICU 2.4
391	*/
392	UnicodeSet(UChar32 start, UChar32 end);
393
394	#ifndef U_HIDE_INTERNAL_API
395	/**
396	* @internal
397	*/
398	enum ESerialization {
399	kSerialized / result of serialize() /
400	};
401
402	/**
403	* Constructs a set from the output of serialize().
404	*
405	* @param buffer the 16 bit array
406	* @param bufferLen the original length returned from serialize()
407	* @param serialization the value 'kSerialized'
408	* @param status error code
409	*
410	* @internal
411	*/
412	UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
413	ESerialization serialization, UErrorCode &status);
414	#endif /* U_HIDE_INTERNAL_API */
415
416	/**
417	* Constructs a set from the given pattern. See the class
418	* description for the syntax of the pattern language.
419	* @param pattern a string specifying what characters are in the set
420	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
421	* contains a syntax error.
422	* @stable ICU 2.0
423	*/
424	UnicodeSet(const UnicodeString& pattern,
425	UErrorCode& status);
426
427	#ifndef U_HIDE_INTERNAL_API
428	/**
429	* Constructs a set from the given pattern. See the class
430	* description for the syntax of the pattern language.
431	* @param pattern a string specifying what characters are in the set
432	* @param options bitmask for options to apply to the pattern.
433	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
434	* @param symbols a symbol table mapping variable names to values
435	* and stand-in characters to UnicodeSets; may be NULL
436	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
437	* contains a syntax error.
438	* @internal
439	*/
440	UnicodeSet(const UnicodeString& pattern,
441	uint32_t options,
442	const SymbolTable* symbols,
443	UErrorCode& status);
444	#endif /* U_HIDE_INTERNAL_API */
445
446	/**
447	* Constructs a set from the given pattern. See the class description
448	* for the syntax of the pattern language.
449	* @param pattern a string specifying what characters are in the set
450	* @param pos on input, the position in pattern at which to start parsing.
451	* On output, the position after the last character parsed.
452	* @param options bitmask for options to apply to the pattern.
453	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
454	* @param symbols a symbol table mapping variable names to values
455	* and stand-in characters to UnicodeSets; may be NULL
456	* @param status input-output error code
457	* @stable ICU 2.8
458	*/
459	UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
460	uint32_t options,
461	const SymbolTable* symbols,
462	UErrorCode& status);
463
464	/**
465	* Constructs a set that is identical to the given UnicodeSet.
466	* @stable ICU 2.0
467	*/
468	UnicodeSet(const UnicodeSet& o);
469
470	/**
471	* Destructs the set.
472	* @stable ICU 2.0
473	*/
474	virtual ~UnicodeSet();
475
476	/**
477	* Assigns this object to be a copy of another.
478	* A frozen set will not be modified.
479	* @stable ICU 2.0
480	*/
481	UnicodeSet& operator=(const UnicodeSet& o);
482
483	/**
484	* Compares the specified object with this set for equality. Returns
485	* <tt>true</tt> if the two sets
486	* have the same size, and every member of the specified set is
487	* contained in this set (or equivalently, every member of this set is
488	* contained in the specified set).
489	*
490	* @param o set to be compared for equality with this set.
491	* @return <tt>true</tt> if the specified set is equal to this set.
492	* @stable ICU 2.0
493	*/
494	virtual bool operator==(const UnicodeSet& o) const;
495
496	/**
497	* Compares the specified object with this set for equality. Returns
498	* <tt>true</tt> if the specified set is not equal to this set.
499	* @stable ICU 2.0
500	*/
501	inline bool operator!=(const UnicodeSet& o) const;
502
503	/**
504	* Returns a copy of this object. All UnicodeFunctor objects have
505	* to support cloning in order to allow classes using
506	* UnicodeFunctors, such as Transliterator, to implement cloning.
507	* If this set is frozen, then the clone will be frozen as well.
508	* Use cloneAsThawed() for a mutable clone of a frozen set.
509	* @see cloneAsThawed
510	* @stable ICU 2.0
511	*/
512	virtual UnicodeSet* clone() const override;
513
514	/**
515	* Returns the hash code value for this set.
516	*
517	* @return the hash code value for this set.
518	* @see Object#hashCode()
519	* @stable ICU 2.0
520	*/
521	virtual int32_t hashCode(void) const;
522
523	/**
524	* Get a UnicodeSet pointer from a USet
525	*
526	* @param uset a USet (the ICU plain C type for UnicodeSet)
527	* @return the corresponding UnicodeSet pointer.
528	*
529	* @stable ICU 4.2
530	*/
531	inline static UnicodeSet fromUSet(USet uset);
532
533	/**
534	* Get a UnicodeSet pointer from a const USet
535	*
536	* @param uset a const USet (the ICU plain C type for UnicodeSet)
537	* @return the corresponding UnicodeSet pointer.
538	*
539	* @stable ICU 4.2
540	*/
541	inline static const UnicodeSet fromUSet(const* USet *uset);
542
543	/**
544	* Produce a USet * pointer for this UnicodeSet.
545	* USet is the plain C type for UnicodeSet
546	*
547	* @return a USet pointer for this UnicodeSet
548	* @stable ICU 4.2
549	*/
550	inline USet *toUSet();
551
552
553	/**
554	* Produce a const USet * pointer for this UnicodeSet.
555	* USet is the plain C type for UnicodeSet
556	*
557	* @return a const USet pointer for this UnicodeSet
558	* @stable ICU 4.2
559	*/
560	inline const USet * toUSet() const;
561
562
563	//----------------------------------------------------------------
564	// Freezable API
565	//----------------------------------------------------------------
566
567	/**
568	* Determines whether the set has been frozen (made immutable) or not.
569	* See the ICU4J Freezable interface for details.
570	* @return true/false for whether the set has been frozen
571	* @see freeze
572	* @see cloneAsThawed
573	* @stable ICU 3.8
574	*/
575	inline UBool isFrozen() const;
576
577	/**
578	* Freeze the set (make it immutable).
579	* Once frozen, it cannot be unfrozen and is therefore thread-safe
580	* until it is deleted.
581	* See the ICU4J Freezable interface for details.
582	* Freezing the set may also make some operations faster, for example
583	* contains() and span().
584	* A frozen set will not be modified. (It remains frozen.)
585	* @return this set.
586	* @see isFrozen
587	* @see cloneAsThawed
588	* @stable ICU 3.8
589	*/
590	UnicodeSet *freeze();
591
592	/**
593	* Clone the set and make the clone mutable.
594	* See the ICU4J Freezable interface for details.
595	* @return the mutable clone
596	* @see freeze
597	* @see isFrozen
598	* @stable ICU 3.8
599	*/
600	UnicodeSet cloneAsThawed() const*;
601
602	//----------------------------------------------------------------
603	// Public API
604	//----------------------------------------------------------------
605
606	/**
607	* Make this object represent the range `start - end`.
608	* If `start > end` then this object is set to an empty range.
609	* A frozen set will not be modified.
610	*
611	* @param start first character in the set, inclusive
612	* @param end last character in the set, inclusive
613	* @stable ICU 2.4
614	*/
615	UnicodeSet& set(UChar32 start, UChar32 end);
616
617	/**
618	* Return true if the given position, in the given pattern, appears
619	* to be the start of a UnicodeSet pattern.
620	* @stable ICU 2.4
621	*/
622	static UBool resemblesPattern(const UnicodeString& pattern,
623	int32_t pos);
624
625	/**
626	* Modifies this set to represent the set specified by the given
627	* pattern, ignoring Unicode Pattern_White_Space characters.
628	* See the class description for the syntax of the pattern language.
629	* A frozen set will not be modified.
630	* @param pattern a string specifying what characters are in the set
631	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
632	* contains a syntax error.
633	* <em> Empties the set passed before applying the pattern.</em>
634	* @return a reference to this
635	* @stable ICU 2.0
636	*/
637	UnicodeSet& applyPattern(const UnicodeString& pattern,
638	UErrorCode& status);
639
640	#ifndef U_HIDE_INTERNAL_API
641	/**
642	* Modifies this set to represent the set specified by the given
643	* pattern, optionally ignoring Unicode Pattern_White_Space characters.
644	* See the class description for the syntax of the pattern language.
645	* A frozen set will not be modified.
646	* @param pattern a string specifying what characters are in the set
647	* @param options bitmask for options to apply to the pattern.
648	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
649	* @param symbols a symbol table mapping variable names to
650	* values and stand-ins to UnicodeSets; may be NULL
651	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
652	* contains a syntax error.
653	*<em> Empties the set passed before applying the pattern.</em>
654	* @return a reference to this
655	* @internal
656	*/
657	UnicodeSet& applyPattern(const UnicodeString& pattern,
658	uint32_t options,
659	const SymbolTable* symbols,
660	UErrorCode& status);
661	#endif /* U_HIDE_INTERNAL_API */
662
663	/**
664	* Parses the given pattern, starting at the given position. The
665	* character at pattern.charAt(pos.getIndex()) must be '[', or the
666	* parse fails. Parsing continues until the corresponding closing
667	* ']'. If a syntax error is encountered between the opening and
668	* closing brace, the parse fails. Upon return from a successful
669	* parse, the ParsePosition is updated to point to the character
670	* following the closing ']', and a StringBuffer containing a
671	* pairs list for the parsed pattern is returned. This method calls
672	* itself recursively to parse embedded subpatterns.
673	*<em> Empties the set passed before applying the pattern.</em>
674	* A frozen set will not be modified.
675	*
676	* @param pattern the string containing the pattern to be parsed.
677	* The portion of the string from pos.getIndex(), which must be a
678	* '[', to the corresponding closing ']', is parsed.
679	* @param pos upon entry, the position at which to being parsing.
680	* The character at pattern.charAt(pos.getIndex()) must be a '['.
681	* Upon return from a successful parse, pos.getIndex() is either
682	* the character after the closing ']' of the parsed pattern, or
683	* pattern.length() if the closing ']' is the last character of
684	* the pattern string.
685	* @param options bitmask for options to apply to the pattern.
686	* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
687	* @param symbols a symbol table mapping variable names to
688	* values and stand-ins to UnicodeSets; may be NULL
689	* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
690	* contains a syntax error.
691	* @return a reference to this
692	* @stable ICU 2.8
693	*/
694	UnicodeSet& applyPattern(const UnicodeString& pattern,
695	ParsePosition& pos,
696	uint32_t options,
697	const SymbolTable* symbols,
698	UErrorCode& status);
699
700	/**
701	* Returns a string representation of this set. If the result of
702	* calling this function is passed to a UnicodeSet constructor, it
703	* will produce another set that is equal to this one.
704	* A frozen set will not be modified.
705	* @param result the string to receive the rules. Previous
706	* contents will be deleted.
707	* @param escapeUnprintable if true then convert unprintable
708	* character to their hex escape representations, \\uxxxx or
709	* \\Uxxxxxxxx. Unprintable characters are those other than
710	* U+000A, U+0020..U+007E.
711	* @stable ICU 2.0
712	*/
713	virtual UnicodeString& toPattern(UnicodeString& result,
714	UBool escapeUnprintable = false) const override;
715
716	/**
717	* Modifies this set to contain those code points which have the given value
718	* for the given binary or enumerated property, as returned by
719	* u_getIntPropertyValue. Prior contents of this set are lost.
720	* A frozen set will not be modified.
721	*
722	* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
723	* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
724	* or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
725	*
726	* @param value a value in the range u_getIntPropertyMinValue(prop)..
727	* u_getIntPropertyMaxValue(prop), with one exception. If prop is
728	* UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
729	* rather a mask value produced by U_GET_GC_MASK(). This allows grouped
730	* categories such as [:L:] to be represented.
731	*
732	* @param ec error code input/output parameter
733	*
734	* @return a reference to this set
735	*
736	* @stable ICU 2.4
737	*/
738	UnicodeSet& applyIntPropertyValue(UProperty prop,
739	int32_t value,
740	UErrorCode& ec);
741
742	/**
743	* Modifies this set to contain those code points which have the
744	* given value for the given property. Prior contents of this
745	* set are lost.
746	* A frozen set will not be modified.
747	*
748	* @param prop a property alias, either short or long. The name is matched
749	* loosely. See PropertyAliases.txt for names and a description of loose
750	* matching. If the value string is empty, then this string is interpreted
751	* as either a General_Category value alias, a Script value alias, a binary
752	* property alias, or a special ID. Special IDs are matched loosely and
753	* correspond to the following sets:
754	*
755	* "ANY" = [\\u0000-\\U0010FFFF],
756	* "ASCII" = [\\u0000-\\u007F],
757	* "Assigned" = [:^Cn:].
758	*
759	* @param value a value alias, either short or long. The name is matched
760	* loosely. See PropertyValueAliases.txt for names and a description of
761	* loose matching. In addition to aliases listed, numeric values and
762	* canonical combining classes may be expressed numerically, e.g., ("nv",
763	* "0.5") or ("ccc", "220"). The value string may also be empty.
764	*
765	* @param ec error code input/output parameter
766	*
767	* @return a reference to this set
768	*
769	* @stable ICU 2.4
770	*/
771	UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
772	const UnicodeString& value,
773	UErrorCode& ec);
774
775	/**
776	* Returns the number of elements in this set (its cardinality).
777	* Note than the elements of a set may include both individual
778	* codepoints and strings.
779	*
780	* This is slower than getRangeCount() because
781	* it counts the code points of all ranges.
782	*
783	* @return the number of elements in this set (its cardinality).
784	* @stable ICU 2.0
785	* @see getRangeCount
786	*/
787	virtual int32_t size(void) const;
788
789	/**
790	* Returns <tt>true</tt> if this set contains no elements.
791	*
792	* @return <tt>true</tt> if this set contains no elements.
793	* @stable ICU 2.0
794	*/
795	virtual UBool isEmpty(void) const;
796
797	#ifndef U_HIDE_DRAFT_API
798	/**
799	* @return true if this set contains multi-character strings or the empty string.
800	* @draft ICU 70
801	*/
802	UBool hasStrings() const;
803	#endif // U_HIDE_DRAFT_API
804
805	/**
806	* Returns true if this set contains the given character.
807	* This function works faster with a frozen set.
808	* @param c character to be checked for containment
809	* @return true if the test condition is met
810	* @stable ICU 2.0
811	*/
812	virtual UBool contains(UChar32 c) const override;
813
814	/**
815	* Returns true if this set contains every character
816	* of the given range.
817	* @param start first character, inclusive, of the range
818	* @param end last character, inclusive, of the range
819	* @return true if the test condition is met
820	* @stable ICU 2.0
821	*/
822	virtual UBool contains(UChar32 start, UChar32 end) const;
823
824	/**
825	* Returns <tt>true</tt> if this set contains the given
826	* multicharacter string.
827	* @param s string to be checked for containment
828	* @return <tt>true</tt> if this set contains the specified string
829	* @stable ICU 2.4
830	*/
831	UBool contains(const UnicodeString& s) const;
832
833	/**
834	* Returns true if this set contains all the characters and strings
835	* of the given set.
836	* @param c set to be checked for containment
837	* @return true if the test condition is met
838	* @stable ICU 2.4
839	*/
840	virtual UBool containsAll(const UnicodeSet& c) const;
841
842	/**
843	* Returns true if this set contains all the characters
844	* of the given string.
845	* @param s string containing characters to be checked for containment
846	* @return true if the test condition is met
847	* @stable ICU 2.4
848	*/
849	UBool containsAll(const UnicodeString& s) const;
850
851	/**
852	* Returns true if this set contains none of the characters
853	* of the given range.
854	* @param start first character, inclusive, of the range
855	* @param end last character, inclusive, of the range
856	* @return true if the test condition is met
857	* @stable ICU 2.4
858	*/
859	UBool containsNone(UChar32 start, UChar32 end) const;
860
861	/**
862	* Returns true if this set contains none of the characters and strings
863	* of the given set.
864	* @param c set to be checked for containment
865	* @return true if the test condition is met
866	* @stable ICU 2.4
867	*/
868	UBool containsNone(const UnicodeSet& c) const;
869
870	/**
871	* Returns true if this set contains none of the characters
872	* of the given string.
873	* @param s string containing characters to be checked for containment
874	* @return true if the test condition is met
875	* @stable ICU 2.4
876	*/
877	UBool containsNone(const UnicodeString& s) const;
878
879	/**
880	* Returns true if this set contains one or more of the characters
881	* in the given range.
882	* @param start first character, inclusive, of the range
883	* @param end last character, inclusive, of the range
884	* @return true if the condition is met
885	* @stable ICU 2.4
886	*/
887	inline UBool containsSome(UChar32 start, UChar32 end) const;
888
889	/**
890	* Returns true if this set contains one or more of the characters
891	* and strings of the given set.
892	* @param s The set to be checked for containment
893	* @return true if the condition is met
894	* @stable ICU 2.4
895	*/
896	inline UBool containsSome(const UnicodeSet& s) const;
897
898	/**
899	* Returns true if this set contains one or more of the characters
900	* of the given string.
901	* @param s string containing characters to be checked for containment
902	* @return true if the condition is met
903	* @stable ICU 2.4
904	*/
905	inline UBool containsSome(const UnicodeString& s) const;
906
907	/**
908	* Returns the length of the initial substring of the input string which
909	* consists only of characters and strings that are contained in this set
910	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
911	* or only of characters and strings that are not contained
912	* in this set (USET_SPAN_NOT_CONTAINED).
913	* See USetSpanCondition for details.
914	* Similar to the strspn() C library function.
915	* Unpaired surrogates are treated according to contains() of their surrogate code points.
916	* This function works faster with a frozen set and with a non-negative string length argument.
917	* @param s start of the string
918	* @param length of the string; can be -1 for NUL-terminated
919	* @param spanCondition specifies the containment condition
920	* @return the length of the initial substring according to the spanCondition;
921	* 0 if the start of the string does not fit the spanCondition
922	* @stable ICU 3.8
923	* @see USetSpanCondition
924	*/
925	int32_t span(const char16_t s, int32_t length, USetSpanCondition spanCondition) const*;
926
927	/**
928	* Returns the end of the substring of the input string according to the USetSpanCondition.
929	* Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
930	* after pinning start to 0<=start<=s.length().
931	* @param s the string
932	* @param start the start index in the string for the span operation
933	* @param spanCondition specifies the containment condition
934	* @return the exclusive end of the substring according to the spanCondition;
935	* the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
936	* @stable ICU 4.4
937	* @see USetSpanCondition
938	*/
939	inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
940
941	/**
942	* Returns the start of the trailing substring of the input string which
943	* consists only of characters and strings that are contained in this set
944	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
945	* or only of characters and strings that are not contained
946	* in this set (USET_SPAN_NOT_CONTAINED).
947	* See USetSpanCondition for details.
948	* Unpaired surrogates are treated according to contains() of their surrogate code points.
949	* This function works faster with a frozen set and with a non-negative string length argument.
950	* @param s start of the string
951	* @param length of the string; can be -1 for NUL-terminated
952	* @param spanCondition specifies the containment condition
953	* @return the start of the trailing substring according to the spanCondition;
954	* the string length if the end of the string does not fit the spanCondition
955	* @stable ICU 3.8
956	* @see USetSpanCondition
957	*/
958	int32_t spanBack(const char16_t s, int32_t length, USetSpanCondition spanCondition) const*;
959
960	/**
961	* Returns the start of the substring of the input string according to the USetSpanCondition.
962	* Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
963	* after pinning limit to 0<=end<=s.length().
964	* @param s the string
965	* @param limit the exclusive-end index in the string for the span operation
966	* (use s.length() or INT32_MAX for spanning back from the end of the string)
967	* @param spanCondition specifies the containment condition
968	* @return the start of the substring according to the spanCondition;
969	* the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
970	* @stable ICU 4.4
971	* @see USetSpanCondition
972	*/
973	inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
974
975	/**
976	* Returns the length of the initial substring of the input string which
977	* consists only of characters and strings that are contained in this set
978	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
979	* or only of characters and strings that are not contained
980	* in this set (USET_SPAN_NOT_CONTAINED).
981	* See USetSpanCondition for details.
982	* Similar to the strspn() C library function.
983	* Malformed byte sequences are treated according to contains(0xfffd).
984	* This function works faster with a frozen set and with a non-negative string length argument.
985	* @param s start of the string (UTF-8)
986	* @param length of the string; can be -1 for NUL-terminated
987	* @param spanCondition specifies the containment condition
988	* @return the length of the initial substring according to the spanCondition;
989	* 0 if the start of the string does not fit the spanCondition
990	* @stable ICU 3.8
991	* @see USetSpanCondition
992	*/
993	int32_t spanUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const*;
994
995	/**
996	* Returns the start of the trailing substring of the input string which
997	* consists only of characters and strings that are contained in this set
998	* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
999	* or only of characters and strings that are not contained
1000	* in this set (USET_SPAN_NOT_CONTAINED).
1001	* See USetSpanCondition for details.
1002	* Malformed byte sequences are treated according to contains(0xfffd).
1003	* This function works faster with a frozen set and with a non-negative string length argument.
1004	* @param s start of the string (UTF-8)
1005	* @param length of the string; can be -1 for NUL-terminated
1006	* @param spanCondition specifies the containment condition
1007	* @return the start of the trailing substring according to the spanCondition;
1008	* the string length if the end of the string does not fit the spanCondition
1009	* @stable ICU 3.8
1010	* @see USetSpanCondition
1011	*/
1012	int32_t spanBackUTF8(const char s, int32_t length, USetSpanCondition spanCondition) const*;
1013
1014	/**
1015	* Implement UnicodeMatcher::matches()
1016	* @stable ICU 2.4
1017	*/
1018	virtual UMatchDegree matches(const Replaceable& text,
1019	int32_t& offset,
1020	int32_t limit,
1021	UBool incremental) override;
1022
1023	private:
1024	/**
1025	* Returns the longest match for s in text at the given position.
1026	* If limit > start then match forward from start+1 to limit
1027	* matching all characters except s.charAt(0). If limit < start,
1028	* go backward starting from start-1 matching all characters
1029	* except s.charAt(s.length()-1). This method assumes that the
1030	* first character, text.charAt(start), matches s, so it does not
1031	* check it.
1032	* @param text the text to match
1033	* @param start the first character to match. In the forward
1034	* direction, text.charAt(start) is matched against s.charAt(0).
1035	* In the reverse direction, it is matched against
1036	* s.charAt(s.length()-1).
1037	* @param limit the limit offset for matching, either last+1 in
1038	* the forward direction, or last-1 in the reverse direction,
1039	* where last is the index of the last character to match.
1040	* @param s
1041	* @return If part of s matches up to the limit, return \|limit -
1042	* start\|. If all of s matches before reaching the limit, return
1043	* s.length(). If there is a mismatch between s and text, return
1044	* 0
1045	*/
1046	static int32_t matchRest(const Replaceable& text,
1047	int32_t start, int32_t limit,
1048	const UnicodeString& s);
1049
1050	/**
1051	* Returns the smallest value i such that c < list[i]. Caller
1052	* must ensure that c is a legal value or this method will enter
1053	* an infinite loop. This method performs a binary search.
1054	* @param c a character in the range MIN_VALUE..MAX_VALUE
1055	* inclusive
1056	* @return the smallest integer i in the range 0..len-1,
1057	* inclusive, such that c < list[i]
1058	*/
1059	int32_t findCodePoint(UChar32 c) const;
1060
1061	public:
1062
1063	/**
1064	* Implementation of UnicodeMatcher API. Union the set of all
1065	* characters that may be matched by this object into the given
1066	* set.
1067	* @param toUnionTo the set into which to union the source characters
1068	* @stable ICU 2.4
1069	*/
1070	virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1071
1072	/**
1073	* Returns the index of the given character within this set, where
1074	* the set is ordered by ascending code point. If the character
1075	* is not in this set, return -1. The inverse of this method is
1076	* <code>charAt()</code>.
1077	* @return an index from 0..size()-1, or -1
1078	* @stable ICU 2.4
1079	*/
1080	int32_t indexOf(UChar32 c) const;
1081
1082	/**
1083	* Returns the character at the given index within this set, where
1084	* the set is ordered by ascending code point. If the index is
1085	* out of range for characters, returns (UChar32)-1.
1086	* The inverse of this method is <code>indexOf()</code>.
1087	*
1088	* For iteration, this is slower than UnicodeSetIterator or
1089	* getRangeCount()/getRangeStart()/getRangeEnd(),
1090	* because for each call it skips linearly over <code>index</code>
1091	* characters in the ranges.
1092	*
1093	* @param index an index from 0..size()-1
1094	* @return the character at the given index, or (UChar32)-1.
1095	* @stable ICU 2.4
1096	*/
1097	UChar32 charAt(int32_t index) const;
1098
1099	/**
1100	* Adds the specified range to this set if it is not already
1101	* present. If this set already contains the specified range,
1102	* the call leaves this set unchanged. If <code>start > end</code>
1103	* then an empty range is added, leaving the set unchanged.
1104	* This is equivalent to a boolean logic OR, or a set UNION.
1105	* A frozen set will not be modified.
1106	*
1107	* @param start first character, inclusive, of range to be added
1108	* to this set.
1109	* @param end last character, inclusive, of range to be added
1110	* to this set.
1111	* @stable ICU 2.0
1112	*/
1113	virtual UnicodeSet& add(UChar32 start, UChar32 end);
1114
1115	/**
1116	* Adds the specified character to this set if it is not already
1117	* present. If this set already contains the specified character,
1118	* the call leaves this set unchanged.
1119	* A frozen set will not be modified.
1120	*
1121	* @param c the character (code point)
1122	* @return this object, for chaining
1123	* @stable ICU 2.0
1124	*/
1125	UnicodeSet& add(UChar32 c);
1126
1127	/**
1128	* Adds the specified multicharacter to this set if it is not already
1129	* present. If this set already contains the multicharacter,
1130	* the call leaves this set unchanged.
1131	* Thus "ch" => {"ch"}
1132	* A frozen set will not be modified.
1133	*
1134	* @param s the source string
1135	* @return this object, for chaining
1136	* @stable ICU 2.4
1137	*/
1138	UnicodeSet& add(const UnicodeString& s);
1139
1140	private:
1141	/**
1142	* @return a code point IF the string consists of a single one.
1143	* otherwise returns -1.
1144	* @param s string to test
1145	*/
1146	static int32_t getSingleCP(const UnicodeString& s);
1147
1148	void _add(const UnicodeString& s);
1149
1150	public:
1151	/**
1152	* Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
1153	* If this set already contains any particular character, it has no effect on that character.
1154	* A frozen set will not be modified.
1155	* @param s the source string
1156	* @return this object, for chaining
1157	* @stable ICU 2.4
1158	*/
1159	UnicodeSet& addAll(const UnicodeString& s);
1160
1161	/**
1162	* Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1163	* A frozen set will not be modified.
1164	* @param s the source string
1165	* @return this object, for chaining
1166	* @stable ICU 2.4
1167	*/
1168	UnicodeSet& retainAll(const UnicodeString& s);
1169
1170	/**
1171	* Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1172	* A frozen set will not be modified.
1173	* @param s the source string
1174	* @return this object, for chaining
1175	* @stable ICU 2.4
1176	*/
1177	UnicodeSet& complementAll(const UnicodeString& s);
1178
1179	/**
1180	* Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1181	* A frozen set will not be modified.
1182	* @param s the source string
1183	* @return this object, for chaining
1184	* @stable ICU 2.4
1185	*/
1186	UnicodeSet& removeAll(const UnicodeString& s);
1187
1188	/**
1189	* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1190	*
1191	* @param s the source string
1192	* @return a newly created set containing the given string.
1193	* The caller owns the return object and is responsible for deleting it.
1194	* @stable ICU 2.4
1195	*/
1196	static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1197
1198
1199	/**
1200	* Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1201	* @param s the source string
1202	* @return a newly created set containing the given characters
1203	* The caller owns the return object and is responsible for deleting it.
1204	* @stable ICU 2.4
1205	*/
1206	static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1207
1208	/**
1209	* Retain only the elements in this set that are contained in the
1210	* specified range. If <code>start > end</code> then an empty range is
1211	* retained, leaving the set empty. This is equivalent to
1212	* a boolean logic AND, or a set INTERSECTION.
1213	* A frozen set will not be modified.
1214	*
1215	* @param start first character, inclusive, of range
1216	* @param end last character, inclusive, of range
1217	* @stable ICU 2.0
1218	*/
1219	virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1220
1221
1222	/**
1223	* Retain the specified character from this set if it is present.
1224	* A frozen set will not be modified.
1225	*
1226	* @param c the character (code point)
1227	* @return this object, for chaining
1228	* @stable ICU 2.0
1229	*/
1230	UnicodeSet& retain(UChar32 c);
1231
1232	#ifndef U_HIDE_DRAFT_API
1233	/**
1234	* Retains only the specified string from this set if it is present.
1235	* Upon return this set will be empty if it did not contain s, or
1236	* will only contain s if it did contain s.
1237	* A frozen set will not be modified.
1238	*
1239	* @param s the source string
1240	* @return this object, for chaining
1241	* @draft ICU 69
1242	*/
1243	UnicodeSet& retain(const UnicodeString &s);
1244	#endif // U_HIDE_DRAFT_API
1245
1246	/**
1247	* Removes the specified range from this set if it is present.
1248	* The set will not contain the specified range once the call
1249	* returns. If <code>start > end</code> then an empty range is
1250	* removed, leaving the set unchanged.
1251	* A frozen set will not be modified.
1252	*
1253	* @param start first character, inclusive, of range to be removed
1254	* from this set.
1255	* @param end last character, inclusive, of range to be removed
1256	* from this set.
1257	* @stable ICU 2.0
1258	*/
1259	virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1260
1261	/**
1262	* Removes the specified character from this set if it is present.
1263	* The set will not contain the specified range once the call
1264	* returns.
1265	* A frozen set will not be modified.
1266	*
1267	* @param c the character (code point)
1268	* @return this object, for chaining
1269	* @stable ICU 2.0
1270	*/
1271	UnicodeSet& remove(UChar32 c);
1272
1273	/**
1274	* Removes the specified string from this set if it is present.
1275	* The set will not contain the specified character once the call
1276	* returns.
1277	* A frozen set will not be modified.
1278	* @param s the source string
1279	* @return this object, for chaining
1280	* @stable ICU 2.4
1281	*/
1282	UnicodeSet& remove(const UnicodeString& s);
1283
1284	/**
1285	* This is equivalent to
1286	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1287	*
1288	* <strong>Note:</strong> This performs a symmetric difference with all code points
1289	* <em>and thus retains all multicharacter strings</em>.
1290	* In order to achieve a “code point complement” (all code points minus this set),
1291	* the easiest is to <code>.complement().removeAllStrings()</code>.
1292	*
1293	* A frozen set will not be modified.
1294	* @stable ICU 2.0
1295	*/
1296	virtual UnicodeSet& complement();
1297
1298	/**
1299	* Complements the specified range in this set. Any character in
1300	* the range will be removed if it is in this set, or will be
1301	* added if it is not in this set. If <code>start > end</code>
1302	* then an empty range is complemented, leaving the set unchanged.
1303	* This is equivalent to a boolean logic XOR.
1304	* A frozen set will not be modified.
1305	*
1306	* @param start first character, inclusive, of range
1307	* @param end last character, inclusive, of range
1308	* @stable ICU 2.0
1309	*/
1310	virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1311
1312	/**
1313	* Complements the specified character in this set. The character
1314	* will be removed if it is in this set, or will be added if it is
1315	* not in this set.
1316	* A frozen set will not be modified.
1317	*
1318	* @param c the character (code point)
1319	* @return this object, for chaining
1320	* @stable ICU 2.0
1321	*/
1322	UnicodeSet& complement(UChar32 c);
1323
1324	/**
1325	* Complement the specified string in this set.
1326	* The string will be removed if it is in this set, or will be added if it is not in this set.
1327	* A frozen set will not be modified.
1328	*
1329	* @param s the string to complement
1330	* @return this object, for chaining
1331	* @stable ICU 2.4
1332	*/
1333	UnicodeSet& complement(const UnicodeString& s);
1334
1335	/**
1336	* Adds all of the elements in the specified set to this set if
1337	* they're not already present. This operation effectively
1338	* modifies this set so that its value is the <i>union</i> of the two
1339	* sets. The behavior of this operation is unspecified if the specified
1340	* collection is modified while the operation is in progress.
1341	* A frozen set will not be modified.
1342	*
1343	* @param c set whose elements are to be added to this set.
1344	* @see #add(UChar32, UChar32)
1345	* @stable ICU 2.0
1346	*/
1347	virtual UnicodeSet& addAll(const UnicodeSet& c);
1348
1349	/**
1350	* Retains only the elements in this set that are contained in the
1351	* specified set. In other words, removes from this set all of
1352	* its elements that are not contained in the specified set. This
1353	* operation effectively modifies this set so that its value is
1354	* the <i>intersection</i> of the two sets.
1355	* A frozen set will not be modified.
1356	*
1357	* @param c set that defines which elements this set will retain.
1358	* @stable ICU 2.0
1359	*/
1360	virtual UnicodeSet& retainAll(const UnicodeSet& c);
1361
1362	/**
1363	* Removes from this set all of its elements that are contained in the
1364	* specified set. This operation effectively modifies this
1365	* set so that its value is the <i>asymmetric set difference</i> of
1366	* the two sets.
1367	* A frozen set will not be modified.
1368	*
1369	* @param c set that defines which elements will be removed from
1370	* this set.
1371	* @stable ICU 2.0
1372	*/
1373	virtual UnicodeSet& removeAll(const UnicodeSet& c);
1374
1375	/**
1376	* Complements in this set all elements contained in the specified
1377	* set. Any character in the other set will be removed if it is
1378	* in this set, or will be added if it is not in this set.
1379	* A frozen set will not be modified.
1380	*
1381	* @param c set that defines which elements will be xor'ed from
1382	* this set.
1383	* @stable ICU 2.4
1384	*/
1385	virtual UnicodeSet& complementAll(const UnicodeSet& c);
1386
1387	/**
1388	* Removes all of the elements from this set. This set will be
1389	* empty after this call returns.
1390	* A frozen set will not be modified.
1391	* @stable ICU 2.0
1392	*/
1393	virtual UnicodeSet& clear(void);
1394
1395	/**
1396	* Close this set over the given attribute. For the attribute
1397	* USET_CASE, the result is to modify this set so that:
1398	*
1399	* 1. For each character or string 'a' in this set, all strings or
1400	* characters 'b' such that foldCase(a) == foldCase(b) are added
1401	* to this set.
1402	*
1403	* 2. For each string 'e' in the resulting set, if e !=
1404	* foldCase(e), 'e' will be removed.
1405	*
1406	* Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1407	*
1408	* (Here foldCase(x) refers to the operation u_strFoldCase, and a
1409	* == b denotes that the contents are the same, not pointer
1410	* comparison.)
1411	*
1412	* A frozen set will not be modified.
1413	*
1414	* @param attribute bitmask for attributes to close over.
1415	* Currently only the USET_CASE bit is supported. Any undefined bits
1416	* are ignored.
1417	* @return a reference to this set.
1418	* @stable ICU 4.2
1419	*/
1420	UnicodeSet& closeOver(int32_t attribute);
1421
1422	/**
1423	* Remove all strings from this set.
1424	*
1425	* @return a reference to this set.
1426	* @stable ICU 4.2
1427	*/
1428	virtual UnicodeSet &removeAllStrings();
1429
1430	/**
1431	* Iteration method that returns the number of ranges contained in
1432	* this set.
1433	* @see #getRangeStart
1434	* @see #getRangeEnd
1435	* @stable ICU 2.4
1436	*/
1437	virtual int32_t getRangeCount(void) const;
1438
1439	/**
1440	* Iteration method that returns the first character in the
1441	* specified range of this set.
1442	* @see #getRangeCount
1443	* @see #getRangeEnd
1444	* @stable ICU 2.4
1445	*/
1446	virtual UChar32 getRangeStart(int32_t index) const;
1447
1448	/**
1449	* Iteration method that returns the last character in the
1450	* specified range of this set.
1451	* @see #getRangeStart
1452	* @see #getRangeEnd
1453	* @stable ICU 2.4
1454	*/
1455	virtual UChar32 getRangeEnd(int32_t index) const;
1456
1457	/**
1458	* Serializes this set into an array of 16-bit integers. Serialization
1459	* (currently) only records the characters in the set; multicharacter
1460	* strings are ignored.
1461	*
1462	* The array has following format (each line is one 16-bit
1463	* integer):
1464	*
1465	* length = (n+2*m) \| (m!=0?0x8000:0)
1466	* bmpLength = n; present if m!=0
1467	* bmp[0]
1468	* bmp[1]
1469	* ...
1470	* bmp[n-1]
1471	* supp-high[0]
1472	* supp-low[0]
1473	* supp-high[1]
1474	* supp-low[1]
1475	* ...
1476	* supp-high[m-1]
1477	* supp-low[m-1]
1478	*
1479	* The array starts with a header. After the header are n bmp
1480	* code points, then m supplementary code points. Either n or m
1481	* or both may be zero. n+2*m is always <= 0x7FFF.
1482	*
1483	* If there are no supplementary characters (if m==0) then the
1484	* header is one 16-bit integer, 'length', with value n.
1485	*
1486	* If there are supplementary characters (if m!=0) then the header
1487	* is two 16-bit integers. The first, 'length', has value
1488	* (n+2*m)\|0x8000. The second, 'bmpLength', has value n.
1489	*
1490	* After the header the code points are stored in ascending order.
1491	* Supplementary code points are stored as most significant 16
1492	* bits followed by least significant 16 bits.
1493	*
1494	* @param dest pointer to buffer of destCapacity 16-bit integers.
1495	* May be NULL only if destCapacity is zero.
1496	* @param destCapacity size of dest, or zero. Must not be negative.
1497	* @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1498	* if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
1499	* n+2*m+(m!=0?2:1) > destCapacity.
1500	* @return the total length of the serialized format, including
1501	* the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1502	* than U_BUFFER_OVERFLOW_ERROR.
1503	* @stable ICU 2.4
1504	*/
1505	int32_t serialize(uint16_t dest, int32_t destCapacity, UErrorCode& ec) const*;
1506
1507	/**
1508	* Reallocate this objects internal structures to take up the least
1509	* possible space, without changing this object's value.
1510	* A frozen set will not be modified.
1511	* @stable ICU 2.4
1512	*/
1513	virtual UnicodeSet& compact();
1514
1515	/**
1516	* Return the class ID for this class. This is useful only for
1517	* comparing to a return value from getDynamicClassID(). For example:
1518	* <pre>
1519	* . Base* polymorphic_pointer = createPolymorphicObject();
1520	* . if (polymorphic_pointer->getDynamicClassID() ==
1521	* . Derived::getStaticClassID()) ...
1522	* </pre>
1523	* @return The class ID for all objects of this class.
1524	* @stable ICU 2.0
1525	*/
1526	static UClassID U_EXPORT2 getStaticClassID(void);
1527
1528	/**
1529	* Implement UnicodeFunctor API.
1530	*
1531	* @return The class ID for this object. All objects of a given
1532	* class have the same class ID. Objects of other classes have
1533	* different class IDs.
1534	* @stable ICU 2.4
1535	*/
1536	virtual UClassID getDynamicClassID(void) const override;
1537
1538	private:
1539
1540	// Private API for the USet API
1541
1542	friend class USetAccess;
1543
1544	const UnicodeString* getString(int32_t index) const;
1545
1546	//----------------------------------------------------------------
1547	// RuleBasedTransliterator support
1548	//----------------------------------------------------------------
1549
1550	private:
1551
1552	/**
1553	* Returns <tt>true</tt> if this set contains any character whose low byte
1554	* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
1555	* indexing.
1556	*/
1557	virtual UBool matchesIndexValue(uint8_t v) const override;
1558
1559	private:
1560	friend class RBBIRuleScanner;
1561
1562	//----------------------------------------------------------------
1563	// Implementation: Clone as thawed (see ICU4J Freezable)
1564	//----------------------------------------------------------------
1565
1566	UnicodeSet(const UnicodeSet& o, UBool / asThawed /);
1567	UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1568
1569	//----------------------------------------------------------------
1570	// Implementation: Pattern parsing
1571	//----------------------------------------------------------------
1572
1573	void applyPatternIgnoreSpace(const UnicodeString& pattern,
1574	ParsePosition& pos,
1575	const SymbolTable* symbols,
1576	UErrorCode& status);
1577
1578	void applyPattern(RuleCharacterIterator& chars,
1579	const SymbolTable* symbols,
1580	UnicodeString& rebuiltPat,
1581	uint32_t options,
1582	UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1583	int32_t depth,
1584	UErrorCode& ec);
1585
1586	//----------------------------------------------------------------
1587	// Implementation: Utility methods
1588	//----------------------------------------------------------------
1589
1590	static int32_t nextCapacity(int32_t minCapacity);
1591
1592	bool ensureCapacity(int32_t newLen);
1593
1594	bool ensureBufferCapacity(int32_t newLen);
1595
1596	void swapBuffers(void);
1597
1598	UBool allocateStrings(UErrorCode &status);
1599	int32_t stringsSize() const;
1600	UBool stringsContains(const UnicodeString &s) const;
1601
1602	UnicodeString& _toPattern(UnicodeString& result,
1603	UBool escapeUnprintable) const;
1604
1605	UnicodeString& _generatePattern(UnicodeString& result,
1606	UBool escapeUnprintable) const;
1607
1608	static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1609
1610	static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1611
1612	static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1613	UBool escapeUnprintable);
1614
1615	//----------------------------------------------------------------
1616	// Implementation: Fundamental operators
1617	//----------------------------------------------------------------
1618
1619	void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1620
1621	void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1622
1623	void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1624
1625	/**
1626	* Return true if the given position, in the given pattern, appears
1627	* to be the start of a property set pattern [:foo:], \\p{foo}, or
1628	* \\P{foo}, or \\N{name}.
1629	*/
1630	static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1631	int32_t pos);
1632
1633	static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1634	int32_t iterOpts);
1635
1636	/**
1637	* Parse the given property pattern at the given parse position
1638	* and set this UnicodeSet to the result.
1639	*
1640	* The original design document is out of date, but still useful.
1641	* Ignore the property and value names:
1642	* https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html
1643	*
1644	* Recognized syntax:
1645	*
1646	* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1647	* \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
1648	* \\N{name} - white space not allowed within "\\N"
1649	*
1650	* Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
1651	* Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
1652	* and trailing space is deleted, and internal runs of whitespace
1653	* are collapsed to a single space.
1654	*
1655	* We support binary properties, enumerated properties, and the
1656	* following non-enumerated properties:
1657	*
1658	* Numeric_Value
1659	* Name
1660	* Unicode_1_Name
1661	*
1662	* @param pattern the pattern string
1663	* @param ppos on entry, the position at which to begin parsing.
1664	* This should be one of the locations marked '^':
1665	*
1666	* [:blah:] \\p{blah} \\P{blah} \\N{name}
1667	* ^ % ^ % ^ % ^ %
1668	*
1669	* On return, the position after the last character parsed, that is,
1670	* the locations marked '%'. If the parse fails, ppos is returned
1671	* unchanged.
1672	* @param ec status
1673	* @return a reference to this.
1674	*/
1675	UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1676	ParsePosition& ppos,
1677	UErrorCode &ec);
1678
1679	void applyPropertyPattern(RuleCharacterIterator& chars,
1680	UnicodeString& rebuiltPat,
1681	UErrorCode& ec);
1682
1683	static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1684
1685	/**
1686	* A filter that returns true if the given code point should be
1687	* included in the UnicodeSet being constructed.
1688	*/
1689	typedef UBool (Filter)(UChar32 codePoint, void** context);
1690
1691	/**
1692	* Given a filter, set this UnicodeSet to the code points
1693	* contained by that filter. The filter MUST be
1694	* property-conformant. That is, if it returns value v for one
1695	* code point, then it must return v for all affiliated code
1696	* points, as defined by the inclusions list. See
1697	* getInclusions().
1698	* src is a UPropertySource value.
1699	*/
1700	void applyFilter(Filter filter,
1701	void* context,
1702	const UnicodeSet* inclusions,
1703	UErrorCode &status);
1704
1705	// UCPMap is now stable ICU 63
1706	void applyIntPropertyValue(const UCPMap *map,
1707	UCPMapValueFilter filter, const* void *context,
1708	UErrorCode &errorCode);
1709
1710	/**
1711	* Set the new pattern to cache.
1712	*/
1713	void setPattern(const UnicodeString& newPat) {
1714	setPattern(newPat: newPat.getBuffer(), newPatLen: newPat.length());
1715	}
1716	void setPattern(const char16_t *newPat, int32_t newPatLen);
1717	/**
1718	* Release existing cached pattern.
1719	*/
1720	void releasePattern();
1721
1722	friend class UnicodeSetIterator;
1723	};
1724
1725
1726
1727	inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1728	return !operator==(o);
1729	}
1730
1731	inline UBool UnicodeSet::isFrozen() const {
1732	return (UBool)(bmpSet!=NULL \|\| stringSpan!=NULL);
1733	}
1734
1735	inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1736	return !containsNone(start, end);
1737	}
1738
1739	inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1740	return !containsNone(c: s);
1741	}
1742
1743	inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1744	return !containsNone(s);
1745	}
1746
1747	inline UBool UnicodeSet::isBogus() const {
1748	return (UBool)(fFlags & kIsBogus);
1749	}
1750
1751	inline UnicodeSet UnicodeSet::fromUSet(USet uset) {
1752	return reinterpret_cast<UnicodeSet *>(uset);
1753	}
1754
1755	inline const UnicodeSet UnicodeSet::fromUSet(const* USet *uset) {
1756	return reinterpret_cast<const UnicodeSet *>(uset);
1757	}
1758
1759	inline USet *UnicodeSet::toUSet() {
1760	return reinterpret_cast<USet >(this*);
1761	}
1762
1763	inline const USet UnicodeSet::toUSet() const* {
1764	return reinterpret_cast<const USet >(this*);
1765	}
1766
1767	inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1768	int32_t sLength=s.length();
1769	if(start<`0`) {
1770	start=`0`;
1771	} else if(start>sLength) {
1772	start=sLength;
1773	}
1774	return start+span(s: s.getBuffer()+start, length: sLength-start, spanCondition);
1775	}
1776
1777	inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1778	int32_t sLength=s.length();
1779	if(limit<`0`) {
1780	limit=`0`;
1781	} else if(limit>sLength) {
1782	limit=sLength;
1783	}
1784	return spanBack(s: s.getBuffer(), length: limit, spanCondition);
1785	}
1786
1787	U_NAMESPACE_END
1788
1789	#endif /* U_SHOW_CPLUSPLUS_API */
1790
1791	#endif
1792

Provided by KDAB

Definitions

UnicodeSet
INITIAL_CAPACITY
kIsBogus
ESerialization
setPattern
operator!=
isFrozen
containsSome
containsSome
containsSome
isBogus
fromUSet
fromUSet
toUSet
toUSet
span

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of include/unicode/uniset.h