unicode.rs - Codebrowser

1	use std::error;
2	use std::fmt;
3	use std::result;
4
5	use crate::hir;
6
7	/// A type alias for errors specific to Unicode handling of classes.
8	pub type Result<T> = result::Result<T, Error>;
9
10	/// An inclusive range of codepoints from a generated file (hence the static
11	/// lifetime).
12	type Range = &'static [(char, char)];
13
14	/// An error that occurs when dealing with Unicode.
15	///
16	/// We don't impl the Error trait here because these always get converted
17	/// into other public errors. (This error type isn't exported.)
18	#[derive(Debug)]
19	pub enum Error {
20	PropertyNotFound,
21	PropertyValueNotFound,
22	// Not used when unicode-perl is enabled.
23	#[allow(dead_code)]
24	PerlClassNotFound,
25	}
26
27	/// A type alias for errors specific to Unicode case folding.
28	pub type FoldResult<T> = result::Result<T, CaseFoldError>;
29
30	/// An error that occurs when Unicode-aware simple case folding fails.
31	///
32	/// This error can occur when the case mapping tables necessary for Unicode
33	/// aware case folding are unavailable. This only occurs when the
34	/// `unicode-case` feature is disabled. (The feature is enabled by default.)
35	#[derive(Debug)]
36	pub struct CaseFoldError(());
37
38	impl error::Error for CaseFoldError {}
39
40	impl fmt::Display for CaseFoldError {
41	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42	write!(
43	f,
44	"Unicode-aware case folding is not available \
45	(probably because the unicode-case feature is not enabled)"
46	)
47	}
48	}
49
50	/// An error that occurs when the Unicode-aware `\w` class is unavailable.
51	///
52	/// This error can occur when the data tables necessary for the Unicode aware
53	/// Perl character class `\w` are unavailable. This only occurs when the
54	/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
55	#[derive(Debug)]
56	pub struct UnicodeWordError(());
57
58	impl error::Error for UnicodeWordError {}
59
60	impl fmt::Display for UnicodeWordError {
61	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62	write!(
63	f,
64	"Unicode-aware `\\`w class is not available \
65	(probably because the unicode-perl feature is not enabled)"
66	)
67	}
68	}
69
70	/// Return an iterator over the equivalence class of simple case mappings
71	/// for the given codepoint. The equivalence class does not include the
72	/// given codepoint.
73	///
74	/// If the equivalence class is empty, then this returns the next scalar
75	/// value that has a non-empty equivalence class, if it exists. If no such
76	/// scalar value exists, then `None` is returned. The point of this behavior
77	/// is to permit callers to avoid calling `simple_fold` more than they need
78	/// to, since there is some cost to fetching the equivalence class.
79	///
80	/// This returns an error if the Unicode case folding tables are not available.
81	pub fn simple_fold(
82	c: char,
83	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
84	#[cfg(not(feature = "unicode-case"))]
85	fn imp(
86	_: char,
87	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
88	{
89	use std::option::IntoIter;
90	Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
91	}
92
93	#[cfg(feature = "unicode-case")]
94	fn imp(
95	c: char,
96	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
97	{
98	use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
99
100	Ok(CASE_FOLDING_SIMPLE
101	.binary_search_by_key(&c, \|&(c1, _)\| c1)
102	.map(\|i\| CASE_FOLDING_SIMPLE[i].1.iter().copied())
103	.map_err(\|i\| {
104	if i >= CASE_FOLDING_SIMPLE.len() {
105	None
106	} else {
107	Some(CASE_FOLDING_SIMPLE[i].0)
108	}
109	}))
110	}
111
112	imp(c)
113	}
114
115	/// Returns true if and only if the given (inclusive) range contains at least
116	/// one Unicode scalar value that has a non-empty non-trivial simple case
117	/// mapping.
118	///
119	/// This function panics if `end < start`.
120	///
121	/// This returns an error if the Unicode case folding tables are not available.
122	pub fn contains_simple_case_mapping(
123	start: char,
124	end: char,
125	) -> FoldResult<bool> {
126	#[cfg(not(feature = "unicode-case"))]
127	fn imp(_: char, _: char) -> FoldResult<bool> {
128	Err(CaseFoldError(()))
129	}
130
131	#[cfg(feature = "unicode-case")]
132	fn imp(start: char, end: char) -> FoldResult<bool> {
133	use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
134	use std::cmp::Ordering;
135
136	assert!(start <= end);
137	Ok(CASE_FOLDING_SIMPLE
138	.binary_search_by(\|&(c, _)\| {
139	if start <= c && c <= end {
140	Ordering::Equal
141	} else if c > end {
142	Ordering::Greater
143	} else {
144	Ordering::Less
145	}
146	})
147	.is_ok())
148	}
149
150	imp(start, end)
151	}
152
153	/// A query for finding a character class defined by Unicode. This supports
154	/// either use of a property name directly, or lookup by property value. The
155	/// former generally refers to Binary properties (see UTS#44, Table 8), but
156	/// as a special exception (see UTS#18, Section 1.2) both general categories
157	/// (an enumeration) and scripts (a catalog) are supported as if each of their
158	/// possible values were a binary property.
159	///
160	/// In all circumstances, property names and values are normalized and
161	/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
162	///
163	/// The lifetime `'a` refers to the shorter of the lifetimes of property name
164	/// and property value.
165	#[derive(Debug)]
166	pub enum ClassQuery<'a> {
167	/// Return a class corresponding to a Unicode binary property, named by
168	/// a single letter.
169	OneLetter(char),
170	/// Return a class corresponding to a Unicode binary property.
171	///
172	/// Note that, by special exception (see UTS#18, Section 1.2), both
173	/// general category values and script values are permitted here as if
174	/// they were a binary property.
175	Binary(&'a str),
176	/// Return a class corresponding to all codepoints whose property
177	/// (identified by `property_name`) corresponds to the given value
178	/// (identified by `property_value`).
179	ByValue {
180	/// A property name.
181	property_name: &'a str,
182	/// A property value.
183	property_value: &'a str,
184	},
185	}
186
187	impl<'a> ClassQuery<'a> {
188	fn canonicalize(&self) -> Result<CanonicalClassQuery> {
189	match *self {
190	ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
191	ClassQuery::Binary(name) => self.canonical_binary(name),
192	ClassQuery::ByValue { property_name, property_value } => {
193	let property_name = symbolic_name_normalize(property_name);
194	let property_value = symbolic_name_normalize(property_value);
195
196	let canon_name = match canonical_prop(&property_name)? {
197	None => return Err(Error::PropertyNotFound),
198	Some(canon_name) => canon_name,
199	};
200	Ok(match canon_name {
201	"General_Category" => {
202	let canon = match canonical_gencat(&property_value)? {
203	None => return Err(Error::PropertyValueNotFound),
204	Some(canon) => canon,
205	};
206	CanonicalClassQuery::GeneralCategory(canon)
207	}
208	"Script" => {
209	let canon = match canonical_script(&property_value)? {
210	None => return Err(Error::PropertyValueNotFound),
211	Some(canon) => canon,
212	};
213	CanonicalClassQuery::Script(canon)
214	}
215	_ => {
216	let vals = match property_values(canon_name)? {
217	None => return Err(Error::PropertyValueNotFound),
218	Some(vals) => vals,
219	};
220	let canon_val =
221	match canonical_value(vals, &property_value) {
222	None => {
223	return Err(Error::PropertyValueNotFound)
224	}
225	Some(canon_val) => canon_val,
226	};
227	CanonicalClassQuery::ByValue {
228	property_name: canon_name,
229	property_value: canon_val,
230	}
231	}
232	})
233	}
234	}
235	}
236
237	fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
238	let norm = symbolic_name_normalize(name);
239
240	// This is a special case where 'cf' refers to the 'Format' general
241	// category, but where the 'cf' abbreviation is also an abbreviation
242	// for the 'Case_Folding' property. But we want to treat it as
243	// a general category. (Currently, we don't even support the
244	// 'Case_Folding' property. But if we do in the future, users will be
245	// required to spell it out.)
246	if norm != "cf" {
247	if let Some(canon) = canonical_prop(&norm)? {
248	return Ok(CanonicalClassQuery::Binary(canon));
249	}
250	}
251	if let Some(canon) = canonical_gencat(&norm)? {
252	return Ok(CanonicalClassQuery::GeneralCategory(canon));
253	}
254	if let Some(canon) = canonical_script(&norm)? {
255	return Ok(CanonicalClassQuery::Script(canon));
256	}
257	Err(Error::PropertyNotFound)
258	}
259	}
260
261	/// Like ClassQuery, but its parameters have been canonicalized. This also
262	/// differentiates binary properties from flattened general categories and
263	/// scripts.
264	#[derive(Debug, Eq, PartialEq)]
265	enum CanonicalClassQuery {
266	/// The canonical binary property name.
267	Binary(&'static str),
268	/// The canonical general category name.
269	GeneralCategory(&'static str),
270	/// The canonical script name.
271	Script(&'static str),
272	/// An arbitrary association between property and value, both of which
273	/// have been canonicalized.
274	///
275	/// Note that by construction, the property name of ByValue will never
276	/// be General_Category or Script. Those two cases are subsumed by the
277	/// eponymous variants.
278	ByValue {
279	/// The canonical property name.
280	property_name: &'static str,
281	/// The canonical property value.
282	property_value: &'static str,
283	},
284	}
285
286	/// Looks up a Unicode class given a query. If one doesn't exist, then
287	/// `None` is returned.
288	pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> {
289	use self::CanonicalClassQuery::*;
290
291	match query.canonicalize()? {
292	Binary(name) => bool_property(name),
293	GeneralCategory(name) => gencat(name),
294	Script(name) => script(name),
295	ByValue { property_name: "Age", property_value } => {
296	let mut class = hir::ClassUnicode::empty();
297	for set in ages(property_value)? {
298	class.union(&hir_class(set));
299	}
300	Ok(class)
301	}
302	ByValue { property_name: "Script_Extensions", property_value } => {
303	script_extension(property_value)
304	}
305	ByValue {
306	property_name: "Grapheme_Cluster_Break",
307	property_value,
308	} => gcb(property_value),
309	ByValue { property_name: "Sentence_Break", property_value } => {
310	sb(property_value)
311	}
312	ByValue { property_name: "Word_Break", property_value } => {
313	wb(property_value)
314	}
315	_ => {
316	// What else should we support?
317	Err(Error::PropertyNotFound)
318	}
319	}
320	}
321
322	/// Returns a Unicode aware class for \w.
323	///
324	/// This returns an error if the data is not available for \w.
325	pub fn perl_word() -> Result<hir::ClassUnicode> {
326	#[cfg(not(feature = "unicode-perl"))]
327	fn imp() -> Result<hir::ClassUnicode> {
328	Err(Error::PerlClassNotFound)
329	}
330
331	#[cfg(feature = "unicode-perl")]
332	fn imp() -> Result<hir::ClassUnicode> {
333	use crate::unicode_tables::perl_word::PERL_WORD;
334	Ok(hir_class(PERL_WORD))
335	}
336
337	imp()
338	}
339
340	/// Returns a Unicode aware class for \s.
341	///
342	/// This returns an error if the data is not available for \s.
343	pub fn perl_space() -> Result<hir::ClassUnicode> {
344	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
345	fn imp() -> Result<hir::ClassUnicode> {
346	Err(Error::PerlClassNotFound)
347	}
348
349	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
350	fn imp() -> Result<hir::ClassUnicode> {
351	use crate::unicode_tables::perl_space::WHITE_SPACE;
352	Ok(hir_class(WHITE_SPACE))
353	}
354
355	#[cfg(feature = "unicode-bool")]
356	fn imp() -> Result<hir::ClassUnicode> {
357	use crate::unicode_tables::property_bool::WHITE_SPACE;
358	Ok(hir_class(WHITE_SPACE))
359	}
360
361	imp()
362	}
363
364	/// Returns a Unicode aware class for \d.
365	///
366	/// This returns an error if the data is not available for \d.
367	pub fn perl_digit() -> Result<hir::ClassUnicode> {
368	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
369	fn imp() -> Result<hir::ClassUnicode> {
370	Err(Error::PerlClassNotFound)
371	}
372
373	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
374	fn imp() -> Result<hir::ClassUnicode> {
375	use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
376	Ok(hir_class(DECIMAL_NUMBER))
377	}
378
379	#[cfg(feature = "unicode-gencat")]
380	fn imp() -> Result<hir::ClassUnicode> {
381	use crate::unicode_tables::general_category::DECIMAL_NUMBER;
382	Ok(hir_class(DECIMAL_NUMBER))
383	}
384
385	imp()
386	}
387
388	/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
389	pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
390	let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
391	.iter()
392	.map(\|&(s, e)\| hir::ClassUnicodeRange::new(s, e))
393	.collect();
394	hir::ClassUnicode::new(hir_ranges)
395	}
396
397	/// Returns true only if the given codepoint is in the `\w` character class.
398	///
399	/// If the `unicode-perl` feature is not enabled, then this returns an error.
400	pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
401	#[cfg(not(feature = "unicode-perl"))]
402	fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
403	Err(UnicodeWordError(()))
404	}
405
406	#[cfg(feature = "unicode-perl")]
407	fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
408	use crate::is_word_byte;
409	use crate::unicode_tables::perl_word::PERL_WORD;
410	use std::cmp::Ordering;
411
412	if c <= `0x7F` as char && is_word_byte(c as u8) {
413	return Ok(`true`);
414	}
415	Ok(PERL_WORD
416	.binary_search_by(\|&(start, end)\| {
417	if start <= c && c <= end {
418	Ordering::Equal
419	} else if start > c {
420	Ordering::Greater
421	} else {
422	Ordering::Less
423	}
424	})
425	.is_ok())
426	}
427
428	imp(c)
429	}
430
431	/// A mapping of property values for a specific property.
432	///
433	/// The first element of each tuple is a normalized property value while the
434	/// second element of each tuple is the corresponding canonical property
435	/// value.
436	type PropertyValues = &'static [(&'static str, &'static str)];
437
438	fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
439	Ok(match normalized_value {
440	"any" => Some("Any"),
441	"assigned" => Some("Assigned"),
442	"ascii" => Some("ASCII"),
443	_ => {
444	let gencats = property_values("General_Category")?.unwrap();
445	canonical_value(gencats, normalized_value)
446	}
447	})
448	}
449
450	fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
451	let scripts = property_values("Script")?.unwrap();
452	Ok(canonical_value(scripts, normalized_value))
453	}
454
455	/// Find the canonical property name for the given normalized property name.
456	///
457	/// If no such property exists, then `None` is returned.
458	///
459	/// The normalized property name must have been normalized according to
460	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
461	///
462	/// If the property names data is not available, then an error is returned.
463	fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
464	#[cfg(not(any(
465	feature = "unicode-age",
466	feature = "unicode-bool",
467	feature = "unicode-gencat",
468	feature = "unicode-perl",
469	feature = "unicode-script",
470	feature = "unicode-segment",
471	)))]
472	fn imp(_: &str) -> Result<Option<&'static str>> {
473	Err(Error::PropertyNotFound)
474	}
475
476	#[cfg(any(
477	feature = "unicode-age",
478	feature = "unicode-bool",
479	feature = "unicode-gencat",
480	feature = "unicode-perl",
481	feature = "unicode-script",
482	feature = "unicode-segment",
483	))]
484	fn imp(name: &str) -> Result<Option<&'static str>> {
485	use crate::unicode_tables::property_names::PROPERTY_NAMES;
486
487	Ok(PROPERTY_NAMES
488	.binary_search_by_key(&name, \|&(n, _)\| n)
489	.ok()
490	.map(\|i\| PROPERTY_NAMES[i].1))
491	}
492
493	imp(normalized_name)
494	}
495
496	/// Find the canonical property value for the given normalized property
497	/// value.
498	///
499	/// The given property values should correspond to the values for the property
500	/// under question, which can be found using `property_values`.
501	///
502	/// If no such property value exists, then `None` is returned.
503	///
504	/// The normalized property value must have been normalized according to
505	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
506	fn canonical_value(
507	vals: PropertyValues,
508	normalized_value: &str,
509	) -> Option<&'static str> {
510	vals.binary_search_by_key(&normalized_value, \|&(n, _)\| n)
511	.ok()
512	.map(\|i\| vals[i].1)
513	}
514
515	/// Return the table of property values for the given property name.
516	///
517	/// If the property values data is not available, then an error is returned.
518	fn property_values(
519	canonical_property_name: &'static str,
520	) -> Result<Option<PropertyValues>> {
521	#[cfg(not(any(
522	feature = "unicode-age",
523	feature = "unicode-bool",
524	feature = "unicode-gencat",
525	feature = "unicode-perl",
526	feature = "unicode-script",
527	feature = "unicode-segment",
528	)))]
529	fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
530	Err(Error::PropertyValueNotFound)
531	}
532
533	#[cfg(any(
534	feature = "unicode-age",
535	feature = "unicode-bool",
536	feature = "unicode-gencat",
537	feature = "unicode-perl",
538	feature = "unicode-script",
539	feature = "unicode-segment",
540	))]
541	fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
542	use crate::unicode_tables::property_values::PROPERTY_VALUES;
543
544	Ok(PROPERTY_VALUES
545	.binary_search_by_key(&name, \|&(n, _)\| n)
546	.ok()
547	.map(\|i\| PROPERTY_VALUES[i].1))
548	}
549
550	imp(canonical_property_name)
551	}
552
553	// This is only used in some cases, but small enough to just let it be dead
554	// instead of figuring out (and maintaining) the right set of features.
555	#[allow(dead_code)]
556	fn property_set(
557	name_map: &'static [(&'static str, Range)],
558	canonical: &'static str,
559	) -> Option<Range> {
560	name_map
561	.binary_search_by_key(&canonical, \|x\| x.0)
562	.ok()
563	.map(\|i\| name_map[i].1)
564	}
565
566	/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
567	/// of codepoints that were added in a particular revision of Unicode. The
568	/// iterator yields items in chronological order.
569	///
570	/// If the given age value isn't valid or if the data isn't available, then an
571	/// error is returned instead.
572	fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
573	#[cfg(not(feature = "unicode-age"))]
574	fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
575	use std::option::IntoIter;
576	Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
577	}
578
579	#[cfg(feature = "unicode-age")]
580	fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
581	use crate::unicode_tables::age;
582
583	const AGES: &[(&str, Range)] = &[
584	("V1_1", age::V1_1),
585	("V2_0", age::V2_0),
586	("V2_1", age::V2_1),
587	("V3_0", age::V3_0),
588	("V3_1", age::V3_1),
589	("V3_2", age::V3_2),
590	("V4_0", age::V4_0),
591	("V4_1", age::V4_1),
592	("V5_0", age::V5_0),
593	("V5_1", age::V5_1),
594	("V5_2", age::V5_2),
595	("V6_0", age::V6_0),
596	("V6_1", age::V6_1),
597	("V6_2", age::V6_2),
598	("V6_3", age::V6_3),
599	("V7_0", age::V7_0),
600	("V8_0", age::V8_0),
601	("V9_0", age::V9_0),
602	("V10_0", age::V10_0),
603	("V11_0", age::V11_0),
604	("V12_0", age::V12_0),
605	("V12_1", age::V12_1),
606	("V13_0", age::V13_0),
607	("V14_0", age::V14_0),
608	("V15_0", age::V15_0),
609	];
610	assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
611
612	let pos = AGES.iter().position(\|&(age, _)\| canonical_age == age);
613	match pos {
614	None => Err(Error::PropertyValueNotFound),
615	Some(i) => Ok(AGES[..=i].iter().map(\|&(_, classes)\| classes)),
616	}
617	}
618
619	imp(canonical_age)
620	}
621
622	/// Returns the Unicode HIR class corresponding to the given general category.
623	///
624	/// Name canonicalization is assumed to be performed by the caller.
625	///
626	/// If the given general category could not be found, or if the general
627	/// category data is not available, then an error is returned.
628	fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
629	#[cfg(not(feature = "unicode-gencat"))]
630	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
631	Err(Error::PropertyNotFound)
632	}
633
634	#[cfg(feature = "unicode-gencat")]
635	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
636	use crate::unicode_tables::general_category::BY_NAME;
637	match name {
638	"ASCII" => Ok(hir_class(&[('`\0`', '`\x7F`')])),
639	"Any" => Ok(hir_class(&[('`\0`', '`\u{10FFFF}`')])),
640	"Assigned" => {
641	let mut cls = gencat("Unassigned")?;
642	cls.negate();
643	Ok(cls)
644	}
645	name => property_set(BY_NAME, name)
646	.map(hir_class)
647	.ok_or(Error::PropertyValueNotFound),
648	}
649	}
650
651	match canonical_name {
652	"Decimal_Number" => perl_digit(),
653	name => imp(name),
654	}
655	}
656
657	/// Returns the Unicode HIR class corresponding to the given script.
658	///
659	/// Name canonicalization is assumed to be performed by the caller.
660	///
661	/// If the given script could not be found, or if the script data is not
662	/// available, then an error is returned.
663	fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
664	#[cfg(not(feature = "unicode-script"))]
665	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
666	Err(Error::PropertyNotFound)
667	}
668
669	#[cfg(feature = "unicode-script")]
670	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
671	use crate::unicode_tables::script::BY_NAME;
672	property_set(BY_NAME, name)
673	.map(hir_class)
674	.ok_or(Error::PropertyValueNotFound)
675	}
676
677	imp(canonical_name)
678	}
679
680	/// Returns the Unicode HIR class corresponding to the given script extension.
681	///
682	/// Name canonicalization is assumed to be performed by the caller.
683	///
684	/// If the given script extension could not be found, or if the script data is
685	/// not available, then an error is returned.
686	fn script_extension(
687	canonical_name: &'static str,
688	) -> Result<hir::ClassUnicode> {
689	#[cfg(not(feature = "unicode-script"))]
690	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
691	Err(Error::PropertyNotFound)
692	}
693
694	#[cfg(feature = "unicode-script")]
695	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
696	use crate::unicode_tables::script_extension::BY_NAME;
697	property_set(BY_NAME, name)
698	.map(hir_class)
699	.ok_or(Error::PropertyValueNotFound)
700	}
701
702	imp(canonical_name)
703	}
704
705	/// Returns the Unicode HIR class corresponding to the given Unicode boolean
706	/// property.
707	///
708	/// Name canonicalization is assumed to be performed by the caller.
709	///
710	/// If the given boolean property could not be found, or if the boolean
711	/// property data is not available, then an error is returned.
712	fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
713	#[cfg(not(feature = "unicode-bool"))]
714	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
715	Err(Error::PropertyNotFound)
716	}
717
718	#[cfg(feature = "unicode-bool")]
719	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
720	use crate::unicode_tables::property_bool::BY_NAME;
721	property_set(BY_NAME, name)
722	.map(hir_class)
723	.ok_or(Error::PropertyNotFound)
724	}
725
726	match canonical_name {
727	"Decimal_Number" => perl_digit(),
728	"White_Space" => perl_space(),
729	name => imp(name),
730	}
731	}
732
733	/// Returns the Unicode HIR class corresponding to the given grapheme cluster
734	/// break property.
735	///
736	/// Name canonicalization is assumed to be performed by the caller.
737	///
738	/// If the given property could not be found, or if the corresponding data is
739	/// not available, then an error is returned.
740	fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
741	#[cfg(not(feature = "unicode-segment"))]
742	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
743	Err(Error::PropertyNotFound)
744	}
745
746	#[cfg(feature = "unicode-segment")]
747	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
748	use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
749	property_set(BY_NAME, name)
750	.map(hir_class)
751	.ok_or(Error::PropertyValueNotFound)
752	}
753
754	imp(canonical_name)
755	}
756
757	/// Returns the Unicode HIR class corresponding to the given word break
758	/// property.
759	///
760	/// Name canonicalization is assumed to be performed by the caller.
761	///
762	/// If the given property could not be found, or if the corresponding data is
763	/// not available, then an error is returned.
764	fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
765	#[cfg(not(feature = "unicode-segment"))]
766	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
767	Err(Error::PropertyNotFound)
768	}
769
770	#[cfg(feature = "unicode-segment")]
771	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
772	use crate::unicode_tables::word_break::BY_NAME;
773	property_set(BY_NAME, name)
774	.map(hir_class)
775	.ok_or(Error::PropertyValueNotFound)
776	}
777
778	imp(canonical_name)
779	}
780
781	/// Returns the Unicode HIR class corresponding to the given sentence
782	/// break property.
783	///
784	/// Name canonicalization is assumed to be performed by the caller.
785	///
786	/// If the given property could not be found, or if the corresponding data is
787	/// not available, then an error is returned.
788	fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
789	#[cfg(not(feature = "unicode-segment"))]
790	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
791	Err(Error::PropertyNotFound)
792	}
793
794	#[cfg(feature = "unicode-segment")]
795	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
796	use crate::unicode_tables::sentence_break::BY_NAME;
797	property_set(BY_NAME, name)
798	.map(hir_class)
799	.ok_or(Error::PropertyValueNotFound)
800	}
801
802	imp(canonical_name)
803	}
804
805	/// Like symbolic_name_normalize_bytes, but operates on a string.
806	fn symbolic_name_normalize(x: &str) -> String {
807	let mut tmp = x.as_bytes().to_vec();
808	let len = symbolic_name_normalize_bytes(&mut tmp).len();
809	tmp.truncate(len);
810	// This should always succeed because `symbolic_name_normalize_bytes`
811	// guarantees that `&tmp[..len]` is always valid UTF-8.
812	//
813	// N.B. We could avoid the additional UTF-8 check here, but it's unlikely
814	// to be worth skipping the additional safety check. A benchmark must
815	// justify it first.
816	String::from_utf8(tmp).unwrap()
817	}
818
819	/// Normalize the given symbolic name in place according to UAX44-LM3.
820	///
821	/// A "symbolic name" typically corresponds to property names and property
822	/// value aliases. Note, though, that it should not be applied to property
823	/// string values.
824	///
825	/// The slice returned is guaranteed to be valid UTF-8 for all possible values
826	/// of `slice`.
827	///
828	/// See: https://unicode.org/reports/tr44/#UAX44-LM3
829	fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
830	// I couldn't find a place in the standard that specified that property
831	// names/aliases had a particular structure (unlike character names), but
832	// we assume that it's ASCII only and drop anything that isn't ASCII.
833	let mut start = `0`;
834	let mut starts_with_is = `false`;
835	if slice.len() >= `2` {
836	// Ignore any "is" prefix.
837	starts_with_is = slice[`0`..`2`] == b"is"[..]
838	\|\| slice[`0`..`2`] == b"IS"[..]
839	\|\| slice[`0`..`2`] == b"iS"[..]
840	\|\| slice[`0`..`2`] == b"Is"[..];
841	if starts_with_is {
842	start = `2`;
843	}
844	}
845	let mut next_write = `0`;
846	for i in start..slice.len() {
847	// VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
848	// UTF-8, we ensure that the slice contains only ASCII bytes. In
849	// particular, we drop every non-ASCII byte from the normalized string.
850	let b = slice[i];
851	if b == b' ' \|\| b == b'_' \|\| b == b'-' {
852	continue;
853	} else if b'A' <= b && b <= b'Z' {
854	slice[next_write] = b + (b'a' - b'A');
855	next_write += `1`;
856	} else if b <= `0x7F` {
857	slice[next_write] = b;
858	next_write += `1`;
859	}
860	}
861	// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
862	// ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
863	// fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
864	// is actually an alias for the 'Other' general category.
865	if starts_with_is && next_write == `1` && slice[`0`] == b'c' {
866	slice[`0`] = b'i';
867	slice[`1`] = b's';
868	slice[`2`] = b'c';
869	next_write = `3`;
870	}
871	&mut slice[..next_write]
872	}
873
874	#[cfg(test)]
875	mod tests {
876	use super::{
877	contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
878	symbolic_name_normalize_bytes,
879	};
880
881	#[cfg(feature = "unicode-case")]
882	fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
883	simple_fold(c).unwrap().unwrap()
884	}
885
886	#[cfg(feature = "unicode-case")]
887	fn simple_fold_err(c: char) -> Option<char> {
888	match simple_fold(c).unwrap() {
889	Ok(_) => unreachable!("simple_fold returned Ok iterator"),
890	Err(next) => next,
891	}
892	}
893
894	#[cfg(feature = "unicode-case")]
895	fn contains_case_map(start: char, end: char) -> bool {
896	contains_simple_case_mapping(start, end).unwrap()
897	}
898
899	#[test]
900	#[cfg(feature = "unicode-case")]
901	fn simple_fold_k() {
902	let xs: Vec<char> = simple_fold_ok('k').collect();
903	assert_eq!(xs, vec!['K', 'K']);
904
905	let xs: Vec<char> = simple_fold_ok('K').collect();
906	assert_eq!(xs, vec!['k', 'K']);
907
908	let xs: Vec<char> = simple_fold_ok('K').collect();
909	assert_eq!(xs, vec!['K', 'k']);
910	}
911
912	#[test]
913	#[cfg(feature = "unicode-case")]
914	fn simple_fold_a() {
915	let xs: Vec<char> = simple_fold_ok('a').collect();
916	assert_eq!(xs, vec!['A']);
917
918	let xs: Vec<char> = simple_fold_ok('A').collect();
919	assert_eq!(xs, vec!['a']);
920	}
921
922	#[test]
923	#[cfg(feature = "unicode-case")]
924	fn simple_fold_empty() {
925	assert_eq!(Some('A'), simple_fold_err('?'));
926	assert_eq!(Some('A'), simple_fold_err('@'));
927	assert_eq!(Some('a'), simple_fold_err('['));
928	assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
929	}
930
931	#[test]
932	#[cfg(feature = "unicode-case")]
933	fn simple_fold_max() {
934	assert_eq!(None, simple_fold_err('`\u{10FFFE}`'));
935	assert_eq!(None, simple_fold_err('`\u{10FFFF}`'));
936	}
937
938	#[test]
939	#[cfg(not(feature = "unicode-case"))]
940	fn simple_fold_disabled() {
941	assert!(simple_fold('a').is_err());
942	}
943
944	#[test]
945	#[cfg(feature = "unicode-case")]
946	fn range_contains() {
947	assert!(contains_case_map('A', 'A'));
948	assert!(contains_case_map('Z', 'Z'));
949	assert!(contains_case_map('A', 'Z'));
950	assert!(contains_case_map('@', 'A'));
951	assert!(contains_case_map('Z', '['));
952	assert!(contains_case_map('☃', 'Ⰰ'));
953
954	assert!(!contains_case_map('[', '['));
955	assert!(!contains_case_map('[', '`'));
956
957	assert!(!contains_case_map('☃', '☃'));
958	}
959
960	#[test]
961	#[cfg(not(feature = "unicode-case"))]
962	fn range_contains_disabled() {
963	assert!(contains_simple_case_mapping('a', 'a').is_err());
964	}
965
966	#[test]
967	#[cfg(feature = "unicode-gencat")]
968	fn regression_466() {
969	use super::{CanonicalClassQuery, ClassQuery};
970
971	let q = ClassQuery::OneLetter('C');
972	assert_eq!(
973	q.canonicalize().unwrap(),
974	CanonicalClassQuery::GeneralCategory("Other")
975	);
976	}
977
978	#[test]
979	fn sym_normalize() {
980	let sym_norm = symbolic_name_normalize;
981
982	assert_eq!(sym_norm("Line_Break"), "linebreak");
983	assert_eq!(sym_norm("Line-break"), "linebreak");
984	assert_eq!(sym_norm("linebreak"), "linebreak");
985	assert_eq!(sym_norm("BA"), "ba");
986	assert_eq!(sym_norm("ba"), "ba");
987	assert_eq!(sym_norm("Greek"), "greek");
988	assert_eq!(sym_norm("isGreek"), "greek");
989	assert_eq!(sym_norm("IS_Greek"), "greek");
990	assert_eq!(sym_norm("isc"), "isc");
991	assert_eq!(sym_norm("is c"), "isc");
992	assert_eq!(sym_norm("is_c"), "isc");
993	}
994
995	#[test]
996	fn valid_utf8_symbolic() {
997	let mut x = b"abc`\xFF`xyz".to_vec();
998	let y = symbolic_name_normalize_bytes(&mut x);
999	assert_eq!(y, b"abcxyz");
1000	}
1001	}
1002