unicode.rs - Codebrowser

1	use alloc::{
2	string::{String, ToString},
3	vec::Vec,
4	};
5
6	use crate::hir;
7
8	/// An inclusive range of codepoints from a generated file (hence the static
9	/// lifetime).
10	type Range = &'static [(char, char)];
11
12	/// An error that occurs when dealing with Unicode.
13	///
14	/// We don't impl the Error trait here because these always get converted
15	/// into other public errors. (This error type isn't exported.)
16	#[derive(Debug)]
17	pub enum Error {
18	PropertyNotFound,
19	PropertyValueNotFound,
20	// Not used when unicode-perl is enabled.
21	#[allow(dead_code)]
22	PerlClassNotFound,
23	}
24
25	/// An error that occurs when Unicode-aware simple case folding fails.
26	///
27	/// This error can occur when the case mapping tables necessary for Unicode
28	/// aware case folding are unavailable. This only occurs when the
29	/// `unicode-case` feature is disabled. (The feature is enabled by default.)
30	#[derive(Debug)]
31	pub struct CaseFoldError(());
32
33	#[cfg(feature = "std")]
34	impl std::error::Error for CaseFoldError {}
35
36	impl core::fmt::Display for CaseFoldError {
37	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
38	write!(
39	f,
40	"Unicode-aware case folding is not available \
41	(probably because the unicode-case feature is not enabled)"
42	)
43	}
44	}
45
46	/// An error that occurs when the Unicode-aware `\w` class is unavailable.
47	///
48	/// This error can occur when the data tables necessary for the Unicode aware
49	/// Perl character class `\w` are unavailable. This only occurs when the
50	/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
51	#[derive(Debug)]
52	pub struct UnicodeWordError(());
53
54	#[cfg(feature = "std")]
55	impl std::error::Error for UnicodeWordError {}
56
57	impl core::fmt::Display for UnicodeWordError {
58	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
59	write!(
60	f,
61	"Unicode-aware `\\`w class is not available \
62	(probably because the unicode-perl feature is not enabled)"
63	)
64	}
65	}
66
67	/// A state oriented traverser of the simple case folding table.
68	///
69	/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will
70	/// return an error if the underlying case folding table is unavailable.
71	///
72	/// After construction, it is expected that callers will use
73	/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly
74	/// increasing order. For example, calling it on `b` and then on `a` is illegal
75	/// and will result in a panic.
76	///
77	/// The main idea of this type is that it tries hard to make mapping lookups
78	/// fast by exploiting the structure of the underlying table, and the ordering
79	/// assumption enables this.
80	#[derive(Debug)]
81	pub struct SimpleCaseFolder {
82	/// The simple case fold table. It's a sorted association list, where the
83	/// keys are Unicode scalar values and the values are the corresponding
84	/// equivalence class (not including the key) of the "simple" case folded
85	/// Unicode scalar values.
86	table: &'static [(char, &'static [char])],
87	/// The last codepoint that was used for a lookup.
88	last: Option<char>,
89	/// The index to the entry in `table` corresponding to the smallest key `k`
90	/// such that `k > k0`, where `k0` is the most recent key lookup. Note that
91	/// in particular, `k0` may not be in the table!
92	next: usize,
93	}
94
95	impl SimpleCaseFolder {
96	/// Create a new simple case folder, returning an error if the underlying
97	/// case folding table is unavailable.
98	pub fn new() -> Result<SimpleCaseFolder, CaseFoldError> {
99	#[cfg(not(feature = "unicode-case"))]
100	{
101	Err(CaseFoldError(()))
102	}
103	#[cfg(feature = "unicode-case")]
104	{
105	Ok(SimpleCaseFolder {
106	table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE,
107	last: None,
108	next: `0`,
109	})
110	}
111	}
112
113	/// Return the equivalence class of case folded codepoints for the given
114	/// codepoint. The equivalence class returned never includes the codepoint
115	/// given. If the given codepoint has no case folded codepoints (i.e.,
116	/// no entry in the underlying case folding table), then this returns an
117	/// empty slice.
118	///
119	/// # Panics
120	///
121	/// This panics when called with a `c` that is less than or equal to the
122	/// previous call. In other words, callers need to use this method with
123	/// strictly increasing values of `c`.
124	pub fn mapping(&mut self, c: char) -> &'static [char] {
125	if let Some(last) = self.last {
126	assert!(
127	last < c,
128	"got codepoint U+{:X} which occurs before \
129	last codepoint U+{:X}",
130	u32::from(c),
131	u32::from(last),
132	);
133	}
134	self.last = Some(c);
135	if self.next >= self.table.len() {
136	return &[];
137	}
138	let (k, v) = self.table[self.next];
139	if k == c {
140	self.next += `1`;
141	return v;
142	}
143	match self.get(c) {
144	Err(i) => {
145	self.next = i;
146	&[]
147	}
148	Ok(i) => {
149	// Since we require lookups to proceed
150	// in order, anything we find should be
151	// after whatever we thought might be
152	// next. Otherwise, the caller is either
153	// going out of order or we would have
154	// found our next key at 'self.next'.
155	assert!(i > self.next);
156	self.next = i + `1`;
157	self.table[i].1
158	}
159	}
160	}
161
162	/// Returns true if and only if the given range overlaps with any region
163	/// of the underlying case folding table. That is, when true, there exists
164	/// at least one codepoint in the inclusive range `[start, end]` that has
165	/// a non-trivial equivalence class of case folded codepoints. Conversely,
166	/// when this returns false, all codepoints in the range `[start, end]`
167	/// correspond to the trivial equivalence class of case folded codepoints,
168	/// i.e., itself.
169	///
170	/// This is useful to call before iterating over the codepoints in the
171	/// range and looking up the mapping for each. If you know none of the
172	/// mappings will return anything, then you might be able to skip doing it
173	/// altogether.
174	///
175	/// # Panics
176	///
177	/// This panics when `end < start`.
178	pub fn overlaps(&self, start: char, end: char) -> bool {
179	use core::cmp::Ordering;
180
181	assert!(start <= end);
182	self.table
183	.binary_search_by(\|&(c, _)\| {
184	if start <= c && c <= end {
185	Ordering::Equal
186	} else if c > end {
187	Ordering::Greater
188	} else {
189	Ordering::Less
190	}
191	})
192	.is_ok()
193	}
194
195	/// Returns the index at which `c` occurs in the simple case fold table. If
196	/// `c` does not occur, then this returns an `i` such that `table[i-1].0 <
197	/// c` and `table[i].0 > c`.
198	fn get(&self, c: char) -> Result<usize, usize> {
199	self.table.binary_search_by_key(&c, \|&(c1, _)\| c1)
200	}
201	}
202
203	/// A query for finding a character class defined by Unicode. This supports
204	/// either use of a property name directly, or lookup by property value. The
205	/// former generally refers to Binary properties (see UTS#44, Table 8), but
206	/// as a special exception (see UTS#18, Section 1.2) both general categories
207	/// (an enumeration) and scripts (a catalog) are supported as if each of their
208	/// possible values were a binary property.
209	///
210	/// In all circumstances, property names and values are normalized and
211	/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
212	///
213	/// The lifetime `'a` refers to the shorter of the lifetimes of property name
214	/// and property value.
215	#[derive(Debug)]
216	pub enum ClassQuery<'a> {
217	/// Return a class corresponding to a Unicode binary property, named by
218	/// a single letter.
219	OneLetter(char),
220	/// Return a class corresponding to a Unicode binary property.
221	///
222	/// Note that, by special exception (see UTS#18, Section 1.2), both
223	/// general category values and script values are permitted here as if
224	/// they were a binary property.
225	Binary(&'a str),
226	/// Return a class corresponding to all codepoints whose property
227	/// (identified by `property_name`) corresponds to the given value
228	/// (identified by `property_value`).
229	ByValue {
230	/// A property name.
231	property_name: &'a str,
232	/// A property value.
233	property_value: &'a str,
234	},
235	}
236
237	impl<'a> ClassQuery<'a> {
238	fn canonicalize(&self) -> Result<CanonicalClassQuery, Error> {
239	match *self {
240	ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
241	ClassQuery::Binary(name) => self.canonical_binary(name),
242	ClassQuery::ByValue { property_name, property_value } => {
243	let property_name = symbolic_name_normalize(property_name);
244	let property_value = symbolic_name_normalize(property_value);
245
246	let canon_name = match canonical_prop(&property_name)? {
247	None => return Err(Error::PropertyNotFound),
248	Some(canon_name) => canon_name,
249	};
250	Ok(match canon_name {
251	"General_Category" => {
252	let canon = match canonical_gencat(&property_value)? {
253	None => return Err(Error::PropertyValueNotFound),
254	Some(canon) => canon,
255	};
256	CanonicalClassQuery::GeneralCategory(canon)
257	}
258	"Script" => {
259	let canon = match canonical_script(&property_value)? {
260	None => return Err(Error::PropertyValueNotFound),
261	Some(canon) => canon,
262	};
263	CanonicalClassQuery::Script(canon)
264	}
265	_ => {
266	let vals = match property_values(canon_name)? {
267	None => return Err(Error::PropertyValueNotFound),
268	Some(vals) => vals,
269	};
270	let canon_val =
271	match canonical_value(vals, &property_value) {
272	None => {
273	return Err(Error::PropertyValueNotFound)
274	}
275	Some(canon_val) => canon_val,
276	};
277	CanonicalClassQuery::ByValue {
278	property_name: canon_name,
279	property_value: canon_val,
280	}
281	}
282	})
283	}
284	}
285	}
286
287	fn canonical_binary(
288	&self,
289	name: &str,
290	) -> Result<CanonicalClassQuery, Error> {
291	let norm = symbolic_name_normalize(name);
292
293	// This is a special case where 'cf' refers to the 'Format' general
294	// category, but where the 'cf' abbreviation is also an abbreviation
295	// for the 'Case_Folding' property. But we want to treat it as
296	// a general category. (Currently, we don't even support the
297	// 'Case_Folding' property. But if we do in the future, users will be
298	// required to spell it out.)
299	//
300	// Also 'sc' refers to the 'Currency_Symbol' general category, but is
301	// also the abbreviation for the 'Script' property. So we avoid calling
302	// 'canonical_prop' for it too, which would erroneously normalize it
303	// to 'Script'.
304	//
305	// Another case: 'lc' is an abbreviation for the 'Cased_Letter'
306	// general category, but is also an abbreviation for the 'Lowercase_Mapping'
307	// property. We don't currently support the latter, so as with 'cf'
308	// above, we treat 'lc' as 'Cased_Letter'.
309	if norm != "cf" && norm != "sc" && norm != "lc" {
310	if let Some(canon) = canonical_prop(&norm)? {
311	return Ok(CanonicalClassQuery::Binary(canon));
312	}
313	}
314	if let Some(canon) = canonical_gencat(&norm)? {
315	return Ok(CanonicalClassQuery::GeneralCategory(canon));
316	}
317	if let Some(canon) = canonical_script(&norm)? {
318	return Ok(CanonicalClassQuery::Script(canon));
319	}
320	Err(Error::PropertyNotFound)
321	}
322	}
323
324	/// Like ClassQuery, but its parameters have been canonicalized. This also
325	/// differentiates binary properties from flattened general categories and
326	/// scripts.
327	#[derive(Debug, Eq, PartialEq)]
328	enum CanonicalClassQuery {
329	/// The canonical binary property name.
330	Binary(&'static str),
331	/// The canonical general category name.
332	GeneralCategory(&'static str),
333	/// The canonical script name.
334	Script(&'static str),
335	/// An arbitrary association between property and value, both of which
336	/// have been canonicalized.
337	///
338	/// Note that by construction, the property name of ByValue will never
339	/// be General_Category or Script. Those two cases are subsumed by the
340	/// eponymous variants.
341	ByValue {
342	/// The canonical property name.
343	property_name: &'static str,
344	/// The canonical property value.
345	property_value: &'static str,
346	},
347	}
348
349	/// Looks up a Unicode class given a query. If one doesn't exist, then
350	/// `None` is returned.
351	pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode, Error> {
352	use self::CanonicalClassQuery::*;
353
354	match query.canonicalize()? {
355	Binary(name) => bool_property(name),
356	GeneralCategory(name) => gencat(name),
357	Script(name) => script(name),
358	ByValue { property_name: "Age", property_value } => {
359	let mut class = hir::ClassUnicode::empty();
360	for set in ages(property_value)? {
361	class.union(&hir_class(set));
362	}
363	Ok(class)
364	}
365	ByValue { property_name: "Script_Extensions", property_value } => {
366	script_extension(property_value)
367	}
368	ByValue {
369	property_name: "Grapheme_Cluster_Break",
370	property_value,
371	} => gcb(property_value),
372	ByValue { property_name: "Sentence_Break", property_value } => {
373	sb(property_value)
374	}
375	ByValue { property_name: "Word_Break", property_value } => {
376	wb(property_value)
377	}
378	_ => {
379	// What else should we support?
380	Err(Error::PropertyNotFound)
381	}
382	}
383	}
384
385	/// Returns a Unicode aware class for \w.
386	///
387	/// This returns an error if the data is not available for \w.
388	pub fn perl_word() -> Result<hir::ClassUnicode, Error> {
389	#[cfg(not(feature = "unicode-perl"))]
390	fn imp() -> Result<hir::ClassUnicode, Error> {
391	Err(Error::PerlClassNotFound)
392	}
393
394	#[cfg(feature = "unicode-perl")]
395	fn imp() -> Result<hir::ClassUnicode, Error> {
396	use crate::unicode_tables::perl_word::PERL_WORD;
397	Ok(hir_class(PERL_WORD))
398	}
399
400	imp()
401	}
402
403	/// Returns a Unicode aware class for \s.
404	///
405	/// This returns an error if the data is not available for \s.
406	pub fn perl_space() -> Result<hir::ClassUnicode, Error> {
407	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
408	fn imp() -> Result<hir::ClassUnicode, Error> {
409	Err(Error::PerlClassNotFound)
410	}
411
412	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
413	fn imp() -> Result<hir::ClassUnicode, Error> {
414	use crate::unicode_tables::perl_space::WHITE_SPACE;
415	Ok(hir_class(WHITE_SPACE))
416	}
417
418	#[cfg(feature = "unicode-bool")]
419	fn imp() -> Result<hir::ClassUnicode, Error> {
420	use crate::unicode_tables::property_bool::WHITE_SPACE;
421	Ok(hir_class(WHITE_SPACE))
422	}
423
424	imp()
425	}
426
427	/// Returns a Unicode aware class for \d.
428	///
429	/// This returns an error if the data is not available for \d.
430	pub fn perl_digit() -> Result<hir::ClassUnicode, Error> {
431	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
432	fn imp() -> Result<hir::ClassUnicode, Error> {
433	Err(Error::PerlClassNotFound)
434	}
435
436	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
437	fn imp() -> Result<hir::ClassUnicode, Error> {
438	use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
439	Ok(hir_class(DECIMAL_NUMBER))
440	}
441
442	#[cfg(feature = "unicode-gencat")]
443	fn imp() -> Result<hir::ClassUnicode, Error> {
444	use crate::unicode_tables::general_category::DECIMAL_NUMBER;
445	Ok(hir_class(DECIMAL_NUMBER))
446	}
447
448	imp()
449	}
450
451	/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
452	pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
453	let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
454	.iter()
455	.map(\|&(s, e)\| hir::ClassUnicodeRange::new(s, e))
456	.collect();
457	hir::ClassUnicode::new(hir_ranges)
458	}
459
460	/// Returns true only if the given codepoint is in the `\w` character class.
461	///
462	/// If the `unicode-perl` feature is not enabled, then this returns an error.
463	pub fn is_word_character(c: char) -> Result<bool, UnicodeWordError> {
464	#[cfg(not(feature = "unicode-perl"))]
465	fn imp(_: char) -> Result<bool, UnicodeWordError> {
466	Err(UnicodeWordError(()))
467	}
468
469	#[cfg(feature = "unicode-perl")]
470	fn imp(c: char) -> Result<bool, UnicodeWordError> {
471	use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD};
472
473	if u8::try_from(c).map_or(`false`, is_word_byte) {
474	return Ok(`true`);
475	}
476	Ok(PERL_WORD
477	.binary_search_by(\|&(start, end)\| {
478	use core::cmp::Ordering;
479
480	if start <= c && c <= end {
481	Ordering::Equal
482	} else if start > c {
483	Ordering::Greater
484	} else {
485	Ordering::Less
486	}
487	})
488	.is_ok())
489	}
490
491	imp(c)
492	}
493
494	/// A mapping of property values for a specific property.
495	///
496	/// The first element of each tuple is a normalized property value while the
497	/// second element of each tuple is the corresponding canonical property
498	/// value.
499	type PropertyValues = &'static [(&'static str, &'static str)];
500
501	fn canonical_gencat(
502	normalized_value: &str,
503	) -> Result<Option<&'static str>, Error> {
504	Ok(match normalized_value {
505	"any" => Some("Any"),
506	"assigned" => Some("Assigned"),
507	"ascii" => Some("ASCII"),
508	_ => {
509	let gencats = property_values("General_Category")?.unwrap();
510	canonical_value(gencats, normalized_value)
511	}
512	})
513	}
514
515	fn canonical_script(
516	normalized_value: &str,
517	) -> Result<Option<&'static str>, Error> {
518	let scripts = property_values("Script")?.unwrap();
519	Ok(canonical_value(scripts, normalized_value))
520	}
521
522	/// Find the canonical property name for the given normalized property name.
523	///
524	/// If no such property exists, then `None` is returned.
525	///
526	/// The normalized property name must have been normalized according to
527	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
528	///
529	/// If the property names data is not available, then an error is returned.
530	fn canonical_prop(
531	normalized_name: &str,
532	) -> Result<Option<&'static str>, Error> {
533	#[cfg(not(any(
534	feature = "unicode-age",
535	feature = "unicode-bool",
536	feature = "unicode-gencat",
537	feature = "unicode-perl",
538	feature = "unicode-script",
539	feature = "unicode-segment",
540	)))]
541	fn imp(_: &str) -> Result<Option<&'static str>, Error> {
542	Err(Error::PropertyNotFound)
543	}
544
545	#[cfg(any(
546	feature = "unicode-age",
547	feature = "unicode-bool",
548	feature = "unicode-gencat",
549	feature = "unicode-perl",
550	feature = "unicode-script",
551	feature = "unicode-segment",
552	))]
553	fn imp(name: &str) -> Result<Option<&'static str>, Error> {
554	use crate::unicode_tables::property_names::PROPERTY_NAMES;
555
556	Ok(PROPERTY_NAMES
557	.binary_search_by_key(&name, \|&(n, _)\| n)
558	.ok()
559	.map(\|i\| PROPERTY_NAMES[i].1))
560	}
561
562	imp(normalized_name)
563	}
564
565	/// Find the canonical property value for the given normalized property
566	/// value.
567	///
568	/// The given property values should correspond to the values for the property
569	/// under question, which can be found using `property_values`.
570	///
571	/// If no such property value exists, then `None` is returned.
572	///
573	/// The normalized property value must have been normalized according to
574	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
575	fn canonical_value(
576	vals: PropertyValues,
577	normalized_value: &str,
578	) -> Option<&'static str> {
579	vals.binary_search_by_key(&normalized_value, \|&(n, _)\| n)
580	.ok()
581	.map(\|i\| vals[i].1)
582	}
583
584	/// Return the table of property values for the given property name.
585	///
586	/// If the property values data is not available, then an error is returned.
587	fn property_values(
588	canonical_property_name: &'static str,
589	) -> Result<Option<PropertyValues>, Error> {
590	#[cfg(not(any(
591	feature = "unicode-age",
592	feature = "unicode-bool",
593	feature = "unicode-gencat",
594	feature = "unicode-perl",
595	feature = "unicode-script",
596	feature = "unicode-segment",
597	)))]
598	fn imp(_: &'static str) -> Result<Option<PropertyValues>, Error> {
599	Err(Error::PropertyValueNotFound)
600	}
601
602	#[cfg(any(
603	feature = "unicode-age",
604	feature = "unicode-bool",
605	feature = "unicode-gencat",
606	feature = "unicode-perl",
607	feature = "unicode-script",
608	feature = "unicode-segment",
609	))]
610	fn imp(name: &'static str) -> Result<Option<PropertyValues>, Error> {
611	use crate::unicode_tables::property_values::PROPERTY_VALUES;
612
613	Ok(PROPERTY_VALUES
614	.binary_search_by_key(&name, \|&(n, _)\| n)
615	.ok()
616	.map(\|i\| PROPERTY_VALUES[i].1))
617	}
618
619	imp(canonical_property_name)
620	}
621
622	// This is only used in some cases, but small enough to just let it be dead
623	// instead of figuring out (and maintaining) the right set of features.
624	#[allow(dead_code)]
625	fn property_set(
626	name_map: &'static [(&'static str, Range)],
627	canonical: &'static str,
628	) -> Option<Range> {
629	name_map
630	.binary_search_by_key(&canonical, \|x\| x.0)
631	.ok()
632	.map(\|i\| name_map[i].1)
633	}
634
635	/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
636	/// of codepoints that were added in a particular revision of Unicode. The
637	/// iterator yields items in chronological order.
638	///
639	/// If the given age value isn't valid or if the data isn't available, then an
640	/// error is returned instead.
641	fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
642	#[cfg(not(feature = "unicode-age"))]
643	fn imp(_: &str) -> Result<impl Iterator<Item = Range>, Error> {
644	use core::option::IntoIter;
645	Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
646	}
647
648	#[cfg(feature = "unicode-age")]
649	fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>, Error> {
650	use crate::unicode_tables::age;
651
652	const AGES: &[(&str, Range)] = &[
653	("V1_1", age::V1_1),
654	("V2_0", age::V2_0),
655	("V2_1", age::V2_1),
656	("V3_0", age::V3_0),
657	("V3_1", age::V3_1),
658	("V3_2", age::V3_2),
659	("V4_0", age::V4_0),
660	("V4_1", age::V4_1),
661	("V5_0", age::V5_0),
662	("V5_1", age::V5_1),
663	("V5_2", age::V5_2),
664	("V6_0", age::V6_0),
665	("V6_1", age::V6_1),
666	("V6_2", age::V6_2),
667	("V6_3", age::V6_3),
668	("V7_0", age::V7_0),
669	("V8_0", age::V8_0),
670	("V9_0", age::V9_0),
671	("V10_0", age::V10_0),
672	("V11_0", age::V11_0),
673	("V12_0", age::V12_0),
674	("V12_1", age::V12_1),
675	("V13_0", age::V13_0),
676	("V14_0", age::V14_0),
677	("V15_0", age::V15_0),
678	];
679	assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
680
681	let pos = AGES.iter().position(\|&(age, _)\| canonical_age == age);
682	match pos {
683	None => Err(Error::PropertyValueNotFound),
684	Some(i) => Ok(AGES[..=i].iter().map(\|&(_, classes)\| classes)),
685	}
686	}
687
688	imp(canonical_age)
689	}
690
691	/// Returns the Unicode HIR class corresponding to the given general category.
692	///
693	/// Name canonicalization is assumed to be performed by the caller.
694	///
695	/// If the given general category could not be found, or if the general
696	/// category data is not available, then an error is returned.
697	fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
698	#[cfg(not(feature = "unicode-gencat"))]
699	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
700	Err(Error::PropertyNotFound)
701	}
702
703	#[cfg(feature = "unicode-gencat")]
704	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
705	use crate::unicode_tables::general_category::BY_NAME;
706	match name {
707	"ASCII" => Ok(hir_class(&[('`\0`', '`\x7F`')])),
708	"Any" => Ok(hir_class(&[('`\0`', '`\u{10FFFF}`')])),
709	"Assigned" => {
710	let mut cls = gencat("Unassigned")?;
711	cls.negate();
712	Ok(cls)
713	}
714	name => property_set(BY_NAME, name)
715	.map(hir_class)
716	.ok_or(Error::PropertyValueNotFound),
717	}
718	}
719
720	match canonical_name {
721	"Decimal_Number" => perl_digit(),
722	name => imp(name),
723	}
724	}
725
726	/// Returns the Unicode HIR class corresponding to the given script.
727	///
728	/// Name canonicalization is assumed to be performed by the caller.
729	///
730	/// If the given script could not be found, or if the script data is not
731	/// available, then an error is returned.
732	fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
733	#[cfg(not(feature = "unicode-script"))]
734	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
735	Err(Error::PropertyNotFound)
736	}
737
738	#[cfg(feature = "unicode-script")]
739	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
740	use crate::unicode_tables::script::BY_NAME;
741	property_set(BY_NAME, name)
742	.map(hir_class)
743	.ok_or(Error::PropertyValueNotFound)
744	}
745
746	imp(canonical_name)
747	}
748
749	/// Returns the Unicode HIR class corresponding to the given script extension.
750	///
751	/// Name canonicalization is assumed to be performed by the caller.
752	///
753	/// If the given script extension could not be found, or if the script data is
754	/// not available, then an error is returned.
755	fn script_extension(
756	canonical_name: &'static str,
757	) -> Result<hir::ClassUnicode, Error> {
758	#[cfg(not(feature = "unicode-script"))]
759	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
760	Err(Error::PropertyNotFound)
761	}
762
763	#[cfg(feature = "unicode-script")]
764	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
765	use crate::unicode_tables::script_extension::BY_NAME;
766	property_set(BY_NAME, name)
767	.map(hir_class)
768	.ok_or(Error::PropertyValueNotFound)
769	}
770
771	imp(canonical_name)
772	}
773
774	/// Returns the Unicode HIR class corresponding to the given Unicode boolean
775	/// property.
776	///
777	/// Name canonicalization is assumed to be performed by the caller.
778	///
779	/// If the given boolean property could not be found, or if the boolean
780	/// property data is not available, then an error is returned.
781	fn bool_property(
782	canonical_name: &'static str,
783	) -> Result<hir::ClassUnicode, Error> {
784	#[cfg(not(feature = "unicode-bool"))]
785	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
786	Err(Error::PropertyNotFound)
787	}
788
789	#[cfg(feature = "unicode-bool")]
790	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
791	use crate::unicode_tables::property_bool::BY_NAME;
792	property_set(BY_NAME, name)
793	.map(hir_class)
794	.ok_or(Error::PropertyNotFound)
795	}
796
797	match canonical_name {
798	"Decimal_Number" => perl_digit(),
799	"White_Space" => perl_space(),
800	name => imp(name),
801	}
802	}
803
804	/// Returns the Unicode HIR class corresponding to the given grapheme cluster
805	/// break property.
806	///
807	/// Name canonicalization is assumed to be performed by the caller.
808	///
809	/// If the given property could not be found, or if the corresponding data is
810	/// not available, then an error is returned.
811	fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
812	#[cfg(not(feature = "unicode-segment"))]
813	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
814	Err(Error::PropertyNotFound)
815	}
816
817	#[cfg(feature = "unicode-segment")]
818	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
819	use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
820	property_set(BY_NAME, name)
821	.map(hir_class)
822	.ok_or(Error::PropertyValueNotFound)
823	}
824
825	imp(canonical_name)
826	}
827
828	/// Returns the Unicode HIR class corresponding to the given word break
829	/// property.
830	///
831	/// Name canonicalization is assumed to be performed by the caller.
832	///
833	/// If the given property could not be found, or if the corresponding data is
834	/// not available, then an error is returned.
835	fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
836	#[cfg(not(feature = "unicode-segment"))]
837	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
838	Err(Error::PropertyNotFound)
839	}
840
841	#[cfg(feature = "unicode-segment")]
842	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
843	use crate::unicode_tables::word_break::BY_NAME;
844	property_set(BY_NAME, name)
845	.map(hir_class)
846	.ok_or(Error::PropertyValueNotFound)
847	}
848
849	imp(canonical_name)
850	}
851
852	/// Returns the Unicode HIR class corresponding to the given sentence
853	/// break property.
854	///
855	/// Name canonicalization is assumed to be performed by the caller.
856	///
857	/// If the given property could not be found, or if the corresponding data is
858	/// not available, then an error is returned.
859	fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode, Error> {
860	#[cfg(not(feature = "unicode-segment"))]
861	fn imp(_: &'static str) -> Result<hir::ClassUnicode, Error> {
862	Err(Error::PropertyNotFound)
863	}
864
865	#[cfg(feature = "unicode-segment")]
866	fn imp(name: &'static str) -> Result<hir::ClassUnicode, Error> {
867	use crate::unicode_tables::sentence_break::BY_NAME;
868	property_set(BY_NAME, name)
869	.map(hir_class)
870	.ok_or(Error::PropertyValueNotFound)
871	}
872
873	imp(canonical_name)
874	}
875
876	/// Like symbolic_name_normalize_bytes, but operates on a string.
877	fn symbolic_name_normalize(x: &str) -> String {
878	let mut tmp = x.as_bytes().to_vec();
879	let len = symbolic_name_normalize_bytes(&mut tmp).len();
880	tmp.truncate(len);
881	// This should always succeed because `symbolic_name_normalize_bytes`
882	// guarantees that `&tmp[..len]` is always valid UTF-8.
883	//
884	// N.B. We could avoid the additional UTF-8 check here, but it's unlikely
885	// to be worth skipping the additional safety check. A benchmark must
886	// justify it first.
887	String::from_utf8(tmp).unwrap()
888	}
889
890	/// Normalize the given symbolic name in place according to UAX44-LM3.
891	///
892	/// A "symbolic name" typically corresponds to property names and property
893	/// value aliases. Note, though, that it should not be applied to property
894	/// string values.
895	///
896	/// The slice returned is guaranteed to be valid UTF-8 for all possible values
897	/// of `slice`.
898	///
899	/// See: https://unicode.org/reports/tr44/#UAX44-LM3
900	fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
901	// I couldn't find a place in the standard that specified that property
902	// names/aliases had a particular structure (unlike character names), but
903	// we assume that it's ASCII only and drop anything that isn't ASCII.
904	let mut start = `0`;
905	let mut starts_with_is = `false`;
906	if slice.len() >= `2` {
907	// Ignore any "is" prefix.
908	starts_with_is = slice[`0`..`2`] == b"is"[..]
909	\|\| slice[`0`..`2`] == b"IS"[..]
910	\|\| slice[`0`..`2`] == b"iS"[..]
911	\|\| slice[`0`..`2`] == b"Is"[..];
912	if starts_with_is {
913	start = `2`;
914	}
915	}
916	let mut next_write = `0`;
917	for i in start..slice.len() {
918	// VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
919	// UTF-8, we ensure that the slice contains only ASCII bytes. In
920	// particular, we drop every non-ASCII byte from the normalized string.
921	let b = slice[i];
922	if b == b' ' \|\| b == b'_' \|\| b == b'-' {
923	continue;
924	} else if b'A' <= b && b <= b'Z' {
925	slice[next_write] = b + (b'a' - b'A');
926	next_write += `1`;
927	} else if b <= `0x7F` {
928	slice[next_write] = b;
929	next_write += `1`;
930	}
931	}
932	// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
933	// ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
934	// fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
935	// is actually an alias for the 'Other' general category.
936	if starts_with_is && next_write == `1` && slice[`0`] == b'c' {
937	slice[`0`] = b'i';
938	slice[`1`] = b's';
939	slice[`2`] = b'c';
940	next_write = `3`;
941	}
942	&mut slice[..next_write]
943	}
944
945	#[cfg(test)]
946	mod tests {
947	use super::*;
948
949	#[cfg(feature = "unicode-case")]
950	fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
951	SimpleCaseFolder::new().unwrap().mapping(c).iter().copied()
952	}
953
954	#[cfg(feature = "unicode-case")]
955	fn contains_case_map(start: char, end: char) -> bool {
956	SimpleCaseFolder::new().unwrap().overlaps(start, end)
957	}
958
959	#[test]
960	#[cfg(feature = "unicode-case")]
961	fn simple_fold_k() {
962	let xs: Vec<char> = simple_fold_ok('k').collect();
963	assert_eq!(xs, alloc::vec!['K', 'K']);
964
965	let xs: Vec<char> = simple_fold_ok('K').collect();
966	assert_eq!(xs, alloc::vec!['k', 'K']);
967
968	let xs: Vec<char> = simple_fold_ok('K').collect();
969	assert_eq!(xs, alloc::vec!['K', 'k']);
970	}
971
972	#[test]
973	#[cfg(feature = "unicode-case")]
974	fn simple_fold_a() {
975	let xs: Vec<char> = simple_fold_ok('a').collect();
976	assert_eq!(xs, alloc::vec!['A']);
977
978	let xs: Vec<char> = simple_fold_ok('A').collect();
979	assert_eq!(xs, alloc::vec!['a']);
980	}
981
982	#[test]
983	#[cfg(not(feature = "unicode-case"))]
984	fn simple_fold_disabled() {
985	assert!(SimpleCaseFolder::new().is_err());
986	}
987
988	#[test]
989	#[cfg(feature = "unicode-case")]
990	fn range_contains() {
991	assert!(contains_case_map('A', 'A'));
992	assert!(contains_case_map('Z', 'Z'));
993	assert!(contains_case_map('A', 'Z'));
994	assert!(contains_case_map('@', 'A'));
995	assert!(contains_case_map('Z', '['));
996	assert!(contains_case_map('☃', 'Ⰰ'));
997
998	assert!(!contains_case_map('[', '['));
999	assert!(!contains_case_map('[', '`'));
1000
1001	assert!(!contains_case_map('☃', '☃'));
1002	}
1003
1004	#[test]
1005	#[cfg(feature = "unicode-gencat")]
1006	fn regression_466() {
1007	use super::{CanonicalClassQuery, ClassQuery};
1008
1009	let q = ClassQuery::OneLetter('C');
1010	assert_eq!(
1011	q.canonicalize().unwrap(),
1012	CanonicalClassQuery::GeneralCategory("Other")
1013	);
1014	}
1015
1016	#[test]
1017	fn sym_normalize() {
1018	let sym_norm = symbolic_name_normalize;
1019
1020	assert_eq!(sym_norm("Line_Break"), "linebreak");
1021	assert_eq!(sym_norm("Line-break"), "linebreak");
1022	assert_eq!(sym_norm("linebreak"), "linebreak");
1023	assert_eq!(sym_norm("BA"), "ba");
1024	assert_eq!(sym_norm("ba"), "ba");
1025	assert_eq!(sym_norm("Greek"), "greek");
1026	assert_eq!(sym_norm("isGreek"), "greek");
1027	assert_eq!(sym_norm("IS_Greek"), "greek");
1028	assert_eq!(sym_norm("isc"), "isc");
1029	assert_eq!(sym_norm("is c"), "isc");
1030	assert_eq!(sym_norm("is_c"), "isc");
1031	}
1032
1033	#[test]
1034	fn valid_utf8_symbolic() {
1035	let mut x = b"abc`\xFF`xyz".to_vec();
1036	let y = symbolic_name_normalize_bytes(&mut x);
1037	assert_eq!(y, b"abcxyz");
1038	}
1039	}
1040