unicode_data.rs source code [crates/ucd_parse/src/unicode_data.rs]

1	use std::path::Path;
2
3	use crate::{
4	common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5	error::Error,
6	};
7
8	/// Represents a single row in the `UnicodeData.txt` file.
9	///
10	/// These fields were taken from UAX44, Table 9, as part of the documentation
11	/// for the
12	/// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt).
13	#[derive(Clone, Debug, Default, Eq, PartialEq)]
14	pub struct UnicodeData {
15	/// The codepoint corresponding to this row.
16	pub codepoint: Codepoint,
17	/// The name of this codepoint.
18	pub name: String,
19	/// The "general category" of this codepoint.
20	pub general_category: String,
21	/// The class of this codepoint used in the Canonical Ordering Algorithm.
22	///
23	/// Note that some classes map to a particular symbol. See
24	/// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
25	pub canonical_combining_class: u8,
26	/// The bidirectional class of this codepoint.
27	///
28	/// Possible values are listed in
29	/// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
30	pub bidi_class: String,
31	/// The decomposition mapping for this codepoint. This includes its
32	/// formatting tag (if present).
33	pub decomposition: UnicodeDataDecomposition,
34	/// A decimal numeric representation of this codepoint, if it has the
35	/// property `Numeric_Type=Decimal`.
36	pub numeric_type_decimal: Option<u8>,
37	/// A decimal numeric representation of this codepoint, if it has the
38	/// property `Numeric_Type=Digit`. Note that while this field is still
39	/// populated for existing codepoints, no new codepoints will have this
40	/// field populated.
41	pub numeric_type_digit: Option<u8>,
42	/// A decimal or rational numeric representation of this codepoint, if it
43	/// has the property `Numeric_Type=Numeric`.
44	pub numeric_type_numeric: Option<UnicodeDataNumeric>,
45	/// A boolean indicating whether this codepoint is "mirrored" in
46	/// bidirectional text.
47	pub bidi_mirrored: bool,
48	/// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
49	/// this field is empty unless it is significantly different from
50	/// the `name` field.
51	pub unicode1_name: String,
52	/// The ISO 10464 comment field. This no longer contains any non-NULL
53	/// values.
54	pub iso_comment: String,
55	/// This codepoint's simple uppercase mapping, if it exists.
56	pub simple_uppercase_mapping: Option<Codepoint>,
57	/// This codepoint's simple lowercase mapping, if it exists.
58	pub simple_lowercase_mapping: Option<Codepoint>,
59	/// This codepoint's simple titlecase mapping, if it exists.
60	pub simple_titlecase_mapping: Option<Codepoint>,
61	}
62
63	impl UcdFile for UnicodeData {
64	fn relative_file_path() -> &'static Path {
65	Path::new("UnicodeData.txt")
66	}
67	}
68
69	impl UcdFileByCodepoint for UnicodeData {
70	fn codepoints(&self) -> CodepointIter {
71	self.codepoint.into_iter()
72	}
73	}
74
75	impl UnicodeData {
76	/// Returns true if and only if this record corresponds to the start of a
77	/// range.
78	pub fn is_range_start(&self) -> bool {
79	self.name.starts_with('<')
80	&& self.name.ends_with('>')
81	&& self.name.contains("First")
82	}
83
84	/// Returns true if and only if this record corresponds to the end of a
85	/// range.
86	pub fn is_range_end(&self) -> bool {
87	self.name.starts_with('<')
88	&& self.name.ends_with('>')
89	&& self.name.contains("Last")
90	}
91	}
92
93	impl std::str::FromStr for UnicodeData {
94	type Err = Error;
95
96	fn from_str(line: &str) -> Result<UnicodeData, Error> {
97	let re_parts = regex!(
98	r"(?x)
99	^
100	([A-Z0-9]+); # 1; codepoint
101	([^;]+); # 2; name
102	([^;]+); # 3; general category
103	([0-9]+); # 4; canonical combining class
104	([^;]+); # 5; bidi class
105	([^;]*); # 6; decomposition
106	([0-9]*); # 7; numeric type decimal
107	([0-9]*); # 8; numeric type digit
108	([-0-9/]*); # 9; numeric type numeric
109	([YN]); # 10; bidi mirrored
110	([^;]*); # 11; unicode1 name
111	([^;]*); # 12; ISO comment
112	([^;]*); # 13; simple uppercase mapping
113	([^;]*); # 14; simple lowercase mapping
114	([^;]*) # 15; simple titlecase mapping
115	$
116	",
117	);
118
119	let caps = match re_parts.captures(line.trim()) {
120	Some(caps) => caps,
121	None => return err!("invalid UnicodeData line"),
122	};
123	let capget = \|n\| caps.get(n).unwrap().as_str();
124	let mut data = UnicodeData::default();
125
126	data.codepoint = capget(`1`).parse()?;
127	data.name = capget(`2`).to_string();
128	data.general_category = capget(`3`).to_string();
129	data.canonical_combining_class = match capget(`4`).parse() {
130	Ok(n) => n,
131	Err(err) => {
132	return err!(
133	"failed to parse canonical combining class '{}': {}",
134	capget(`4`),
135	err
136	)
137	}
138	};
139	data.bidi_class = capget(`5`).to_string();
140	if !caps[`6`].is_empty() {
141	data.decomposition = caps[`6`].parse()?;
142	} else {
143	data.decomposition.push(data.codepoint)?;
144	}
145	if !capget(`7`).is_empty() {
146	data.numeric_type_decimal = Some(match capget(`7`).parse() {
147	Ok(n) => n,
148	Err(err) => {
149	return err!(
150	"failed to parse numeric type decimal '{}': {}",
151	capget(`7`),
152	err
153	)
154	}
155	});
156	}
157	if !capget(`8`).is_empty() {
158	data.numeric_type_digit = Some(match capget(`8`).parse() {
159	Ok(n) => n,
160	Err(err) => {
161	return err!(
162	"failed to parse numeric type digit '{}': {}",
163	capget(`8`),
164	err
165	)
166	}
167	});
168	}
169	if !capget(`9`).is_empty() {
170	data.numeric_type_numeric = Some(capget(`9`).parse()?);
171	}
172	data.bidi_mirrored = capget(`10`) == "Y";
173	data.unicode1_name = capget(`11`).to_string();
174	data.iso_comment = capget(`12`).to_string();
175	if !capget(`13`).is_empty() {
176	data.simple_uppercase_mapping = Some(capget(`13`).parse()?);
177	}
178	if !capget(`14`).is_empty() {
179	data.simple_lowercase_mapping = Some(capget(`14`).parse()?);
180	}
181	if !capget(`15`).is_empty() {
182	data.simple_titlecase_mapping = Some(capget(`15`).parse()?);
183	}
184	Ok(data)
185	}
186	}
187
188	impl std::fmt::Display for UnicodeData {
189	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190	write!(f, "{};", self.codepoint)?;
191	write!(f, "{};", self.name)?;
192	write!(f, "{};", self.general_category)?;
193	write!(f, "{};", self.canonical_combining_class)?;
194	write!(f, "{};", self.bidi_class)?;
195	if self.decomposition.is_canonical()
196	&& self.decomposition.mapping() == &[self.codepoint]
197	{
198	write!(f, ";")?;
199	} else {
200	write!(f, "{};", self.decomposition)?;
201	}
202	if let Some(n) = self.numeric_type_decimal {
203	write!(f, "{};", n)?;
204	} else {
205	write!(f, ";")?;
206	}
207	if let Some(n) = self.numeric_type_digit {
208	write!(f, "{};", n)?;
209	} else {
210	write!(f, ";")?;
211	}
212	if let Some(n) = self.numeric_type_numeric {
213	write!(f, "{};", n)?;
214	} else {
215	write!(f, ";")?;
216	}
217	write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
218	write!(f, "{};", self.unicode1_name)?;
219	write!(f, "{};", self.iso_comment)?;
220	if let Some(cp) = self.simple_uppercase_mapping {
221	write!(f, "{};", cp)?;
222	} else {
223	write!(f, ";")?;
224	}
225	if let Some(cp) = self.simple_lowercase_mapping {
226	write!(f, "{};", cp)?;
227	} else {
228	write!(f, ";")?;
229	}
230	if let Some(cp) = self.simple_titlecase_mapping {
231	write!(f, "{}", cp)?;
232	}
233	Ok(())
234	}
235	}
236
237	/// Represents a decomposition mapping of a single row in the
238	/// `UnicodeData.txt` file.
239	#[derive(Clone, Debug, Default, Eq, PartialEq)]
240	pub struct UnicodeDataDecomposition {
241	/// The formatting tag associated with this mapping, if present.
242	pub tag: Option<UnicodeDataDecompositionTag>,
243	/// The number of codepoints in this mapping.
244	pub len: usize,
245	/// The codepoints in the mapping. Entries beyond `len` in the mapping
246	/// are always U+0000. If no mapping was present, then this always contains
247	/// a single codepoint corresponding to this row's character.
248	pub mapping: [Codepoint; `18`],
249	}
250
251	impl UnicodeDataDecomposition {
252	/// Create a new decomposition mapping with the given tag and codepoints.
253	///
254	/// If there are too many codepoints, then an error is returned.
255	pub fn new(
256	tag: Option<UnicodeDataDecompositionTag>,
257	mapping: &[Codepoint],
258	) -> Result<UnicodeDataDecomposition, Error> {
259	let mut x = UnicodeDataDecomposition::default();
260	x.tag = tag;
261	for &cp in mapping {
262	x.push(cp)?;
263	}
264	Ok(x)
265	}
266
267	/// Add a new codepoint to this decomposition's mapping.
268	///
269	/// If the mapping is already full, then this returns an error.
270	pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
271	if self.len >= self.mapping.len() {
272	return err!(
273	"invalid decomposition mapping (too many codepoints)"
274	);
275	}
276	self.mapping[self.len] = cp;
277	self.len += `1`;
278	Ok(())
279	}
280
281	/// Return the mapping as a slice of codepoints. The slice returned
282	/// has length equivalent to the number of codepoints in this mapping.
283	pub fn mapping(&self) -> &[Codepoint] {
284	&self.mapping[..self.len]
285	}
286
287	/// Returns true if and only if this decomposition mapping is canonical.
288	pub fn is_canonical(&self) -> bool {
289	self.tag.is_none()
290	}
291	}
292
293	impl std::str::FromStr for UnicodeDataDecomposition {
294	type Err = Error;
295
296	fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
297	let re_with_tag =
298	regex!(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$");
299	let re_chars = regex!(r"[0-9A-F]+");
300	if s.is_empty() {
301	return err!(
302	"expected non-empty string for \
303	UnicodeDataDecomposition value"
304	);
305	}
306	let caps = match re_with_tag.captures(s) {
307	Some(caps) => caps,
308	None => return err!("invalid decomposition value"),
309	};
310	let mut decomp = UnicodeDataDecomposition::default();
311	let mut codepoints = s;
312	if let Some(m) = caps.name("tag") {
313	decomp.tag = Some(m.as_str().parse()?);
314	codepoints = &caps["chars"];
315	}
316	for m in re_chars.find_iter(codepoints) {
317	let cp = m.as_str().parse()?;
318	decomp.push(cp)?;
319	}
320	Ok(decomp)
321	}
322	}
323
324	impl std::fmt::Display for UnicodeDataDecomposition {
325	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326	if let Some(ref tag: &UnicodeDataDecompositionTag) = self.tag {
327	write!(f, "<{}> ", tag)?;
328	}
329	let mut first: bool = `true`;
330	for cp: &Codepoint in self.mapping() {
331	if !first {
332	write!(f, " ")?;
333	}
334	first = `false`;
335	write!(f, "{}", cp)?;
336	}
337	Ok(())
338	}
339	}
340
341	/// The formatting tag on a decomposition mapping.
342	///
343	/// This is taken from
344	/// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
345	#[derive(Clone, Debug, Eq, PartialEq)]
346	pub enum UnicodeDataDecompositionTag {
347	/// <font>
348	Font,
349	/// <noBreak>
350	NoBreak,
351	/// <initial>
352	Initial,
353	/// <medial>
354	Medial,
355	/// <final>
356	Final,
357	/// <isolated>
358	Isolated,
359	/// <circle>
360	Circle,
361	/// <super>
362	Super,
363	/// <sub>
364	Sub,
365	/// <vertical>
366	Vertical,
367	/// <wide>
368	Wide,
369	/// <narrow>
370	Narrow,
371	/// <small>
372	Small,
373	/// <square>
374	Square,
375	/// <fraction>
376	Fraction,
377	/// <compat>
378	Compat,
379	}
380
381	impl std::str::FromStr for UnicodeDataDecompositionTag {
382	type Err = Error;
383
384	fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
385	use self::UnicodeDataDecompositionTag::*;
386	Ok(match s {
387	"font" => Font,
388	"noBreak" => NoBreak,
389	"initial" => Initial,
390	"medial" => Medial,
391	"final" => Final,
392	"isolated" => Isolated,
393	"circle" => Circle,
394	"super" => Super,
395	"sub" => Sub,
396	"vertical" => Vertical,
397	"wide" => Wide,
398	"narrow" => Narrow,
399	"small" => Small,
400	"square" => Square,
401	"fraction" => Fraction,
402	"compat" => Compat,
403	_ => return err!("invalid decomposition formatting tag: {}", s),
404	})
405	}
406	}
407
408	impl std::fmt::Display for UnicodeDataDecompositionTag {
409	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410	use self::UnicodeDataDecompositionTag::*;
411	let s: &'static str = match *self {
412	Font => "font",
413	NoBreak => "noBreak",
414	Initial => "initial",
415	Medial => "medial",
416	Final => "final",
417	Isolated => "isolated",
418	Circle => "circle",
419	Super => "super",
420	Sub => "sub",
421	Vertical => "vertical",
422	Wide => "wide",
423	Narrow => "narrow",
424	Small => "small",
425	Square => "square",
426	Fraction => "fraction",
427	Compat => "compat",
428	};
429	write!(f, "{}", s)
430	}
431	}
432
433	/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
434	///
435	/// A numeric value can either be a signed integer or a rational number.
436	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
437	pub enum UnicodeDataNumeric {
438	/// An integer.
439	Integer(i64),
440	/// A rational number. The first is the numerator and the latter is the
441	/// denominator.
442	Rational(i64, i64),
443	}
444
445	impl std::str::FromStr for UnicodeDataNumeric {
446	type Err = Error;
447
448	fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
449	if s.is_empty() {
450	return err!(
451	"expected non-empty string for UnicodeDataNumeric value"
452	);
453	}
454	if let Some(pos) = s.find('/') {
455	let (snum, sden) = (&s[..pos], &s[pos + `1`..]);
456	let num = match snum.parse() {
457	Ok(num) => num,
458	Err(err) => {
459	return err!(
460	"invalid integer numerator '{}': {}",
461	snum,
462	err
463	);
464	}
465	};
466	let den = match sden.parse() {
467	Ok(den) => den,
468	Err(err) => {
469	return err!(
470	"invalid integer denominator '{}': {}",
471	sden,
472	err
473	);
474	}
475	};
476	Ok(UnicodeDataNumeric::Rational(num, den))
477	} else {
478	match s.parse() {
479	Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
480	Err(err) => {
481	return err!(
482	"invalid integer denominator '{}': {}",
483	s,
484	err
485	);
486	}
487	}
488	}
489	}
490	}
491
492	impl std::fmt::Display for UnicodeDataNumeric {
493	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
494	match *self {
495	UnicodeDataNumeric::Integer(n: i64) => write!(f, "{}", n),
496	UnicodeDataNumeric::Rational(n: i64, d: i64) => write!(f, "{}/{}", n, d),
497	}
498	}
499	}
500
501	/// An iterator adapter that expands rows in `UnicodeData.txt`.
502	///
503	/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
504	/// represented. Instead, they are represented by a pair of rows, indicating
505	/// a range of codepoints with the same properties. For example, the Hangul
506	/// syllable codepoints are represented by these two rows:
507	///
508	/// ```ignore
509	/// AC00;<Hangul Syllable, First>;Lo;`0`;L;;;;;N;;;;;
510	/// D7A3;<Hangul Syllable, Last>;Lo;`0`;L;;;;;N;;;;;
511	/// ```
512	///
513	/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
514	/// Unicode codepoints is found, it will be expanded to the appropriate
515	/// sequence of `UnicodeData` values. Note that all such expanded records will
516	/// have an empty name.
517	pub struct UnicodeDataExpander<I: Iterator> {
518	/// The underlying iterator.
519	it: std::iter::Peekable<I>,
520	/// A range of codepoints to emit when we've found a pair. Otherwise,
521	/// `None`.
522	range: CodepointRange,
523	}
524
525	struct CodepointRange {
526	/// The codepoint range.
527	range: std::ops::Range<u32>,
528	/// The start record. All subsequent records in this range are generated
529	/// by cloning this and updating the codepoint/name.
530	start_record: UnicodeData,
531	}
532
533	impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
534	/// Create a new iterator that expands pairs of `UnicodeData` range
535	/// records. All other records are passed through as-is.
536	pub fn new<T>(it: T) -> UnicodeDataExpander<I>
537	where
538	T: IntoIterator<IntoIter = I, Item = I::Item>,
539	{
540	UnicodeDataExpander {
541	it: it.into_iter().peekable(),
542	range: CodepointRange {
543	range: `0`..`0`,
544	start_record: UnicodeData::default(),
545	},
546	}
547	}
548	}
549
550	impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
551	type Item = UnicodeData;
552
553	fn next(&mut self) -> Option<UnicodeData> {
554	if let Some(udata: UnicodeData) = self.range.next() {
555	return Some(udata);
556	}
557	let row1: UnicodeData = match self.it.next() {
558	None => return None,
559	Some(row1: UnicodeData) => row1,
560	};
561	if !row1.is_range_start()
562	\|\| !self.it.peek().map_or(default:`false`, \|row2: &UnicodeData\| row2.is_range_end())
563	{
564	return Some(row1);
565	}
566	let row2: UnicodeData = self.it.next().unwrap();
567	self.range = CodepointRange {
568	range: row1.codepoint.value()..(row2.codepoint.value() + `1`),
569	start_record: row1,
570	};
571	self.next()
572	}
573	}
574
575	impl Iterator for CodepointRange {
576	type Item = UnicodeData;
577
578	fn next(&mut self) -> Option<UnicodeData> {
579	let cp: u32 = match self.range.next() {
580	None => return None,
581	Some(cp: u32) => cp,
582	};
583	Some(UnicodeData {
584	codepoint: Codepoint::from_u32(cp).unwrap(),
585	name: "".to_string(),
586	..self.start_record.clone()
587	})
588	}
589	}
590
591	#[cfg(test)]
592	mod tests {
593	use crate::common::Codepoint;
594
595	use super::{
596	UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
597	UnicodeDataNumeric,
598	};
599
600	fn codepoint(n: u32) -> Codepoint {
601	Codepoint::from_u32(n).unwrap()
602	}
603
604	fn s(string: &str) -> String {
605	string.to_string()
606	}
607
608	#[test]
609	fn parse1() {
610	let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;`\n`";
611	let data: UnicodeData = line.parse().unwrap();
612	assert_eq!(
613	data,
614	UnicodeData {
615	codepoint: codepoint(`0x249d`),
616	name: s("PARENTHESIZED LATIN SMALL LETTER B"),
617	general_category: s("So"),
618	canonical_combining_class: `0`,
619	bidi_class: s("L"),
620	decomposition: UnicodeDataDecomposition::new(
621	Some(UnicodeDataDecompositionTag::Compat),
622	&[codepoint(`0x28`), codepoint(`0x62`), codepoint(`0x29`)],
623	)
624	.unwrap(),
625	numeric_type_decimal: None,
626	numeric_type_digit: None,
627	numeric_type_numeric: None,
628	bidi_mirrored: `false`,
629	unicode1_name: s(""),
630	iso_comment: s(""),
631	simple_uppercase_mapping: None,
632	simple_lowercase_mapping: None,
633	simple_titlecase_mapping: None,
634	}
635	);
636	}
637
638	#[test]
639	fn parse2() {
640	let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;`\n`";
641	let data: UnicodeData = line.parse().unwrap();
642	assert_eq!(
643	data,
644	UnicodeData {
645	codepoint: codepoint(`0x000D`),
646	name: s("<control>"),
647	general_category: s("Cc"),
648	canonical_combining_class: `0`,
649	bidi_class: s("B"),
650	decomposition: UnicodeDataDecomposition::new(
651	None,
652	&[codepoint(`0x000D`)]
653	)
654	.unwrap(),
655	numeric_type_decimal: None,
656	numeric_type_digit: None,
657	numeric_type_numeric: None,
658	bidi_mirrored: `false`,
659	unicode1_name: s("CARRIAGE RETURN (CR)"),
660	iso_comment: s(""),
661	simple_uppercase_mapping: None,
662	simple_lowercase_mapping: None,
663	simple_titlecase_mapping: None,
664	}
665	);
666	}
667
668	#[test]
669	fn parse3() {
670	let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;`\n`";
671	let data: UnicodeData = line.parse().unwrap();
672	assert_eq!(
673	data,
674	UnicodeData {
675	codepoint: codepoint(`0x00BC`),
676	name: s("VULGAR FRACTION ONE QUARTER"),
677	general_category: s("No"),
678	canonical_combining_class: `0`,
679	bidi_class: s("ON"),
680	decomposition: UnicodeDataDecomposition::new(
681	Some(UnicodeDataDecompositionTag::Fraction),
682	&[codepoint(`0x31`), codepoint(`0x2044`), codepoint(`0x34`)],
683	)
684	.unwrap(),
685	numeric_type_decimal: None,
686	numeric_type_digit: None,
687	numeric_type_numeric: Some(UnicodeDataNumeric::Rational(`1`, `4`)),
688	bidi_mirrored: `false`,
689	unicode1_name: s("FRACTION ONE QUARTER"),
690	iso_comment: s(""),
691	simple_uppercase_mapping: None,
692	simple_lowercase_mapping: None,
693	simple_titlecase_mapping: None,
694	}
695	);
696	}
697
698	#[test]
699	fn parse4() {
700	let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;`\n`";
701	let data: UnicodeData = line.parse().unwrap();
702	assert_eq!(
703	data,
704	UnicodeData {
705	codepoint: codepoint(`0x0041`),
706	name: s("LATIN CAPITAL LETTER A"),
707	general_category: s("Lu"),
708	canonical_combining_class: `0`,
709	bidi_class: s("L"),
710	decomposition: UnicodeDataDecomposition::new(
711	None,
712	&[codepoint(`0x0041`)]
713	)
714	.unwrap(),
715	numeric_type_decimal: None,
716	numeric_type_digit: None,
717	numeric_type_numeric: None,
718	bidi_mirrored: `false`,
719	unicode1_name: s(""),
720	iso_comment: s(""),
721	simple_uppercase_mapping: None,
722	simple_lowercase_mapping: Some(codepoint(`0x0061`)),
723	simple_titlecase_mapping: None,
724	}
725	);
726	}
727
728	#[test]
729	fn parse5() {
730	let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;`\n`";
731	let data: UnicodeData = line.parse().unwrap();
732	assert_eq!(
733	data,
734	UnicodeData {
735	codepoint: codepoint(`0x0F33`),
736	name: s("TIBETAN DIGIT HALF ZERO"),
737	general_category: s("No"),
738	canonical_combining_class: `0`,
739	bidi_class: s("L"),
740	decomposition: UnicodeDataDecomposition::new(
741	None,
742	&[codepoint(`0x0F33`)]
743	)
744	.unwrap(),
745	numeric_type_decimal: None,
746	numeric_type_digit: None,
747	numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
748	-`1`, `2`
749	)),
750	bidi_mirrored: `false`,
751	unicode1_name: s(""),
752	iso_comment: s(""),
753	simple_uppercase_mapping: None,
754	simple_lowercase_mapping: None,
755	simple_titlecase_mapping: None,
756	}
757	);
758	}
759
760	#[test]
761	fn expander() {
762	use super::UnicodeDataExpander;
763	use crate::common::UcdLineParser;
764
765	let data = "\
766	ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
767	AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
768	D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
769	D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
770	";
771	let records = UcdLineParser::new(None, data.as_bytes())
772	.collect::<Result<Vec<_>, _>>()
773	.unwrap();
774	assert_eq!(UnicodeDataExpander::new(records).count(), `11174`);
775	}
776	}
777