unicode_data.rs source code [crates/ucd-parse/src/unicode_data.rs]

1	use std::fmt;
2	use std::iter;
3	use std::ops::Range;
4	use std::path::Path;
5	use std::str::FromStr;
6
7	use once_cell::sync::Lazy;
8	use regex::Regex;
9
10	use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
11	use crate::error::Error;
12
13	/// Represents a single row in the `UnicodeData.txt` file.
14	///
15	/// These fields were taken from UAX44, Table 9, as part of the documentation
16	/// for the
17	/// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt).
18	#[derive(Clone, Debug, Default, Eq, PartialEq)]
19	pub struct UnicodeData {
20	/// The codepoint corresponding to this row.
21	pub codepoint: Codepoint,
22	/// The name of this codepoint.
23	pub name: String,
24	/// The "general category" of this codepoint.
25	pub general_category: String,
26	/// The class of this codepoint used in the Canonical Ordering Algorithm.
27	///
28	/// Note that some classes map to a particular symbol. See
29	/// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
30	pub canonical_combining_class: u8,
31	/// The bidirectional class of this codepoint.
32	///
33	/// Possible values are listed in
34	/// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
35	pub bidi_class: String,
36	/// The decomposition mapping for this codepoint. This includes its
37	/// formatting tag (if present).
38	pub decomposition: UnicodeDataDecomposition,
39	/// A decimal numeric representation of this codepoint, if it has the
40	/// property `Numeric_Type=Decimal`.
41	pub numeric_type_decimal: Option<u8>,
42	/// A decimal numeric representation of this codepoint, if it has the
43	/// property `Numeric_Type=Digit`. Note that while this field is still
44	/// populated for existing codepoints, no new codepoints will have this
45	/// field populated.
46	pub numeric_type_digit: Option<u8>,
47	/// A decimal or rational numeric representation of this codepoint, if it
48	/// has the property `Numeric_Type=Numeric`.
49	pub numeric_type_numeric: Option<UnicodeDataNumeric>,
50	/// A boolean indicating whether this codepoint is "mirrored" in
51	/// bidirectional text.
52	pub bidi_mirrored: bool,
53	/// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
54	/// this field is empty unless it is significantly different from
55	/// the `name` field.
56	pub unicode1_name: String,
57	/// The ISO 10464 comment field. This no longer contains any non-NULL
58	/// values.
59	pub iso_comment: String,
60	/// This codepoint's simple uppercase mapping, if it exists.
61	pub simple_uppercase_mapping: Option<Codepoint>,
62	/// This codepoint's simple lowercase mapping, if it exists.
63	pub simple_lowercase_mapping: Option<Codepoint>,
64	/// This codepoint's simple titlecase mapping, if it exists.
65	pub simple_titlecase_mapping: Option<Codepoint>,
66	}
67
68	impl UcdFile for UnicodeData {
69	fn relative_file_path() -> &'static Path {
70	Path::new("UnicodeData.txt")
71	}
72	}
73
74	impl UcdFileByCodepoint for UnicodeData {
75	fn codepoints(&self) -> CodepointIter {
76	self.codepoint.into_iter()
77	}
78	}
79
80	impl UnicodeData {
81	/// Returns true if and only if this record corresponds to the start of a
82	/// range.
83	pub fn is_range_start(&self) -> bool {
84	self.name.starts_with('<')
85	&& self.name.ends_with('>')
86	&& self.name.contains("First")
87	}
88
89	/// Returns true if and only if this record corresponds to the end of a
90	/// range.
91	pub fn is_range_end(&self) -> bool {
92	self.name.starts_with('<')
93	&& self.name.ends_with('>')
94	&& self.name.contains("Last")
95	}
96	}
97
98	impl FromStr for UnicodeData {
99	type Err = Error;
100
101	fn from_str(line: &str) -> Result<UnicodeData, Error> {
102	static PARTS: Lazy<Regex> = Lazy::new(\|\| {
103	Regex::new(
104	r"(?x)
105	^
106	([A-Z0-9]+); # 1; codepoint
107	([^;]+); # 2; name
108	([^;]+); # 3; general category
109	([0-9]+); # 4; canonical combining class
110	([^;]+); # 5; bidi class
111	([^;]*); # 6; decomposition
112	([0-9]*); # 7; numeric type decimal
113	([0-9]*); # 8; numeric type digit
114	([-0-9/]*); # 9; numeric type numeric
115	([YN]); # 10; bidi mirrored
116	([^;]*); # 11; unicode1 name
117	([^;]*); # 12; ISO comment
118	([^;]*); # 13; simple uppercase mapping
119	([^;]*); # 14; simple lowercase mapping
120	([^;]*) # 15; simple titlecase mapping
121	$
122	",
123	)
124	.unwrap()
125	});
126	let caps = match PARTS.captures(line.trim()) {
127	Some(caps) => caps,
128	None => return err!("invalid UnicodeData line"),
129	};
130	let capget = \|n\| caps.get(n).unwrap().as_str();
131	let mut data = UnicodeData::default();
132
133	data.codepoint = capget(`1`).parse()?;
134	data.name = capget(`2`).to_string();
135	data.general_category = capget(`3`).to_string();
136	data.canonical_combining_class = match capget(`4`).parse() {
137	Ok(n) => n,
138	Err(err) => {
139	return err!(
140	"failed to parse canonical combining class '{}': {}",
141	capget(`4`),
142	err
143	)
144	}
145	};
146	data.bidi_class = capget(`5`).to_string();
147	if !caps[`6`].is_empty() {
148	data.decomposition = caps[`6`].parse()?;
149	} else {
150	data.decomposition.push(data.codepoint)?;
151	}
152	if !capget(`7`).is_empty() {
153	data.numeric_type_decimal = Some(match capget(`7`).parse() {
154	Ok(n) => n,
155	Err(err) => {
156	return err!(
157	"failed to parse numeric type decimal '{}': {}",
158	capget(`7`),
159	err
160	)
161	}
162	});
163	}
164	if !capget(`8`).is_empty() {
165	data.numeric_type_digit = Some(match capget(`8`).parse() {
166	Ok(n) => n,
167	Err(err) => {
168	return err!(
169	"failed to parse numeric type digit '{}': {}",
170	capget(`8`),
171	err
172	)
173	}
174	});
175	}
176	if !capget(`9`).is_empty() {
177	data.numeric_type_numeric = Some(capget(`9`).parse()?);
178	}
179	data.bidi_mirrored = capget(`10`) == "Y";
180	data.unicode1_name = capget(`11`).to_string();
181	data.iso_comment = capget(`12`).to_string();
182	if !capget(`13`).is_empty() {
183	data.simple_uppercase_mapping = Some(capget(`13`).parse()?);
184	}
185	if !capget(`14`).is_empty() {
186	data.simple_lowercase_mapping = Some(capget(`14`).parse()?);
187	}
188	if !capget(`15`).is_empty() {
189	data.simple_titlecase_mapping = Some(capget(`15`).parse()?);
190	}
191	Ok(data)
192	}
193	}
194
195	impl fmt::Display for UnicodeData {
196	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
197	write!(f, "{};", self.codepoint)?;
198	write!(f, "{};", self.name)?;
199	write!(f, "{};", self.general_category)?;
200	write!(f, "{};", self.canonical_combining_class)?;
201	write!(f, "{};", self.bidi_class)?;
202	if self.decomposition.is_canonical()
203	&& self.decomposition.mapping() == &[self.codepoint]
204	{
205	write!(f, ";")?;
206	} else {
207	write!(f, "{};", self.decomposition)?;
208	}
209	if let Some(n) = self.numeric_type_decimal {
210	write!(f, "{};", n)?;
211	} else {
212	write!(f, ";")?;
213	}
214	if let Some(n) = self.numeric_type_digit {
215	write!(f, "{};", n)?;
216	} else {
217	write!(f, ";")?;
218	}
219	if let Some(n) = self.numeric_type_numeric {
220	write!(f, "{};", n)?;
221	} else {
222	write!(f, ";")?;
223	}
224	write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
225	write!(f, "{};", self.unicode1_name)?;
226	write!(f, "{};", self.iso_comment)?;
227	if let Some(cp) = self.simple_uppercase_mapping {
228	write!(f, "{};", cp)?;
229	} else {
230	write!(f, ";")?;
231	}
232	if let Some(cp) = self.simple_lowercase_mapping {
233	write!(f, "{};", cp)?;
234	} else {
235	write!(f, ";")?;
236	}
237	if let Some(cp) = self.simple_titlecase_mapping {
238	write!(f, "{}", cp)?;
239	}
240	Ok(())
241	}
242	}
243
244	/// Represents a decomposition mapping of a single row in the
245	/// `UnicodeData.txt` file.
246	#[derive(Clone, Debug, Default, Eq, PartialEq)]
247	pub struct UnicodeDataDecomposition {
248	/// The formatting tag associated with this mapping, if present.
249	pub tag: Option<UnicodeDataDecompositionTag>,
250	/// The number of codepoints in this mapping.
251	pub len: usize,
252	/// The codepoints in the mapping. Entries beyond `len` in the mapping
253	/// are always U+0000. If no mapping was present, then this always contains
254	/// a single codepoint corresponding to this row's character.
255	pub mapping: [Codepoint; `18`],
256	}
257
258	impl UnicodeDataDecomposition {
259	/// Create a new decomposition mapping with the given tag and codepoints.
260	///
261	/// If there are too many codepoints, then an error is returned.
262	pub fn new(
263	tag: Option<UnicodeDataDecompositionTag>,
264	mapping: &[Codepoint],
265	) -> Result<UnicodeDataDecomposition, Error> {
266	let mut x = UnicodeDataDecomposition::default();
267	x.tag = tag;
268	for &cp in mapping {
269	x.push(cp)?;
270	}
271	Ok(x)
272	}
273
274	/// Add a new codepoint to this decomposition's mapping.
275	///
276	/// If the mapping is already full, then this returns an error.
277	pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
278	if self.len >= self.mapping.len() {
279	return err!(
280	"invalid decomposition mapping (too many codepoints)"
281	);
282	}
283	self.mapping[self.len] = cp;
284	self.len += `1`;
285	Ok(())
286	}
287
288	/// Return the mapping as a slice of codepoints. The slice returned
289	/// has length equivalent to the number of codepoints in this mapping.
290	pub fn mapping(&self) -> &[Codepoint] {
291	&self.mapping[..self.len]
292	}
293
294	/// Returns true if and only if this decomposition mapping is canonical.
295	pub fn is_canonical(&self) -> bool {
296	self.tag.is_none()
297	}
298	}
299
300	impl FromStr for UnicodeDataDecomposition {
301	type Err = Error;
302
303	fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
304	static WITH_TAG: Lazy<Regex> = Lazy::new(\|\| {
305	Regex::new(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$")
306	.unwrap()
307	});
308	static CHARS: Lazy<Regex> =
309	Lazy::new(\|\| Regex::new(r"[0-9A-F]+").unwrap());
310	if s.is_empty() {
311	return err!(
312	"expected non-empty string for \
313	UnicodeDataDecomposition value"
314	);
315	}
316	let caps = match WITH_TAG.captures(s) {
317	Some(caps) => caps,
318	None => return err!("invalid decomposition value"),
319	};
320	let mut decomp = UnicodeDataDecomposition::default();
321	let mut codepoints = s;
322	if let Some(m) = caps.name("tag") {
323	decomp.tag = Some(m.as_str().parse()?);
324	codepoints = &caps["chars"];
325	}
326	for m in CHARS.find_iter(codepoints) {
327	let cp = m.as_str().parse()?;
328	decomp.push(cp)?;
329	}
330	Ok(decomp)
331	}
332	}
333
334	impl fmt::Display for UnicodeDataDecomposition {
335	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
336	if let Some(ref tag: &UnicodeDataDecompositionTag) = self.tag {
337	write!(f, "<{}> ", tag)?;
338	}
339	let mut first: bool = `true`;
340	for cp: &Codepoint in self.mapping() {
341	if !first {
342	write!(f, " ")?;
343	}
344	first = `false`;
345	write!(f, "{}", cp)?;
346	}
347	Ok(())
348	}
349	}
350
351	/// The formatting tag on a decomposition mapping.
352	///
353	/// This is taken from
354	/// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
355	#[derive(Clone, Debug, Eq, PartialEq)]
356	pub enum UnicodeDataDecompositionTag {
357	/// <font>
358	Font,
359	/// <noBreak>
360	NoBreak,
361	/// <initial>
362	Initial,
363	/// <medial>
364	Medial,
365	/// <final>
366	Final,
367	/// <isolated>
368	Isolated,
369	/// <circle>
370	Circle,
371	/// <super>
372	Super,
373	/// <sub>
374	Sub,
375	/// <vertical>
376	Vertical,
377	/// <wide>
378	Wide,
379	/// <narrow>
380	Narrow,
381	/// <small>
382	Small,
383	/// <square>
384	Square,
385	/// <fraction>
386	Fraction,
387	/// <compat>
388	Compat,
389	}
390
391	impl FromStr for UnicodeDataDecompositionTag {
392	type Err = Error;
393
394	fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
395	use self::UnicodeDataDecompositionTag::*;
396	Ok(match s {
397	"font" => Font,
398	"noBreak" => NoBreak,
399	"initial" => Initial,
400	"medial" => Medial,
401	"final" => Final,
402	"isolated" => Isolated,
403	"circle" => Circle,
404	"super" => Super,
405	"sub" => Sub,
406	"vertical" => Vertical,
407	"wide" => Wide,
408	"narrow" => Narrow,
409	"small" => Small,
410	"square" => Square,
411	"fraction" => Fraction,
412	"compat" => Compat,
413	_ => return err!("invalid decomposition formatting tag: {}", s),
414	})
415	}
416	}
417
418	impl fmt::Display for UnicodeDataDecompositionTag {
419	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
420	use self::UnicodeDataDecompositionTag::*;
421	let s: &str = match *self {
422	Font => "font",
423	NoBreak => "noBreak",
424	Initial => "initial",
425	Medial => "medial",
426	Final => "final",
427	Isolated => "isolated",
428	Circle => "circle",
429	Super => "super",
430	Sub => "sub",
431	Vertical => "vertical",
432	Wide => "wide",
433	Narrow => "narrow",
434	Small => "small",
435	Square => "square",
436	Fraction => "fraction",
437	Compat => "compat",
438	};
439	write!(f, "{}", s)
440	}
441	}
442
443	/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
444	///
445	/// A numeric value can either be a signed integer or a rational number.
446	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
447	pub enum UnicodeDataNumeric {
448	/// An integer.
449	Integer(i64),
450	/// A rational number. The first is the numerator and the latter is the
451	/// denominator.
452	Rational(i64, i64),
453	}
454
455	impl FromStr for UnicodeDataNumeric {
456	type Err = Error;
457
458	fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
459	if s.is_empty() {
460	return err!(
461	"expected non-empty string for UnicodeDataNumeric value"
462	);
463	}
464	if let Some(pos) = s.find('/') {
465	let (snum, sden) = (&s[..pos], &s[pos + `1`..]);
466	let num = match snum.parse() {
467	Ok(num) => num,
468	Err(err) => {
469	return err!(
470	"invalid integer numerator '{}': {}",
471	snum,
472	err
473	);
474	}
475	};
476	let den = match sden.parse() {
477	Ok(den) => den,
478	Err(err) => {
479	return err!(
480	"invalid integer denominator '{}': {}",
481	sden,
482	err
483	);
484	}
485	};
486	Ok(UnicodeDataNumeric::Rational(num, den))
487	} else {
488	match s.parse() {
489	Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
490	Err(err) => {
491	return err!(
492	"invalid integer denominator '{}': {}",
493	s,
494	err
495	);
496	}
497	}
498	}
499	}
500	}
501
502	impl fmt::Display for UnicodeDataNumeric {
503	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
504	match *self {
505	UnicodeDataNumeric::Integer(n: i64) => write!(f, "{}", n),
506	UnicodeDataNumeric::Rational(n: i64, d: i64) => write!(f, "{}/{}", n, d),
507	}
508	}
509	}
510
511	/// An iterator adapter that expands rows in `UnicodeData.txt`.
512	///
513	/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
514	/// represented. Instead, they are represented by a pair of rows, indicating
515	/// a range of codepoints with the same properties. For example, the Hangul
516	/// syllable codepoints are represented by these two rows:
517	///
518	/// ```ignore
519	/// AC00;<Hangul Syllable, First>;Lo;`0`;L;;;;;N;;;;;
520	/// D7A3;<Hangul Syllable, Last>;Lo;`0`;L;;;;;N;;;;;
521	/// ```
522	///
523	/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
524	/// Unicode codepoints is found, it will be expanded to the appropriate
525	/// sequence of `UnicodeData` values. Note that all such expanded records will
526	/// have an empty name.
527	pub struct UnicodeDataExpander<I: Iterator> {
528	/// The underlying iterator.
529	it: iter::Peekable<I>,
530	/// A range of codepoints to emit when we've found a pair. Otherwise,
531	/// `None`.
532	range: CodepointRange,
533	}
534
535	struct CodepointRange {
536	/// The codepoint range.
537	range: Range<u32>,
538	/// The start record. All subsequent records in this range are generated
539	/// by cloning this and updating the codepoint/name.
540	start_record: UnicodeData,
541	}
542
543	impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
544	/// Create a new iterator that expands pairs of `UnicodeData` range
545	/// records. All other records are passed through as-is.
546	pub fn new<T>(it: T) -> UnicodeDataExpander<I>
547	where
548	T: IntoIterator<IntoIter = I, Item = I::Item>,
549	{
550	UnicodeDataExpander {
551	it: it.into_iter().peekable(),
552	range: CodepointRange {
553	range: `0`..`0`,
554	start_record: UnicodeData::default(),
555	},
556	}
557	}
558	}
559
560	impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
561	type Item = UnicodeData;
562
563	fn next(&mut self) -> Option<UnicodeData> {
564	if let Some(udata: UnicodeData) = self.range.next() {
565	return Some(udata);
566	}
567	let row1: UnicodeData = match self.it.next() {
568	None => return None,
569	Some(row1: UnicodeData) => row1,
570	};
571	if !row1.is_range_start()
572	\|\| !self.it.peek().map_or(default:`false`, \|row2: &UnicodeData\| row2.is_range_end())
573	{
574	return Some(row1);
575	}
576	let row2: UnicodeData = self.it.next().unwrap();
577	self.range = CodepointRange {
578	range: row1.codepoint.value()..(row2.codepoint.value() + `1`),
579	start_record: row1,
580	};
581	self.next()
582	}
583	}
584
585	impl Iterator for CodepointRange {
586	type Item = UnicodeData;
587
588	fn next(&mut self) -> Option<UnicodeData> {
589	let cp: u32 = match self.range.next() {
590	None => return None,
591	Some(cp: u32) => cp,
592	};
593	Some(UnicodeData {
594	codepoint: Codepoint::from_u32(cp).unwrap(),
595	name: "".to_string(),
596	..self.start_record.clone()
597	})
598	}
599	}
600
601	#[cfg(test)]
602	mod tests {
603	use crate::common::Codepoint;
604
605	use super::{
606	UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
607	UnicodeDataNumeric,
608	};
609
610	fn codepoint(n: u32) -> Codepoint {
611	Codepoint::from_u32(n).unwrap()
612	}
613
614	fn s(string: &str) -> String {
615	string.to_string()
616	}
617
618	#[test]
619	fn parse1() {
620	let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;`\n`";
621	let data: UnicodeData = line.parse().unwrap();
622	assert_eq!(
623	data,
624	UnicodeData {
625	codepoint: codepoint(`0x249d`),
626	name: s("PARENTHESIZED LATIN SMALL LETTER B"),
627	general_category: s("So"),
628	canonical_combining_class: `0`,
629	bidi_class: s("L"),
630	decomposition: UnicodeDataDecomposition::new(
631	Some(UnicodeDataDecompositionTag::Compat),
632	&[codepoint(`0x28`), codepoint(`0x62`), codepoint(`0x29`)],
633	)
634	.unwrap(),
635	numeric_type_decimal: None,
636	numeric_type_digit: None,
637	numeric_type_numeric: None,
638	bidi_mirrored: `false`,
639	unicode1_name: s(""),
640	iso_comment: s(""),
641	simple_uppercase_mapping: None,
642	simple_lowercase_mapping: None,
643	simple_titlecase_mapping: None,
644	}
645	);
646	}
647
648	#[test]
649	fn parse2() {
650	let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;`\n`";
651	let data: UnicodeData = line.parse().unwrap();
652	assert_eq!(
653	data,
654	UnicodeData {
655	codepoint: codepoint(`0x000D`),
656	name: s("<control>"),
657	general_category: s("Cc"),
658	canonical_combining_class: `0`,
659	bidi_class: s("B"),
660	decomposition: UnicodeDataDecomposition::new(
661	None,
662	&[codepoint(`0x000D`)]
663	)
664	.unwrap(),
665	numeric_type_decimal: None,
666	numeric_type_digit: None,
667	numeric_type_numeric: None,
668	bidi_mirrored: `false`,
669	unicode1_name: s("CARRIAGE RETURN (CR)"),
670	iso_comment: s(""),
671	simple_uppercase_mapping: None,
672	simple_lowercase_mapping: None,
673	simple_titlecase_mapping: None,
674	}
675	);
676	}
677
678	#[test]
679	fn parse3() {
680	let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;`\n`";
681	let data: UnicodeData = line.parse().unwrap();
682	assert_eq!(
683	data,
684	UnicodeData {
685	codepoint: codepoint(`0x00BC`),
686	name: s("VULGAR FRACTION ONE QUARTER"),
687	general_category: s("No"),
688	canonical_combining_class: `0`,
689	bidi_class: s("ON"),
690	decomposition: UnicodeDataDecomposition::new(
691	Some(UnicodeDataDecompositionTag::Fraction),
692	&[codepoint(`0x31`), codepoint(`0x2044`), codepoint(`0x34`)],
693	)
694	.unwrap(),
695	numeric_type_decimal: None,
696	numeric_type_digit: None,
697	numeric_type_numeric: Some(UnicodeDataNumeric::Rational(`1`, `4`)),
698	bidi_mirrored: `false`,
699	unicode1_name: s("FRACTION ONE QUARTER"),
700	iso_comment: s(""),
701	simple_uppercase_mapping: None,
702	simple_lowercase_mapping: None,
703	simple_titlecase_mapping: None,
704	}
705	);
706	}
707
708	#[test]
709	fn parse4() {
710	let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;`\n`";
711	let data: UnicodeData = line.parse().unwrap();
712	assert_eq!(
713	data,
714	UnicodeData {
715	codepoint: codepoint(`0x0041`),
716	name: s("LATIN CAPITAL LETTER A"),
717	general_category: s("Lu"),
718	canonical_combining_class: `0`,
719	bidi_class: s("L"),
720	decomposition: UnicodeDataDecomposition::new(
721	None,
722	&[codepoint(`0x0041`)]
723	)
724	.unwrap(),
725	numeric_type_decimal: None,
726	numeric_type_digit: None,
727	numeric_type_numeric: None,
728	bidi_mirrored: `false`,
729	unicode1_name: s(""),
730	iso_comment: s(""),
731	simple_uppercase_mapping: None,
732	simple_lowercase_mapping: Some(codepoint(`0x0061`)),
733	simple_titlecase_mapping: None,
734	}
735	);
736	}
737
738	#[test]
739	fn parse5() {
740	let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;`\n`";
741	let data: UnicodeData = line.parse().unwrap();
742	assert_eq!(
743	data,
744	UnicodeData {
745	codepoint: codepoint(`0x0F33`),
746	name: s("TIBETAN DIGIT HALF ZERO"),
747	general_category: s("No"),
748	canonical_combining_class: `0`,
749	bidi_class: s("L"),
750	decomposition: UnicodeDataDecomposition::new(
751	None,
752	&[codepoint(`0x0F33`)]
753	)
754	.unwrap(),
755	numeric_type_decimal: None,
756	numeric_type_digit: None,
757	numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
758	`-1`, `2`
759	)),
760	bidi_mirrored: `false`,
761	unicode1_name: s(""),
762	iso_comment: s(""),
763	simple_uppercase_mapping: None,
764	simple_lowercase_mapping: None,
765	simple_titlecase_mapping: None,
766	}
767	);
768	}
769
770	#[test]
771	fn expander() {
772	use super::UnicodeDataExpander;
773	use crate::common::UcdLineParser;
774
775	let data = "\
776	ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
777	AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
778	D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
779	D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
780	";
781	let records = UcdLineParser::new(None, data.as_bytes())
782	.collect::<Result<Vec<_>, _>>()
783	.unwrap();
784	assert_eq!(UnicodeDataExpander::new(records).count(), `11174`);
785	}
786	}
787