1use std::path::Path;
2
3use crate::{
4 common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5 error::Error,
6};
7
8/// Represents a single row in the `UnicodeData.txt` file.
9///
10/// These fields were taken from UAX44, Table 9, as part of the documentation
11/// for the
12/// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt).
13#[derive(Clone, Debug, Default, Eq, PartialEq)]
14pub struct UnicodeData {
15 /// The codepoint corresponding to this row.
16 pub codepoint: Codepoint,
17 /// The name of this codepoint.
18 pub name: String,
19 /// The "general category" of this codepoint.
20 pub general_category: String,
21 /// The class of this codepoint used in the Canonical Ordering Algorithm.
22 ///
23 /// Note that some classes map to a particular symbol. See
24 /// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
25 pub canonical_combining_class: u8,
26 /// The bidirectional class of this codepoint.
27 ///
28 /// Possible values are listed in
29 /// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
30 pub bidi_class: String,
31 /// The decomposition mapping for this codepoint. This includes its
32 /// formatting tag (if present).
33 pub decomposition: UnicodeDataDecomposition,
34 /// A decimal numeric representation of this codepoint, if it has the
35 /// property `Numeric_Type=Decimal`.
36 pub numeric_type_decimal: Option<u8>,
37 /// A decimal numeric representation of this codepoint, if it has the
38 /// property `Numeric_Type=Digit`. Note that while this field is still
39 /// populated for existing codepoints, no new codepoints will have this
40 /// field populated.
41 pub numeric_type_digit: Option<u8>,
42 /// A decimal or rational numeric representation of this codepoint, if it
43 /// has the property `Numeric_Type=Numeric`.
44 pub numeric_type_numeric: Option<UnicodeDataNumeric>,
45 /// A boolean indicating whether this codepoint is "mirrored" in
46 /// bidirectional text.
47 pub bidi_mirrored: bool,
48 /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
49 /// this field is empty unless it is significantly different from
50 /// the `name` field.
51 pub unicode1_name: String,
52 /// The ISO 10464 comment field. This no longer contains any non-NULL
53 /// values.
54 pub iso_comment: String,
55 /// This codepoint's simple uppercase mapping, if it exists.
56 pub simple_uppercase_mapping: Option<Codepoint>,
57 /// This codepoint's simple lowercase mapping, if it exists.
58 pub simple_lowercase_mapping: Option<Codepoint>,
59 /// This codepoint's simple titlecase mapping, if it exists.
60 pub simple_titlecase_mapping: Option<Codepoint>,
61}
62
63impl UcdFile for UnicodeData {
64 fn relative_file_path() -> &'static Path {
65 Path::new("UnicodeData.txt")
66 }
67}
68
69impl UcdFileByCodepoint for UnicodeData {
70 fn codepoints(&self) -> CodepointIter {
71 self.codepoint.into_iter()
72 }
73}
74
75impl UnicodeData {
76 /// Returns true if and only if this record corresponds to the start of a
77 /// range.
78 pub fn is_range_start(&self) -> bool {
79 self.name.starts_with('<')
80 && self.name.ends_with('>')
81 && self.name.contains("First")
82 }
83
84 /// Returns true if and only if this record corresponds to the end of a
85 /// range.
86 pub fn is_range_end(&self) -> bool {
87 self.name.starts_with('<')
88 && self.name.ends_with('>')
89 && self.name.contains("Last")
90 }
91}
92
93impl std::str::FromStr for UnicodeData {
94 type Err = Error;
95
96 fn from_str(line: &str) -> Result<UnicodeData, Error> {
97 let re_parts = regex!(
98 r"(?x)
99 ^
100 ([A-Z0-9]+); # 1; codepoint
101 ([^;]+); # 2; name
102 ([^;]+); # 3; general category
103 ([0-9]+); # 4; canonical combining class
104 ([^;]+); # 5; bidi class
105 ([^;]*); # 6; decomposition
106 ([0-9]*); # 7; numeric type decimal
107 ([0-9]*); # 8; numeric type digit
108 ([-0-9/]*); # 9; numeric type numeric
109 ([YN]); # 10; bidi mirrored
110 ([^;]*); # 11; unicode1 name
111 ([^;]*); # 12; ISO comment
112 ([^;]*); # 13; simple uppercase mapping
113 ([^;]*); # 14; simple lowercase mapping
114 ([^;]*) # 15; simple titlecase mapping
115 $
116 ",
117 );
118
119 let caps = match re_parts.captures(line.trim()) {
120 Some(caps) => caps,
121 None => return err!("invalid UnicodeData line"),
122 };
123 let capget = |n| caps.get(n).unwrap().as_str();
124 let mut data = UnicodeData::default();
125
126 data.codepoint = capget(1).parse()?;
127 data.name = capget(2).to_string();
128 data.general_category = capget(3).to_string();
129 data.canonical_combining_class = match capget(4).parse() {
130 Ok(n) => n,
131 Err(err) => {
132 return err!(
133 "failed to parse canonical combining class '{}': {}",
134 capget(4),
135 err
136 )
137 }
138 };
139 data.bidi_class = capget(5).to_string();
140 if !caps[6].is_empty() {
141 data.decomposition = caps[6].parse()?;
142 } else {
143 data.decomposition.push(data.codepoint)?;
144 }
145 if !capget(7).is_empty() {
146 data.numeric_type_decimal = Some(match capget(7).parse() {
147 Ok(n) => n,
148 Err(err) => {
149 return err!(
150 "failed to parse numeric type decimal '{}': {}",
151 capget(7),
152 err
153 )
154 }
155 });
156 }
157 if !capget(8).is_empty() {
158 data.numeric_type_digit = Some(match capget(8).parse() {
159 Ok(n) => n,
160 Err(err) => {
161 return err!(
162 "failed to parse numeric type digit '{}': {}",
163 capget(8),
164 err
165 )
166 }
167 });
168 }
169 if !capget(9).is_empty() {
170 data.numeric_type_numeric = Some(capget(9).parse()?);
171 }
172 data.bidi_mirrored = capget(10) == "Y";
173 data.unicode1_name = capget(11).to_string();
174 data.iso_comment = capget(12).to_string();
175 if !capget(13).is_empty() {
176 data.simple_uppercase_mapping = Some(capget(13).parse()?);
177 }
178 if !capget(14).is_empty() {
179 data.simple_lowercase_mapping = Some(capget(14).parse()?);
180 }
181 if !capget(15).is_empty() {
182 data.simple_titlecase_mapping = Some(capget(15).parse()?);
183 }
184 Ok(data)
185 }
186}
187
188impl std::fmt::Display for UnicodeData {
189 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 write!(f, "{};", self.codepoint)?;
191 write!(f, "{};", self.name)?;
192 write!(f, "{};", self.general_category)?;
193 write!(f, "{};", self.canonical_combining_class)?;
194 write!(f, "{};", self.bidi_class)?;
195 if self.decomposition.is_canonical()
196 && self.decomposition.mapping() == &[self.codepoint]
197 {
198 write!(f, ";")?;
199 } else {
200 write!(f, "{};", self.decomposition)?;
201 }
202 if let Some(n) = self.numeric_type_decimal {
203 write!(f, "{};", n)?;
204 } else {
205 write!(f, ";")?;
206 }
207 if let Some(n) = self.numeric_type_digit {
208 write!(f, "{};", n)?;
209 } else {
210 write!(f, ";")?;
211 }
212 if let Some(n) = self.numeric_type_numeric {
213 write!(f, "{};", n)?;
214 } else {
215 write!(f, ";")?;
216 }
217 write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
218 write!(f, "{};", self.unicode1_name)?;
219 write!(f, "{};", self.iso_comment)?;
220 if let Some(cp) = self.simple_uppercase_mapping {
221 write!(f, "{};", cp)?;
222 } else {
223 write!(f, ";")?;
224 }
225 if let Some(cp) = self.simple_lowercase_mapping {
226 write!(f, "{};", cp)?;
227 } else {
228 write!(f, ";")?;
229 }
230 if let Some(cp) = self.simple_titlecase_mapping {
231 write!(f, "{}", cp)?;
232 }
233 Ok(())
234 }
235}
236
237/// Represents a decomposition mapping of a single row in the
238/// `UnicodeData.txt` file.
239#[derive(Clone, Debug, Default, Eq, PartialEq)]
240pub struct UnicodeDataDecomposition {
241 /// The formatting tag associated with this mapping, if present.
242 pub tag: Option<UnicodeDataDecompositionTag>,
243 /// The number of codepoints in this mapping.
244 pub len: usize,
245 /// The codepoints in the mapping. Entries beyond `len` in the mapping
246 /// are always U+0000. If no mapping was present, then this always contains
247 /// a single codepoint corresponding to this row's character.
248 pub mapping: [Codepoint; 18],
249}
250
251impl UnicodeDataDecomposition {
252 /// Create a new decomposition mapping with the given tag and codepoints.
253 ///
254 /// If there are too many codepoints, then an error is returned.
255 pub fn new(
256 tag: Option<UnicodeDataDecompositionTag>,
257 mapping: &[Codepoint],
258 ) -> Result<UnicodeDataDecomposition, Error> {
259 let mut x = UnicodeDataDecomposition::default();
260 x.tag = tag;
261 for &cp in mapping {
262 x.push(cp)?;
263 }
264 Ok(x)
265 }
266
267 /// Add a new codepoint to this decomposition's mapping.
268 ///
269 /// If the mapping is already full, then this returns an error.
270 pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
271 if self.len >= self.mapping.len() {
272 return err!(
273 "invalid decomposition mapping (too many codepoints)"
274 );
275 }
276 self.mapping[self.len] = cp;
277 self.len += 1;
278 Ok(())
279 }
280
281 /// Return the mapping as a slice of codepoints. The slice returned
282 /// has length equivalent to the number of codepoints in this mapping.
283 pub fn mapping(&self) -> &[Codepoint] {
284 &self.mapping[..self.len]
285 }
286
287 /// Returns true if and only if this decomposition mapping is canonical.
288 pub fn is_canonical(&self) -> bool {
289 self.tag.is_none()
290 }
291}
292
293impl std::str::FromStr for UnicodeDataDecomposition {
294 type Err = Error;
295
296 fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
297 let re_with_tag =
298 regex!(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$");
299 let re_chars = regex!(r"[0-9A-F]+");
300 if s.is_empty() {
301 return err!(
302 "expected non-empty string for \
303 UnicodeDataDecomposition value"
304 );
305 }
306 let caps = match re_with_tag.captures(s) {
307 Some(caps) => caps,
308 None => return err!("invalid decomposition value"),
309 };
310 let mut decomp = UnicodeDataDecomposition::default();
311 let mut codepoints = s;
312 if let Some(m) = caps.name("tag") {
313 decomp.tag = Some(m.as_str().parse()?);
314 codepoints = &caps["chars"];
315 }
316 for m in re_chars.find_iter(codepoints) {
317 let cp = m.as_str().parse()?;
318 decomp.push(cp)?;
319 }
320 Ok(decomp)
321 }
322}
323
324impl std::fmt::Display for UnicodeDataDecomposition {
325 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
326 if let Some(ref tag: &UnicodeDataDecompositionTag) = self.tag {
327 write!(f, "<{}> ", tag)?;
328 }
329 let mut first: bool = true;
330 for cp: &Codepoint in self.mapping() {
331 if !first {
332 write!(f, " ")?;
333 }
334 first = false;
335 write!(f, "{}", cp)?;
336 }
337 Ok(())
338 }
339}
340
341/// The formatting tag on a decomposition mapping.
342///
343/// This is taken from
344/// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
345#[derive(Clone, Debug, Eq, PartialEq)]
346pub enum UnicodeDataDecompositionTag {
347 /// <font>
348 Font,
349 /// <noBreak>
350 NoBreak,
351 /// <initial>
352 Initial,
353 /// <medial>
354 Medial,
355 /// <final>
356 Final,
357 /// <isolated>
358 Isolated,
359 /// <circle>
360 Circle,
361 /// <super>
362 Super,
363 /// <sub>
364 Sub,
365 /// <vertical>
366 Vertical,
367 /// <wide>
368 Wide,
369 /// <narrow>
370 Narrow,
371 /// <small>
372 Small,
373 /// <square>
374 Square,
375 /// <fraction>
376 Fraction,
377 /// <compat>
378 Compat,
379}
380
381impl std::str::FromStr for UnicodeDataDecompositionTag {
382 type Err = Error;
383
384 fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
385 use self::UnicodeDataDecompositionTag::*;
386 Ok(match s {
387 "font" => Font,
388 "noBreak" => NoBreak,
389 "initial" => Initial,
390 "medial" => Medial,
391 "final" => Final,
392 "isolated" => Isolated,
393 "circle" => Circle,
394 "super" => Super,
395 "sub" => Sub,
396 "vertical" => Vertical,
397 "wide" => Wide,
398 "narrow" => Narrow,
399 "small" => Small,
400 "square" => Square,
401 "fraction" => Fraction,
402 "compat" => Compat,
403 _ => return err!("invalid decomposition formatting tag: {}", s),
404 })
405 }
406}
407
408impl std::fmt::Display for UnicodeDataDecompositionTag {
409 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410 use self::UnicodeDataDecompositionTag::*;
411 let s: &'static str = match *self {
412 Font => "font",
413 NoBreak => "noBreak",
414 Initial => "initial",
415 Medial => "medial",
416 Final => "final",
417 Isolated => "isolated",
418 Circle => "circle",
419 Super => "super",
420 Sub => "sub",
421 Vertical => "vertical",
422 Wide => "wide",
423 Narrow => "narrow",
424 Small => "small",
425 Square => "square",
426 Fraction => "fraction",
427 Compat => "compat",
428 };
429 write!(f, "{}", s)
430 }
431}
432
433/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
434///
435/// A numeric value can either be a signed integer or a rational number.
436#[derive(Clone, Copy, Debug, Eq, PartialEq)]
437pub enum UnicodeDataNumeric {
438 /// An integer.
439 Integer(i64),
440 /// A rational number. The first is the numerator and the latter is the
441 /// denominator.
442 Rational(i64, i64),
443}
444
445impl std::str::FromStr for UnicodeDataNumeric {
446 type Err = Error;
447
448 fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
449 if s.is_empty() {
450 return err!(
451 "expected non-empty string for UnicodeDataNumeric value"
452 );
453 }
454 if let Some(pos) = s.find('/') {
455 let (snum, sden) = (&s[..pos], &s[pos + 1..]);
456 let num = match snum.parse() {
457 Ok(num) => num,
458 Err(err) => {
459 return err!(
460 "invalid integer numerator '{}': {}",
461 snum,
462 err
463 );
464 }
465 };
466 let den = match sden.parse() {
467 Ok(den) => den,
468 Err(err) => {
469 return err!(
470 "invalid integer denominator '{}': {}",
471 sden,
472 err
473 );
474 }
475 };
476 Ok(UnicodeDataNumeric::Rational(num, den))
477 } else {
478 match s.parse() {
479 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
480 Err(err) => {
481 return err!(
482 "invalid integer denominator '{}': {}",
483 s,
484 err
485 );
486 }
487 }
488 }
489 }
490}
491
492impl std::fmt::Display for UnicodeDataNumeric {
493 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
494 match *self {
495 UnicodeDataNumeric::Integer(n: i64) => write!(f, "{}", n),
496 UnicodeDataNumeric::Rational(n: i64, d: i64) => write!(f, "{}/{}", n, d),
497 }
498 }
499}
500
501/// An iterator adapter that expands rows in `UnicodeData.txt`.
502///
503/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
504/// represented. Instead, they are represented by a pair of rows, indicating
505/// a range of codepoints with the same properties. For example, the Hangul
506/// syllable codepoints are represented by these two rows:
507///
508/// ```ignore
509/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
510/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
511/// ```
512///
513/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
514/// Unicode codepoints is found, it will be expanded to the appropriate
515/// sequence of `UnicodeData` values. Note that all such expanded records will
516/// have an empty name.
517pub struct UnicodeDataExpander<I: Iterator> {
518 /// The underlying iterator.
519 it: std::iter::Peekable<I>,
520 /// A range of codepoints to emit when we've found a pair. Otherwise,
521 /// `None`.
522 range: CodepointRange,
523}
524
525struct CodepointRange {
526 /// The codepoint range.
527 range: std::ops::Range<u32>,
528 /// The start record. All subsequent records in this range are generated
529 /// by cloning this and updating the codepoint/name.
530 start_record: UnicodeData,
531}
532
533impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
534 /// Create a new iterator that expands pairs of `UnicodeData` range
535 /// records. All other records are passed through as-is.
536 pub fn new<T>(it: T) -> UnicodeDataExpander<I>
537 where
538 T: IntoIterator<IntoIter = I, Item = I::Item>,
539 {
540 UnicodeDataExpander {
541 it: it.into_iter().peekable(),
542 range: CodepointRange {
543 range: 0..0,
544 start_record: UnicodeData::default(),
545 },
546 }
547 }
548}
549
550impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
551 type Item = UnicodeData;
552
553 fn next(&mut self) -> Option<UnicodeData> {
554 if let Some(udata: UnicodeData) = self.range.next() {
555 return Some(udata);
556 }
557 let row1: UnicodeData = match self.it.next() {
558 None => return None,
559 Some(row1: UnicodeData) => row1,
560 };
561 if !row1.is_range_start()
562 || !self.it.peek().map_or(default:false, |row2: &UnicodeData| row2.is_range_end())
563 {
564 return Some(row1);
565 }
566 let row2: UnicodeData = self.it.next().unwrap();
567 self.range = CodepointRange {
568 range: row1.codepoint.value()..(row2.codepoint.value() + 1),
569 start_record: row1,
570 };
571 self.next()
572 }
573}
574
575impl Iterator for CodepointRange {
576 type Item = UnicodeData;
577
578 fn next(&mut self) -> Option<UnicodeData> {
579 let cp: u32 = match self.range.next() {
580 None => return None,
581 Some(cp: u32) => cp,
582 };
583 Some(UnicodeData {
584 codepoint: Codepoint::from_u32(cp).unwrap(),
585 name: "".to_string(),
586 ..self.start_record.clone()
587 })
588 }
589}
590
591#[cfg(test)]
592mod tests {
593 use crate::common::Codepoint;
594
595 use super::{
596 UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
597 UnicodeDataNumeric,
598 };
599
600 fn codepoint(n: u32) -> Codepoint {
601 Codepoint::from_u32(n).unwrap()
602 }
603
604 fn s(string: &str) -> String {
605 string.to_string()
606 }
607
608 #[test]
609 fn parse1() {
610 let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
611 let data: UnicodeData = line.parse().unwrap();
612 assert_eq!(
613 data,
614 UnicodeData {
615 codepoint: codepoint(0x249d),
616 name: s("PARENTHESIZED LATIN SMALL LETTER B"),
617 general_category: s("So"),
618 canonical_combining_class: 0,
619 bidi_class: s("L"),
620 decomposition: UnicodeDataDecomposition::new(
621 Some(UnicodeDataDecompositionTag::Compat),
622 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
623 )
624 .unwrap(),
625 numeric_type_decimal: None,
626 numeric_type_digit: None,
627 numeric_type_numeric: None,
628 bidi_mirrored: false,
629 unicode1_name: s(""),
630 iso_comment: s(""),
631 simple_uppercase_mapping: None,
632 simple_lowercase_mapping: None,
633 simple_titlecase_mapping: None,
634 }
635 );
636 }
637
638 #[test]
639 fn parse2() {
640 let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
641 let data: UnicodeData = line.parse().unwrap();
642 assert_eq!(
643 data,
644 UnicodeData {
645 codepoint: codepoint(0x000D),
646 name: s("<control>"),
647 general_category: s("Cc"),
648 canonical_combining_class: 0,
649 bidi_class: s("B"),
650 decomposition: UnicodeDataDecomposition::new(
651 None,
652 &[codepoint(0x000D)]
653 )
654 .unwrap(),
655 numeric_type_decimal: None,
656 numeric_type_digit: None,
657 numeric_type_numeric: None,
658 bidi_mirrored: false,
659 unicode1_name: s("CARRIAGE RETURN (CR)"),
660 iso_comment: s(""),
661 simple_uppercase_mapping: None,
662 simple_lowercase_mapping: None,
663 simple_titlecase_mapping: None,
664 }
665 );
666 }
667
668 #[test]
669 fn parse3() {
670 let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
671 let data: UnicodeData = line.parse().unwrap();
672 assert_eq!(
673 data,
674 UnicodeData {
675 codepoint: codepoint(0x00BC),
676 name: s("VULGAR FRACTION ONE QUARTER"),
677 general_category: s("No"),
678 canonical_combining_class: 0,
679 bidi_class: s("ON"),
680 decomposition: UnicodeDataDecomposition::new(
681 Some(UnicodeDataDecompositionTag::Fraction),
682 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
683 )
684 .unwrap(),
685 numeric_type_decimal: None,
686 numeric_type_digit: None,
687 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
688 bidi_mirrored: false,
689 unicode1_name: s("FRACTION ONE QUARTER"),
690 iso_comment: s(""),
691 simple_uppercase_mapping: None,
692 simple_lowercase_mapping: None,
693 simple_titlecase_mapping: None,
694 }
695 );
696 }
697
698 #[test]
699 fn parse4() {
700 let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
701 let data: UnicodeData = line.parse().unwrap();
702 assert_eq!(
703 data,
704 UnicodeData {
705 codepoint: codepoint(0x0041),
706 name: s("LATIN CAPITAL LETTER A"),
707 general_category: s("Lu"),
708 canonical_combining_class: 0,
709 bidi_class: s("L"),
710 decomposition: UnicodeDataDecomposition::new(
711 None,
712 &[codepoint(0x0041)]
713 )
714 .unwrap(),
715 numeric_type_decimal: None,
716 numeric_type_digit: None,
717 numeric_type_numeric: None,
718 bidi_mirrored: false,
719 unicode1_name: s(""),
720 iso_comment: s(""),
721 simple_uppercase_mapping: None,
722 simple_lowercase_mapping: Some(codepoint(0x0061)),
723 simple_titlecase_mapping: None,
724 }
725 );
726 }
727
728 #[test]
729 fn parse5() {
730 let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
731 let data: UnicodeData = line.parse().unwrap();
732 assert_eq!(
733 data,
734 UnicodeData {
735 codepoint: codepoint(0x0F33),
736 name: s("TIBETAN DIGIT HALF ZERO"),
737 general_category: s("No"),
738 canonical_combining_class: 0,
739 bidi_class: s("L"),
740 decomposition: UnicodeDataDecomposition::new(
741 None,
742 &[codepoint(0x0F33)]
743 )
744 .unwrap(),
745 numeric_type_decimal: None,
746 numeric_type_digit: None,
747 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
748 -1, 2
749 )),
750 bidi_mirrored: false,
751 unicode1_name: s(""),
752 iso_comment: s(""),
753 simple_uppercase_mapping: None,
754 simple_lowercase_mapping: None,
755 simple_titlecase_mapping: None,
756 }
757 );
758 }
759
760 #[test]
761 fn expander() {
762 use super::UnicodeDataExpander;
763 use crate::common::UcdLineParser;
764
765 let data = "\
766ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
767AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
768D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
769D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
770";
771 let records = UcdLineParser::new(None, data.as_bytes())
772 .collect::<Result<Vec<_>, _>>()
773 .unwrap();
774 assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
775 }
776}
777