1use std::fmt;
2use std::iter;
3use std::ops::Range;
4use std::path::Path;
5use std::str::FromStr;
6
7use once_cell::sync::Lazy;
8use regex::Regex;
9
10use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
11use crate::error::Error;
12
13/// Represents a single row in the `UnicodeData.txt` file.
14///
15/// These fields were taken from UAX44, Table 9, as part of the documentation
16/// for the
17/// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt).
18#[derive(Clone, Debug, Default, Eq, PartialEq)]
19pub struct UnicodeData {
20 /// The codepoint corresponding to this row.
21 pub codepoint: Codepoint,
22 /// The name of this codepoint.
23 pub name: String,
24 /// The "general category" of this codepoint.
25 pub general_category: String,
26 /// The class of this codepoint used in the Canonical Ordering Algorithm.
27 ///
28 /// Note that some classes map to a particular symbol. See
29 /// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
30 pub canonical_combining_class: u8,
31 /// The bidirectional class of this codepoint.
32 ///
33 /// Possible values are listed in
34 /// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
35 pub bidi_class: String,
36 /// The decomposition mapping for this codepoint. This includes its
37 /// formatting tag (if present).
38 pub decomposition: UnicodeDataDecomposition,
39 /// A decimal numeric representation of this codepoint, if it has the
40 /// property `Numeric_Type=Decimal`.
41 pub numeric_type_decimal: Option<u8>,
42 /// A decimal numeric representation of this codepoint, if it has the
43 /// property `Numeric_Type=Digit`. Note that while this field is still
44 /// populated for existing codepoints, no new codepoints will have this
45 /// field populated.
46 pub numeric_type_digit: Option<u8>,
47 /// A decimal or rational numeric representation of this codepoint, if it
48 /// has the property `Numeric_Type=Numeric`.
49 pub numeric_type_numeric: Option<UnicodeDataNumeric>,
50 /// A boolean indicating whether this codepoint is "mirrored" in
51 /// bidirectional text.
52 pub bidi_mirrored: bool,
53 /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
54 /// this field is empty unless it is significantly different from
55 /// the `name` field.
56 pub unicode1_name: String,
57 /// The ISO 10464 comment field. This no longer contains any non-NULL
58 /// values.
59 pub iso_comment: String,
60 /// This codepoint's simple uppercase mapping, if it exists.
61 pub simple_uppercase_mapping: Option<Codepoint>,
62 /// This codepoint's simple lowercase mapping, if it exists.
63 pub simple_lowercase_mapping: Option<Codepoint>,
64 /// This codepoint's simple titlecase mapping, if it exists.
65 pub simple_titlecase_mapping: Option<Codepoint>,
66}
67
68impl UcdFile for UnicodeData {
69 fn relative_file_path() -> &'static Path {
70 Path::new("UnicodeData.txt")
71 }
72}
73
74impl UcdFileByCodepoint for UnicodeData {
75 fn codepoints(&self) -> CodepointIter {
76 self.codepoint.into_iter()
77 }
78}
79
80impl UnicodeData {
81 /// Returns true if and only if this record corresponds to the start of a
82 /// range.
83 pub fn is_range_start(&self) -> bool {
84 self.name.starts_with('<')
85 && self.name.ends_with('>')
86 && self.name.contains("First")
87 }
88
89 /// Returns true if and only if this record corresponds to the end of a
90 /// range.
91 pub fn is_range_end(&self) -> bool {
92 self.name.starts_with('<')
93 && self.name.ends_with('>')
94 && self.name.contains("Last")
95 }
96}
97
98impl FromStr for UnicodeData {
99 type Err = Error;
100
101 fn from_str(line: &str) -> Result<UnicodeData, Error> {
102 static PARTS: Lazy<Regex> = Lazy::new(|| {
103 Regex::new(
104 r"(?x)
105 ^
106 ([A-Z0-9]+); # 1; codepoint
107 ([^;]+); # 2; name
108 ([^;]+); # 3; general category
109 ([0-9]+); # 4; canonical combining class
110 ([^;]+); # 5; bidi class
111 ([^;]*); # 6; decomposition
112 ([0-9]*); # 7; numeric type decimal
113 ([0-9]*); # 8; numeric type digit
114 ([-0-9/]*); # 9; numeric type numeric
115 ([YN]); # 10; bidi mirrored
116 ([^;]*); # 11; unicode1 name
117 ([^;]*); # 12; ISO comment
118 ([^;]*); # 13; simple uppercase mapping
119 ([^;]*); # 14; simple lowercase mapping
120 ([^;]*) # 15; simple titlecase mapping
121 $
122 ",
123 )
124 .unwrap()
125 });
126 let caps = match PARTS.captures(line.trim()) {
127 Some(caps) => caps,
128 None => return err!("invalid UnicodeData line"),
129 };
130 let capget = |n| caps.get(n).unwrap().as_str();
131 let mut data = UnicodeData::default();
132
133 data.codepoint = capget(1).parse()?;
134 data.name = capget(2).to_string();
135 data.general_category = capget(3).to_string();
136 data.canonical_combining_class = match capget(4).parse() {
137 Ok(n) => n,
138 Err(err) => {
139 return err!(
140 "failed to parse canonical combining class '{}': {}",
141 capget(4),
142 err
143 )
144 }
145 };
146 data.bidi_class = capget(5).to_string();
147 if !caps[6].is_empty() {
148 data.decomposition = caps[6].parse()?;
149 } else {
150 data.decomposition.push(data.codepoint)?;
151 }
152 if !capget(7).is_empty() {
153 data.numeric_type_decimal = Some(match capget(7).parse() {
154 Ok(n) => n,
155 Err(err) => {
156 return err!(
157 "failed to parse numeric type decimal '{}': {}",
158 capget(7),
159 err
160 )
161 }
162 });
163 }
164 if !capget(8).is_empty() {
165 data.numeric_type_digit = Some(match capget(8).parse() {
166 Ok(n) => n,
167 Err(err) => {
168 return err!(
169 "failed to parse numeric type digit '{}': {}",
170 capget(8),
171 err
172 )
173 }
174 });
175 }
176 if !capget(9).is_empty() {
177 data.numeric_type_numeric = Some(capget(9).parse()?);
178 }
179 data.bidi_mirrored = capget(10) == "Y";
180 data.unicode1_name = capget(11).to_string();
181 data.iso_comment = capget(12).to_string();
182 if !capget(13).is_empty() {
183 data.simple_uppercase_mapping = Some(capget(13).parse()?);
184 }
185 if !capget(14).is_empty() {
186 data.simple_lowercase_mapping = Some(capget(14).parse()?);
187 }
188 if !capget(15).is_empty() {
189 data.simple_titlecase_mapping = Some(capget(15).parse()?);
190 }
191 Ok(data)
192 }
193}
194
195impl fmt::Display for UnicodeData {
196 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
197 write!(f, "{};", self.codepoint)?;
198 write!(f, "{};", self.name)?;
199 write!(f, "{};", self.general_category)?;
200 write!(f, "{};", self.canonical_combining_class)?;
201 write!(f, "{};", self.bidi_class)?;
202 if self.decomposition.is_canonical()
203 && self.decomposition.mapping() == &[self.codepoint]
204 {
205 write!(f, ";")?;
206 } else {
207 write!(f, "{};", self.decomposition)?;
208 }
209 if let Some(n) = self.numeric_type_decimal {
210 write!(f, "{};", n)?;
211 } else {
212 write!(f, ";")?;
213 }
214 if let Some(n) = self.numeric_type_digit {
215 write!(f, "{};", n)?;
216 } else {
217 write!(f, ";")?;
218 }
219 if let Some(n) = self.numeric_type_numeric {
220 write!(f, "{};", n)?;
221 } else {
222 write!(f, ";")?;
223 }
224 write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
225 write!(f, "{};", self.unicode1_name)?;
226 write!(f, "{};", self.iso_comment)?;
227 if let Some(cp) = self.simple_uppercase_mapping {
228 write!(f, "{};", cp)?;
229 } else {
230 write!(f, ";")?;
231 }
232 if let Some(cp) = self.simple_lowercase_mapping {
233 write!(f, "{};", cp)?;
234 } else {
235 write!(f, ";")?;
236 }
237 if let Some(cp) = self.simple_titlecase_mapping {
238 write!(f, "{}", cp)?;
239 }
240 Ok(())
241 }
242}
243
244/// Represents a decomposition mapping of a single row in the
245/// `UnicodeData.txt` file.
246#[derive(Clone, Debug, Default, Eq, PartialEq)]
247pub struct UnicodeDataDecomposition {
248 /// The formatting tag associated with this mapping, if present.
249 pub tag: Option<UnicodeDataDecompositionTag>,
250 /// The number of codepoints in this mapping.
251 pub len: usize,
252 /// The codepoints in the mapping. Entries beyond `len` in the mapping
253 /// are always U+0000. If no mapping was present, then this always contains
254 /// a single codepoint corresponding to this row's character.
255 pub mapping: [Codepoint; 18],
256}
257
258impl UnicodeDataDecomposition {
259 /// Create a new decomposition mapping with the given tag and codepoints.
260 ///
261 /// If there are too many codepoints, then an error is returned.
262 pub fn new(
263 tag: Option<UnicodeDataDecompositionTag>,
264 mapping: &[Codepoint],
265 ) -> Result<UnicodeDataDecomposition, Error> {
266 let mut x = UnicodeDataDecomposition::default();
267 x.tag = tag;
268 for &cp in mapping {
269 x.push(cp)?;
270 }
271 Ok(x)
272 }
273
274 /// Add a new codepoint to this decomposition's mapping.
275 ///
276 /// If the mapping is already full, then this returns an error.
277 pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
278 if self.len >= self.mapping.len() {
279 return err!(
280 "invalid decomposition mapping (too many codepoints)"
281 );
282 }
283 self.mapping[self.len] = cp;
284 self.len += 1;
285 Ok(())
286 }
287
288 /// Return the mapping as a slice of codepoints. The slice returned
289 /// has length equivalent to the number of codepoints in this mapping.
290 pub fn mapping(&self) -> &[Codepoint] {
291 &self.mapping[..self.len]
292 }
293
294 /// Returns true if and only if this decomposition mapping is canonical.
295 pub fn is_canonical(&self) -> bool {
296 self.tag.is_none()
297 }
298}
299
300impl FromStr for UnicodeDataDecomposition {
301 type Err = Error;
302
303 fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
304 static WITH_TAG: Lazy<Regex> = Lazy::new(|| {
305 Regex::new(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$")
306 .unwrap()
307 });
308 static CHARS: Lazy<Regex> =
309 Lazy::new(|| Regex::new(r"[0-9A-F]+").unwrap());
310 if s.is_empty() {
311 return err!(
312 "expected non-empty string for \
313 UnicodeDataDecomposition value"
314 );
315 }
316 let caps = match WITH_TAG.captures(s) {
317 Some(caps) => caps,
318 None => return err!("invalid decomposition value"),
319 };
320 let mut decomp = UnicodeDataDecomposition::default();
321 let mut codepoints = s;
322 if let Some(m) = caps.name("tag") {
323 decomp.tag = Some(m.as_str().parse()?);
324 codepoints = &caps["chars"];
325 }
326 for m in CHARS.find_iter(codepoints) {
327 let cp = m.as_str().parse()?;
328 decomp.push(cp)?;
329 }
330 Ok(decomp)
331 }
332}
333
334impl fmt::Display for UnicodeDataDecomposition {
335 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
336 if let Some(ref tag: &UnicodeDataDecompositionTag) = self.tag {
337 write!(f, "<{}> ", tag)?;
338 }
339 let mut first: bool = true;
340 for cp: &Codepoint in self.mapping() {
341 if !first {
342 write!(f, " ")?;
343 }
344 first = false;
345 write!(f, "{}", cp)?;
346 }
347 Ok(())
348 }
349}
350
351/// The formatting tag on a decomposition mapping.
352///
353/// This is taken from
354/// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
355#[derive(Clone, Debug, Eq, PartialEq)]
356pub enum UnicodeDataDecompositionTag {
357 /// <font>
358 Font,
359 /// <noBreak>
360 NoBreak,
361 /// <initial>
362 Initial,
363 /// <medial>
364 Medial,
365 /// <final>
366 Final,
367 /// <isolated>
368 Isolated,
369 /// <circle>
370 Circle,
371 /// <super>
372 Super,
373 /// <sub>
374 Sub,
375 /// <vertical>
376 Vertical,
377 /// <wide>
378 Wide,
379 /// <narrow>
380 Narrow,
381 /// <small>
382 Small,
383 /// <square>
384 Square,
385 /// <fraction>
386 Fraction,
387 /// <compat>
388 Compat,
389}
390
391impl FromStr for UnicodeDataDecompositionTag {
392 type Err = Error;
393
394 fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
395 use self::UnicodeDataDecompositionTag::*;
396 Ok(match s {
397 "font" => Font,
398 "noBreak" => NoBreak,
399 "initial" => Initial,
400 "medial" => Medial,
401 "final" => Final,
402 "isolated" => Isolated,
403 "circle" => Circle,
404 "super" => Super,
405 "sub" => Sub,
406 "vertical" => Vertical,
407 "wide" => Wide,
408 "narrow" => Narrow,
409 "small" => Small,
410 "square" => Square,
411 "fraction" => Fraction,
412 "compat" => Compat,
413 _ => return err!("invalid decomposition formatting tag: {}", s),
414 })
415 }
416}
417
418impl fmt::Display for UnicodeDataDecompositionTag {
419 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
420 use self::UnicodeDataDecompositionTag::*;
421 let s: &str = match *self {
422 Font => "font",
423 NoBreak => "noBreak",
424 Initial => "initial",
425 Medial => "medial",
426 Final => "final",
427 Isolated => "isolated",
428 Circle => "circle",
429 Super => "super",
430 Sub => "sub",
431 Vertical => "vertical",
432 Wide => "wide",
433 Narrow => "narrow",
434 Small => "small",
435 Square => "square",
436 Fraction => "fraction",
437 Compat => "compat",
438 };
439 write!(f, "{}", s)
440 }
441}
442
443/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
444///
445/// A numeric value can either be a signed integer or a rational number.
446#[derive(Clone, Copy, Debug, Eq, PartialEq)]
447pub enum UnicodeDataNumeric {
448 /// An integer.
449 Integer(i64),
450 /// A rational number. The first is the numerator and the latter is the
451 /// denominator.
452 Rational(i64, i64),
453}
454
455impl FromStr for UnicodeDataNumeric {
456 type Err = Error;
457
458 fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
459 if s.is_empty() {
460 return err!(
461 "expected non-empty string for UnicodeDataNumeric value"
462 );
463 }
464 if let Some(pos) = s.find('/') {
465 let (snum, sden) = (&s[..pos], &s[pos + 1..]);
466 let num = match snum.parse() {
467 Ok(num) => num,
468 Err(err) => {
469 return err!(
470 "invalid integer numerator '{}': {}",
471 snum,
472 err
473 );
474 }
475 };
476 let den = match sden.parse() {
477 Ok(den) => den,
478 Err(err) => {
479 return err!(
480 "invalid integer denominator '{}': {}",
481 sden,
482 err
483 );
484 }
485 };
486 Ok(UnicodeDataNumeric::Rational(num, den))
487 } else {
488 match s.parse() {
489 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
490 Err(err) => {
491 return err!(
492 "invalid integer denominator '{}': {}",
493 s,
494 err
495 );
496 }
497 }
498 }
499 }
500}
501
502impl fmt::Display for UnicodeDataNumeric {
503 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
504 match *self {
505 UnicodeDataNumeric::Integer(n: i64) => write!(f, "{}", n),
506 UnicodeDataNumeric::Rational(n: i64, d: i64) => write!(f, "{}/{}", n, d),
507 }
508 }
509}
510
511/// An iterator adapter that expands rows in `UnicodeData.txt`.
512///
513/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
514/// represented. Instead, they are represented by a pair of rows, indicating
515/// a range of codepoints with the same properties. For example, the Hangul
516/// syllable codepoints are represented by these two rows:
517///
518/// ```ignore
519/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
520/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
521/// ```
522///
523/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
524/// Unicode codepoints is found, it will be expanded to the appropriate
525/// sequence of `UnicodeData` values. Note that all such expanded records will
526/// have an empty name.
527pub struct UnicodeDataExpander<I: Iterator> {
528 /// The underlying iterator.
529 it: iter::Peekable<I>,
530 /// A range of codepoints to emit when we've found a pair. Otherwise,
531 /// `None`.
532 range: CodepointRange,
533}
534
535struct CodepointRange {
536 /// The codepoint range.
537 range: Range<u32>,
538 /// The start record. All subsequent records in this range are generated
539 /// by cloning this and updating the codepoint/name.
540 start_record: UnicodeData,
541}
542
543impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
544 /// Create a new iterator that expands pairs of `UnicodeData` range
545 /// records. All other records are passed through as-is.
546 pub fn new<T>(it: T) -> UnicodeDataExpander<I>
547 where
548 T: IntoIterator<IntoIter = I, Item = I::Item>,
549 {
550 UnicodeDataExpander {
551 it: it.into_iter().peekable(),
552 range: CodepointRange {
553 range: 0..0,
554 start_record: UnicodeData::default(),
555 },
556 }
557 }
558}
559
560impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
561 type Item = UnicodeData;
562
563 fn next(&mut self) -> Option<UnicodeData> {
564 if let Some(udata: UnicodeData) = self.range.next() {
565 return Some(udata);
566 }
567 let row1: UnicodeData = match self.it.next() {
568 None => return None,
569 Some(row1: UnicodeData) => row1,
570 };
571 if !row1.is_range_start()
572 || !self.it.peek().map_or(default:false, |row2: &UnicodeData| row2.is_range_end())
573 {
574 return Some(row1);
575 }
576 let row2: UnicodeData = self.it.next().unwrap();
577 self.range = CodepointRange {
578 range: row1.codepoint.value()..(row2.codepoint.value() + 1),
579 start_record: row1,
580 };
581 self.next()
582 }
583}
584
585impl Iterator for CodepointRange {
586 type Item = UnicodeData;
587
588 fn next(&mut self) -> Option<UnicodeData> {
589 let cp: u32 = match self.range.next() {
590 None => return None,
591 Some(cp: u32) => cp,
592 };
593 Some(UnicodeData {
594 codepoint: Codepoint::from_u32(cp).unwrap(),
595 name: "".to_string(),
596 ..self.start_record.clone()
597 })
598 }
599}
600
601#[cfg(test)]
602mod tests {
603 use crate::common::Codepoint;
604
605 use super::{
606 UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
607 UnicodeDataNumeric,
608 };
609
610 fn codepoint(n: u32) -> Codepoint {
611 Codepoint::from_u32(n).unwrap()
612 }
613
614 fn s(string: &str) -> String {
615 string.to_string()
616 }
617
618 #[test]
619 fn parse1() {
620 let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
621 let data: UnicodeData = line.parse().unwrap();
622 assert_eq!(
623 data,
624 UnicodeData {
625 codepoint: codepoint(0x249d),
626 name: s("PARENTHESIZED LATIN SMALL LETTER B"),
627 general_category: s("So"),
628 canonical_combining_class: 0,
629 bidi_class: s("L"),
630 decomposition: UnicodeDataDecomposition::new(
631 Some(UnicodeDataDecompositionTag::Compat),
632 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
633 )
634 .unwrap(),
635 numeric_type_decimal: None,
636 numeric_type_digit: None,
637 numeric_type_numeric: None,
638 bidi_mirrored: false,
639 unicode1_name: s(""),
640 iso_comment: s(""),
641 simple_uppercase_mapping: None,
642 simple_lowercase_mapping: None,
643 simple_titlecase_mapping: None,
644 }
645 );
646 }
647
648 #[test]
649 fn parse2() {
650 let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
651 let data: UnicodeData = line.parse().unwrap();
652 assert_eq!(
653 data,
654 UnicodeData {
655 codepoint: codepoint(0x000D),
656 name: s("<control>"),
657 general_category: s("Cc"),
658 canonical_combining_class: 0,
659 bidi_class: s("B"),
660 decomposition: UnicodeDataDecomposition::new(
661 None,
662 &[codepoint(0x000D)]
663 )
664 .unwrap(),
665 numeric_type_decimal: None,
666 numeric_type_digit: None,
667 numeric_type_numeric: None,
668 bidi_mirrored: false,
669 unicode1_name: s("CARRIAGE RETURN (CR)"),
670 iso_comment: s(""),
671 simple_uppercase_mapping: None,
672 simple_lowercase_mapping: None,
673 simple_titlecase_mapping: None,
674 }
675 );
676 }
677
678 #[test]
679 fn parse3() {
680 let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
681 let data: UnicodeData = line.parse().unwrap();
682 assert_eq!(
683 data,
684 UnicodeData {
685 codepoint: codepoint(0x00BC),
686 name: s("VULGAR FRACTION ONE QUARTER"),
687 general_category: s("No"),
688 canonical_combining_class: 0,
689 bidi_class: s("ON"),
690 decomposition: UnicodeDataDecomposition::new(
691 Some(UnicodeDataDecompositionTag::Fraction),
692 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
693 )
694 .unwrap(),
695 numeric_type_decimal: None,
696 numeric_type_digit: None,
697 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
698 bidi_mirrored: false,
699 unicode1_name: s("FRACTION ONE QUARTER"),
700 iso_comment: s(""),
701 simple_uppercase_mapping: None,
702 simple_lowercase_mapping: None,
703 simple_titlecase_mapping: None,
704 }
705 );
706 }
707
708 #[test]
709 fn parse4() {
710 let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
711 let data: UnicodeData = line.parse().unwrap();
712 assert_eq!(
713 data,
714 UnicodeData {
715 codepoint: codepoint(0x0041),
716 name: s("LATIN CAPITAL LETTER A"),
717 general_category: s("Lu"),
718 canonical_combining_class: 0,
719 bidi_class: s("L"),
720 decomposition: UnicodeDataDecomposition::new(
721 None,
722 &[codepoint(0x0041)]
723 )
724 .unwrap(),
725 numeric_type_decimal: None,
726 numeric_type_digit: None,
727 numeric_type_numeric: None,
728 bidi_mirrored: false,
729 unicode1_name: s(""),
730 iso_comment: s(""),
731 simple_uppercase_mapping: None,
732 simple_lowercase_mapping: Some(codepoint(0x0061)),
733 simple_titlecase_mapping: None,
734 }
735 );
736 }
737
738 #[test]
739 fn parse5() {
740 let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
741 let data: UnicodeData = line.parse().unwrap();
742 assert_eq!(
743 data,
744 UnicodeData {
745 codepoint: codepoint(0x0F33),
746 name: s("TIBETAN DIGIT HALF ZERO"),
747 general_category: s("No"),
748 canonical_combining_class: 0,
749 bidi_class: s("L"),
750 decomposition: UnicodeDataDecomposition::new(
751 None,
752 &[codepoint(0x0F33)]
753 )
754 .unwrap(),
755 numeric_type_decimal: None,
756 numeric_type_digit: None,
757 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
758 -1, 2
759 )),
760 bidi_mirrored: false,
761 unicode1_name: s(""),
762 iso_comment: s(""),
763 simple_uppercase_mapping: None,
764 simple_lowercase_mapping: None,
765 simple_titlecase_mapping: None,
766 }
767 );
768 }
769
770 #[test]
771 fn expander() {
772 use super::UnicodeDataExpander;
773 use crate::common::UcdLineParser;
774
775 let data = "\
776ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
777AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
778D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
779D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
780";
781 let records = UcdLineParser::new(None, data.as_bytes())
782 .collect::<Result<Vec<_>, _>>()
783 .unwrap();
784 assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
785 }
786}
787