1 | use std::fmt; |
2 | use std::iter; |
3 | use std::ops::Range; |
4 | use std::path::Path; |
5 | use std::str::FromStr; |
6 | |
7 | use once_cell::sync::Lazy; |
8 | use regex::Regex; |
9 | |
10 | use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; |
11 | use crate::error::Error; |
12 | |
13 | /// Represents a single row in the `UnicodeData.txt` file. |
14 | /// |
15 | /// These fields were taken from UAX44, Table 9, as part of the documentation |
16 | /// for the |
17 | /// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt). |
18 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
19 | pub struct UnicodeData { |
20 | /// The codepoint corresponding to this row. |
21 | pub codepoint: Codepoint, |
22 | /// The name of this codepoint. |
23 | pub name: String, |
24 | /// The "general category" of this codepoint. |
25 | pub general_category: String, |
26 | /// The class of this codepoint used in the Canonical Ordering Algorithm. |
27 | /// |
28 | /// Note that some classes map to a particular symbol. See |
29 | /// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values). |
30 | pub canonical_combining_class: u8, |
31 | /// The bidirectional class of this codepoint. |
32 | /// |
33 | /// Possible values are listed in |
34 | /// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values). |
35 | pub bidi_class: String, |
36 | /// The decomposition mapping for this codepoint. This includes its |
37 | /// formatting tag (if present). |
38 | pub decomposition: UnicodeDataDecomposition, |
39 | /// A decimal numeric representation of this codepoint, if it has the |
40 | /// property `Numeric_Type=Decimal`. |
41 | pub numeric_type_decimal: Option<u8>, |
42 | /// A decimal numeric representation of this codepoint, if it has the |
43 | /// property `Numeric_Type=Digit`. Note that while this field is still |
44 | /// populated for existing codepoints, no new codepoints will have this |
45 | /// field populated. |
46 | pub numeric_type_digit: Option<u8>, |
47 | /// A decimal or rational numeric representation of this codepoint, if it |
48 | /// has the property `Numeric_Type=Numeric`. |
49 | pub numeric_type_numeric: Option<UnicodeDataNumeric>, |
50 | /// A boolean indicating whether this codepoint is "mirrored" in |
51 | /// bidirectional text. |
52 | pub bidi_mirrored: bool, |
53 | /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that |
54 | /// this field is empty unless it is significantly different from |
55 | /// the `name` field. |
56 | pub unicode1_name: String, |
57 | /// The ISO 10464 comment field. This no longer contains any non-NULL |
58 | /// values. |
59 | pub iso_comment: String, |
60 | /// This codepoint's simple uppercase mapping, if it exists. |
61 | pub simple_uppercase_mapping: Option<Codepoint>, |
62 | /// This codepoint's simple lowercase mapping, if it exists. |
63 | pub simple_lowercase_mapping: Option<Codepoint>, |
64 | /// This codepoint's simple titlecase mapping, if it exists. |
65 | pub simple_titlecase_mapping: Option<Codepoint>, |
66 | } |
67 | |
68 | impl UcdFile for UnicodeData { |
69 | fn relative_file_path() -> &'static Path { |
70 | Path::new("UnicodeData.txt" ) |
71 | } |
72 | } |
73 | |
74 | impl UcdFileByCodepoint for UnicodeData { |
75 | fn codepoints(&self) -> CodepointIter { |
76 | self.codepoint.into_iter() |
77 | } |
78 | } |
79 | |
80 | impl UnicodeData { |
81 | /// Returns true if and only if this record corresponds to the start of a |
82 | /// range. |
83 | pub fn is_range_start(&self) -> bool { |
84 | self.name.starts_with('<' ) |
85 | && self.name.ends_with('>' ) |
86 | && self.name.contains("First" ) |
87 | } |
88 | |
89 | /// Returns true if and only if this record corresponds to the end of a |
90 | /// range. |
91 | pub fn is_range_end(&self) -> bool { |
92 | self.name.starts_with('<' ) |
93 | && self.name.ends_with('>' ) |
94 | && self.name.contains("Last" ) |
95 | } |
96 | } |
97 | |
98 | impl FromStr for UnicodeData { |
99 | type Err = Error; |
100 | |
101 | fn from_str(line: &str) -> Result<UnicodeData, Error> { |
102 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
103 | Regex::new( |
104 | r"(?x) |
105 | ^ |
106 | ([A-Z0-9]+); # 1; codepoint |
107 | ([^;]+); # 2; name |
108 | ([^;]+); # 3; general category |
109 | ([0-9]+); # 4; canonical combining class |
110 | ([^;]+); # 5; bidi class |
111 | ([^;]*); # 6; decomposition |
112 | ([0-9]*); # 7; numeric type decimal |
113 | ([0-9]*); # 8; numeric type digit |
114 | ([-0-9/]*); # 9; numeric type numeric |
115 | ([YN]); # 10; bidi mirrored |
116 | ([^;]*); # 11; unicode1 name |
117 | ([^;]*); # 12; ISO comment |
118 | ([^;]*); # 13; simple uppercase mapping |
119 | ([^;]*); # 14; simple lowercase mapping |
120 | ([^;]*) # 15; simple titlecase mapping |
121 | $ |
122 | " , |
123 | ) |
124 | .unwrap() |
125 | }); |
126 | let caps = match PARTS.captures(line.trim()) { |
127 | Some(caps) => caps, |
128 | None => return err!("invalid UnicodeData line" ), |
129 | }; |
130 | let capget = |n| caps.get(n).unwrap().as_str(); |
131 | let mut data = UnicodeData::default(); |
132 | |
133 | data.codepoint = capget(1).parse()?; |
134 | data.name = capget(2).to_string(); |
135 | data.general_category = capget(3).to_string(); |
136 | data.canonical_combining_class = match capget(4).parse() { |
137 | Ok(n) => n, |
138 | Err(err) => { |
139 | return err!( |
140 | "failed to parse canonical combining class ' {}': {}" , |
141 | capget(4), |
142 | err |
143 | ) |
144 | } |
145 | }; |
146 | data.bidi_class = capget(5).to_string(); |
147 | if !caps[6].is_empty() { |
148 | data.decomposition = caps[6].parse()?; |
149 | } else { |
150 | data.decomposition.push(data.codepoint)?; |
151 | } |
152 | if !capget(7).is_empty() { |
153 | data.numeric_type_decimal = Some(match capget(7).parse() { |
154 | Ok(n) => n, |
155 | Err(err) => { |
156 | return err!( |
157 | "failed to parse numeric type decimal ' {}': {}" , |
158 | capget(7), |
159 | err |
160 | ) |
161 | } |
162 | }); |
163 | } |
164 | if !capget(8).is_empty() { |
165 | data.numeric_type_digit = Some(match capget(8).parse() { |
166 | Ok(n) => n, |
167 | Err(err) => { |
168 | return err!( |
169 | "failed to parse numeric type digit ' {}': {}" , |
170 | capget(8), |
171 | err |
172 | ) |
173 | } |
174 | }); |
175 | } |
176 | if !capget(9).is_empty() { |
177 | data.numeric_type_numeric = Some(capget(9).parse()?); |
178 | } |
179 | data.bidi_mirrored = capget(10) == "Y" ; |
180 | data.unicode1_name = capget(11).to_string(); |
181 | data.iso_comment = capget(12).to_string(); |
182 | if !capget(13).is_empty() { |
183 | data.simple_uppercase_mapping = Some(capget(13).parse()?); |
184 | } |
185 | if !capget(14).is_empty() { |
186 | data.simple_lowercase_mapping = Some(capget(14).parse()?); |
187 | } |
188 | if !capget(15).is_empty() { |
189 | data.simple_titlecase_mapping = Some(capget(15).parse()?); |
190 | } |
191 | Ok(data) |
192 | } |
193 | } |
194 | |
195 | impl fmt::Display for UnicodeData { |
196 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
197 | write!(f, " {};" , self.codepoint)?; |
198 | write!(f, " {};" , self.name)?; |
199 | write!(f, " {};" , self.general_category)?; |
200 | write!(f, " {};" , self.canonical_combining_class)?; |
201 | write!(f, " {};" , self.bidi_class)?; |
202 | if self.decomposition.is_canonical() |
203 | && self.decomposition.mapping() == &[self.codepoint] |
204 | { |
205 | write!(f, ";" )?; |
206 | } else { |
207 | write!(f, " {};" , self.decomposition)?; |
208 | } |
209 | if let Some(n) = self.numeric_type_decimal { |
210 | write!(f, " {};" , n)?; |
211 | } else { |
212 | write!(f, ";" )?; |
213 | } |
214 | if let Some(n) = self.numeric_type_digit { |
215 | write!(f, " {};" , n)?; |
216 | } else { |
217 | write!(f, ";" )?; |
218 | } |
219 | if let Some(n) = self.numeric_type_numeric { |
220 | write!(f, " {};" , n)?; |
221 | } else { |
222 | write!(f, ";" )?; |
223 | } |
224 | write!(f, " {};" , if self.bidi_mirrored { "Y" } else { "N" })?; |
225 | write!(f, " {};" , self.unicode1_name)?; |
226 | write!(f, " {};" , self.iso_comment)?; |
227 | if let Some(cp) = self.simple_uppercase_mapping { |
228 | write!(f, " {};" , cp)?; |
229 | } else { |
230 | write!(f, ";" )?; |
231 | } |
232 | if let Some(cp) = self.simple_lowercase_mapping { |
233 | write!(f, " {};" , cp)?; |
234 | } else { |
235 | write!(f, ";" )?; |
236 | } |
237 | if let Some(cp) = self.simple_titlecase_mapping { |
238 | write!(f, " {}" , cp)?; |
239 | } |
240 | Ok(()) |
241 | } |
242 | } |
243 | |
244 | /// Represents a decomposition mapping of a single row in the |
245 | /// `UnicodeData.txt` file. |
246 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
247 | pub struct UnicodeDataDecomposition { |
248 | /// The formatting tag associated with this mapping, if present. |
249 | pub tag: Option<UnicodeDataDecompositionTag>, |
250 | /// The number of codepoints in this mapping. |
251 | pub len: usize, |
252 | /// The codepoints in the mapping. Entries beyond `len` in the mapping |
253 | /// are always U+0000. If no mapping was present, then this always contains |
254 | /// a single codepoint corresponding to this row's character. |
255 | pub mapping: [Codepoint; 18], |
256 | } |
257 | |
258 | impl UnicodeDataDecomposition { |
259 | /// Create a new decomposition mapping with the given tag and codepoints. |
260 | /// |
261 | /// If there are too many codepoints, then an error is returned. |
262 | pub fn new( |
263 | tag: Option<UnicodeDataDecompositionTag>, |
264 | mapping: &[Codepoint], |
265 | ) -> Result<UnicodeDataDecomposition, Error> { |
266 | let mut x = UnicodeDataDecomposition::default(); |
267 | x.tag = tag; |
268 | for &cp in mapping { |
269 | x.push(cp)?; |
270 | } |
271 | Ok(x) |
272 | } |
273 | |
274 | /// Add a new codepoint to this decomposition's mapping. |
275 | /// |
276 | /// If the mapping is already full, then this returns an error. |
277 | pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> { |
278 | if self.len >= self.mapping.len() { |
279 | return err!( |
280 | "invalid decomposition mapping (too many codepoints)" |
281 | ); |
282 | } |
283 | self.mapping[self.len] = cp; |
284 | self.len += 1; |
285 | Ok(()) |
286 | } |
287 | |
288 | /// Return the mapping as a slice of codepoints. The slice returned |
289 | /// has length equivalent to the number of codepoints in this mapping. |
290 | pub fn mapping(&self) -> &[Codepoint] { |
291 | &self.mapping[..self.len] |
292 | } |
293 | |
294 | /// Returns true if and only if this decomposition mapping is canonical. |
295 | pub fn is_canonical(&self) -> bool { |
296 | self.tag.is_none() |
297 | } |
298 | } |
299 | |
300 | impl FromStr for UnicodeDataDecomposition { |
301 | type Err = Error; |
302 | |
303 | fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> { |
304 | static WITH_TAG: Lazy<Regex> = Lazy::new(|| { |
305 | Regex::new(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$" ) |
306 | .unwrap() |
307 | }); |
308 | static CHARS: Lazy<Regex> = |
309 | Lazy::new(|| Regex::new(r"[0-9A-F]+" ).unwrap()); |
310 | if s.is_empty() { |
311 | return err!( |
312 | "expected non-empty string for \ |
313 | UnicodeDataDecomposition value" |
314 | ); |
315 | } |
316 | let caps = match WITH_TAG.captures(s) { |
317 | Some(caps) => caps, |
318 | None => return err!("invalid decomposition value" ), |
319 | }; |
320 | let mut decomp = UnicodeDataDecomposition::default(); |
321 | let mut codepoints = s; |
322 | if let Some(m) = caps.name("tag" ) { |
323 | decomp.tag = Some(m.as_str().parse()?); |
324 | codepoints = &caps["chars" ]; |
325 | } |
326 | for m in CHARS.find_iter(codepoints) { |
327 | let cp = m.as_str().parse()?; |
328 | decomp.push(cp)?; |
329 | } |
330 | Ok(decomp) |
331 | } |
332 | } |
333 | |
334 | impl fmt::Display for UnicodeDataDecomposition { |
335 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
336 | if let Some(ref tag: &UnicodeDataDecompositionTag) = self.tag { |
337 | write!(f, "< {}> " , tag)?; |
338 | } |
339 | let mut first: bool = true; |
340 | for cp: &Codepoint in self.mapping() { |
341 | if !first { |
342 | write!(f, " " )?; |
343 | } |
344 | first = false; |
345 | write!(f, " {}" , cp)?; |
346 | } |
347 | Ok(()) |
348 | } |
349 | } |
350 | |
351 | /// The formatting tag on a decomposition mapping. |
352 | /// |
353 | /// This is taken from |
354 | /// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). |
355 | #[derive (Clone, Debug, Eq, PartialEq)] |
356 | pub enum UnicodeDataDecompositionTag { |
357 | /// <font> |
358 | Font, |
359 | /// <noBreak> |
360 | NoBreak, |
361 | /// <initial> |
362 | Initial, |
363 | /// <medial> |
364 | Medial, |
365 | /// <final> |
366 | Final, |
367 | /// <isolated> |
368 | Isolated, |
369 | /// <circle> |
370 | Circle, |
371 | /// <super> |
372 | Super, |
373 | /// <sub> |
374 | Sub, |
375 | /// <vertical> |
376 | Vertical, |
377 | /// <wide> |
378 | Wide, |
379 | /// <narrow> |
380 | Narrow, |
381 | /// <small> |
382 | Small, |
383 | /// <square> |
384 | Square, |
385 | /// <fraction> |
386 | Fraction, |
387 | /// <compat> |
388 | Compat, |
389 | } |
390 | |
391 | impl FromStr for UnicodeDataDecompositionTag { |
392 | type Err = Error; |
393 | |
394 | fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> { |
395 | use self::UnicodeDataDecompositionTag::*; |
396 | Ok(match s { |
397 | "font" => Font, |
398 | "noBreak" => NoBreak, |
399 | "initial" => Initial, |
400 | "medial" => Medial, |
401 | "final" => Final, |
402 | "isolated" => Isolated, |
403 | "circle" => Circle, |
404 | "super" => Super, |
405 | "sub" => Sub, |
406 | "vertical" => Vertical, |
407 | "wide" => Wide, |
408 | "narrow" => Narrow, |
409 | "small" => Small, |
410 | "square" => Square, |
411 | "fraction" => Fraction, |
412 | "compat" => Compat, |
413 | _ => return err!("invalid decomposition formatting tag: {}" , s), |
414 | }) |
415 | } |
416 | } |
417 | |
418 | impl fmt::Display for UnicodeDataDecompositionTag { |
419 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
420 | use self::UnicodeDataDecompositionTag::*; |
421 | let s: &str = match *self { |
422 | Font => "font" , |
423 | NoBreak => "noBreak" , |
424 | Initial => "initial" , |
425 | Medial => "medial" , |
426 | Final => "final" , |
427 | Isolated => "isolated" , |
428 | Circle => "circle" , |
429 | Super => "super" , |
430 | Sub => "sub" , |
431 | Vertical => "vertical" , |
432 | Wide => "wide" , |
433 | Narrow => "narrow" , |
434 | Small => "small" , |
435 | Square => "square" , |
436 | Fraction => "fraction" , |
437 | Compat => "compat" , |
438 | }; |
439 | write!(f, " {}" , s) |
440 | } |
441 | } |
442 | |
443 | /// A numeric value corresponding to characters with `Numeric_Type=Numeric`. |
444 | /// |
445 | /// A numeric value can either be a signed integer or a rational number. |
446 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
447 | pub enum UnicodeDataNumeric { |
448 | /// An integer. |
449 | Integer(i64), |
450 | /// A rational number. The first is the numerator and the latter is the |
451 | /// denominator. |
452 | Rational(i64, i64), |
453 | } |
454 | |
455 | impl FromStr for UnicodeDataNumeric { |
456 | type Err = Error; |
457 | |
458 | fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> { |
459 | if s.is_empty() { |
460 | return err!( |
461 | "expected non-empty string for UnicodeDataNumeric value" |
462 | ); |
463 | } |
464 | if let Some(pos) = s.find('/' ) { |
465 | let (snum, sden) = (&s[..pos], &s[pos + 1..]); |
466 | let num = match snum.parse() { |
467 | Ok(num) => num, |
468 | Err(err) => { |
469 | return err!( |
470 | "invalid integer numerator ' {}': {}" , |
471 | snum, |
472 | err |
473 | ); |
474 | } |
475 | }; |
476 | let den = match sden.parse() { |
477 | Ok(den) => den, |
478 | Err(err) => { |
479 | return err!( |
480 | "invalid integer denominator ' {}': {}" , |
481 | sden, |
482 | err |
483 | ); |
484 | } |
485 | }; |
486 | Ok(UnicodeDataNumeric::Rational(num, den)) |
487 | } else { |
488 | match s.parse() { |
489 | Ok(den) => Ok(UnicodeDataNumeric::Integer(den)), |
490 | Err(err) => { |
491 | return err!( |
492 | "invalid integer denominator ' {}': {}" , |
493 | s, |
494 | err |
495 | ); |
496 | } |
497 | } |
498 | } |
499 | } |
500 | } |
501 | |
502 | impl fmt::Display for UnicodeDataNumeric { |
503 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
504 | match *self { |
505 | UnicodeDataNumeric::Integer(n: i64) => write!(f, " {}" , n), |
506 | UnicodeDataNumeric::Rational(n: i64, d: i64) => write!(f, " {}/ {}" , n, d), |
507 | } |
508 | } |
509 | } |
510 | |
511 | /// An iterator adapter that expands rows in `UnicodeData.txt`. |
512 | /// |
513 | /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly |
514 | /// represented. Instead, they are represented by a pair of rows, indicating |
515 | /// a range of codepoints with the same properties. For example, the Hangul |
516 | /// syllable codepoints are represented by these two rows: |
517 | /// |
518 | /// ```ignore |
519 | /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
520 | /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
521 | /// ``` |
522 | /// |
523 | /// This iterator will wrap any iterator of `UnicodeData` and, when a range of |
524 | /// Unicode codepoints is found, it will be expanded to the appropriate |
525 | /// sequence of `UnicodeData` values. Note that all such expanded records will |
526 | /// have an empty name. |
527 | pub struct UnicodeDataExpander<I: Iterator> { |
528 | /// The underlying iterator. |
529 | it: iter::Peekable<I>, |
530 | /// A range of codepoints to emit when we've found a pair. Otherwise, |
531 | /// `None`. |
532 | range: CodepointRange, |
533 | } |
534 | |
535 | struct CodepointRange { |
536 | /// The codepoint range. |
537 | range: Range<u32>, |
538 | /// The start record. All subsequent records in this range are generated |
539 | /// by cloning this and updating the codepoint/name. |
540 | start_record: UnicodeData, |
541 | } |
542 | |
543 | impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> { |
544 | /// Create a new iterator that expands pairs of `UnicodeData` range |
545 | /// records. All other records are passed through as-is. |
546 | pub fn new<T>(it: T) -> UnicodeDataExpander<I> |
547 | where |
548 | T: IntoIterator<IntoIter = I, Item = I::Item>, |
549 | { |
550 | UnicodeDataExpander { |
551 | it: it.into_iter().peekable(), |
552 | range: CodepointRange { |
553 | range: 0..0, |
554 | start_record: UnicodeData::default(), |
555 | }, |
556 | } |
557 | } |
558 | } |
559 | |
560 | impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> { |
561 | type Item = UnicodeData; |
562 | |
563 | fn next(&mut self) -> Option<UnicodeData> { |
564 | if let Some(udata: UnicodeData) = self.range.next() { |
565 | return Some(udata); |
566 | } |
567 | let row1: UnicodeData = match self.it.next() { |
568 | None => return None, |
569 | Some(row1: UnicodeData) => row1, |
570 | }; |
571 | if !row1.is_range_start() |
572 | || !self.it.peek().map_or(default:false, |row2: &UnicodeData| row2.is_range_end()) |
573 | { |
574 | return Some(row1); |
575 | } |
576 | let row2: UnicodeData = self.it.next().unwrap(); |
577 | self.range = CodepointRange { |
578 | range: row1.codepoint.value()..(row2.codepoint.value() + 1), |
579 | start_record: row1, |
580 | }; |
581 | self.next() |
582 | } |
583 | } |
584 | |
585 | impl Iterator for CodepointRange { |
586 | type Item = UnicodeData; |
587 | |
588 | fn next(&mut self) -> Option<UnicodeData> { |
589 | let cp: u32 = match self.range.next() { |
590 | None => return None, |
591 | Some(cp: u32) => cp, |
592 | }; |
593 | Some(UnicodeData { |
594 | codepoint: Codepoint::from_u32(cp).unwrap(), |
595 | name: "" .to_string(), |
596 | ..self.start_record.clone() |
597 | }) |
598 | } |
599 | } |
600 | |
601 | #[cfg (test)] |
602 | mod tests { |
603 | use crate::common::Codepoint; |
604 | |
605 | use super::{ |
606 | UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, |
607 | UnicodeDataNumeric, |
608 | }; |
609 | |
610 | fn codepoint(n: u32) -> Codepoint { |
611 | Codepoint::from_u32(n).unwrap() |
612 | } |
613 | |
614 | fn s(string: &str) -> String { |
615 | string.to_string() |
616 | } |
617 | |
618 | #[test ] |
619 | fn parse1() { |
620 | let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;; \n" ; |
621 | let data: UnicodeData = line.parse().unwrap(); |
622 | assert_eq!( |
623 | data, |
624 | UnicodeData { |
625 | codepoint: codepoint(0x249d), |
626 | name: s("PARENTHESIZED LATIN SMALL LETTER B" ), |
627 | general_category: s("So" ), |
628 | canonical_combining_class: 0, |
629 | bidi_class: s("L" ), |
630 | decomposition: UnicodeDataDecomposition::new( |
631 | Some(UnicodeDataDecompositionTag::Compat), |
632 | &[codepoint(0x28), codepoint(0x62), codepoint(0x29)], |
633 | ) |
634 | .unwrap(), |
635 | numeric_type_decimal: None, |
636 | numeric_type_digit: None, |
637 | numeric_type_numeric: None, |
638 | bidi_mirrored: false, |
639 | unicode1_name: s("" ), |
640 | iso_comment: s("" ), |
641 | simple_uppercase_mapping: None, |
642 | simple_lowercase_mapping: None, |
643 | simple_titlecase_mapping: None, |
644 | } |
645 | ); |
646 | } |
647 | |
648 | #[test ] |
649 | fn parse2() { |
650 | let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;; \n" ; |
651 | let data: UnicodeData = line.parse().unwrap(); |
652 | assert_eq!( |
653 | data, |
654 | UnicodeData { |
655 | codepoint: codepoint(0x000D), |
656 | name: s("<control>" ), |
657 | general_category: s("Cc" ), |
658 | canonical_combining_class: 0, |
659 | bidi_class: s("B" ), |
660 | decomposition: UnicodeDataDecomposition::new( |
661 | None, |
662 | &[codepoint(0x000D)] |
663 | ) |
664 | .unwrap(), |
665 | numeric_type_decimal: None, |
666 | numeric_type_digit: None, |
667 | numeric_type_numeric: None, |
668 | bidi_mirrored: false, |
669 | unicode1_name: s("CARRIAGE RETURN (CR)" ), |
670 | iso_comment: s("" ), |
671 | simple_uppercase_mapping: None, |
672 | simple_lowercase_mapping: None, |
673 | simple_titlecase_mapping: None, |
674 | } |
675 | ); |
676 | } |
677 | |
678 | #[test ] |
679 | fn parse3() { |
680 | let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;; \n" ; |
681 | let data: UnicodeData = line.parse().unwrap(); |
682 | assert_eq!( |
683 | data, |
684 | UnicodeData { |
685 | codepoint: codepoint(0x00BC), |
686 | name: s("VULGAR FRACTION ONE QUARTER" ), |
687 | general_category: s("No" ), |
688 | canonical_combining_class: 0, |
689 | bidi_class: s("ON" ), |
690 | decomposition: UnicodeDataDecomposition::new( |
691 | Some(UnicodeDataDecompositionTag::Fraction), |
692 | &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)], |
693 | ) |
694 | .unwrap(), |
695 | numeric_type_decimal: None, |
696 | numeric_type_digit: None, |
697 | numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)), |
698 | bidi_mirrored: false, |
699 | unicode1_name: s("FRACTION ONE QUARTER" ), |
700 | iso_comment: s("" ), |
701 | simple_uppercase_mapping: None, |
702 | simple_lowercase_mapping: None, |
703 | simple_titlecase_mapping: None, |
704 | } |
705 | ); |
706 | } |
707 | |
708 | #[test ] |
709 | fn parse4() { |
710 | let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; \n" ; |
711 | let data: UnicodeData = line.parse().unwrap(); |
712 | assert_eq!( |
713 | data, |
714 | UnicodeData { |
715 | codepoint: codepoint(0x0041), |
716 | name: s("LATIN CAPITAL LETTER A" ), |
717 | general_category: s("Lu" ), |
718 | canonical_combining_class: 0, |
719 | bidi_class: s("L" ), |
720 | decomposition: UnicodeDataDecomposition::new( |
721 | None, |
722 | &[codepoint(0x0041)] |
723 | ) |
724 | .unwrap(), |
725 | numeric_type_decimal: None, |
726 | numeric_type_digit: None, |
727 | numeric_type_numeric: None, |
728 | bidi_mirrored: false, |
729 | unicode1_name: s("" ), |
730 | iso_comment: s("" ), |
731 | simple_uppercase_mapping: None, |
732 | simple_lowercase_mapping: Some(codepoint(0x0061)), |
733 | simple_titlecase_mapping: None, |
734 | } |
735 | ); |
736 | } |
737 | |
738 | #[test ] |
739 | fn parse5() { |
740 | let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;; \n" ; |
741 | let data: UnicodeData = line.parse().unwrap(); |
742 | assert_eq!( |
743 | data, |
744 | UnicodeData { |
745 | codepoint: codepoint(0x0F33), |
746 | name: s("TIBETAN DIGIT HALF ZERO" ), |
747 | general_category: s("No" ), |
748 | canonical_combining_class: 0, |
749 | bidi_class: s("L" ), |
750 | decomposition: UnicodeDataDecomposition::new( |
751 | None, |
752 | &[codepoint(0x0F33)] |
753 | ) |
754 | .unwrap(), |
755 | numeric_type_decimal: None, |
756 | numeric_type_digit: None, |
757 | numeric_type_numeric: Some(UnicodeDataNumeric::Rational( |
758 | -1, 2 |
759 | )), |
760 | bidi_mirrored: false, |
761 | unicode1_name: s("" ), |
762 | iso_comment: s("" ), |
763 | simple_uppercase_mapping: None, |
764 | simple_lowercase_mapping: None, |
765 | simple_titlecase_mapping: None, |
766 | } |
767 | ); |
768 | } |
769 | |
770 | #[test ] |
771 | fn expander() { |
772 | use super::UnicodeDataExpander; |
773 | use crate::common::UcdLineParser; |
774 | |
775 | let data = "\ |
776 | ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; |
777 | AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
778 | D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
779 | D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;; |
780 | " ; |
781 | let records = UcdLineParser::new(None, data.as_bytes()) |
782 | .collect::<Result<Vec<_>, _>>() |
783 | .unwrap(); |
784 | assert_eq!(UnicodeDataExpander::new(records).count(), 11174); |
785 | } |
786 | } |
787 | |