1 | use std::path::Path; |
2 | |
3 | use crate::{ |
4 | common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}, |
5 | error::Error, |
6 | }; |
7 | |
8 | /// Represents a single row in the `UnicodeData.txt` file. |
9 | /// |
10 | /// These fields were taken from UAX44, Table 9, as part of the documentation |
11 | /// for the |
12 | /// [`UnicodeData.txt` file](https://www.unicode.org/reports/tr44/#UnicodeData.txt). |
13 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
14 | pub struct UnicodeData { |
15 | /// The codepoint corresponding to this row. |
16 | pub codepoint: Codepoint, |
17 | /// The name of this codepoint. |
18 | pub name: String, |
19 | /// The "general category" of this codepoint. |
20 | pub general_category: String, |
21 | /// The class of this codepoint used in the Canonical Ordering Algorithm. |
22 | /// |
23 | /// Note that some classes map to a particular symbol. See |
24 | /// [UAX44, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values). |
25 | pub canonical_combining_class: u8, |
26 | /// The bidirectional class of this codepoint. |
27 | /// |
28 | /// Possible values are listed in |
29 | /// [UAX44, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values). |
30 | pub bidi_class: String, |
31 | /// The decomposition mapping for this codepoint. This includes its |
32 | /// formatting tag (if present). |
33 | pub decomposition: UnicodeDataDecomposition, |
34 | /// A decimal numeric representation of this codepoint, if it has the |
35 | /// property `Numeric_Type=Decimal`. |
36 | pub numeric_type_decimal: Option<u8>, |
37 | /// A decimal numeric representation of this codepoint, if it has the |
38 | /// property `Numeric_Type=Digit`. Note that while this field is still |
39 | /// populated for existing codepoints, no new codepoints will have this |
40 | /// field populated. |
41 | pub numeric_type_digit: Option<u8>, |
42 | /// A decimal or rational numeric representation of this codepoint, if it |
43 | /// has the property `Numeric_Type=Numeric`. |
44 | pub numeric_type_numeric: Option<UnicodeDataNumeric>, |
45 | /// A boolean indicating whether this codepoint is "mirrored" in |
46 | /// bidirectional text. |
47 | pub bidi_mirrored: bool, |
48 | /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that |
49 | /// this field is empty unless it is significantly different from |
50 | /// the `name` field. |
51 | pub unicode1_name: String, |
52 | /// The ISO 10464 comment field. This no longer contains any non-NULL |
53 | /// values. |
54 | pub iso_comment: String, |
55 | /// This codepoint's simple uppercase mapping, if it exists. |
56 | pub simple_uppercase_mapping: Option<Codepoint>, |
57 | /// This codepoint's simple lowercase mapping, if it exists. |
58 | pub simple_lowercase_mapping: Option<Codepoint>, |
59 | /// This codepoint's simple titlecase mapping, if it exists. |
60 | pub simple_titlecase_mapping: Option<Codepoint>, |
61 | } |
62 | |
63 | impl UcdFile for UnicodeData { |
64 | fn relative_file_path() -> &'static Path { |
65 | Path::new("UnicodeData.txt" ) |
66 | } |
67 | } |
68 | |
69 | impl UcdFileByCodepoint for UnicodeData { |
70 | fn codepoints(&self) -> CodepointIter { |
71 | self.codepoint.into_iter() |
72 | } |
73 | } |
74 | |
75 | impl UnicodeData { |
76 | /// Returns true if and only if this record corresponds to the start of a |
77 | /// range. |
78 | pub fn is_range_start(&self) -> bool { |
79 | self.name.starts_with('<' ) |
80 | && self.name.ends_with('>' ) |
81 | && self.name.contains("First" ) |
82 | } |
83 | |
84 | /// Returns true if and only if this record corresponds to the end of a |
85 | /// range. |
86 | pub fn is_range_end(&self) -> bool { |
87 | self.name.starts_with('<' ) |
88 | && self.name.ends_with('>' ) |
89 | && self.name.contains("Last" ) |
90 | } |
91 | } |
92 | |
93 | impl std::str::FromStr for UnicodeData { |
94 | type Err = Error; |
95 | |
96 | fn from_str(line: &str) -> Result<UnicodeData, Error> { |
97 | let re_parts = regex!( |
98 | r"(?x) |
99 | ^ |
100 | ([A-Z0-9]+); # 1; codepoint |
101 | ([^;]+); # 2; name |
102 | ([^;]+); # 3; general category |
103 | ([0-9]+); # 4; canonical combining class |
104 | ([^;]+); # 5; bidi class |
105 | ([^;]*); # 6; decomposition |
106 | ([0-9]*); # 7; numeric type decimal |
107 | ([0-9]*); # 8; numeric type digit |
108 | ([-0-9/]*); # 9; numeric type numeric |
109 | ([YN]); # 10; bidi mirrored |
110 | ([^;]*); # 11; unicode1 name |
111 | ([^;]*); # 12; ISO comment |
112 | ([^;]*); # 13; simple uppercase mapping |
113 | ([^;]*); # 14; simple lowercase mapping |
114 | ([^;]*) # 15; simple titlecase mapping |
115 | $ |
116 | " , |
117 | ); |
118 | |
119 | let caps = match re_parts.captures(line.trim()) { |
120 | Some(caps) => caps, |
121 | None => return err!("invalid UnicodeData line" ), |
122 | }; |
123 | let capget = |n| caps.get(n).unwrap().as_str(); |
124 | let mut data = UnicodeData::default(); |
125 | |
126 | data.codepoint = capget(1).parse()?; |
127 | data.name = capget(2).to_string(); |
128 | data.general_category = capget(3).to_string(); |
129 | data.canonical_combining_class = match capget(4).parse() { |
130 | Ok(n) => n, |
131 | Err(err) => { |
132 | return err!( |
133 | "failed to parse canonical combining class ' {}': {}" , |
134 | capget(4), |
135 | err |
136 | ) |
137 | } |
138 | }; |
139 | data.bidi_class = capget(5).to_string(); |
140 | if !caps[6].is_empty() { |
141 | data.decomposition = caps[6].parse()?; |
142 | } else { |
143 | data.decomposition.push(data.codepoint)?; |
144 | } |
145 | if !capget(7).is_empty() { |
146 | data.numeric_type_decimal = Some(match capget(7).parse() { |
147 | Ok(n) => n, |
148 | Err(err) => { |
149 | return err!( |
150 | "failed to parse numeric type decimal ' {}': {}" , |
151 | capget(7), |
152 | err |
153 | ) |
154 | } |
155 | }); |
156 | } |
157 | if !capget(8).is_empty() { |
158 | data.numeric_type_digit = Some(match capget(8).parse() { |
159 | Ok(n) => n, |
160 | Err(err) => { |
161 | return err!( |
162 | "failed to parse numeric type digit ' {}': {}" , |
163 | capget(8), |
164 | err |
165 | ) |
166 | } |
167 | }); |
168 | } |
169 | if !capget(9).is_empty() { |
170 | data.numeric_type_numeric = Some(capget(9).parse()?); |
171 | } |
172 | data.bidi_mirrored = capget(10) == "Y" ; |
173 | data.unicode1_name = capget(11).to_string(); |
174 | data.iso_comment = capget(12).to_string(); |
175 | if !capget(13).is_empty() { |
176 | data.simple_uppercase_mapping = Some(capget(13).parse()?); |
177 | } |
178 | if !capget(14).is_empty() { |
179 | data.simple_lowercase_mapping = Some(capget(14).parse()?); |
180 | } |
181 | if !capget(15).is_empty() { |
182 | data.simple_titlecase_mapping = Some(capget(15).parse()?); |
183 | } |
184 | Ok(data) |
185 | } |
186 | } |
187 | |
188 | impl std::fmt::Display for UnicodeData { |
189 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
190 | write!(f, " {};" , self.codepoint)?; |
191 | write!(f, " {};" , self.name)?; |
192 | write!(f, " {};" , self.general_category)?; |
193 | write!(f, " {};" , self.canonical_combining_class)?; |
194 | write!(f, " {};" , self.bidi_class)?; |
195 | if self.decomposition.is_canonical() |
196 | && self.decomposition.mapping() == &[self.codepoint] |
197 | { |
198 | write!(f, ";" )?; |
199 | } else { |
200 | write!(f, " {};" , self.decomposition)?; |
201 | } |
202 | if let Some(n) = self.numeric_type_decimal { |
203 | write!(f, " {};" , n)?; |
204 | } else { |
205 | write!(f, ";" )?; |
206 | } |
207 | if let Some(n) = self.numeric_type_digit { |
208 | write!(f, " {};" , n)?; |
209 | } else { |
210 | write!(f, ";" )?; |
211 | } |
212 | if let Some(n) = self.numeric_type_numeric { |
213 | write!(f, " {};" , n)?; |
214 | } else { |
215 | write!(f, ";" )?; |
216 | } |
217 | write!(f, " {};" , if self.bidi_mirrored { "Y" } else { "N" })?; |
218 | write!(f, " {};" , self.unicode1_name)?; |
219 | write!(f, " {};" , self.iso_comment)?; |
220 | if let Some(cp) = self.simple_uppercase_mapping { |
221 | write!(f, " {};" , cp)?; |
222 | } else { |
223 | write!(f, ";" )?; |
224 | } |
225 | if let Some(cp) = self.simple_lowercase_mapping { |
226 | write!(f, " {};" , cp)?; |
227 | } else { |
228 | write!(f, ";" )?; |
229 | } |
230 | if let Some(cp) = self.simple_titlecase_mapping { |
231 | write!(f, " {}" , cp)?; |
232 | } |
233 | Ok(()) |
234 | } |
235 | } |
236 | |
237 | /// Represents a decomposition mapping of a single row in the |
238 | /// `UnicodeData.txt` file. |
239 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
240 | pub struct UnicodeDataDecomposition { |
241 | /// The formatting tag associated with this mapping, if present. |
242 | pub tag: Option<UnicodeDataDecompositionTag>, |
243 | /// The number of codepoints in this mapping. |
244 | pub len: usize, |
245 | /// The codepoints in the mapping. Entries beyond `len` in the mapping |
246 | /// are always U+0000. If no mapping was present, then this always contains |
247 | /// a single codepoint corresponding to this row's character. |
248 | pub mapping: [Codepoint; 18], |
249 | } |
250 | |
251 | impl UnicodeDataDecomposition { |
252 | /// Create a new decomposition mapping with the given tag and codepoints. |
253 | /// |
254 | /// If there are too many codepoints, then an error is returned. |
255 | pub fn new( |
256 | tag: Option<UnicodeDataDecompositionTag>, |
257 | mapping: &[Codepoint], |
258 | ) -> Result<UnicodeDataDecomposition, Error> { |
259 | let mut x = UnicodeDataDecomposition::default(); |
260 | x.tag = tag; |
261 | for &cp in mapping { |
262 | x.push(cp)?; |
263 | } |
264 | Ok(x) |
265 | } |
266 | |
267 | /// Add a new codepoint to this decomposition's mapping. |
268 | /// |
269 | /// If the mapping is already full, then this returns an error. |
270 | pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> { |
271 | if self.len >= self.mapping.len() { |
272 | return err!( |
273 | "invalid decomposition mapping (too many codepoints)" |
274 | ); |
275 | } |
276 | self.mapping[self.len] = cp; |
277 | self.len += 1; |
278 | Ok(()) |
279 | } |
280 | |
281 | /// Return the mapping as a slice of codepoints. The slice returned |
282 | /// has length equivalent to the number of codepoints in this mapping. |
283 | pub fn mapping(&self) -> &[Codepoint] { |
284 | &self.mapping[..self.len] |
285 | } |
286 | |
287 | /// Returns true if and only if this decomposition mapping is canonical. |
288 | pub fn is_canonical(&self) -> bool { |
289 | self.tag.is_none() |
290 | } |
291 | } |
292 | |
293 | impl std::str::FromStr for UnicodeDataDecomposition { |
294 | type Err = Error; |
295 | |
296 | fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> { |
297 | let re_with_tag = |
298 | regex!(r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$" ); |
299 | let re_chars = regex!(r"[0-9A-F]+" ); |
300 | if s.is_empty() { |
301 | return err!( |
302 | "expected non-empty string for \ |
303 | UnicodeDataDecomposition value" |
304 | ); |
305 | } |
306 | let caps = match re_with_tag.captures(s) { |
307 | Some(caps) => caps, |
308 | None => return err!("invalid decomposition value" ), |
309 | }; |
310 | let mut decomp = UnicodeDataDecomposition::default(); |
311 | let mut codepoints = s; |
312 | if let Some(m) = caps.name("tag" ) { |
313 | decomp.tag = Some(m.as_str().parse()?); |
314 | codepoints = &caps["chars" ]; |
315 | } |
316 | for m in re_chars.find_iter(codepoints) { |
317 | let cp = m.as_str().parse()?; |
318 | decomp.push(cp)?; |
319 | } |
320 | Ok(decomp) |
321 | } |
322 | } |
323 | |
324 | impl std::fmt::Display for UnicodeDataDecomposition { |
325 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
326 | if let Some(ref tag: &UnicodeDataDecompositionTag) = self.tag { |
327 | write!(f, "< {}> " , tag)?; |
328 | } |
329 | let mut first: bool = true; |
330 | for cp: &Codepoint in self.mapping() { |
331 | if !first { |
332 | write!(f, " " )?; |
333 | } |
334 | first = false; |
335 | write!(f, " {}" , cp)?; |
336 | } |
337 | Ok(()) |
338 | } |
339 | } |
340 | |
341 | /// The formatting tag on a decomposition mapping. |
342 | /// |
343 | /// This is taken from |
344 | /// [UAX44, Table 14](https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). |
345 | #[derive (Clone, Debug, Eq, PartialEq)] |
346 | pub enum UnicodeDataDecompositionTag { |
347 | /// <font> |
348 | Font, |
349 | /// <noBreak> |
350 | NoBreak, |
351 | /// <initial> |
352 | Initial, |
353 | /// <medial> |
354 | Medial, |
355 | /// <final> |
356 | Final, |
357 | /// <isolated> |
358 | Isolated, |
359 | /// <circle> |
360 | Circle, |
361 | /// <super> |
362 | Super, |
363 | /// <sub> |
364 | Sub, |
365 | /// <vertical> |
366 | Vertical, |
367 | /// <wide> |
368 | Wide, |
369 | /// <narrow> |
370 | Narrow, |
371 | /// <small> |
372 | Small, |
373 | /// <square> |
374 | Square, |
375 | /// <fraction> |
376 | Fraction, |
377 | /// <compat> |
378 | Compat, |
379 | } |
380 | |
381 | impl std::str::FromStr for UnicodeDataDecompositionTag { |
382 | type Err = Error; |
383 | |
384 | fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> { |
385 | use self::UnicodeDataDecompositionTag::*; |
386 | Ok(match s { |
387 | "font" => Font, |
388 | "noBreak" => NoBreak, |
389 | "initial" => Initial, |
390 | "medial" => Medial, |
391 | "final" => Final, |
392 | "isolated" => Isolated, |
393 | "circle" => Circle, |
394 | "super" => Super, |
395 | "sub" => Sub, |
396 | "vertical" => Vertical, |
397 | "wide" => Wide, |
398 | "narrow" => Narrow, |
399 | "small" => Small, |
400 | "square" => Square, |
401 | "fraction" => Fraction, |
402 | "compat" => Compat, |
403 | _ => return err!("invalid decomposition formatting tag: {}" , s), |
404 | }) |
405 | } |
406 | } |
407 | |
408 | impl std::fmt::Display for UnicodeDataDecompositionTag { |
409 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
410 | use self::UnicodeDataDecompositionTag::*; |
411 | let s: &'static str = match *self { |
412 | Font => "font" , |
413 | NoBreak => "noBreak" , |
414 | Initial => "initial" , |
415 | Medial => "medial" , |
416 | Final => "final" , |
417 | Isolated => "isolated" , |
418 | Circle => "circle" , |
419 | Super => "super" , |
420 | Sub => "sub" , |
421 | Vertical => "vertical" , |
422 | Wide => "wide" , |
423 | Narrow => "narrow" , |
424 | Small => "small" , |
425 | Square => "square" , |
426 | Fraction => "fraction" , |
427 | Compat => "compat" , |
428 | }; |
429 | write!(f, " {}" , s) |
430 | } |
431 | } |
432 | |
433 | /// A numeric value corresponding to characters with `Numeric_Type=Numeric`. |
434 | /// |
435 | /// A numeric value can either be a signed integer or a rational number. |
436 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
437 | pub enum UnicodeDataNumeric { |
438 | /// An integer. |
439 | Integer(i64), |
440 | /// A rational number. The first is the numerator and the latter is the |
441 | /// denominator. |
442 | Rational(i64, i64), |
443 | } |
444 | |
445 | impl std::str::FromStr for UnicodeDataNumeric { |
446 | type Err = Error; |
447 | |
448 | fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> { |
449 | if s.is_empty() { |
450 | return err!( |
451 | "expected non-empty string for UnicodeDataNumeric value" |
452 | ); |
453 | } |
454 | if let Some(pos) = s.find('/' ) { |
455 | let (snum, sden) = (&s[..pos], &s[pos + 1..]); |
456 | let num = match snum.parse() { |
457 | Ok(num) => num, |
458 | Err(err) => { |
459 | return err!( |
460 | "invalid integer numerator ' {}': {}" , |
461 | snum, |
462 | err |
463 | ); |
464 | } |
465 | }; |
466 | let den = match sden.parse() { |
467 | Ok(den) => den, |
468 | Err(err) => { |
469 | return err!( |
470 | "invalid integer denominator ' {}': {}" , |
471 | sden, |
472 | err |
473 | ); |
474 | } |
475 | }; |
476 | Ok(UnicodeDataNumeric::Rational(num, den)) |
477 | } else { |
478 | match s.parse() { |
479 | Ok(den) => Ok(UnicodeDataNumeric::Integer(den)), |
480 | Err(err) => { |
481 | return err!( |
482 | "invalid integer denominator ' {}': {}" , |
483 | s, |
484 | err |
485 | ); |
486 | } |
487 | } |
488 | } |
489 | } |
490 | } |
491 | |
492 | impl std::fmt::Display for UnicodeDataNumeric { |
493 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
494 | match *self { |
495 | UnicodeDataNumeric::Integer(n: i64) => write!(f, " {}" , n), |
496 | UnicodeDataNumeric::Rational(n: i64, d: i64) => write!(f, " {}/ {}" , n, d), |
497 | } |
498 | } |
499 | } |
500 | |
501 | /// An iterator adapter that expands rows in `UnicodeData.txt`. |
502 | /// |
503 | /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly |
504 | /// represented. Instead, they are represented by a pair of rows, indicating |
505 | /// a range of codepoints with the same properties. For example, the Hangul |
506 | /// syllable codepoints are represented by these two rows: |
507 | /// |
508 | /// ```ignore |
509 | /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
510 | /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
511 | /// ``` |
512 | /// |
513 | /// This iterator will wrap any iterator of `UnicodeData` and, when a range of |
514 | /// Unicode codepoints is found, it will be expanded to the appropriate |
515 | /// sequence of `UnicodeData` values. Note that all such expanded records will |
516 | /// have an empty name. |
517 | pub struct UnicodeDataExpander<I: Iterator> { |
518 | /// The underlying iterator. |
519 | it: std::iter::Peekable<I>, |
520 | /// A range of codepoints to emit when we've found a pair. Otherwise, |
521 | /// `None`. |
522 | range: CodepointRange, |
523 | } |
524 | |
525 | struct CodepointRange { |
526 | /// The codepoint range. |
527 | range: std::ops::Range<u32>, |
528 | /// The start record. All subsequent records in this range are generated |
529 | /// by cloning this and updating the codepoint/name. |
530 | start_record: UnicodeData, |
531 | } |
532 | |
533 | impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> { |
534 | /// Create a new iterator that expands pairs of `UnicodeData` range |
535 | /// records. All other records are passed through as-is. |
536 | pub fn new<T>(it: T) -> UnicodeDataExpander<I> |
537 | where |
538 | T: IntoIterator<IntoIter = I, Item = I::Item>, |
539 | { |
540 | UnicodeDataExpander { |
541 | it: it.into_iter().peekable(), |
542 | range: CodepointRange { |
543 | range: 0..0, |
544 | start_record: UnicodeData::default(), |
545 | }, |
546 | } |
547 | } |
548 | } |
549 | |
550 | impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> { |
551 | type Item = UnicodeData; |
552 | |
553 | fn next(&mut self) -> Option<UnicodeData> { |
554 | if let Some(udata: UnicodeData) = self.range.next() { |
555 | return Some(udata); |
556 | } |
557 | let row1: UnicodeData = match self.it.next() { |
558 | None => return None, |
559 | Some(row1: UnicodeData) => row1, |
560 | }; |
561 | if !row1.is_range_start() |
562 | || !self.it.peek().map_or(default:false, |row2: &UnicodeData| row2.is_range_end()) |
563 | { |
564 | return Some(row1); |
565 | } |
566 | let row2: UnicodeData = self.it.next().unwrap(); |
567 | self.range = CodepointRange { |
568 | range: row1.codepoint.value()..(row2.codepoint.value() + 1), |
569 | start_record: row1, |
570 | }; |
571 | self.next() |
572 | } |
573 | } |
574 | |
575 | impl Iterator for CodepointRange { |
576 | type Item = UnicodeData; |
577 | |
578 | fn next(&mut self) -> Option<UnicodeData> { |
579 | let cp: u32 = match self.range.next() { |
580 | None => return None, |
581 | Some(cp: u32) => cp, |
582 | }; |
583 | Some(UnicodeData { |
584 | codepoint: Codepoint::from_u32(cp).unwrap(), |
585 | name: "" .to_string(), |
586 | ..self.start_record.clone() |
587 | }) |
588 | } |
589 | } |
590 | |
591 | #[cfg (test)] |
592 | mod tests { |
593 | use crate::common::Codepoint; |
594 | |
595 | use super::{ |
596 | UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, |
597 | UnicodeDataNumeric, |
598 | }; |
599 | |
600 | fn codepoint(n: u32) -> Codepoint { |
601 | Codepoint::from_u32(n).unwrap() |
602 | } |
603 | |
604 | fn s(string: &str) -> String { |
605 | string.to_string() |
606 | } |
607 | |
608 | #[test ] |
609 | fn parse1() { |
610 | let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;; \n" ; |
611 | let data: UnicodeData = line.parse().unwrap(); |
612 | assert_eq!( |
613 | data, |
614 | UnicodeData { |
615 | codepoint: codepoint(0x249d), |
616 | name: s("PARENTHESIZED LATIN SMALL LETTER B" ), |
617 | general_category: s("So" ), |
618 | canonical_combining_class: 0, |
619 | bidi_class: s("L" ), |
620 | decomposition: UnicodeDataDecomposition::new( |
621 | Some(UnicodeDataDecompositionTag::Compat), |
622 | &[codepoint(0x28), codepoint(0x62), codepoint(0x29)], |
623 | ) |
624 | .unwrap(), |
625 | numeric_type_decimal: None, |
626 | numeric_type_digit: None, |
627 | numeric_type_numeric: None, |
628 | bidi_mirrored: false, |
629 | unicode1_name: s("" ), |
630 | iso_comment: s("" ), |
631 | simple_uppercase_mapping: None, |
632 | simple_lowercase_mapping: None, |
633 | simple_titlecase_mapping: None, |
634 | } |
635 | ); |
636 | } |
637 | |
638 | #[test ] |
639 | fn parse2() { |
640 | let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;; \n" ; |
641 | let data: UnicodeData = line.parse().unwrap(); |
642 | assert_eq!( |
643 | data, |
644 | UnicodeData { |
645 | codepoint: codepoint(0x000D), |
646 | name: s("<control>" ), |
647 | general_category: s("Cc" ), |
648 | canonical_combining_class: 0, |
649 | bidi_class: s("B" ), |
650 | decomposition: UnicodeDataDecomposition::new( |
651 | None, |
652 | &[codepoint(0x000D)] |
653 | ) |
654 | .unwrap(), |
655 | numeric_type_decimal: None, |
656 | numeric_type_digit: None, |
657 | numeric_type_numeric: None, |
658 | bidi_mirrored: false, |
659 | unicode1_name: s("CARRIAGE RETURN (CR)" ), |
660 | iso_comment: s("" ), |
661 | simple_uppercase_mapping: None, |
662 | simple_lowercase_mapping: None, |
663 | simple_titlecase_mapping: None, |
664 | } |
665 | ); |
666 | } |
667 | |
668 | #[test ] |
669 | fn parse3() { |
670 | let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;; \n" ; |
671 | let data: UnicodeData = line.parse().unwrap(); |
672 | assert_eq!( |
673 | data, |
674 | UnicodeData { |
675 | codepoint: codepoint(0x00BC), |
676 | name: s("VULGAR FRACTION ONE QUARTER" ), |
677 | general_category: s("No" ), |
678 | canonical_combining_class: 0, |
679 | bidi_class: s("ON" ), |
680 | decomposition: UnicodeDataDecomposition::new( |
681 | Some(UnicodeDataDecompositionTag::Fraction), |
682 | &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)], |
683 | ) |
684 | .unwrap(), |
685 | numeric_type_decimal: None, |
686 | numeric_type_digit: None, |
687 | numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)), |
688 | bidi_mirrored: false, |
689 | unicode1_name: s("FRACTION ONE QUARTER" ), |
690 | iso_comment: s("" ), |
691 | simple_uppercase_mapping: None, |
692 | simple_lowercase_mapping: None, |
693 | simple_titlecase_mapping: None, |
694 | } |
695 | ); |
696 | } |
697 | |
698 | #[test ] |
699 | fn parse4() { |
700 | let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; \n" ; |
701 | let data: UnicodeData = line.parse().unwrap(); |
702 | assert_eq!( |
703 | data, |
704 | UnicodeData { |
705 | codepoint: codepoint(0x0041), |
706 | name: s("LATIN CAPITAL LETTER A" ), |
707 | general_category: s("Lu" ), |
708 | canonical_combining_class: 0, |
709 | bidi_class: s("L" ), |
710 | decomposition: UnicodeDataDecomposition::new( |
711 | None, |
712 | &[codepoint(0x0041)] |
713 | ) |
714 | .unwrap(), |
715 | numeric_type_decimal: None, |
716 | numeric_type_digit: None, |
717 | numeric_type_numeric: None, |
718 | bidi_mirrored: false, |
719 | unicode1_name: s("" ), |
720 | iso_comment: s("" ), |
721 | simple_uppercase_mapping: None, |
722 | simple_lowercase_mapping: Some(codepoint(0x0061)), |
723 | simple_titlecase_mapping: None, |
724 | } |
725 | ); |
726 | } |
727 | |
728 | #[test ] |
729 | fn parse5() { |
730 | let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;; \n" ; |
731 | let data: UnicodeData = line.parse().unwrap(); |
732 | assert_eq!( |
733 | data, |
734 | UnicodeData { |
735 | codepoint: codepoint(0x0F33), |
736 | name: s("TIBETAN DIGIT HALF ZERO" ), |
737 | general_category: s("No" ), |
738 | canonical_combining_class: 0, |
739 | bidi_class: s("L" ), |
740 | decomposition: UnicodeDataDecomposition::new( |
741 | None, |
742 | &[codepoint(0x0F33)] |
743 | ) |
744 | .unwrap(), |
745 | numeric_type_decimal: None, |
746 | numeric_type_digit: None, |
747 | numeric_type_numeric: Some(UnicodeDataNumeric::Rational( |
748 | -1, 2 |
749 | )), |
750 | bidi_mirrored: false, |
751 | unicode1_name: s("" ), |
752 | iso_comment: s("" ), |
753 | simple_uppercase_mapping: None, |
754 | simple_lowercase_mapping: None, |
755 | simple_titlecase_mapping: None, |
756 | } |
757 | ); |
758 | } |
759 | |
760 | #[test ] |
761 | fn expander() { |
762 | use super::UnicodeDataExpander; |
763 | use crate::common::UcdLineParser; |
764 | |
765 | let data = "\ |
766 | ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; |
767 | AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
768 | D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
769 | D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;; |
770 | " ; |
771 | let records = UcdLineParser::new(None, data.as_bytes()) |
772 | .collect::<Result<Vec<_>, _>>() |
773 | .unwrap(); |
774 | assert_eq!(UnicodeDataExpander::new(records).count(), 11174); |
775 | } |
776 | } |
777 | |