1 | use std::path::Path; |
2 | |
3 | use crate::{ |
4 | common::{ |
5 | parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile, |
6 | UcdFileByCodepoint, |
7 | }, |
8 | error::Error, |
9 | }; |
10 | |
11 | /// A single row in the `SpecialCasing.txt` file. |
12 | /// |
13 | /// Note that a single codepoint may be mapped multiple times. In particular, |
14 | /// a single codepoint might have mappings based on distinct language sensitive |
15 | /// conditions (e.g., `U+0307`). |
16 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
17 | pub struct SpecialCaseMapping { |
18 | /// The codepoint that is being mapped. |
19 | pub codepoint: Codepoint, |
20 | /// The lowercase mapping, which may be empty. |
21 | pub lowercase: Vec<Codepoint>, |
22 | /// The titlecase mapping, which may be empty. |
23 | pub titlecase: Vec<Codepoint>, |
24 | /// The uppercase mapping, which may be empty. |
25 | pub uppercase: Vec<Codepoint>, |
26 | /// A list of language specific conditions, see `SpecialCasing.txt` for |
27 | /// more details. |
28 | pub conditions: Vec<String>, |
29 | } |
30 | |
31 | impl UcdFile for SpecialCaseMapping { |
32 | fn relative_file_path() -> &'static Path { |
33 | Path::new("SpecialCasing.txt" ) |
34 | } |
35 | } |
36 | |
37 | impl UcdFileByCodepoint for SpecialCaseMapping { |
38 | fn codepoints(&self) -> CodepointIter { |
39 | self.codepoint.into_iter() |
40 | } |
41 | } |
42 | |
43 | impl std::str::FromStr for SpecialCaseMapping { |
44 | type Err = Error; |
45 | |
46 | fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> { |
47 | let re_parts = regex!( |
48 | r"(?x) |
49 | ^ |
50 | \s*(?P<codepoint>[^\s;]+)\s*; |
51 | \s*(?P<lower>[^;]+)\s*; |
52 | \s*(?P<title>[^;]+)\s*; |
53 | \s*(?P<upper>[^;]+)\s*; |
54 | \s*(?P<conditions>[^;\x23]+)? |
55 | " , |
56 | ); |
57 | |
58 | let caps = match re_parts.captures(line.trim()) { |
59 | Some(caps) => caps, |
60 | None => return err!("invalid SpecialCasing line: ' {}'" , line), |
61 | }; |
62 | let conditions = caps |
63 | .name("conditions" ) |
64 | .map(|x| { |
65 | x.as_str() |
66 | .trim() |
67 | .split_whitespace() |
68 | .map(|c| c.to_string()) |
69 | .collect() |
70 | }) |
71 | .unwrap_or(vec![]); |
72 | Ok(SpecialCaseMapping { |
73 | codepoint: caps["codepoint" ].parse()?, |
74 | lowercase: parse_codepoint_sequence(&caps["lower" ])?, |
75 | titlecase: parse_codepoint_sequence(&caps["title" ])?, |
76 | uppercase: parse_codepoint_sequence(&caps["upper" ])?, |
77 | conditions, |
78 | }) |
79 | } |
80 | } |
81 | |
82 | #[cfg (test)] |
83 | mod tests { |
84 | use super::SpecialCaseMapping; |
85 | |
86 | #[test ] |
87 | fn parse_no_conds() { |
88 | let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA \n" ; |
89 | let row: SpecialCaseMapping = line.parse().unwrap(); |
90 | assert_eq!(row.codepoint, 0x1F52); |
91 | assert_eq!(row.lowercase, vec![0x1F52]); |
92 | assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]); |
93 | assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]); |
94 | assert!(row.conditions.is_empty()); |
95 | } |
96 | |
97 | #[test ] |
98 | fn parse_conds() { |
99 | let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE \n" ; |
100 | let row: SpecialCaseMapping = line.parse().unwrap(); |
101 | assert_eq!(row.codepoint, 0x0307); |
102 | assert!(row.lowercase.is_empty()); |
103 | assert_eq!(row.titlecase, vec![0x0307]); |
104 | assert_eq!(row.uppercase, vec![0x0307]); |
105 | assert_eq!(row.conditions, vec!["tr" , "After_I" ]); |
106 | } |
107 | } |
108 | |