1 | use std::path::Path; |
2 | use std::str::FromStr; |
3 | |
4 | use once_cell::sync::Lazy; |
5 | use regex::Regex; |
6 | |
7 | use crate::common::UcdFile; |
8 | use crate::error::Error; |
9 | |
10 | /// A single row in the `PropertyValueAliases.txt` file. |
11 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
12 | pub struct PropertyValueAlias { |
13 | /// The property name for which this value alias applies. |
14 | pub property: String, |
15 | /// A numeric abbreviation for this property value, if present. (This is |
16 | /// seemingly only present for the `ccc`/`Canonical_Combining_Class` |
17 | /// property.) |
18 | pub numeric: Option<u8>, |
19 | /// An abbreviation for this property value. |
20 | pub abbreviation: String, |
21 | /// The "long" form of this property value. |
22 | pub long: String, |
23 | /// Additional value aliases (if present). |
24 | pub aliases: Vec<String>, |
25 | } |
26 | |
27 | impl UcdFile for PropertyValueAlias { |
28 | fn relative_file_path() -> &'static Path { |
29 | Path::new("PropertyValueAliases.txt" ) |
30 | } |
31 | } |
32 | |
33 | impl FromStr for PropertyValueAlias { |
34 | type Err = Error; |
35 | |
36 | fn from_str(line: &str) -> Result<PropertyValueAlias, Error> { |
37 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
38 | Regex::new( |
39 | r"(?x) |
40 | ^ |
41 | \s*(?P<prop>[^\s;]+)\s*; |
42 | \s*(?P<abbrev>[^\s;]+)\s*; |
43 | \s*(?P<long>[^\s;]+)\s* |
44 | (?:;(?P<aliases>.*))? |
45 | " , |
46 | ) |
47 | .unwrap() |
48 | }); |
49 | static PARTS_CCC: Lazy<Regex> = Lazy::new(|| { |
50 | Regex::new( |
51 | r"(?x) |
52 | ^ |
53 | ccc; |
54 | \s*(?P<num_class>[0-9]+)\s*; |
55 | \s*(?P<abbrev>[^\s;]+)\s*; |
56 | \s*(?P<long>[^\s;]+) |
57 | " , |
58 | ) |
59 | .unwrap() |
60 | }); |
61 | static ALIASES: Lazy<Regex> = Lazy::new(|| { |
62 | Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*" ).unwrap() |
63 | }); |
64 | |
65 | if line.starts_with("ccc;" ) { |
66 | let caps = match PARTS_CCC.captures(line.trim()) { |
67 | Some(caps) => caps, |
68 | None => { |
69 | return err!("invalid PropertyValueAliases (ccc) line" ) |
70 | } |
71 | }; |
72 | let n = match caps["num_class" ].parse() { |
73 | Ok(n) => n, |
74 | Err(err) => { |
75 | return err!( |
76 | "failed to parse ccc number ' {}': {}" , |
77 | &caps["num_class" ], |
78 | err |
79 | ) |
80 | } |
81 | }; |
82 | let abbrev = caps.name("abbrev" ).unwrap().as_str(); |
83 | let long = caps.name("long" ).unwrap().as_str(); |
84 | return Ok(PropertyValueAlias { |
85 | property: line[0..3].to_string(), |
86 | numeric: Some(n), |
87 | abbreviation: abbrev.to_string(), |
88 | long: long.to_string(), |
89 | aliases: vec![], |
90 | }); |
91 | } |
92 | |
93 | let caps = match PARTS.captures(line.trim()) { |
94 | Some(caps) => caps, |
95 | None => return err!("invalid PropertyValueAliases line" ), |
96 | }; |
97 | let mut aliases = vec![]; |
98 | if let Some(m) = caps.name("aliases" ) { |
99 | for acaps in ALIASES.captures_iter(m.as_str()) { |
100 | let alias = acaps.name("alias" ).unwrap().as_str(); |
101 | if alias == "#" { |
102 | // This starts a comment, so stop reading. |
103 | break; |
104 | } |
105 | aliases.push(alias.to_string()); |
106 | } |
107 | } |
108 | Ok(PropertyValueAlias { |
109 | property: caps.name("prop" ).unwrap().as_str().to_string(), |
110 | numeric: None, |
111 | abbreviation: caps.name("abbrev" ).unwrap().as_str().to_string(), |
112 | long: caps.name("long" ).unwrap().as_str().to_string(), |
113 | aliases, |
114 | }) |
115 | } |
116 | } |
117 | |
118 | #[cfg (test)] |
119 | mod tests { |
120 | use super::PropertyValueAlias; |
121 | |
122 | #[test ] |
123 | fn parse1() { |
124 | let line = "blk; Arabic_PF_A ; Arabic_Presentation_Forms_A ; Arabic_Presentation_Forms-A \n" ; |
125 | let row: PropertyValueAlias = line.parse().unwrap(); |
126 | assert_eq!(row.property, "blk" ); |
127 | assert_eq!(row.numeric, None); |
128 | assert_eq!(row.abbreviation, "Arabic_PF_A" ); |
129 | assert_eq!(row.long, "Arabic_Presentation_Forms_A" ); |
130 | assert_eq!(row.aliases, vec!["Arabic_Presentation_Forms-A" ]); |
131 | } |
132 | |
133 | #[test ] |
134 | fn parse2() { |
135 | let line = "AHex; N ; No ; F ; False \n" ; |
136 | let row: PropertyValueAlias = line.parse().unwrap(); |
137 | assert_eq!(row.property, "AHex" ); |
138 | assert_eq!(row.numeric, None); |
139 | assert_eq!(row.abbreviation, "N" ); |
140 | assert_eq!(row.long, "No" ); |
141 | assert_eq!(row.aliases, vec!["F" , "False" ]); |
142 | } |
143 | |
144 | #[test ] |
145 | fn parse3() { |
146 | let line = "age; 1.1 ; V1_1 \n" ; |
147 | let row: PropertyValueAlias = line.parse().unwrap(); |
148 | assert_eq!(row.property, "age" ); |
149 | assert_eq!(row.numeric, None); |
150 | assert_eq!(row.abbreviation, "1.1" ); |
151 | assert_eq!(row.long, "V1_1" ); |
152 | assert!(row.aliases.is_empty()); |
153 | } |
154 | |
155 | #[test ] |
156 | fn parse4() { |
157 | let line = "ccc; 0; NR ; Not_Reordered \n" ; |
158 | let row: PropertyValueAlias = line.parse().unwrap(); |
159 | assert_eq!(row.property, "ccc" ); |
160 | assert_eq!(row.numeric, Some(0)); |
161 | assert_eq!(row.abbreviation, "NR" ); |
162 | assert_eq!(row.long, "Not_Reordered" ); |
163 | assert!(row.aliases.is_empty()); |
164 | } |
165 | |
166 | #[test ] |
167 | fn parse5() { |
168 | let line = |
169 | "ccc; 133; CCC133 ; CCC133 # RESERVED \n" ; |
170 | let row: PropertyValueAlias = line.parse().unwrap(); |
171 | assert_eq!(row.property, "ccc" ); |
172 | assert_eq!(row.numeric, Some(133)); |
173 | assert_eq!(row.abbreviation, "CCC133" ); |
174 | assert_eq!(row.long, "CCC133" ); |
175 | assert!(row.aliases.is_empty()); |
176 | } |
177 | |
178 | #[test ] |
179 | fn parse6() { |
180 | let line = "gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps \n" ; |
181 | let row: PropertyValueAlias = line.parse().unwrap(); |
182 | assert_eq!(row.property, "gc" ); |
183 | assert_eq!(row.numeric, None); |
184 | assert_eq!(row.abbreviation, "P" ); |
185 | assert_eq!(row.long, "Punctuation" ); |
186 | assert_eq!(row.aliases, vec!["punct" ]); |
187 | } |
188 | } |
189 | |