1 | use std::path::Path; |
2 | use std::str::FromStr; |
3 | |
4 | use once_cell::sync::Lazy; |
5 | use regex::Regex; |
6 | |
7 | use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; |
8 | use crate::error::Error; |
9 | |
10 | /// A single row in the `NameAliases.txt` file. |
11 | /// |
12 | /// Note that there are multiple rows for some codepoint. Each row provides a |
13 | /// new alias. |
14 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
15 | pub struct NameAlias { |
16 | /// The codepoint corresponding to this row. |
17 | pub codepoint: Codepoint, |
18 | /// The alias. |
19 | pub alias: String, |
20 | /// The label of this alias. |
21 | pub label: NameAliasLabel, |
22 | } |
23 | |
24 | impl UcdFile for NameAlias { |
25 | fn relative_file_path() -> &'static Path { |
26 | Path::new("NameAliases.txt" ) |
27 | } |
28 | } |
29 | |
30 | impl UcdFileByCodepoint for NameAlias { |
31 | fn codepoints(&self) -> CodepointIter { |
32 | self.codepoint.into_iter() |
33 | } |
34 | } |
35 | |
36 | impl FromStr for NameAlias { |
37 | type Err = Error; |
38 | |
39 | fn from_str(line: &str) -> Result<NameAlias, Error> { |
40 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
41 | Regex::new( |
42 | r"(?x) |
43 | ^ |
44 | (?P<codepoint>[A-Z0-9]+); |
45 | \s* |
46 | (?P<alias>[^;]+); |
47 | \s* |
48 | (?P<label>\S+) |
49 | " , |
50 | ) |
51 | .unwrap() |
52 | }); |
53 | |
54 | let caps = match PARTS.captures(line.trim()) { |
55 | Some(caps) => caps, |
56 | None => return err!("invalid NameAliases line" ), |
57 | }; |
58 | Ok(NameAlias { |
59 | codepoint: caps["codepoint" ].parse()?, |
60 | alias: caps.name("alias" ).unwrap().as_str().to_string(), |
61 | label: caps["label" ].parse()?, |
62 | }) |
63 | } |
64 | } |
65 | |
66 | /// The label of a name alias. |
67 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
68 | pub enum NameAliasLabel { |
69 | /// Corrections for serious problems in a character name. |
70 | Correction, |
71 | /// ISO 6429 names for C0 and C1 control functions and other commonly |
72 | /// occurring names for control codes. |
73 | Control, |
74 | /// A few widely used alternate names for format characters. |
75 | Alternate, |
76 | /// Several documented labels for C1 control code points which were |
77 | /// never actually approved in any standard. |
78 | Figment, |
79 | /// Commonly occurring abbreviations (or acronyms) for control codes, |
80 | /// format characters, spaces and variation selectors. |
81 | Abbreviation, |
82 | } |
83 | |
84 | impl Default for NameAliasLabel { |
85 | fn default() -> NameAliasLabel { |
86 | // This is arbitrary, but the Default impl is convenient. |
87 | NameAliasLabel::Correction |
88 | } |
89 | } |
90 | |
91 | impl FromStr for NameAliasLabel { |
92 | type Err = Error; |
93 | |
94 | fn from_str(s: &str) -> Result<NameAliasLabel, Error> { |
95 | match s { |
96 | "correction" => Ok(NameAliasLabel::Correction), |
97 | "control" => Ok(NameAliasLabel::Control), |
98 | "alternate" => Ok(NameAliasLabel::Alternate), |
99 | "figment" => Ok(NameAliasLabel::Figment), |
100 | "abbreviation" => Ok(NameAliasLabel::Abbreviation), |
101 | unknown: &str => err!("unknown name alias label: ' {}'" , unknown), |
102 | } |
103 | } |
104 | } |
105 | |
106 | #[cfg (test)] |
107 | mod tests { |
108 | use super::{NameAlias, NameAliasLabel}; |
109 | |
110 | #[test ] |
111 | fn parse1() { |
112 | let line = "0000;NULL;control \n" ; |
113 | let row: NameAlias = line.parse().unwrap(); |
114 | assert_eq!(row.codepoint, 0x0); |
115 | assert_eq!(row.alias, "NULL" ); |
116 | assert_eq!(row.label, NameAliasLabel::Control); |
117 | } |
118 | |
119 | #[test ] |
120 | fn parse2() { |
121 | let line = "000B;VERTICAL TABULATION;control \n" ; |
122 | let row: NameAlias = line.parse().unwrap(); |
123 | assert_eq!(row.codepoint, 0xB); |
124 | assert_eq!(row.alias, "VERTICAL TABULATION" ); |
125 | assert_eq!(row.label, NameAliasLabel::Control); |
126 | } |
127 | |
128 | #[test ] |
129 | fn parse3() { |
130 | let line = "0081;HIGH OCTET PRESET;figment \n" ; |
131 | let row: NameAlias = line.parse().unwrap(); |
132 | assert_eq!(row.codepoint, 0x81); |
133 | assert_eq!(row.alias, "HIGH OCTET PRESET" ); |
134 | assert_eq!(row.label, NameAliasLabel::Figment); |
135 | } |
136 | |
137 | #[test ] |
138 | fn parse4() { |
139 | let line = "E01EF;VS256;abbreviation \n" ; |
140 | let row: NameAlias = line.parse().unwrap(); |
141 | assert_eq!(row.codepoint, 0xE01EF); |
142 | assert_eq!(row.alias, "VS256" ); |
143 | assert_eq!(row.label, NameAliasLabel::Abbreviation); |
144 | } |
145 | } |
146 | |