1 | use std::path::Path; |
2 | use std::str::FromStr; |
3 | |
4 | use once_cell::sync::Lazy; |
5 | use regex::Regex; |
6 | |
7 | use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; |
8 | use crate::error::Error; |
9 | |
10 | /// A single row in the `CaseFolding.txt` file. |
11 | /// |
12 | /// The contents of `CaseFolding.txt` are a convenience derived from both |
13 | /// `UnicodeData.txt` and `SpecialCasing.txt`. |
14 | /// |
15 | /// Note that a single codepoint may be mapped multiple times. In particular, |
16 | /// a single codepoint might have distinct `CaseStatus::Simple` and |
17 | /// `CaseStatus::Full` mappings. |
18 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
19 | pub struct CaseFold { |
20 | /// The codepoint that is being mapped. |
21 | pub codepoint: Codepoint, |
22 | /// The case status of this mapping. |
23 | pub status: CaseStatus, |
24 | /// The actual case mapping, which is more than one codepoint if this is |
25 | /// a "full" mapping. |
26 | pub mapping: Vec<Codepoint>, |
27 | } |
28 | |
29 | impl UcdFile for CaseFold { |
30 | fn relative_file_path() -> &'static Path { |
31 | Path::new("CaseFolding.txt" ) |
32 | } |
33 | } |
34 | |
35 | impl UcdFileByCodepoint for CaseFold { |
36 | fn codepoints(&self) -> CodepointIter { |
37 | self.codepoint.into_iter() |
38 | } |
39 | } |
40 | |
41 | impl FromStr for CaseFold { |
42 | type Err = Error; |
43 | |
44 | fn from_str(line: &str) -> Result<CaseFold, Error> { |
45 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
46 | Regex::new( |
47 | r"(?x) |
48 | ^ |
49 | \s*(?P<codepoint>[^\s;]+)\s*; |
50 | \s*(?P<status>[^\s;]+)\s*; |
51 | \s*(?P<mapping>[^;]+)\s*; |
52 | " , |
53 | ) |
54 | .unwrap() |
55 | }); |
56 | |
57 | let caps = match PARTS.captures(line.trim()) { |
58 | Some(caps) => caps, |
59 | None => return err!("invalid CaseFolding line: ' {}'" , line), |
60 | }; |
61 | let mut mapping = vec![]; |
62 | for cp in caps["mapping" ].split_whitespace() { |
63 | mapping.push(cp.parse()?); |
64 | } |
65 | Ok(CaseFold { |
66 | codepoint: caps["codepoint" ].parse()?, |
67 | status: caps["status" ].parse()?, |
68 | mapping, |
69 | }) |
70 | } |
71 | } |
72 | |
73 | /// The status of a particular case mapping. |
74 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
75 | pub enum CaseStatus { |
76 | /// Case mappings shared by both "simple" and "full" mappings. |
77 | Common, |
78 | /// A case mapping that changes the number of codepoints. |
79 | Full, |
80 | /// A case mapping that doesn't change the number of codepoints, when it |
81 | /// differs from `Full`. |
82 | Simple, |
83 | /// Special cases (currently only for Turkic mappings) that are typically |
84 | /// excluded by default. Special cases don't change the number of |
85 | /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes. |
86 | Special, |
87 | } |
88 | |
89 | impl Default for CaseStatus { |
90 | fn default() -> CaseStatus { |
91 | CaseStatus::Common |
92 | } |
93 | } |
94 | |
95 | impl CaseStatus { |
96 | /// Returns true if and only if this status indicates a case mapping that |
97 | /// won't change the number of codepoints. |
98 | pub fn is_fixed(&self) -> bool { |
99 | *self != CaseStatus::Full |
100 | } |
101 | } |
102 | |
103 | impl FromStr for CaseStatus { |
104 | type Err = Error; |
105 | |
106 | fn from_str(s: &str) -> Result<CaseStatus, Error> { |
107 | match s { |
108 | "C" => Ok(CaseStatus::Common), |
109 | "F" => Ok(CaseStatus::Full), |
110 | "S" => Ok(CaseStatus::Simple), |
111 | "T" => Ok(CaseStatus::Special), |
112 | _ => err!( |
113 | "unrecognized case status: ' {}' \ |
114 | (must be one of C, F, S or T)" , |
115 | s |
116 | ), |
117 | } |
118 | } |
119 | } |
120 | |
121 | #[cfg (test)] |
122 | mod tests { |
123 | use super::{CaseFold, CaseStatus}; |
124 | |
125 | #[test ] |
126 | fn parse_common() { |
127 | let line = |
128 | "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE \n" ; |
129 | let row: CaseFold = line.parse().unwrap(); |
130 | assert_eq!(row.codepoint, 0x0150); |
131 | assert_eq!(row.status, CaseStatus::Common); |
132 | assert_eq!(row.mapping, vec![0x0151]); |
133 | } |
134 | |
135 | #[test ] |
136 | fn parse_full() { |
137 | let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS \n" ; |
138 | let row: CaseFold = line.parse().unwrap(); |
139 | assert_eq!(row.codepoint, 0x03B0); |
140 | assert_eq!(row.status, CaseStatus::Full); |
141 | assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]); |
142 | } |
143 | |
144 | #[test ] |
145 | fn parse_simple() { |
146 | let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI \n" ; |
147 | let row: CaseFold = line.parse().unwrap(); |
148 | assert_eq!(row.codepoint, 0x1F8F); |
149 | assert_eq!(row.status, CaseStatus::Simple); |
150 | assert_eq!(row.mapping, vec![0x1F87]); |
151 | } |
152 | |
153 | #[test ] |
154 | fn parse_special() { |
155 | let line = "0049; T; 0131; # LATIN CAPITAL LETTER I \n" ; |
156 | let row: CaseFold = line.parse().unwrap(); |
157 | assert_eq!(row.codepoint, 0x0049); |
158 | assert_eq!(row.status, CaseStatus::Special); |
159 | assert_eq!(row.mapping, vec![0x0131]); |
160 | } |
161 | } |
162 | |