1use std::path::Path;
2use std::str::FromStr;
3
4use once_cell::sync::Lazy;
5use regex::Regex;
6
7use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
8use crate::error::Error;
9
10/// A single row in the `CaseFolding.txt` file.
11///
12/// The contents of `CaseFolding.txt` are a convenience derived from both
13/// `UnicodeData.txt` and `SpecialCasing.txt`.
14///
15/// Note that a single codepoint may be mapped multiple times. In particular,
16/// a single codepoint might have distinct `CaseStatus::Simple` and
17/// `CaseStatus::Full` mappings.
18#[derive(Clone, Debug, Default, Eq, PartialEq)]
19pub struct CaseFold {
20 /// The codepoint that is being mapped.
21 pub codepoint: Codepoint,
22 /// The case status of this mapping.
23 pub status: CaseStatus,
24 /// The actual case mapping, which is more than one codepoint if this is
25 /// a "full" mapping.
26 pub mapping: Vec<Codepoint>,
27}
28
29impl UcdFile for CaseFold {
30 fn relative_file_path() -> &'static Path {
31 Path::new("CaseFolding.txt")
32 }
33}
34
35impl UcdFileByCodepoint for CaseFold {
36 fn codepoints(&self) -> CodepointIter {
37 self.codepoint.into_iter()
38 }
39}
40
41impl FromStr for CaseFold {
42 type Err = Error;
43
44 fn from_str(line: &str) -> Result<CaseFold, Error> {
45 static PARTS: Lazy<Regex> = Lazy::new(|| {
46 Regex::new(
47 r"(?x)
48 ^
49 \s*(?P<codepoint>[^\s;]+)\s*;
50 \s*(?P<status>[^\s;]+)\s*;
51 \s*(?P<mapping>[^;]+)\s*;
52 ",
53 )
54 .unwrap()
55 });
56
57 let caps = match PARTS.captures(line.trim()) {
58 Some(caps) => caps,
59 None => return err!("invalid CaseFolding line: '{}'", line),
60 };
61 let mut mapping = vec![];
62 for cp in caps["mapping"].split_whitespace() {
63 mapping.push(cp.parse()?);
64 }
65 Ok(CaseFold {
66 codepoint: caps["codepoint"].parse()?,
67 status: caps["status"].parse()?,
68 mapping,
69 })
70 }
71}
72
73/// The status of a particular case mapping.
74#[derive(Clone, Copy, Debug, Eq, PartialEq)]
75pub enum CaseStatus {
76 /// Case mappings shared by both "simple" and "full" mappings.
77 Common,
78 /// A case mapping that changes the number of codepoints.
79 Full,
80 /// A case mapping that doesn't change the number of codepoints, when it
81 /// differs from `Full`.
82 Simple,
83 /// Special cases (currently only for Turkic mappings) that are typically
84 /// excluded by default. Special cases don't change the number of
85 /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
86 Special,
87}
88
89impl Default for CaseStatus {
90 fn default() -> CaseStatus {
91 CaseStatus::Common
92 }
93}
94
95impl CaseStatus {
96 /// Returns true if and only if this status indicates a case mapping that
97 /// won't change the number of codepoints.
98 pub fn is_fixed(&self) -> bool {
99 *self != CaseStatus::Full
100 }
101}
102
103impl FromStr for CaseStatus {
104 type Err = Error;
105
106 fn from_str(s: &str) -> Result<CaseStatus, Error> {
107 match s {
108 "C" => Ok(CaseStatus::Common),
109 "F" => Ok(CaseStatus::Full),
110 "S" => Ok(CaseStatus::Simple),
111 "T" => Ok(CaseStatus::Special),
112 _ => err!(
113 "unrecognized case status: '{}' \
114 (must be one of C, F, S or T)",
115 s
116 ),
117 }
118 }
119}
120
121#[cfg(test)]
122mod tests {
123 use super::{CaseFold, CaseStatus};
124
125 #[test]
126 fn parse_common() {
127 let line =
128 "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
129 let row: CaseFold = line.parse().unwrap();
130 assert_eq!(row.codepoint, 0x0150);
131 assert_eq!(row.status, CaseStatus::Common);
132 assert_eq!(row.mapping, vec![0x0151]);
133 }
134
135 #[test]
136 fn parse_full() {
137 let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
138 let row: CaseFold = line.parse().unwrap();
139 assert_eq!(row.codepoint, 0x03B0);
140 assert_eq!(row.status, CaseStatus::Full);
141 assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
142 }
143
144 #[test]
145 fn parse_simple() {
146 let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
147 let row: CaseFold = line.parse().unwrap();
148 assert_eq!(row.codepoint, 0x1F8F);
149 assert_eq!(row.status, CaseStatus::Simple);
150 assert_eq!(row.mapping, vec![0x1F87]);
151 }
152
153 #[test]
154 fn parse_special() {
155 let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
156 let row: CaseFold = line.parse().unwrap();
157 assert_eq!(row.codepoint, 0x0049);
158 assert_eq!(row.status, CaseStatus::Special);
159 assert_eq!(row.mapping, vec![0x0131]);
160 }
161}
162