1use std::path::Path;
2
3use crate::{
4 common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5 error::Error,
6};
7
8/// A single row in the `CaseFolding.txt` file.
9///
10/// The contents of `CaseFolding.txt` are a convenience derived from both
11/// `UnicodeData.txt` and `SpecialCasing.txt`.
12///
13/// Note that a single codepoint may be mapped multiple times. In particular,
14/// a single codepoint might have distinct `CaseStatus::Simple` and
15/// `CaseStatus::Full` mappings.
16#[derive(Clone, Debug, Default, Eq, PartialEq)]
17pub struct CaseFold {
18 /// The codepoint that is being mapped.
19 pub codepoint: Codepoint,
20 /// The case status of this mapping.
21 pub status: CaseStatus,
22 /// The actual case mapping, which is more than one codepoint if this is
23 /// a "full" mapping.
24 pub mapping: Vec<Codepoint>,
25}
26
27impl UcdFile for CaseFold {
28 fn relative_file_path() -> &'static Path {
29 Path::new("CaseFolding.txt")
30 }
31}
32
33impl UcdFileByCodepoint for CaseFold {
34 fn codepoints(&self) -> CodepointIter {
35 self.codepoint.into_iter()
36 }
37}
38
39impl std::str::FromStr for CaseFold {
40 type Err = Error;
41
42 fn from_str(line: &str) -> Result<CaseFold, Error> {
43 let re_parts = regex!(
44 r"(?x)
45 ^
46 \s*(?P<codepoint>[^\s;]+)\s*;
47 \s*(?P<status>[^\s;]+)\s*;
48 \s*(?P<mapping>[^;]+)\s*;
49 ",
50 );
51
52 let caps = match re_parts.captures(line.trim()) {
53 Some(caps) => caps,
54 None => return err!("invalid CaseFolding line: '{}'", line),
55 };
56 let mut mapping = vec![];
57 for cp in caps["mapping"].split_whitespace() {
58 mapping.push(cp.parse()?);
59 }
60 Ok(CaseFold {
61 codepoint: caps["codepoint"].parse()?,
62 status: caps["status"].parse()?,
63 mapping,
64 })
65 }
66}
67
68/// The status of a particular case mapping.
69#[derive(Clone, Copy, Debug, Eq, PartialEq)]
70pub enum CaseStatus {
71 /// Case mappings shared by both "simple" and "full" mappings.
72 Common,
73 /// A case mapping that changes the number of codepoints.
74 Full,
75 /// A case mapping that doesn't change the number of codepoints, when it
76 /// differs from `Full`.
77 Simple,
78 /// Special cases (currently only for Turkic mappings) that are typically
79 /// excluded by default. Special cases don't change the number of
80 /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
81 Special,
82}
83
84impl Default for CaseStatus {
85 fn default() -> CaseStatus {
86 CaseStatus::Common
87 }
88}
89
90impl CaseStatus {
91 /// Returns true if and only if this status indicates a case mapping that
92 /// won't change the number of codepoints.
93 pub fn is_fixed(&self) -> bool {
94 *self != CaseStatus::Full
95 }
96}
97
98impl std::str::FromStr for CaseStatus {
99 type Err = Error;
100
101 fn from_str(s: &str) -> Result<CaseStatus, Error> {
102 match s {
103 "C" => Ok(CaseStatus::Common),
104 "F" => Ok(CaseStatus::Full),
105 "S" => Ok(CaseStatus::Simple),
106 "T" => Ok(CaseStatus::Special),
107 _ => err!(
108 "unrecognized case status: '{}' \
109 (must be one of C, F, S or T)",
110 s
111 ),
112 }
113 }
114}
115
116#[cfg(test)]
117mod tests {
118 use super::{CaseFold, CaseStatus};
119
120 #[test]
121 fn parse_common() {
122 let line =
123 "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
124 let row: CaseFold = line.parse().unwrap();
125 assert_eq!(row.codepoint, 0x0150);
126 assert_eq!(row.status, CaseStatus::Common);
127 assert_eq!(row.mapping, vec![0x0151]);
128 }
129
130 #[test]
131 fn parse_full() {
132 let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
133 let row: CaseFold = line.parse().unwrap();
134 assert_eq!(row.codepoint, 0x03B0);
135 assert_eq!(row.status, CaseStatus::Full);
136 assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
137 }
138
139 #[test]
140 fn parse_simple() {
141 let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
142 let row: CaseFold = line.parse().unwrap();
143 assert_eq!(row.codepoint, 0x1F8F);
144 assert_eq!(row.status, CaseStatus::Simple);
145 assert_eq!(row.mapping, vec![0x1F87]);
146 }
147
148 #[test]
149 fn parse_special() {
150 let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
151 let row: CaseFold = line.parse().unwrap();
152 assert_eq!(row.codepoint, 0x0049);
153 assert_eq!(row.status, CaseStatus::Special);
154 assert_eq!(row.mapping, vec![0x0131]);
155 }
156}
157