case_folding.rs source code [crates/ucd-parse/src/case_folding.rs]

1	use std::path::Path;
2	use std::str::FromStr;
3
4	use once_cell::sync::Lazy;
5	use regex::Regex;
6
7	use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
8	use crate::error::Error;
9
10	/// A single row in the `CaseFolding.txt` file.
11	///
12	/// The contents of `CaseFolding.txt` are a convenience derived from both
13	/// `UnicodeData.txt` and `SpecialCasing.txt`.
14	///
15	/// Note that a single codepoint may be mapped multiple times. In particular,
16	/// a single codepoint might have distinct `CaseStatus::Simple` and
17	/// `CaseStatus::Full` mappings.
18	#[derive(Clone, Debug, Default, Eq, PartialEq)]
19	pub struct CaseFold {
20	/// The codepoint that is being mapped.
21	pub codepoint: Codepoint,
22	/// The case status of this mapping.
23	pub status: CaseStatus,
24	/// The actual case mapping, which is more than one codepoint if this is
25	/// a "full" mapping.
26	pub mapping: Vec<Codepoint>,
27	}
28
29	impl UcdFile for CaseFold {
30	fn relative_file_path() -> &'static Path {
31	Path::new("CaseFolding.txt")
32	}
33	}
34
35	impl UcdFileByCodepoint for CaseFold {
36	fn codepoints(&self) -> CodepointIter {
37	self.codepoint.into_iter()
38	}
39	}
40
41	impl FromStr for CaseFold {
42	type Err = Error;
43
44	fn from_str(line: &str) -> Result<CaseFold, Error> {
45	static PARTS: Lazy<Regex> = Lazy::new(\|\| {
46	Regex::new(
47	r"(?x)
48	^
49	\s(?P<codepoint>[^\s;]+)\s;
50	\s(?P<status>[^\s;]+)\s;
51	\s(?P<mapping>[^;]+)\s;
52	",
53	)
54	.unwrap()
55	});
56
57	let caps = match PARTS.captures(line.trim()) {
58	Some(caps) => caps,
59	None => return err!("invalid CaseFolding line: '{}'", line),
60	};
61	let mut mapping = vec![];
62	for cp in caps["mapping"].split_whitespace() {
63	mapping.push(cp.parse()?);
64	}
65	Ok(CaseFold {
66	codepoint: caps["codepoint"].parse()?,
67	status: caps["status"].parse()?,
68	mapping,
69	})
70	}
71	}
72
73	/// The status of a particular case mapping.
74	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
75	pub enum CaseStatus {
76	/// Case mappings shared by both "simple" and "full" mappings.
77	Common,
78	/// A case mapping that changes the number of codepoints.
79	Full,
80	/// A case mapping that doesn't change the number of codepoints, when it
81	/// differs from `Full`.
82	Simple,
83	/// Special cases (currently only for Turkic mappings) that are typically
84	/// excluded by default. Special cases don't change the number of
85	/// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
86	Special,
87	}
88
89	impl Default for CaseStatus {
90	fn default() -> CaseStatus {
91	CaseStatus::Common
92	}
93	}
94
95	impl CaseStatus {
96	/// Returns true if and only if this status indicates a case mapping that
97	/// won't change the number of codepoints.
98	pub fn is_fixed(&self) -> bool {
99	*self != CaseStatus::Full
100	}
101	}
102
103	impl FromStr for CaseStatus {
104	type Err = Error;
105
106	fn from_str(s: &str) -> Result<CaseStatus, Error> {
107	match s {
108	"C" => Ok(CaseStatus::Common),
109	"F" => Ok(CaseStatus::Full),
110	"S" => Ok(CaseStatus::Simple),
111	"T" => Ok(CaseStatus::Special),
112	_ => err!(
113	"unrecognized case status: '{}' \
114	(must be one of C, F, S or T)",
115	s
116	),
117	}
118	}
119	}
120
121	#[cfg(test)]
122	mod tests {
123	use super::{CaseFold, CaseStatus};
124
125	#[test]
126	fn parse_common() {
127	let line =
128	"0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE`\n`";
129	let row: CaseFold = line.parse().unwrap();
130	assert_eq!(row.codepoint, `0x0150`);
131	assert_eq!(row.status, CaseStatus::Common);
132	assert_eq!(row.mapping, vec![`0x0151`]);
133	}
134
135	#[test]
136	fn parse_full() {
137	let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS`\n`";
138	let row: CaseFold = line.parse().unwrap();
139	assert_eq!(row.codepoint, `0x03B0`);
140	assert_eq!(row.status, CaseStatus::Full);
141	assert_eq!(row.mapping, vec![`0x03C5`, `0x0308`, `0x0301`]);
142	}
143
144	#[test]
145	fn parse_simple() {
146	let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI`\n`";
147	let row: CaseFold = line.parse().unwrap();
148	assert_eq!(row.codepoint, `0x1F8F`);
149	assert_eq!(row.status, CaseStatus::Simple);
150	assert_eq!(row.mapping, vec![`0x1F87`]);
151	}
152
153	#[test]
154	fn parse_special() {
155	let line = "0049; T; 0131; # LATIN CAPITAL LETTER I`\n`";
156	let row: CaseFold = line.parse().unwrap();
157	assert_eq!(row.codepoint, `0x0049`);
158	assert_eq!(row.status, CaseStatus::Special);
159	assert_eq!(row.mapping, vec![`0x0131`]);
160	}
161	}
162