case_folding.rs source code [crates/ucd_parse/src/case_folding.rs]

1	use std::path::Path;
2
3	use crate::{
4	common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
5	error::Error,
6	};
7
8	/// A single row in the `CaseFolding.txt` file.
9	///
10	/// The contents of `CaseFolding.txt` are a convenience derived from both
11	/// `UnicodeData.txt` and `SpecialCasing.txt`.
12	///
13	/// Note that a single codepoint may be mapped multiple times. In particular,
14	/// a single codepoint might have distinct `CaseStatus::Simple` and
15	/// `CaseStatus::Full` mappings.
16	#[derive(Clone, Debug, Default, Eq, PartialEq)]
17	pub struct CaseFold {
18	/// The codepoint that is being mapped.
19	pub codepoint: Codepoint,
20	/// The case status of this mapping.
21	pub status: CaseStatus,
22	/// The actual case mapping, which is more than one codepoint if this is
23	/// a "full" mapping.
24	pub mapping: Vec<Codepoint>,
25	}
26
27	impl UcdFile for CaseFold {
28	fn relative_file_path() -> &'static Path {
29	Path::new("CaseFolding.txt")
30	}
31	}
32
33	impl UcdFileByCodepoint for CaseFold {
34	fn codepoints(&self) -> CodepointIter {
35	self.codepoint.into_iter()
36	}
37	}
38
39	impl std::str::FromStr for CaseFold {
40	type Err = Error;
41
42	fn from_str(line: &str) -> Result<CaseFold, Error> {
43	let re_parts = regex!(
44	r"(?x)
45	^
46	\s(?P<codepoint>[^\s;]+)\s;
47	\s(?P<status>[^\s;]+)\s;
48	\s(?P<mapping>[^;]+)\s;
49	",
50	);
51
52	let caps = match re_parts.captures(line.trim()) {
53	Some(caps) => caps,
54	None => return err!("invalid CaseFolding line: '{}'", line),
55	};
56	let mut mapping = vec![];
57	for cp in caps["mapping"].split_whitespace() {
58	mapping.push(cp.parse()?);
59	}
60	Ok(CaseFold {
61	codepoint: caps["codepoint"].parse()?,
62	status: caps["status"].parse()?,
63	mapping,
64	})
65	}
66	}
67
68	/// The status of a particular case mapping.
69	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
70	pub enum CaseStatus {
71	/// Case mappings shared by both "simple" and "full" mappings.
72	Common,
73	/// A case mapping that changes the number of codepoints.
74	Full,
75	/// A case mapping that doesn't change the number of codepoints, when it
76	/// differs from `Full`.
77	Simple,
78	/// Special cases (currently only for Turkic mappings) that are typically
79	/// excluded by default. Special cases don't change the number of
80	/// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
81	Special,
82	}
83
84	impl Default for CaseStatus {
85	fn default() -> CaseStatus {
86	CaseStatus::Common
87	}
88	}
89
90	impl CaseStatus {
91	/// Returns true if and only if this status indicates a case mapping that
92	/// won't change the number of codepoints.
93	pub fn is_fixed(&self) -> bool {
94	*self != CaseStatus::Full
95	}
96	}
97
98	impl std::str::FromStr for CaseStatus {
99	type Err = Error;
100
101	fn from_str(s: &str) -> Result<CaseStatus, Error> {
102	match s {
103	"C" => Ok(CaseStatus::Common),
104	"F" => Ok(CaseStatus::Full),
105	"S" => Ok(CaseStatus::Simple),
106	"T" => Ok(CaseStatus::Special),
107	_ => err!(
108	"unrecognized case status: '{}' \
109	(must be one of C, F, S or T)",
110	s
111	),
112	}
113	}
114	}
115
116	#[cfg(test)]
117	mod tests {
118	use super::{CaseFold, CaseStatus};
119
120	#[test]
121	fn parse_common() {
122	let line =
123	"0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE`\n`";
124	let row: CaseFold = line.parse().unwrap();
125	assert_eq!(row.codepoint, `0x0150`);
126	assert_eq!(row.status, CaseStatus::Common);
127	assert_eq!(row.mapping, vec![`0x0151`]);
128	}
129
130	#[test]
131	fn parse_full() {
132	let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS`\n`";
133	let row: CaseFold = line.parse().unwrap();
134	assert_eq!(row.codepoint, `0x03B0`);
135	assert_eq!(row.status, CaseStatus::Full);
136	assert_eq!(row.mapping, vec![`0x03C5`, `0x0308`, `0x0301`]);
137	}
138
139	#[test]
140	fn parse_simple() {
141	let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI`\n`";
142	let row: CaseFold = line.parse().unwrap();
143	assert_eq!(row.codepoint, `0x1F8F`);
144	assert_eq!(row.status, CaseStatus::Simple);
145	assert_eq!(row.mapping, vec![`0x1F87`]);
146	}
147
148	#[test]
149	fn parse_special() {
150	let line = "0049; T; 0131; # LATIN CAPITAL LETTER I`\n`";
151	let row: CaseFold = line.parse().unwrap();
152	assert_eq!(row.codepoint, `0x0049`);
153	assert_eq!(row.status, CaseStatus::Special);
154	assert_eq!(row.mapping, vec![`0x0131`]);
155	}
156	}
157