1use std::path::Path;
2use std::str::FromStr;
3
4use once_cell::sync::Lazy;
5use regex::Regex;
6
7use crate::common::{
8 parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile,
9 UcdFileByCodepoint,
10};
11use crate::error::Error;
12
13/// A single row in the `SpecialCasing.txt` file.
14///
15/// Note that a single codepoint may be mapped multiple times. In particular,
16/// a single codepoint might have mappings based on distinct language sensitive
17/// conditions (e.g., `U+0307`).
18#[derive(Clone, Debug, Default, Eq, PartialEq)]
19pub struct SpecialCaseMapping {
20 /// The codepoint that is being mapped.
21 pub codepoint: Codepoint,
22 /// The lowercase mapping, which may be empty.
23 pub lowercase: Vec<Codepoint>,
24 /// The titlecase mapping, which may be empty.
25 pub titlecase: Vec<Codepoint>,
26 /// The uppercase mapping, which may be empty.
27 pub uppercase: Vec<Codepoint>,
28 /// A list of language specific conditions, see `SpecialCasing.txt` for
29 /// more details.
30 pub conditions: Vec<String>,
31}
32
33impl UcdFile for SpecialCaseMapping {
34 fn relative_file_path() -> &'static Path {
35 Path::new("SpecialCasing.txt")
36 }
37}
38
39impl UcdFileByCodepoint for SpecialCaseMapping {
40 fn codepoints(&self) -> CodepointIter {
41 self.codepoint.into_iter()
42 }
43}
44
45impl FromStr for SpecialCaseMapping {
46 type Err = Error;
47
48 fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> {
49 static PARTS: Lazy<Regex> = Lazy::new(|| {
50 Regex::new(
51 r"(?x)
52 ^
53 \s*(?P<codepoint>[^\s;]+)\s*;
54 \s*(?P<lower>[^;]+)\s*;
55 \s*(?P<title>[^;]+)\s*;
56 \s*(?P<upper>[^;]+)\s*;
57 \s*(?P<conditions>[^;\x23]+)?
58 ",
59 )
60 .unwrap()
61 });
62
63 let caps = match PARTS.captures(line.trim()) {
64 Some(caps) => caps,
65 None => return err!("invalid SpecialCasing line: '{}'", line),
66 };
67 let conditions = caps
68 .name("conditions")
69 .map(|x| {
70 x.as_str()
71 .trim()
72 .split_whitespace()
73 .map(|c| c.to_string())
74 .collect()
75 })
76 .unwrap_or(vec![]);
77 Ok(SpecialCaseMapping {
78 codepoint: caps["codepoint"].parse()?,
79 lowercase: parse_codepoint_sequence(&caps["lower"])?,
80 titlecase: parse_codepoint_sequence(&caps["title"])?,
81 uppercase: parse_codepoint_sequence(&caps["upper"])?,
82 conditions,
83 })
84 }
85}
86
87#[cfg(test)]
88mod tests {
89 use super::SpecialCaseMapping;
90
91 #[test]
92 fn parse_no_conds() {
93 let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n";
94 let row: SpecialCaseMapping = line.parse().unwrap();
95 assert_eq!(row.codepoint, 0x1F52);
96 assert_eq!(row.lowercase, vec![0x1F52]);
97 assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]);
98 assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]);
99 assert!(row.conditions.is_empty());
100 }
101
102 #[test]
103 fn parse_conds() {
104 let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n";
105 let row: SpecialCaseMapping = line.parse().unwrap();
106 assert_eq!(row.codepoint, 0x0307);
107 assert!(row.lowercase.is_empty());
108 assert_eq!(row.titlecase, vec![0x0307]);
109 assert_eq!(row.uppercase, vec![0x0307]);
110 assert_eq!(row.conditions, vec!["tr", "After_I"]);
111 }
112}
113