1 | use std::path::Path; |
2 | use std::str::FromStr; |
3 | |
4 | use once_cell::sync::Lazy; |
5 | use regex::Regex; |
6 | |
7 | use crate::common::{ |
8 | parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile, |
9 | UcdFileByCodepoint, |
10 | }; |
11 | use crate::error::Error; |
12 | |
13 | /// A single row in the `SpecialCasing.txt` file. |
14 | /// |
15 | /// Note that a single codepoint may be mapped multiple times. In particular, |
16 | /// a single codepoint might have mappings based on distinct language sensitive |
17 | /// conditions (e.g., `U+0307`). |
18 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
19 | pub struct SpecialCaseMapping { |
20 | /// The codepoint that is being mapped. |
21 | pub codepoint: Codepoint, |
22 | /// The lowercase mapping, which may be empty. |
23 | pub lowercase: Vec<Codepoint>, |
24 | /// The titlecase mapping, which may be empty. |
25 | pub titlecase: Vec<Codepoint>, |
26 | /// The uppercase mapping, which may be empty. |
27 | pub uppercase: Vec<Codepoint>, |
28 | /// A list of language specific conditions, see `SpecialCasing.txt` for |
29 | /// more details. |
30 | pub conditions: Vec<String>, |
31 | } |
32 | |
33 | impl UcdFile for SpecialCaseMapping { |
34 | fn relative_file_path() -> &'static Path { |
35 | Path::new("SpecialCasing.txt" ) |
36 | } |
37 | } |
38 | |
39 | impl UcdFileByCodepoint for SpecialCaseMapping { |
40 | fn codepoints(&self) -> CodepointIter { |
41 | self.codepoint.into_iter() |
42 | } |
43 | } |
44 | |
45 | impl FromStr for SpecialCaseMapping { |
46 | type Err = Error; |
47 | |
48 | fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> { |
49 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
50 | Regex::new( |
51 | r"(?x) |
52 | ^ |
53 | \s*(?P<codepoint>[^\s;]+)\s*; |
54 | \s*(?P<lower>[^;]+)\s*; |
55 | \s*(?P<title>[^;]+)\s*; |
56 | \s*(?P<upper>[^;]+)\s*; |
57 | \s*(?P<conditions>[^;\x23]+)? |
58 | " , |
59 | ) |
60 | .unwrap() |
61 | }); |
62 | |
63 | let caps = match PARTS.captures(line.trim()) { |
64 | Some(caps) => caps, |
65 | None => return err!("invalid SpecialCasing line: ' {}'" , line), |
66 | }; |
67 | let conditions = caps |
68 | .name("conditions" ) |
69 | .map(|x| { |
70 | x.as_str() |
71 | .trim() |
72 | .split_whitespace() |
73 | .map(|c| c.to_string()) |
74 | .collect() |
75 | }) |
76 | .unwrap_or(vec![]); |
77 | Ok(SpecialCaseMapping { |
78 | codepoint: caps["codepoint" ].parse()?, |
79 | lowercase: parse_codepoint_sequence(&caps["lower" ])?, |
80 | titlecase: parse_codepoint_sequence(&caps["title" ])?, |
81 | uppercase: parse_codepoint_sequence(&caps["upper" ])?, |
82 | conditions, |
83 | }) |
84 | } |
85 | } |
86 | |
87 | #[cfg (test)] |
88 | mod tests { |
89 | use super::SpecialCaseMapping; |
90 | |
91 | #[test ] |
92 | fn parse_no_conds() { |
93 | let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA \n" ; |
94 | let row: SpecialCaseMapping = line.parse().unwrap(); |
95 | assert_eq!(row.codepoint, 0x1F52); |
96 | assert_eq!(row.lowercase, vec![0x1F52]); |
97 | assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]); |
98 | assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]); |
99 | assert!(row.conditions.is_empty()); |
100 | } |
101 | |
102 | #[test ] |
103 | fn parse_conds() { |
104 | let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE \n" ; |
105 | let row: SpecialCaseMapping = line.parse().unwrap(); |
106 | assert_eq!(row.codepoint, 0x0307); |
107 | assert!(row.lowercase.is_empty()); |
108 | assert_eq!(row.titlecase, vec![0x0307]); |
109 | assert_eq!(row.uppercase, vec![0x0307]); |
110 | assert_eq!(row.conditions, vec!["tr" , "After_I" ]); |
111 | } |
112 | } |
113 | |