| 1 | use std::path::Path; |
| 2 | |
| 3 | use crate::{ |
| 4 | common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}, |
| 5 | error::Error, |
| 6 | }; |
| 7 | |
| 8 | /// A single row in the `CaseFolding.txt` file. |
| 9 | /// |
| 10 | /// The contents of `CaseFolding.txt` are a convenience derived from both |
| 11 | /// `UnicodeData.txt` and `SpecialCasing.txt`. |
| 12 | /// |
| 13 | /// Note that a single codepoint may be mapped multiple times. In particular, |
| 14 | /// a single codepoint might have distinct `CaseStatus::Simple` and |
| 15 | /// `CaseStatus::Full` mappings. |
| 16 | #[derive (Clone, Debug, Default, Eq, PartialEq)] |
| 17 | pub struct CaseFold { |
| 18 | /// The codepoint that is being mapped. |
| 19 | pub codepoint: Codepoint, |
| 20 | /// The case status of this mapping. |
| 21 | pub status: CaseStatus, |
| 22 | /// The actual case mapping, which is more than one codepoint if this is |
| 23 | /// a "full" mapping. |
| 24 | pub mapping: Vec<Codepoint>, |
| 25 | } |
| 26 | |
| 27 | impl UcdFile for CaseFold { |
| 28 | fn relative_file_path() -> &'static Path { |
| 29 | Path::new("CaseFolding.txt" ) |
| 30 | } |
| 31 | } |
| 32 | |
| 33 | impl UcdFileByCodepoint for CaseFold { |
| 34 | fn codepoints(&self) -> CodepointIter { |
| 35 | self.codepoint.into_iter() |
| 36 | } |
| 37 | } |
| 38 | |
| 39 | impl std::str::FromStr for CaseFold { |
| 40 | type Err = Error; |
| 41 | |
| 42 | fn from_str(line: &str) -> Result<CaseFold, Error> { |
| 43 | let re_parts = regex!( |
| 44 | r"(?x) |
| 45 | ^ |
| 46 | \s*(?P<codepoint>[^\s;]+)\s*; |
| 47 | \s*(?P<status>[^\s;]+)\s*; |
| 48 | \s*(?P<mapping>[^;]+)\s*; |
| 49 | " , |
| 50 | ); |
| 51 | |
| 52 | let caps = match re_parts.captures(line.trim()) { |
| 53 | Some(caps) => caps, |
| 54 | None => return err!("invalid CaseFolding line: ' {}'" , line), |
| 55 | }; |
| 56 | let mut mapping = vec![]; |
| 57 | for cp in caps["mapping" ].split_whitespace() { |
| 58 | mapping.push(cp.parse()?); |
| 59 | } |
| 60 | Ok(CaseFold { |
| 61 | codepoint: caps["codepoint" ].parse()?, |
| 62 | status: caps["status" ].parse()?, |
| 63 | mapping, |
| 64 | }) |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | /// The status of a particular case mapping. |
| 69 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 70 | pub enum CaseStatus { |
| 71 | /// Case mappings shared by both "simple" and "full" mappings. |
| 72 | Common, |
| 73 | /// A case mapping that changes the number of codepoints. |
| 74 | Full, |
| 75 | /// A case mapping that doesn't change the number of codepoints, when it |
| 76 | /// differs from `Full`. |
| 77 | Simple, |
| 78 | /// Special cases (currently only for Turkic mappings) that are typically |
| 79 | /// excluded by default. Special cases don't change the number of |
| 80 | /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes. |
| 81 | Special, |
| 82 | } |
| 83 | |
| 84 | impl Default for CaseStatus { |
| 85 | fn default() -> CaseStatus { |
| 86 | CaseStatus::Common |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | impl CaseStatus { |
| 91 | /// Returns true if and only if this status indicates a case mapping that |
| 92 | /// won't change the number of codepoints. |
| 93 | pub fn is_fixed(&self) -> bool { |
| 94 | *self != CaseStatus::Full |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | impl std::str::FromStr for CaseStatus { |
| 99 | type Err = Error; |
| 100 | |
| 101 | fn from_str(s: &str) -> Result<CaseStatus, Error> { |
| 102 | match s { |
| 103 | "C" => Ok(CaseStatus::Common), |
| 104 | "F" => Ok(CaseStatus::Full), |
| 105 | "S" => Ok(CaseStatus::Simple), |
| 106 | "T" => Ok(CaseStatus::Special), |
| 107 | _ => err!( |
| 108 | "unrecognized case status: ' {}' \ |
| 109 | (must be one of C, F, S or T)" , |
| 110 | s |
| 111 | ), |
| 112 | } |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | #[cfg (test)] |
| 117 | mod tests { |
| 118 | use super::{CaseFold, CaseStatus}; |
| 119 | |
| 120 | #[test ] |
| 121 | fn parse_common() { |
| 122 | let line = |
| 123 | "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE \n" ; |
| 124 | let row: CaseFold = line.parse().unwrap(); |
| 125 | assert_eq!(row.codepoint, 0x0150); |
| 126 | assert_eq!(row.status, CaseStatus::Common); |
| 127 | assert_eq!(row.mapping, vec![0x0151]); |
| 128 | } |
| 129 | |
| 130 | #[test ] |
| 131 | fn parse_full() { |
| 132 | let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS \n" ; |
| 133 | let row: CaseFold = line.parse().unwrap(); |
| 134 | assert_eq!(row.codepoint, 0x03B0); |
| 135 | assert_eq!(row.status, CaseStatus::Full); |
| 136 | assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]); |
| 137 | } |
| 138 | |
| 139 | #[test ] |
| 140 | fn parse_simple() { |
| 141 | let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI \n" ; |
| 142 | let row: CaseFold = line.parse().unwrap(); |
| 143 | assert_eq!(row.codepoint, 0x1F8F); |
| 144 | assert_eq!(row.status, CaseStatus::Simple); |
| 145 | assert_eq!(row.mapping, vec![0x1F87]); |
| 146 | } |
| 147 | |
| 148 | #[test ] |
| 149 | fn parse_special() { |
| 150 | let line = "0049; T; 0131; # LATIN CAPITAL LETTER I \n" ; |
| 151 | let row: CaseFold = line.parse().unwrap(); |
| 152 | assert_eq!(row.codepoint, 0x0049); |
| 153 | assert_eq!(row.status, CaseStatus::Special); |
| 154 | assert_eq!(row.mapping, vec![0x0131]); |
| 155 | } |
| 156 | } |
| 157 | |