| 1 | // Copyright 2015 Google Inc. All rights reserved. |
| 2 | // |
| 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4 | // of this software and associated documentation files (the "Software"), to deal |
| 5 | // in the Software without restriction, including without limitation the rights |
| 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 7 | // copies of the Software, and to permit persons to whom the Software is |
| 8 | // furnished to do so, subject to the following conditions: |
| 9 | // |
| 10 | // The above copyright notice and this permission notice shall be included in |
| 11 | // all copies or substantial portions of the Software. |
| 12 | // |
| 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 19 | // THE SOFTWARE. |
| 20 | |
| 21 | //! CommonMark punctuation set based on spec and Unicode properties. |
| 22 | |
| 23 | // Autogenerated by mk_puncttable.py |
| 24 | |
| 25 | const PUNCT_MASKS_ASCII: [u16; 8] = [ |
| 26 | 0x0000, // U+0000...U+000F |
| 27 | 0x0000, // U+0010...U+001F |
| 28 | 0xfffe, // U+0020...U+002F |
| 29 | 0xfc00, // U+0030...U+003F |
| 30 | 0x0001, // U+0040...U+004F |
| 31 | 0xf800, // U+0050...U+005F |
| 32 | 0x0001, // U+0060...U+006F |
| 33 | 0x7800, // U+0070...U+007F |
| 34 | ]; |
| 35 | |
| 36 | const PUNCT_TAB: [u16; 132] = [ |
| 37 | 10, // U+00A0...U+00AF |
| 38 | 11, // U+00B0...U+00BF |
| 39 | 55, // U+0370...U+037F |
| 40 | 56, // U+0380...U+038F |
| 41 | 85, // U+0550...U+055F |
| 42 | 88, // U+0580...U+058F |
| 43 | 91, // U+05B0...U+05BF |
| 44 | 92, // U+05C0...U+05CF |
| 45 | 95, // U+05F0...U+05FF |
| 46 | 96, // U+0600...U+060F |
| 47 | 97, // U+0610...U+061F |
| 48 | 102, // U+0660...U+066F |
| 49 | 109, // U+06D0...U+06DF |
| 50 | 112, // U+0700...U+070F |
| 51 | 127, // U+07F0...U+07FF |
| 52 | 131, // U+0830...U+083F |
| 53 | 133, // U+0850...U+085F |
| 54 | 150, // U+0960...U+096F |
| 55 | 151, // U+0970...U+097F |
| 56 | 175, // U+0AF0...U+0AFF |
| 57 | 223, // U+0DF0...U+0DFF |
| 58 | 228, // U+0E40...U+0E4F |
| 59 | 229, // U+0E50...U+0E5F |
| 60 | 240, // U+0F00...U+0F0F |
| 61 | 241, // U+0F10...U+0F1F |
| 62 | 243, // U+0F30...U+0F3F |
| 63 | 248, // U+0F80...U+0F8F |
| 64 | 253, // U+0FD0...U+0FDF |
| 65 | 260, // U+1040...U+104F |
| 66 | 271, // U+10F0...U+10FF |
| 67 | 310, // U+1360...U+136F |
| 68 | 320, // U+1400...U+140F |
| 69 | 358, // U+1660...U+166F |
| 70 | 361, // U+1690...U+169F |
| 71 | 366, // U+16E0...U+16EF |
| 72 | 371, // U+1730...U+173F |
| 73 | 381, // U+17D0...U+17DF |
| 74 | 384, // U+1800...U+180F |
| 75 | 404, // U+1940...U+194F |
| 76 | 417, // U+1A10...U+1A1F |
| 77 | 426, // U+1AA0...U+1AAF |
| 78 | 437, // U+1B50...U+1B5F |
| 79 | 438, // U+1B60...U+1B6F |
| 80 | 447, // U+1BF0...U+1BFF |
| 81 | 451, // U+1C30...U+1C3F |
| 82 | 455, // U+1C70...U+1C7F |
| 83 | 460, // U+1CC0...U+1CCF |
| 84 | 461, // U+1CD0...U+1CDF |
| 85 | 513, // U+2010...U+201F |
| 86 | 514, // U+2020...U+202F |
| 87 | 515, // U+2030...U+203F |
| 88 | 516, // U+2040...U+204F |
| 89 | 517, // U+2050...U+205F |
| 90 | 519, // U+2070...U+207F |
| 91 | 520, // U+2080...U+208F |
| 92 | 560, // U+2300...U+230F |
| 93 | 562, // U+2320...U+232F |
| 94 | 630, // U+2760...U+276F |
| 95 | 631, // U+2770...U+277F |
| 96 | 636, // U+27C0...U+27CF |
| 97 | 638, // U+27E0...U+27EF |
| 98 | 664, // U+2980...U+298F |
| 99 | 665, // U+2990...U+299F |
| 100 | 669, // U+29D0...U+29DF |
| 101 | 671, // U+29F0...U+29FF |
| 102 | 719, // U+2CF0...U+2CFF |
| 103 | 727, // U+2D70...U+2D7F |
| 104 | 736, // U+2E00...U+2E0F |
| 105 | 737, // U+2E10...U+2E1F |
| 106 | 738, // U+2E20...U+2E2F |
| 107 | 739, // U+2E30...U+2E3F |
| 108 | 740, // U+2E40...U+2E4F |
| 109 | 768, // U+3000...U+300F |
| 110 | 769, // U+3010...U+301F |
| 111 | 771, // U+3030...U+303F |
| 112 | 778, // U+30A0...U+30AF |
| 113 | 783, // U+30F0...U+30FF |
| 114 | 2639, // U+A4F0...U+A4FF |
| 115 | 2656, // U+A600...U+A60F |
| 116 | 2663, // U+A670...U+A67F |
| 117 | 2671, // U+A6F0...U+A6FF |
| 118 | 2695, // U+A870...U+A87F |
| 119 | 2700, // U+A8C0...U+A8CF |
| 120 | 2703, // U+A8F0...U+A8FF |
| 121 | 2706, // U+A920...U+A92F |
| 122 | 2709, // U+A950...U+A95F |
| 123 | 2716, // U+A9C0...U+A9CF |
| 124 | 2717, // U+A9D0...U+A9DF |
| 125 | 2725, // U+AA50...U+AA5F |
| 126 | 2733, // U+AAD0...U+AADF |
| 127 | 2735, // U+AAF0...U+AAFF |
| 128 | 2750, // U+ABE0...U+ABEF |
| 129 | 4051, // U+FD30...U+FD3F |
| 130 | 4065, // U+FE10...U+FE1F |
| 131 | 4067, // U+FE30...U+FE3F |
| 132 | 4068, // U+FE40...U+FE4F |
| 133 | 4069, // U+FE50...U+FE5F |
| 134 | 4070, // U+FE60...U+FE6F |
| 135 | 4080, // U+FF00...U+FF0F |
| 136 | 4081, // U+FF10...U+FF1F |
| 137 | 4082, // U+FF20...U+FF2F |
| 138 | 4083, // U+FF30...U+FF3F |
| 139 | 4085, // U+FF50...U+FF5F |
| 140 | 4086, // U+FF60...U+FF6F |
| 141 | 4112, // U+10100...U+1010F |
| 142 | 4153, // U+10390...U+1039F |
| 143 | 4157, // U+103D0...U+103DF |
| 144 | 4182, // U+10560...U+1056F |
| 145 | 4229, // U+10850...U+1085F |
| 146 | 4241, // U+10910...U+1091F |
| 147 | 4243, // U+10930...U+1093F |
| 148 | 4261, // U+10A50...U+10A5F |
| 149 | 4263, // U+10A70...U+10A7F |
| 150 | 4271, // U+10AF0...U+10AFF |
| 151 | 4275, // U+10B30...U+10B3F |
| 152 | 4281, // U+10B90...U+10B9F |
| 153 | 4356, // U+11040...U+1104F |
| 154 | 4363, // U+110B0...U+110BF |
| 155 | 4364, // U+110C0...U+110CF |
| 156 | 4372, // U+11140...U+1114F |
| 157 | 4375, // U+11170...U+1117F |
| 158 | 4380, // U+111C0...U+111CF |
| 159 | 4387, // U+11230...U+1123F |
| 160 | 4428, // U+114C0...U+114CF |
| 161 | 4444, // U+115C0...U+115CF |
| 162 | 4452, // U+11640...U+1164F |
| 163 | 4679, // U+12470...U+1247F |
| 164 | 5798, // U+16A60...U+16A6F |
| 165 | 5807, // U+16AF0...U+16AFF |
| 166 | 5811, // U+16B30...U+16B3F |
| 167 | 5812, // U+16B40...U+16B4F |
| 168 | 7113, // U+1BC90...U+1BC9F |
| 169 | ]; |
| 170 | |
| 171 | const PUNCT_MASKS: [u16; 132] = [ |
| 172 | 0x0882, // U+00A0...U+00AF |
| 173 | 0x88c0, // U+00B0...U+00BF |
| 174 | 0x4000, // U+0370...U+037F |
| 175 | 0x0080, // U+0380...U+038F |
| 176 | 0xfc00, // U+0550...U+055F |
| 177 | 0x0600, // U+0580...U+058F |
| 178 | 0x4000, // U+05B0...U+05BF |
| 179 | 0x0049, // U+05C0...U+05CF |
| 180 | 0x0018, // U+05F0...U+05FF |
| 181 | 0x3600, // U+0600...U+060F |
| 182 | 0xc800, // U+0610...U+061F |
| 183 | 0x3c00, // U+0660...U+066F |
| 184 | 0x0010, // U+06D0...U+06DF |
| 185 | 0x3fff, // U+0700...U+070F |
| 186 | 0x0380, // U+07F0...U+07FF |
| 187 | 0x7fff, // U+0830...U+083F |
| 188 | 0x4000, // U+0850...U+085F |
| 189 | 0x0030, // U+0960...U+096F |
| 190 | 0x0001, // U+0970...U+097F |
| 191 | 0x0001, // U+0AF0...U+0AFF |
| 192 | 0x0010, // U+0DF0...U+0DFF |
| 193 | 0x8000, // U+0E40...U+0E4F |
| 194 | 0x0c00, // U+0E50...U+0E5F |
| 195 | 0xfff0, // U+0F00...U+0F0F |
| 196 | 0x0017, // U+0F10...U+0F1F |
| 197 | 0x3c00, // U+0F30...U+0F3F |
| 198 | 0x0020, // U+0F80...U+0F8F |
| 199 | 0x061f, // U+0FD0...U+0FDF |
| 200 | 0xfc00, // U+1040...U+104F |
| 201 | 0x0800, // U+10F0...U+10FF |
| 202 | 0x01ff, // U+1360...U+136F |
| 203 | 0x0001, // U+1400...U+140F |
| 204 | 0x6000, // U+1660...U+166F |
| 205 | 0x1800, // U+1690...U+169F |
| 206 | 0x3800, // U+16E0...U+16EF |
| 207 | 0x0060, // U+1730...U+173F |
| 208 | 0x0770, // U+17D0...U+17DF |
| 209 | 0x07ff, // U+1800...U+180F |
| 210 | 0x0030, // U+1940...U+194F |
| 211 | 0xc000, // U+1A10...U+1A1F |
| 212 | 0x3f7f, // U+1AA0...U+1AAF |
| 213 | 0xfc00, // U+1B50...U+1B5F |
| 214 | 0x0001, // U+1B60...U+1B6F |
| 215 | 0xf000, // U+1BF0...U+1BFF |
| 216 | 0xf800, // U+1C30...U+1C3F |
| 217 | 0xc000, // U+1C70...U+1C7F |
| 218 | 0x00ff, // U+1CC0...U+1CCF |
| 219 | 0x0008, // U+1CD0...U+1CDF |
| 220 | 0xffff, // U+2010...U+201F |
| 221 | 0x00ff, // U+2020...U+202F |
| 222 | 0xffff, // U+2030...U+203F |
| 223 | 0xffef, // U+2040...U+204F |
| 224 | 0x7ffb, // U+2050...U+205F |
| 225 | 0x6000, // U+2070...U+207F |
| 226 | 0x6000, // U+2080...U+208F |
| 227 | 0x0f00, // U+2300...U+230F |
| 228 | 0x0600, // U+2320...U+232F |
| 229 | 0xff00, // U+2760...U+276F |
| 230 | 0x003f, // U+2770...U+277F |
| 231 | 0x0060, // U+27C0...U+27CF |
| 232 | 0xffc0, // U+27E0...U+27EF |
| 233 | 0xfff8, // U+2980...U+298F |
| 234 | 0x01ff, // U+2990...U+299F |
| 235 | 0x0f00, // U+29D0...U+29DF |
| 236 | 0x3000, // U+29F0...U+29FF |
| 237 | 0xde00, // U+2CF0...U+2CFF |
| 238 | 0x0001, // U+2D70...U+2D7F |
| 239 | 0xffff, // U+2E00...U+2E0F |
| 240 | 0xffff, // U+2E10...U+2E1F |
| 241 | 0x7fff, // U+2E20...U+2E2F |
| 242 | 0xffff, // U+2E30...U+2E3F |
| 243 | 0x0007, // U+2E40...U+2E4F |
| 244 | 0xff0e, // U+3000...U+300F |
| 245 | 0xfff3, // U+3010...U+301F |
| 246 | 0x2001, // U+3030...U+303F |
| 247 | 0x0001, // U+30A0...U+30AF |
| 248 | 0x0800, // U+30F0...U+30FF |
| 249 | 0xc000, // U+A4F0...U+A4FF |
| 250 | 0xe000, // U+A600...U+A60F |
| 251 | 0x4008, // U+A670...U+A67F |
| 252 | 0x00fc, // U+A6F0...U+A6FF |
| 253 | 0x00f0, // U+A870...U+A87F |
| 254 | 0xc000, // U+A8C0...U+A8CF |
| 255 | 0x0700, // U+A8F0...U+A8FF |
| 256 | 0xc000, // U+A920...U+A92F |
| 257 | 0x8000, // U+A950...U+A95F |
| 258 | 0x3ffe, // U+A9C0...U+A9CF |
| 259 | 0xc000, // U+A9D0...U+A9DF |
| 260 | 0xf000, // U+AA50...U+AA5F |
| 261 | 0xc000, // U+AAD0...U+AADF |
| 262 | 0x0003, // U+AAF0...U+AAFF |
| 263 | 0x0800, // U+ABE0...U+ABEF |
| 264 | 0xc000, // U+FD30...U+FD3F |
| 265 | 0x03ff, // U+FE10...U+FE1F |
| 266 | 0xffff, // U+FE30...U+FE3F |
| 267 | 0xffff, // U+FE40...U+FE4F |
| 268 | 0xfff7, // U+FE50...U+FE5F |
| 269 | 0x0d0b, // U+FE60...U+FE6F |
| 270 | 0xf7ee, // U+FF00...U+FF0F |
| 271 | 0x8c00, // U+FF10...U+FF1F |
| 272 | 0x0001, // U+FF20...U+FF2F |
| 273 | 0xb800, // U+FF30...U+FF3F |
| 274 | 0xa800, // U+FF50...U+FF5F |
| 275 | 0x003f, // U+FF60...U+FF6F |
| 276 | 0x0007, // U+10100...U+1010F |
| 277 | 0x8000, // U+10390...U+1039F |
| 278 | 0x0001, // U+103D0...U+103DF |
| 279 | 0x8000, // U+10560...U+1056F |
| 280 | 0x0080, // U+10850...U+1085F |
| 281 | 0x8000, // U+10910...U+1091F |
| 282 | 0x8000, // U+10930...U+1093F |
| 283 | 0x01ff, // U+10A50...U+10A5F |
| 284 | 0x8000, // U+10A70...U+10A7F |
| 285 | 0x007f, // U+10AF0...U+10AFF |
| 286 | 0xfe00, // U+10B30...U+10B3F |
| 287 | 0x1e00, // U+10B90...U+10B9F |
| 288 | 0x3f80, // U+11040...U+1104F |
| 289 | 0xd800, // U+110B0...U+110BF |
| 290 | 0x0003, // U+110C0...U+110CF |
| 291 | 0x000f, // U+11140...U+1114F |
| 292 | 0x0030, // U+11170...U+1117F |
| 293 | 0x21e0, // U+111C0...U+111CF |
| 294 | 0x3f00, // U+11230...U+1123F |
| 295 | 0x0040, // U+114C0...U+114CF |
| 296 | 0x03fe, // U+115C0...U+115CF |
| 297 | 0x000e, // U+11640...U+1164F |
| 298 | 0x001f, // U+12470...U+1247F |
| 299 | 0xc000, // U+16A60...U+16A6F |
| 300 | 0x0020, // U+16AF0...U+16AFF |
| 301 | 0x0f80, // U+16B30...U+16B3F |
| 302 | 0x0010, // U+16B40...U+16B4F |
| 303 | 0x8000, // U+1BC90...U+1BC9F |
| 304 | ]; |
| 305 | |
| 306 | pub(crate) fn is_ascii_punctuation(c: u8) -> bool { |
| 307 | c < 128 && (PUNCT_MASKS_ASCII[(c / 16) as usize] & (1 << (c & 15))) != 0 |
| 308 | } |
| 309 | |
| 310 | pub(crate) fn is_punctuation(c: char) -> bool { |
| 311 | let cp: u32 = c as u32; |
| 312 | if cp < 128 { |
| 313 | return is_ascii_punctuation(cp as u8); |
| 314 | } |
| 315 | if cp > 0x1BC9F { |
| 316 | return false; |
| 317 | } |
| 318 | let high: u16 = (cp / 16) as u16; |
| 319 | match PUNCT_TAB.binary_search(&high) { |
| 320 | Ok(index: usize) => (PUNCT_MASKS[index] & (1 << (cp & 15))) != 0, |
| 321 | _ => false, |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | #[cfg (test)] |
| 326 | mod tests { |
| 327 | use super::{is_ascii_punctuation, is_punctuation}; |
| 328 | |
| 329 | #[test ] |
| 330 | fn test_ascii() { |
| 331 | assert!(is_ascii_punctuation(b'!' )); |
| 332 | assert!(is_ascii_punctuation(b'@' )); |
| 333 | assert!(is_ascii_punctuation(b'~' )); |
| 334 | assert!(!is_ascii_punctuation(b' ' )); |
| 335 | assert!(!is_ascii_punctuation(b'0' )); |
| 336 | assert!(!is_ascii_punctuation(b'A' )); |
| 337 | assert!(!is_ascii_punctuation(0xA1)); |
| 338 | } |
| 339 | |
| 340 | #[test ] |
| 341 | fn test_unicode() { |
| 342 | assert!(is_punctuation('~' )); |
| 343 | assert!(!is_punctuation(' ' )); |
| 344 | |
| 345 | assert!(is_punctuation(' \u{00A1}' )); |
| 346 | assert!(is_punctuation(' \u{060C}' )); |
| 347 | assert!(is_punctuation(' \u{FF65}' )); |
| 348 | assert!(is_punctuation(' \u{1BC9F}' )); |
| 349 | assert!(!is_punctuation(' \u{1BCA0}' )); |
| 350 | } |
| 351 | } |
| 352 | |