1 | use std::char; |
2 | use std::collections::BTreeMap; |
3 | use std::fmt; |
4 | use std::fs::File; |
5 | use std::io::{self, BufRead}; |
6 | use std::marker::PhantomData; |
7 | use std::path::{Path, PathBuf}; |
8 | use std::str::FromStr; |
9 | |
10 | use once_cell::sync::Lazy; |
11 | use regex::Regex; |
12 | |
13 | use crate::error::{Error, ErrorKind}; |
14 | |
15 | /// Parse a particular file in the UCD into a sequence of rows. |
16 | /// |
17 | /// The given directory should be the directory to the UCD. |
18 | pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error> |
19 | where |
20 | P: AsRef<Path>, |
21 | D: UcdFile, |
22 | { |
23 | let mut xs: Vec = vec![]; |
24 | for result: Result in D::from_dir(ucd_dir)? { |
25 | let x: D = result?; |
26 | xs.push(x); |
27 | } |
28 | Ok(xs) |
29 | } |
30 | |
31 | /// Parse a particular file in the UCD into a map from codepoint to the record. |
32 | /// |
33 | /// The given directory should be the directory to the UCD. |
34 | pub fn parse_by_codepoint<P, D>( |
35 | ucd_dir: P, |
36 | ) -> Result<BTreeMap<Codepoint, D>, Error> |
37 | where |
38 | P: AsRef<Path>, |
39 | D: UcdFileByCodepoint, |
40 | { |
41 | let mut map: BTreeMap = BTreeMap::new(); |
42 | for result: Result in D::from_dir(ucd_dir)? { |
43 | let x: D = result?; |
44 | for cp: Codepoint in x.codepoints() { |
45 | map.insert(key:cp, value:x.clone()); |
46 | } |
47 | } |
48 | Ok(map) |
49 | } |
50 | |
51 | /// Parse a particular file in the UCD into a map from codepoint to all |
52 | /// records associated with that codepoint. |
53 | /// |
54 | /// This is useful for files that have multiple records for each codepoint. |
55 | /// For example, the `NameAliases.txt` file lists multiple aliases for some |
56 | /// codepoints. |
57 | /// |
58 | /// The given directory should be the directory to the UCD. |
59 | pub fn parse_many_by_codepoint<P, D>( |
60 | ucd_dir: P, |
61 | ) -> Result<BTreeMap<Codepoint, Vec<D>>, Error> |
62 | where |
63 | P: AsRef<Path>, |
64 | D: UcdFileByCodepoint, |
65 | { |
66 | let mut map: BTreeMap> = BTreeMap::new(); |
67 | for result: Result in D::from_dir(ucd_dir)? { |
68 | let x: D = result?; |
69 | for cp: Codepoint in x.codepoints() { |
70 | map.entry(cp).or_insert(default:vec![]).push(x.clone()); |
71 | } |
72 | } |
73 | Ok(map) |
74 | } |
75 | |
76 | /// Given a path pointing at the root of the `ucd_dir`, attempts to determine |
77 | /// it's unicode version. |
78 | /// |
79 | /// This just checks the readme and the very first line of PropList.txt -- in |
80 | /// practice this works for all versions of UCD since 4.1.0. |
81 | pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>( |
82 | ucd_dir: &D, |
83 | ) -> Result<(u64, u64, u64), Error> { |
84 | // Avoid duplication from generic path parameter. |
85 | fn ucd_directory_version_inner( |
86 | ucd_dir: &Path, |
87 | ) -> Result<(u64, u64, u64), Error> { |
88 | static VERSION_RX: Lazy<Regex> = Lazy::new(|| { |
89 | Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt" ).unwrap() |
90 | }); |
91 | |
92 | let proplist = ucd_dir.join("PropList.txt" ); |
93 | let contents = first_line(&proplist)?; |
94 | let caps = match VERSION_RX.captures(&contents) { |
95 | Some(c) => c, |
96 | None => { |
97 | return err!("Failed to find version in line {:?}" , contents) |
98 | } |
99 | }; |
100 | |
101 | let capture_to_num = |n| { |
102 | caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error { |
103 | kind: ErrorKind::Parse(format!( |
104 | "Failed to parse version from {:?} in PropList.txt: {}" , |
105 | contents, e |
106 | )), |
107 | line: Some(0), |
108 | path: Some(proplist.clone()), |
109 | }) |
110 | }; |
111 | let major = capture_to_num(1)?; |
112 | let minor = capture_to_num(2)?; |
113 | let patch = capture_to_num(3)?; |
114 | |
115 | Ok((major, minor, patch)) |
116 | } |
117 | ucd_directory_version_inner(ucd_dir.as_ref()) |
118 | } |
119 | |
120 | fn first_line(path: &Path) -> Result<String, Error> { |
121 | let file: File = std::fs::File::open(path).map_err(|e: Error| Error { |
122 | kind: ErrorKind::Io(e), |
123 | line: None, |
124 | path: Some(path.into()), |
125 | })?; |
126 | |
127 | let mut reader: BufReader = std::io::BufReader::new(inner:file); |
128 | let mut line_contents: String = String::new(); |
129 | reader.read_line(&mut line_contents).map_err(|e: Error| Error { |
130 | kind: ErrorKind::Io(e), |
131 | line: None, |
132 | path: Some(path.into()), |
133 | })?; |
134 | Ok(line_contents) |
135 | } |
136 | |
137 | /// A helper function for parsing a common record format that associates one |
138 | /// or more codepoints with a string value. |
139 | pub fn parse_codepoint_association<'a>( |
140 | line: &'a str, |
141 | ) -> Result<(Codepoints, &'a str), Error> { |
142 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
143 | Regex::new( |
144 | r"(?x) |
145 | ^ |
146 | \s*(?P<codepoints>[^\s;]+)\s*; |
147 | \s*(?P<property>[^;\x23]+)\s* |
148 | " , |
149 | ) |
150 | .unwrap() |
151 | }); |
152 | |
153 | let caps = match PARTS.captures(line.trim()) { |
154 | Some(caps) => caps, |
155 | None => return err!("invalid PropList line: ' {}'" , line), |
156 | }; |
157 | let property = match caps.name("property" ) { |
158 | Some(property) => property.as_str().trim(), |
159 | None => { |
160 | return err!( |
161 | "could not find property name in PropList line: ' {}'" , |
162 | line |
163 | ) |
164 | } |
165 | }; |
166 | Ok((caps["codepoints" ].parse()?, property)) |
167 | } |
168 | |
169 | /// A helper function for parsing a sequence of space separated codepoints. |
170 | /// The sequence is permitted to be empty. |
171 | pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> { |
172 | let mut cps: Vec = vec![]; |
173 | for cp: &str in s.trim().split_whitespace() { |
174 | cps.push(cp.parse()?); |
175 | } |
176 | Ok(cps) |
177 | } |
178 | |
179 | /// A helper function for parsing a single test for the various break |
180 | /// algorithms. |
181 | /// |
182 | /// Upon success, this returns the UTF-8 encoded groups of codepoints along |
183 | /// with the comment associated with the test. The comment is a human readable |
184 | /// description of the test that may prove useful for debugging. |
185 | pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> { |
186 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
187 | Regex::new( |
188 | r"(?x) |
189 | ^ |
190 | (?:÷|×) |
191 | (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+) |
192 | \s+ |
193 | \#(?P<comment>.+) |
194 | $ |
195 | " , |
196 | ) |
197 | .unwrap() |
198 | }); |
199 | static GROUP: Lazy<Regex> = Lazy::new(|| { |
200 | Regex::new( |
201 | r"(?x) |
202 | (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×) |
203 | " , |
204 | ) |
205 | .unwrap() |
206 | }); |
207 | |
208 | let caps = match PARTS.captures(line.trim()) { |
209 | Some(caps) => caps, |
210 | None => return err!("invalid break test line: ' {}'" , line), |
211 | }; |
212 | let comment = caps["comment" ].trim().to_string(); |
213 | |
214 | let mut groups = vec![]; |
215 | let mut cur = String::new(); |
216 | for cap in GROUP.captures_iter(&caps["groups" ]) { |
217 | let cp: Codepoint = cap["codepoint" ].parse()?; |
218 | let ch = match cp.scalar() { |
219 | Some(ch) => ch, |
220 | None => { |
221 | return err!( |
222 | "invalid codepoint ' {:X}' in line: ' {}'" , |
223 | cp.value(), |
224 | line |
225 | ) |
226 | } |
227 | }; |
228 | cur.push(ch); |
229 | if &cap["kind" ] == "÷" { |
230 | groups.push(cur); |
231 | cur = String::new(); |
232 | } |
233 | } |
234 | Ok((groups, comment)) |
235 | } |
236 | |
237 | /// Describes a single UCD file. |
238 | pub trait UcdFile: |
239 | Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq |
240 | { |
241 | /// The file path corresponding to this file, relative to the UCD |
242 | /// directory. |
243 | fn relative_file_path() -> &'static Path; |
244 | |
245 | /// The full file path corresponding to this file given the UCD directory |
246 | /// path. |
247 | fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf { |
248 | ucd_dir.as_ref().join(Self::relative_file_path()) |
249 | } |
250 | |
251 | /// Create an iterator over each record in this UCD file. |
252 | /// |
253 | /// The parameter should correspond to the directory containing the UCD. |
254 | fn from_dir<P: AsRef<Path>>( |
255 | ucd_dir: P, |
256 | ) -> Result<UcdLineParser<File, Self>, Error> { |
257 | UcdLineParser::from_path(Self::file_path(ucd_dir)) |
258 | } |
259 | } |
260 | |
261 | /// Describes a single UCD file where every record in the file is associated |
262 | /// with one or more codepoints. |
263 | pub trait UcdFileByCodepoint: UcdFile { |
264 | /// Returns the codepoints associated with this record. |
265 | fn codepoints(&self) -> CodepointIter; |
266 | } |
267 | |
268 | /// A line oriented parser for a particular UCD file. |
269 | /// |
270 | /// Callers can build a line parser via the |
271 | /// [`UcdFile::from_dir`](trait.UcdFile.html) method. |
272 | /// |
273 | /// The `R` type parameter refers to the underlying `io::Read` implementation |
274 | /// from which the UCD data is read. |
275 | /// |
276 | /// The `D` type parameter refers to the type of the record parsed out of each |
277 | /// line. |
278 | #[derive (Debug)] |
279 | pub struct UcdLineParser<R, D> { |
280 | path: Option<PathBuf>, |
281 | rdr: io::BufReader<R>, |
282 | line: String, |
283 | line_number: u64, |
284 | _data: PhantomData<D>, |
285 | } |
286 | |
287 | impl<D> UcdLineParser<File, D> { |
288 | /// Create a new parser from the given file path. |
289 | pub(crate) fn from_path<P: AsRef<Path>>( |
290 | path: P, |
291 | ) -> Result<UcdLineParser<File, D>, Error> { |
292 | let path: &Path = path.as_ref(); |
293 | let file: File = File::open(path).map_err(|e: Error| Error { |
294 | kind: ErrorKind::Io(e), |
295 | line: None, |
296 | path: Some(path.to_path_buf()), |
297 | })?; |
298 | Ok(UcdLineParser::new(path:Some(path.to_path_buf()), rdr:file)) |
299 | } |
300 | } |
301 | |
302 | impl<R: io::Read, D> UcdLineParser<R, D> { |
303 | /// Create a new parser that parses the reader given. |
304 | /// |
305 | /// The type of data parsed is determined when the `parse_next` function |
306 | /// is called by virtue of the type requested. |
307 | /// |
308 | /// Note that the reader is buffered internally, so the caller does not |
309 | /// need to provide their own buffering. |
310 | pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> { |
311 | UcdLineParser { |
312 | path, |
313 | rdr: io::BufReader::new(inner:rdr), |
314 | line: String::new(), |
315 | line_number: 0, |
316 | _data: PhantomData, |
317 | } |
318 | } |
319 | } |
320 | |
321 | impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> { |
322 | type Item = Result<D, Error>; |
323 | |
324 | fn next(&mut self) -> Option<Result<D, Error>> { |
325 | loop { |
326 | self.line_number += 1; |
327 | self.line.clear(); |
328 | let n = match self.rdr.read_line(&mut self.line) { |
329 | Err(err) => { |
330 | return Some(Err(Error { |
331 | kind: ErrorKind::Io(err), |
332 | line: None, |
333 | path: self.path.clone(), |
334 | })) |
335 | } |
336 | Ok(n) => n, |
337 | }; |
338 | if n == 0 { |
339 | return None; |
340 | } |
341 | if !self.line.starts_with('#' ) && !self.line.trim().is_empty() { |
342 | break; |
343 | } |
344 | } |
345 | let line_number = self.line_number; |
346 | Some(self.line.parse().map_err(|mut err: Error| { |
347 | err.line = Some(line_number); |
348 | err |
349 | })) |
350 | } |
351 | } |
352 | |
353 | /// A representation of either a single codepoint or a range of codepoints. |
354 | #[derive (Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] |
355 | pub enum Codepoints { |
356 | /// A single codepoint. |
357 | Single(Codepoint), |
358 | /// A range of codepoints. |
359 | Range(CodepointRange), |
360 | } |
361 | |
362 | impl Default for Codepoints { |
363 | fn default() -> Codepoints { |
364 | Codepoints::Single(Codepoint::default()) |
365 | } |
366 | } |
367 | |
368 | impl IntoIterator for Codepoints { |
369 | type IntoIter = CodepointIter; |
370 | type Item = Codepoint; |
371 | |
372 | fn into_iter(self) -> CodepointIter { |
373 | match self { |
374 | Codepoints::Single(x: Codepoint) => x.into_iter(), |
375 | Codepoints::Range(x: CodepointRange) => x.into_iter(), |
376 | } |
377 | } |
378 | } |
379 | |
380 | impl FromStr for Codepoints { |
381 | type Err = Error; |
382 | |
383 | fn from_str(s: &str) -> Result<Codepoints, Error> { |
384 | if s.contains(".." ) { |
385 | CodepointRange::from_str(s).map(op:Codepoints::Range) |
386 | } else { |
387 | Codepoint::from_str(s).map(op:Codepoints::Single) |
388 | } |
389 | } |
390 | } |
391 | |
392 | impl fmt::Display for Codepoints { |
393 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
394 | match *self { |
395 | Codepoints::Single(ref x: &Codepoint) => x.fmt(f), |
396 | Codepoints::Range(ref x: &CodepointRange) => x.fmt(f), |
397 | } |
398 | } |
399 | } |
400 | |
401 | impl PartialEq<u32> for Codepoints { |
402 | fn eq(&self, other: &u32) -> bool { |
403 | match *self { |
404 | Codepoints::Single(ref x: &Codepoint) => x == other, |
405 | Codepoints::Range(ref x: &CodepointRange) => x == &(*other, *other), |
406 | } |
407 | } |
408 | } |
409 | |
410 | impl PartialEq<Codepoint> for Codepoints { |
411 | fn eq(&self, other: &Codepoint) -> bool { |
412 | match *self { |
413 | Codepoints::Single(ref x: &Codepoint) => x == other, |
414 | Codepoints::Range(ref x: &CodepointRange) => x == &(*other, *other), |
415 | } |
416 | } |
417 | } |
418 | |
419 | impl PartialEq<(u32, u32)> for Codepoints { |
420 | fn eq(&self, other: &(u32, u32)) -> bool { |
421 | match *self { |
422 | Codepoints::Single(ref x: &Codepoint) => &(x.value(), x.value()) == other, |
423 | Codepoints::Range(ref x: &CodepointRange) => x == other, |
424 | } |
425 | } |
426 | } |
427 | |
428 | impl PartialEq<(Codepoint, Codepoint)> for Codepoints { |
429 | fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { |
430 | match *self { |
431 | Codepoints::Single(ref x: &Codepoint) => &(*x, *x) == other, |
432 | Codepoints::Range(ref x: &CodepointRange) => x == other, |
433 | } |
434 | } |
435 | } |
436 | |
437 | /// A range of Unicode codepoints. The range is inclusive; both ends of the |
438 | /// range are guaranteed to be valid codepoints. |
439 | #[derive ( |
440 | Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, |
441 | )] |
442 | pub struct CodepointRange { |
443 | /// The start of the codepoint range. |
444 | pub start: Codepoint, |
445 | /// The end of the codepoint range. |
446 | pub end: Codepoint, |
447 | } |
448 | |
449 | impl IntoIterator for CodepointRange { |
450 | type IntoIter = CodepointIter; |
451 | type Item = Codepoint; |
452 | |
453 | fn into_iter(self) -> CodepointIter { |
454 | CodepointIter { next: self.start.value(), range: self } |
455 | } |
456 | } |
457 | |
458 | impl FromStr for CodepointRange { |
459 | type Err = Error; |
460 | |
461 | fn from_str(s: &str) -> Result<CodepointRange, Error> { |
462 | static PARTS: Lazy<Regex> = Lazy::new(|| { |
463 | RegexResult::new(re:r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$" ) |
464 | .unwrap() |
465 | }); |
466 | let caps: Captures<'_> = match PARTS.captures(text:s) { |
467 | Some(caps: Captures<'_>) => caps, |
468 | None => return err!("invalid codepoint range: ' {}'" , s), |
469 | }; |
470 | let start: Codepoint = caps["start" ].parse().or_else(|err: Error| { |
471 | err!("failed to parse ' {}' as a codepoint range: {}" , s, err) |
472 | })?; |
473 | let end: Codepoint = caps["end" ].parse().or_else(|err: Error| { |
474 | err!("failed to parse ' {}' as a codepoint range: {}" , s, err) |
475 | })?; |
476 | Ok(CodepointRange { start, end }) |
477 | } |
478 | } |
479 | |
480 | impl fmt::Display for CodepointRange { |
481 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
482 | write!(f, " {}.. {}" , self.start, self.end) |
483 | } |
484 | } |
485 | |
486 | impl PartialEq<(u32, u32)> for CodepointRange { |
487 | fn eq(&self, other: &(u32, u32)) -> bool { |
488 | &(self.start.value(), self.end.value()) == other |
489 | } |
490 | } |
491 | |
492 | impl PartialEq<(Codepoint, Codepoint)> for CodepointRange { |
493 | fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { |
494 | &(self.start, self.end) == other |
495 | } |
496 | } |
497 | |
498 | /// A single Unicode codepoint. |
499 | /// |
500 | /// This type's string representation is a hexadecimal number. It is guaranteed |
501 | /// to be in the range `[0, 10FFFF]`. |
502 | /// |
503 | /// Note that unlike Rust's `char` type, this may be a surrogate codepoint. |
504 | #[derive ( |
505 | Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, |
506 | )] |
507 | pub struct Codepoint(u32); |
508 | |
509 | impl Codepoint { |
510 | /// Create a new codepoint from a `u32`. |
511 | /// |
512 | /// If the given number is not a valid codepoint, then this returns an |
513 | /// error. |
514 | pub fn from_u32(n: u32) -> Result<Codepoint, Error> { |
515 | if n > 0x10FFFF { |
516 | err!(" {:x} is not a valid Unicode codepoint" , n) |
517 | } else { |
518 | Ok(Codepoint(n)) |
519 | } |
520 | } |
521 | |
522 | /// Return the underlying `u32` codepoint value. |
523 | pub fn value(self) -> u32 { |
524 | self.0 |
525 | } |
526 | |
527 | /// Attempt to convert this codepoint to a Unicode scalar value. |
528 | /// |
529 | /// If this is a surrogate codepoint, then this returns `None`. |
530 | pub fn scalar(self) -> Option<char> { |
531 | char::from_u32(self.0) |
532 | } |
533 | } |
534 | |
535 | impl IntoIterator for Codepoint { |
536 | type IntoIter = CodepointIter; |
537 | type Item = Codepoint; |
538 | |
539 | fn into_iter(self) -> CodepointIter { |
540 | let range: CodepointRange = CodepointRange { start: self, end: self }; |
541 | CodepointIter { next: self.value(), range } |
542 | } |
543 | } |
544 | |
545 | impl FromStr for Codepoint { |
546 | type Err = Error; |
547 | |
548 | fn from_str(s: &str) -> Result<Codepoint, Error> { |
549 | match u32::from_str_radix(src:s, radix:16) { |
550 | Ok(n: u32) => Codepoint::from_u32(n), |
551 | Err(err: ParseIntError) => { |
552 | return err!( |
553 | "failed to parse ' {}' as a hexadecimal codepoint: {}" , |
554 | s, |
555 | err |
556 | ); |
557 | } |
558 | } |
559 | } |
560 | } |
561 | |
562 | impl fmt::Display for Codepoint { |
563 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
564 | write!(f, " {:04X}" , self.0) |
565 | } |
566 | } |
567 | |
568 | impl PartialEq<u32> for Codepoint { |
569 | fn eq(&self, other: &u32) -> bool { |
570 | self.0 == *other |
571 | } |
572 | } |
573 | |
574 | impl PartialEq<Codepoint> for u32 { |
575 | fn eq(&self, other: &Codepoint) -> bool { |
576 | *self == other.0 |
577 | } |
578 | } |
579 | |
580 | /// An iterator over a range of Unicode codepoints. |
581 | #[derive (Debug)] |
582 | pub struct CodepointIter { |
583 | next: u32, |
584 | range: CodepointRange, |
585 | } |
586 | |
587 | impl Iterator for CodepointIter { |
588 | type Item = Codepoint; |
589 | |
590 | fn next(&mut self) -> Option<Codepoint> { |
591 | if self.next > self.range.end.value() { |
592 | return None; |
593 | } |
594 | let current: u32 = self.next; |
595 | self.next += 1; |
596 | Some(Codepoint::from_u32(current).unwrap()) |
597 | } |
598 | } |
599 | |