1use std::char;
2use std::collections::BTreeMap;
3use std::fmt;
4use std::fs::File;
5use std::io::{self, BufRead};
6use std::marker::PhantomData;
7use std::path::{Path, PathBuf};
8use std::str::FromStr;
9
10use once_cell::sync::Lazy;
11use regex::Regex;
12
13use crate::error::{Error, ErrorKind};
14
15/// Parse a particular file in the UCD into a sequence of rows.
16///
17/// The given directory should be the directory to the UCD.
18pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
19where
20 P: AsRef<Path>,
21 D: UcdFile,
22{
23 let mut xs: Vec = vec![];
24 for result: Result in D::from_dir(ucd_dir)? {
25 let x: D = result?;
26 xs.push(x);
27 }
28 Ok(xs)
29}
30
31/// Parse a particular file in the UCD into a map from codepoint to the record.
32///
33/// The given directory should be the directory to the UCD.
34pub fn parse_by_codepoint<P, D>(
35 ucd_dir: P,
36) -> Result<BTreeMap<Codepoint, D>, Error>
37where
38 P: AsRef<Path>,
39 D: UcdFileByCodepoint,
40{
41 let mut map: BTreeMap = BTreeMap::new();
42 for result: Result in D::from_dir(ucd_dir)? {
43 let x: D = result?;
44 for cp: Codepoint in x.codepoints() {
45 map.insert(key:cp, value:x.clone());
46 }
47 }
48 Ok(map)
49}
50
51/// Parse a particular file in the UCD into a map from codepoint to all
52/// records associated with that codepoint.
53///
54/// This is useful for files that have multiple records for each codepoint.
55/// For example, the `NameAliases.txt` file lists multiple aliases for some
56/// codepoints.
57///
58/// The given directory should be the directory to the UCD.
59pub fn parse_many_by_codepoint<P, D>(
60 ucd_dir: P,
61) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
62where
63 P: AsRef<Path>,
64 D: UcdFileByCodepoint,
65{
66 let mut map: BTreeMap> = BTreeMap::new();
67 for result: Result in D::from_dir(ucd_dir)? {
68 let x: D = result?;
69 for cp: Codepoint in x.codepoints() {
70 map.entry(cp).or_insert(default:vec![]).push(x.clone());
71 }
72 }
73 Ok(map)
74}
75
76/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
77/// it's unicode version.
78///
79/// This just checks the readme and the very first line of PropList.txt -- in
80/// practice this works for all versions of UCD since 4.1.0.
81pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
82 ucd_dir: &D,
83) -> Result<(u64, u64, u64), Error> {
84 // Avoid duplication from generic path parameter.
85 fn ucd_directory_version_inner(
86 ucd_dir: &Path,
87 ) -> Result<(u64, u64, u64), Error> {
88 static VERSION_RX: Lazy<Regex> = Lazy::new(|| {
89 Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap()
90 });
91
92 let proplist = ucd_dir.join("PropList.txt");
93 let contents = first_line(&proplist)?;
94 let caps = match VERSION_RX.captures(&contents) {
95 Some(c) => c,
96 None => {
97 return err!("Failed to find version in line {:?}", contents)
98 }
99 };
100
101 let capture_to_num = |n| {
102 caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error {
103 kind: ErrorKind::Parse(format!(
104 "Failed to parse version from {:?} in PropList.txt: {}",
105 contents, e
106 )),
107 line: Some(0),
108 path: Some(proplist.clone()),
109 })
110 };
111 let major = capture_to_num(1)?;
112 let minor = capture_to_num(2)?;
113 let patch = capture_to_num(3)?;
114
115 Ok((major, minor, patch))
116 }
117 ucd_directory_version_inner(ucd_dir.as_ref())
118}
119
120fn first_line(path: &Path) -> Result<String, Error> {
121 let file: File = std::fs::File::open(path).map_err(|e: Error| Error {
122 kind: ErrorKind::Io(e),
123 line: None,
124 path: Some(path.into()),
125 })?;
126
127 let mut reader: BufReader = std::io::BufReader::new(inner:file);
128 let mut line_contents: String = String::new();
129 reader.read_line(&mut line_contents).map_err(|e: Error| Error {
130 kind: ErrorKind::Io(e),
131 line: None,
132 path: Some(path.into()),
133 })?;
134 Ok(line_contents)
135}
136
137/// A helper function for parsing a common record format that associates one
138/// or more codepoints with a string value.
139pub fn parse_codepoint_association<'a>(
140 line: &'a str,
141) -> Result<(Codepoints, &'a str), Error> {
142 static PARTS: Lazy<Regex> = Lazy::new(|| {
143 Regex::new(
144 r"(?x)
145 ^
146 \s*(?P<codepoints>[^\s;]+)\s*;
147 \s*(?P<property>[^;\x23]+)\s*
148 ",
149 )
150 .unwrap()
151 });
152
153 let caps = match PARTS.captures(line.trim()) {
154 Some(caps) => caps,
155 None => return err!("invalid PropList line: '{}'", line),
156 };
157 let property = match caps.name("property") {
158 Some(property) => property.as_str().trim(),
159 None => {
160 return err!(
161 "could not find property name in PropList line: '{}'",
162 line
163 )
164 }
165 };
166 Ok((caps["codepoints"].parse()?, property))
167}
168
169/// A helper function for parsing a sequence of space separated codepoints.
170/// The sequence is permitted to be empty.
171pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
172 let mut cps: Vec = vec![];
173 for cp: &str in s.trim().split_whitespace() {
174 cps.push(cp.parse()?);
175 }
176 Ok(cps)
177}
178
179/// A helper function for parsing a single test for the various break
180/// algorithms.
181///
182/// Upon success, this returns the UTF-8 encoded groups of codepoints along
183/// with the comment associated with the test. The comment is a human readable
184/// description of the test that may prove useful for debugging.
185pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
186 static PARTS: Lazy<Regex> = Lazy::new(|| {
187 Regex::new(
188 r"(?x)
189 ^
190 (?:÷|×)
191 (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+)
192 \s+
193 \#(?P<comment>.+)
194 $
195 ",
196 )
197 .unwrap()
198 });
199 static GROUP: Lazy<Regex> = Lazy::new(|| {
200 Regex::new(
201 r"(?x)
202 (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×)
203 ",
204 )
205 .unwrap()
206 });
207
208 let caps = match PARTS.captures(line.trim()) {
209 Some(caps) => caps,
210 None => return err!("invalid break test line: '{}'", line),
211 };
212 let comment = caps["comment"].trim().to_string();
213
214 let mut groups = vec![];
215 let mut cur = String::new();
216 for cap in GROUP.captures_iter(&caps["groups"]) {
217 let cp: Codepoint = cap["codepoint"].parse()?;
218 let ch = match cp.scalar() {
219 Some(ch) => ch,
220 None => {
221 return err!(
222 "invalid codepoint '{:X}' in line: '{}'",
223 cp.value(),
224 line
225 )
226 }
227 };
228 cur.push(ch);
229 if &cap["kind"] == "÷" {
230 groups.push(cur);
231 cur = String::new();
232 }
233 }
234 Ok((groups, comment))
235}
236
237/// Describes a single UCD file.
238pub trait UcdFile:
239 Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
240{
241 /// The file path corresponding to this file, relative to the UCD
242 /// directory.
243 fn relative_file_path() -> &'static Path;
244
245 /// The full file path corresponding to this file given the UCD directory
246 /// path.
247 fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
248 ucd_dir.as_ref().join(Self::relative_file_path())
249 }
250
251 /// Create an iterator over each record in this UCD file.
252 ///
253 /// The parameter should correspond to the directory containing the UCD.
254 fn from_dir<P: AsRef<Path>>(
255 ucd_dir: P,
256 ) -> Result<UcdLineParser<File, Self>, Error> {
257 UcdLineParser::from_path(Self::file_path(ucd_dir))
258 }
259}
260
261/// Describes a single UCD file where every record in the file is associated
262/// with one or more codepoints.
263pub trait UcdFileByCodepoint: UcdFile {
264 /// Returns the codepoints associated with this record.
265 fn codepoints(&self) -> CodepointIter;
266}
267
268/// A line oriented parser for a particular UCD file.
269///
270/// Callers can build a line parser via the
271/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
272///
273/// The `R` type parameter refers to the underlying `io::Read` implementation
274/// from which the UCD data is read.
275///
276/// The `D` type parameter refers to the type of the record parsed out of each
277/// line.
278#[derive(Debug)]
279pub struct UcdLineParser<R, D> {
280 path: Option<PathBuf>,
281 rdr: io::BufReader<R>,
282 line: String,
283 line_number: u64,
284 _data: PhantomData<D>,
285}
286
287impl<D> UcdLineParser<File, D> {
288 /// Create a new parser from the given file path.
289 pub(crate) fn from_path<P: AsRef<Path>>(
290 path: P,
291 ) -> Result<UcdLineParser<File, D>, Error> {
292 let path: &Path = path.as_ref();
293 let file: File = File::open(path).map_err(|e: Error| Error {
294 kind: ErrorKind::Io(e),
295 line: None,
296 path: Some(path.to_path_buf()),
297 })?;
298 Ok(UcdLineParser::new(path:Some(path.to_path_buf()), rdr:file))
299 }
300}
301
302impl<R: io::Read, D> UcdLineParser<R, D> {
303 /// Create a new parser that parses the reader given.
304 ///
305 /// The type of data parsed is determined when the `parse_next` function
306 /// is called by virtue of the type requested.
307 ///
308 /// Note that the reader is buffered internally, so the caller does not
309 /// need to provide their own buffering.
310 pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
311 UcdLineParser {
312 path,
313 rdr: io::BufReader::new(inner:rdr),
314 line: String::new(),
315 line_number: 0,
316 _data: PhantomData,
317 }
318 }
319}
320
321impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
322 type Item = Result<D, Error>;
323
324 fn next(&mut self) -> Option<Result<D, Error>> {
325 loop {
326 self.line_number += 1;
327 self.line.clear();
328 let n = match self.rdr.read_line(&mut self.line) {
329 Err(err) => {
330 return Some(Err(Error {
331 kind: ErrorKind::Io(err),
332 line: None,
333 path: self.path.clone(),
334 }))
335 }
336 Ok(n) => n,
337 };
338 if n == 0 {
339 return None;
340 }
341 if !self.line.starts_with('#') && !self.line.trim().is_empty() {
342 break;
343 }
344 }
345 let line_number = self.line_number;
346 Some(self.line.parse().map_err(|mut err: Error| {
347 err.line = Some(line_number);
348 err
349 }))
350 }
351}
352
353/// A representation of either a single codepoint or a range of codepoints.
354#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
355pub enum Codepoints {
356 /// A single codepoint.
357 Single(Codepoint),
358 /// A range of codepoints.
359 Range(CodepointRange),
360}
361
362impl Default for Codepoints {
363 fn default() -> Codepoints {
364 Codepoints::Single(Codepoint::default())
365 }
366}
367
368impl IntoIterator for Codepoints {
369 type IntoIter = CodepointIter;
370 type Item = Codepoint;
371
372 fn into_iter(self) -> CodepointIter {
373 match self {
374 Codepoints::Single(x: Codepoint) => x.into_iter(),
375 Codepoints::Range(x: CodepointRange) => x.into_iter(),
376 }
377 }
378}
379
380impl FromStr for Codepoints {
381 type Err = Error;
382
383 fn from_str(s: &str) -> Result<Codepoints, Error> {
384 if s.contains("..") {
385 CodepointRange::from_str(s).map(op:Codepoints::Range)
386 } else {
387 Codepoint::from_str(s).map(op:Codepoints::Single)
388 }
389 }
390}
391
392impl fmt::Display for Codepoints {
393 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
394 match *self {
395 Codepoints::Single(ref x: &Codepoint) => x.fmt(f),
396 Codepoints::Range(ref x: &CodepointRange) => x.fmt(f),
397 }
398 }
399}
400
401impl PartialEq<u32> for Codepoints {
402 fn eq(&self, other: &u32) -> bool {
403 match *self {
404 Codepoints::Single(ref x: &Codepoint) => x == other,
405 Codepoints::Range(ref x: &CodepointRange) => x == &(*other, *other),
406 }
407 }
408}
409
410impl PartialEq<Codepoint> for Codepoints {
411 fn eq(&self, other: &Codepoint) -> bool {
412 match *self {
413 Codepoints::Single(ref x: &Codepoint) => x == other,
414 Codepoints::Range(ref x: &CodepointRange) => x == &(*other, *other),
415 }
416 }
417}
418
419impl PartialEq<(u32, u32)> for Codepoints {
420 fn eq(&self, other: &(u32, u32)) -> bool {
421 match *self {
422 Codepoints::Single(ref x: &Codepoint) => &(x.value(), x.value()) == other,
423 Codepoints::Range(ref x: &CodepointRange) => x == other,
424 }
425 }
426}
427
428impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
429 fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
430 match *self {
431 Codepoints::Single(ref x: &Codepoint) => &(*x, *x) == other,
432 Codepoints::Range(ref x: &CodepointRange) => x == other,
433 }
434 }
435}
436
437/// A range of Unicode codepoints. The range is inclusive; both ends of the
438/// range are guaranteed to be valid codepoints.
439#[derive(
440 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
441)]
442pub struct CodepointRange {
443 /// The start of the codepoint range.
444 pub start: Codepoint,
445 /// The end of the codepoint range.
446 pub end: Codepoint,
447}
448
449impl IntoIterator for CodepointRange {
450 type IntoIter = CodepointIter;
451 type Item = Codepoint;
452
453 fn into_iter(self) -> CodepointIter {
454 CodepointIter { next: self.start.value(), range: self }
455 }
456}
457
458impl FromStr for CodepointRange {
459 type Err = Error;
460
461 fn from_str(s: &str) -> Result<CodepointRange, Error> {
462 static PARTS: Lazy<Regex> = Lazy::new(|| {
463 RegexResult::new(re:r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$")
464 .unwrap()
465 });
466 let caps: Captures<'_> = match PARTS.captures(text:s) {
467 Some(caps: Captures<'_>) => caps,
468 None => return err!("invalid codepoint range: '{}'", s),
469 };
470 let start: Codepoint = caps["start"].parse().or_else(|err: Error| {
471 err!("failed to parse '{}' as a codepoint range: {}", s, err)
472 })?;
473 let end: Codepoint = caps["end"].parse().or_else(|err: Error| {
474 err!("failed to parse '{}' as a codepoint range: {}", s, err)
475 })?;
476 Ok(CodepointRange { start, end })
477 }
478}
479
480impl fmt::Display for CodepointRange {
481 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
482 write!(f, "{}..{}", self.start, self.end)
483 }
484}
485
486impl PartialEq<(u32, u32)> for CodepointRange {
487 fn eq(&self, other: &(u32, u32)) -> bool {
488 &(self.start.value(), self.end.value()) == other
489 }
490}
491
492impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
493 fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
494 &(self.start, self.end) == other
495 }
496}
497
498/// A single Unicode codepoint.
499///
500/// This type's string representation is a hexadecimal number. It is guaranteed
501/// to be in the range `[0, 10FFFF]`.
502///
503/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
504#[derive(
505 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
506)]
507pub struct Codepoint(u32);
508
509impl Codepoint {
510 /// Create a new codepoint from a `u32`.
511 ///
512 /// If the given number is not a valid codepoint, then this returns an
513 /// error.
514 pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
515 if n > 0x10FFFF {
516 err!("{:x} is not a valid Unicode codepoint", n)
517 } else {
518 Ok(Codepoint(n))
519 }
520 }
521
522 /// Return the underlying `u32` codepoint value.
523 pub fn value(self) -> u32 {
524 self.0
525 }
526
527 /// Attempt to convert this codepoint to a Unicode scalar value.
528 ///
529 /// If this is a surrogate codepoint, then this returns `None`.
530 pub fn scalar(self) -> Option<char> {
531 char::from_u32(self.0)
532 }
533}
534
535impl IntoIterator for Codepoint {
536 type IntoIter = CodepointIter;
537 type Item = Codepoint;
538
539 fn into_iter(self) -> CodepointIter {
540 let range: CodepointRange = CodepointRange { start: self, end: self };
541 CodepointIter { next: self.value(), range }
542 }
543}
544
545impl FromStr for Codepoint {
546 type Err = Error;
547
548 fn from_str(s: &str) -> Result<Codepoint, Error> {
549 match u32::from_str_radix(src:s, radix:16) {
550 Ok(n: u32) => Codepoint::from_u32(n),
551 Err(err: ParseIntError) => {
552 return err!(
553 "failed to parse '{}' as a hexadecimal codepoint: {}",
554 s,
555 err
556 );
557 }
558 }
559 }
560}
561
562impl fmt::Display for Codepoint {
563 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
564 write!(f, "{:04X}", self.0)
565 }
566}
567
568impl PartialEq<u32> for Codepoint {
569 fn eq(&self, other: &u32) -> bool {
570 self.0 == *other
571 }
572}
573
574impl PartialEq<Codepoint> for u32 {
575 fn eq(&self, other: &Codepoint) -> bool {
576 *self == other.0
577 }
578}
579
580/// An iterator over a range of Unicode codepoints.
581#[derive(Debug)]
582pub struct CodepointIter {
583 next: u32,
584 range: CodepointRange,
585}
586
587impl Iterator for CodepointIter {
588 type Item = Codepoint;
589
590 fn next(&mut self) -> Option<Codepoint> {
591 if self.next > self.range.end.value() {
592 return None;
593 }
594 let current: u32 = self.next;
595 self.next += 1;
596 Some(Codepoint::from_u32(current).unwrap())
597 }
598}
599