common.rs source code [crates/ucd-parse/src/common.rs]

1	use std::char;
2	use std::collections::BTreeMap;
3	use std::fmt;
4	use std::fs::File;
5	use std::io::{self, BufRead};
6	use std::marker::PhantomData;
7	use std::path::{Path, PathBuf};
8	use std::str::FromStr;
9
10	use once_cell::sync::Lazy;
11	use regex::Regex;
12
13	use crate::error::{Error, ErrorKind};
14
15	/// Parse a particular file in the UCD into a sequence of rows.
16	///
17	/// The given directory should be the directory to the UCD.
18	pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
19	where
20	P: AsRef<Path>,
21	D: UcdFile,
22	{
23	let mut xs: Vec = vec![];
24	for result: Result in D::from_dir(ucd_dir)? {
25	let x: D = result?;
26	xs.push(x);
27	}
28	Ok(xs)
29	}
30
31	/// Parse a particular file in the UCD into a map from codepoint to the record.
32	///
33	/// The given directory should be the directory to the UCD.
34	pub fn parse_by_codepoint<P, D>(
35	ucd_dir: P,
36	) -> Result<BTreeMap<Codepoint, D>, Error>
37	where
38	P: AsRef<Path>,
39	D: UcdFileByCodepoint,
40	{
41	let mut map: BTreeMap = BTreeMap::new();
42	for result: Result in D::from_dir(ucd_dir)? {
43	let x: D = result?;
44	for cp: Codepoint in x.codepoints() {
45	map.insert(key:cp, value:x.clone());
46	}
47	}
48	Ok(map)
49	}
50
51	/// Parse a particular file in the UCD into a map from codepoint to all
52	/// records associated with that codepoint.
53	///
54	/// This is useful for files that have multiple records for each codepoint.
55	/// For example, the `NameAliases.txt` file lists multiple aliases for some
56	/// codepoints.
57	///
58	/// The given directory should be the directory to the UCD.
59	pub fn parse_many_by_codepoint<P, D>(
60	ucd_dir: P,
61	) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
62	where
63	P: AsRef<Path>,
64	D: UcdFileByCodepoint,
65	{
66	let mut map: BTreeMap> = BTreeMap::new();
67	for result: Result in D::from_dir(ucd_dir)? {
68	let x: D = result?;
69	for cp: Codepoint in x.codepoints() {
70	map.entry(cp).or_insert(default:vec![]).push(x.clone());
71	}
72	}
73	Ok(map)
74	}
75
76	/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
77	/// it's unicode version.
78	///
79	/// This just checks the readme and the very first line of PropList.txt -- in
80	/// practice this works for all versions of UCD since 4.1.0.
81	pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
82	ucd_dir: &D,
83	) -> Result<(u64, u64, u64), Error> {
84	// Avoid duplication from generic path parameter.
85	fn ucd_directory_version_inner(
86	ucd_dir: &Path,
87	) -> Result<(u64, u64, u64), Error> {
88	static VERSION_RX: Lazy<Regex> = Lazy::new(\|\| {
89	Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap()
90	});
91
92	let proplist = ucd_dir.join("PropList.txt");
93	let contents = first_line(&proplist)?;
94	let caps = match VERSION_RX.captures(&contents) {
95	Some(c) => c,
96	None => {
97	return err!("Failed to find version in line {:?}", contents)
98	}
99	};
100
101	let capture_to_num = \|n\| {
102	caps.get(n).unwrap().as_str().parse::<u64>().map_err(\|e\| Error {
103	kind: ErrorKind::Parse(format!(
104	"Failed to parse version from {:?} in PropList.txt: {}",
105	contents, e
106	)),
107	line: Some(`0`),
108	path: Some(proplist.clone()),
109	})
110	};
111	let major = capture_to_num(`1`)?;
112	let minor = capture_to_num(`2`)?;
113	let patch = capture_to_num(`3`)?;
114
115	Ok((major, minor, patch))
116	}
117	ucd_directory_version_inner(ucd_dir.as_ref())
118	}
119
120	fn first_line(path: &Path) -> Result<String, Error> {
121	let file: File = std::fs::File::open(path).map_err(\|e: Error\| Error {
122	kind: ErrorKind::Io(e),
123	line: None,
124	path: Some(path.into()),
125	})?;
126
127	let mut reader: BufReader = std::io::BufReader::new(inner:file);
128	let mut line_contents: String = String::new();
129	reader.read_line(&mut line_contents).map_err(\|e: Error\| Error {
130	kind: ErrorKind::Io(e),
131	line: None,
132	path: Some(path.into()),
133	})?;
134	Ok(line_contents)
135	}
136
137	/// A helper function for parsing a common record format that associates one
138	/// or more codepoints with a string value.
139	pub fn parse_codepoint_association<'a>(
140	line: &'a str,
141	) -> Result<(Codepoints, &'a str), Error> {
142	static PARTS: Lazy<Regex> = Lazy::new(\|\| {
143	Regex::new(
144	r"(?x)
145	^
146	\s(?P<codepoints>[^\s;]+)\s;
147	\s(?P<property>[^;\x23]+)\s
148	",
149	)
150	.unwrap()
151	});
152
153	let caps = match PARTS.captures(line.trim()) {
154	Some(caps) => caps,
155	None => return err!("invalid PropList line: '{}'", line),
156	};
157	let property = match caps.name("property") {
158	Some(property) => property.as_str().trim(),
159	None => {
160	return err!(
161	"could not find property name in PropList line: '{}'",
162	line
163	)
164	}
165	};
166	Ok((caps["codepoints"].parse()?, property))
167	}
168
169	/// A helper function for parsing a sequence of space separated codepoints.
170	/// The sequence is permitted to be empty.
171	pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
172	let mut cps: Vec = vec![];
173	for cp: &str in s.trim().split_whitespace() {
174	cps.push(cp.parse()?);
175	}
176	Ok(cps)
177	}
178
179	/// A helper function for parsing a single test for the various break
180	/// algorithms.
181	///
182	/// Upon success, this returns the UTF-8 encoded groups of codepoints along
183	/// with the comment associated with the test. The comment is a human readable
184	/// description of the test that may prove useful for debugging.
185	pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
186	static PARTS: Lazy<Regex> = Lazy::new(\|\| {
187	Regex::new(
188	r"(?x)
189	^
190	(?:÷\|×)
191	(?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷\|×))+)
192	\s+
193	\#(?P<comment>.+)
194	$
195	",
196	)
197	.unwrap()
198	});
199	static GROUP: Lazy<Regex> = Lazy::new(\|\| {
200	Regex::new(
201	r"(?x)
202	(?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷\|×)
203	",
204	)
205	.unwrap()
206	});
207
208	let caps = match PARTS.captures(line.trim()) {
209	Some(caps) => caps,
210	None => return err!("invalid break test line: '{}'", line),
211	};
212	let comment = caps["comment"].trim().to_string();
213
214	let mut groups = vec![];
215	let mut cur = String::new();
216	for cap in GROUP.captures_iter(&caps["groups"]) {
217	let cp: Codepoint = cap["codepoint"].parse()?;
218	let ch = match cp.scalar() {
219	Some(ch) => ch,
220	None => {
221	return err!(
222	"invalid codepoint '{:X}' in line: '{}'",
223	cp.value(),
224	line
225	)
226	}
227	};
228	cur.push(ch);
229	if &cap["kind"] == "÷" {
230	groups.push(cur);
231	cur = String::new();
232	}
233	}
234	Ok((groups, comment))
235	}
236
237	/// Describes a single UCD file.
238	pub trait UcdFile:
239	Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
240	{
241	/// The file path corresponding to this file, relative to the UCD
242	/// directory.
243	fn relative_file_path() -> &'static Path;
244
245	/// The full file path corresponding to this file given the UCD directory
246	/// path.
247	fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
248	ucd_dir.as_ref().join(Self::relative_file_path())
249	}
250
251	/// Create an iterator over each record in this UCD file.
252	///
253	/// The parameter should correspond to the directory containing the UCD.
254	fn from_dir<P: AsRef<Path>>(
255	ucd_dir: P,
256	) -> Result<UcdLineParser<File, Self>, Error> {
257	UcdLineParser::from_path(Self::file_path(ucd_dir))
258	}
259	}
260
261	/// Describes a single UCD file where every record in the file is associated
262	/// with one or more codepoints.
263	pub trait UcdFileByCodepoint: UcdFile {
264	/// Returns the codepoints associated with this record.
265	fn codepoints(&self) -> CodepointIter;
266	}
267
268	/// A line oriented parser for a particular UCD file.
269	///
270	/// Callers can build a line parser via the
271	/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
272	///
273	/// The `R` type parameter refers to the underlying `io::Read` implementation
274	/// from which the UCD data is read.
275	///
276	/// The `D` type parameter refers to the type of the record parsed out of each
277	/// line.
278	#[derive(Debug)]
279	pub struct UcdLineParser<R, D> {
280	path: Option<PathBuf>,
281	rdr: io::BufReader<R>,
282	line: String,
283	line_number: u64,
284	_data: PhantomData<D>,
285	}
286
287	impl<D> UcdLineParser<File, D> {
288	/// Create a new parser from the given file path.
289	pub(crate) fn from_path<P: AsRef<Path>>(
290	path: P,
291	) -> Result<UcdLineParser<File, D>, Error> {
292	let path: &Path = path.as_ref();
293	let file: File = File::open(path).map_err(\|e: Error\| Error {
294	kind: ErrorKind::Io(e),
295	line: None,
296	path: Some(path.to_path_buf()),
297	})?;
298	Ok(UcdLineParser::new(path:Some(path.to_path_buf()), rdr:file))
299	}
300	}
301
302	impl<R: io::Read, D> UcdLineParser<R, D> {
303	/// Create a new parser that parses the reader given.
304	///
305	/// The type of data parsed is determined when the `parse_next` function
306	/// is called by virtue of the type requested.
307	///
308	/// Note that the reader is buffered internally, so the caller does not
309	/// need to provide their own buffering.
310	pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
311	UcdLineParser {
312	path,
313	rdr: io::BufReader::new(inner:rdr),
314	line: String::new(),
315	line_number: `0`,
316	_data: PhantomData,
317	}
318	}
319	}
320
321	impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
322	type Item = Result<D, Error>;
323
324	fn next(&mut self) -> Option<Result<D, Error>> {
325	loop {
326	self.line_number += `1`;
327	self.line.clear();
328	let n = match self.rdr.read_line(&mut self.line) {
329	Err(err) => {
330	return Some(Err(Error {
331	kind: ErrorKind::Io(err),
332	line: None,
333	path: self.path.clone(),
334	}))
335	}
336	Ok(n) => n,
337	};
338	if n == `0` {
339	return None;
340	}
341	if !self.line.starts_with('#') && !self.line.trim().is_empty() {
342	break;
343	}
344	}
345	let line_number = self.line_number;
346	Some(self.line.parse().map_err(\|mut err: Error\| {
347	err.line = Some(line_number);
348	err
349	}))
350	}
351	}
352
353	/// A representation of either a single codepoint or a range of codepoints.
354	#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
355	pub enum Codepoints {
356	/// A single codepoint.
357	Single(Codepoint),
358	/// A range of codepoints.
359	Range(CodepointRange),
360	}
361
362	impl Default for Codepoints {
363	fn default() -> Codepoints {
364	Codepoints::Single(Codepoint::default())
365	}
366	}
367
368	impl IntoIterator for Codepoints {
369	type IntoIter = CodepointIter;
370	type Item = Codepoint;
371
372	fn into_iter(self) -> CodepointIter {
373	match self {
374	Codepoints::Single(x: Codepoint) => x.into_iter(),
375	Codepoints::Range(x: CodepointRange) => x.into_iter(),
376	}
377	}
378	}
379
380	impl FromStr for Codepoints {
381	type Err = Error;
382
383	fn from_str(s: &str) -> Result<Codepoints, Error> {
384	if s.contains("..") {
385	CodepointRange::from_str(s).map(op:Codepoints::Range)
386	} else {
387	Codepoint::from_str(s).map(op:Codepoints::Single)
388	}
389	}
390	}
391
392	impl fmt::Display for Codepoints {
393	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
394	match *self {
395	Codepoints::Single(ref x: &Codepoint) => x.fmt(f),
396	Codepoints::Range(ref x: &CodepointRange) => x.fmt(f),
397	}
398	}
399	}
400
401	impl PartialEq<u32> for Codepoints {
402	fn eq(&self, other: &u32) -> bool {
403	match *self {
404	Codepoints::Single(ref x: &Codepoint) => x == other,
405	Codepoints::Range(ref x: &CodepointRange) => x == &(other, other),
406	}
407	}
408	}
409
410	impl PartialEq<Codepoint> for Codepoints {
411	fn eq(&self, other: &Codepoint) -> bool {
412	match *self {
413	Codepoints::Single(ref x: &Codepoint) => x == other,
414	Codepoints::Range(ref x: &CodepointRange) => x == &(other, other),
415	}
416	}
417	}
418
419	impl PartialEq<(u32, u32)> for Codepoints {
420	fn eq(&self, other: &(u32, u32)) -> bool {
421	match *self {
422	Codepoints::Single(ref x: &Codepoint) => &(x.value(), x.value()) == other,
423	Codepoints::Range(ref x: &CodepointRange) => x == other,
424	}
425	}
426	}
427
428	impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
429	fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
430	match *self {
431	Codepoints::Single(ref x: &Codepoint) => &(x, x) == other,
432	Codepoints::Range(ref x: &CodepointRange) => x == other,
433	}
434	}
435	}
436
437	/// A range of Unicode codepoints. The range is inclusive; both ends of the
438	/// range are guaranteed to be valid codepoints.
439	#[derive(
440	Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
441	)]
442	pub struct CodepointRange {
443	/// The start of the codepoint range.
444	pub start: Codepoint,
445	/// The end of the codepoint range.
446	pub end: Codepoint,
447	}
448
449	impl IntoIterator for CodepointRange {
450	type IntoIter = CodepointIter;
451	type Item = Codepoint;
452
453	fn into_iter(self) -> CodepointIter {
454	CodepointIter { next: self.start.value(), range: self }
455	}
456	}
457
458	impl FromStr for CodepointRange {
459	type Err = Error;
460
461	fn from_str(s: &str) -> Result<CodepointRange, Error> {
462	static PARTS: Lazy<Regex> = Lazy::new(\|\| {
463	RegexResult::new(re:r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$")
464	.unwrap()
465	});
466	let caps: Captures<'_> = match PARTS.captures(text:s) {
467	Some(caps: Captures<'_>) => caps,
468	None => return err!("invalid codepoint range: '{}'", s),
469	};
470	let start: Codepoint = caps["start"].parse().or_else(\|err: Error\| {
471	err!("failed to parse '{}' as a codepoint range: {}", s, err)
472	})?;
473	let end: Codepoint = caps["end"].parse().or_else(\|err: Error\| {
474	err!("failed to parse '{}' as a codepoint range: {}", s, err)
475	})?;
476	Ok(CodepointRange { start, end })
477	}
478	}
479
480	impl fmt::Display for CodepointRange {
481	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
482	write!(f, "{}..{}", self.start, self.end)
483	}
484	}
485
486	impl PartialEq<(u32, u32)> for CodepointRange {
487	fn eq(&self, other: &(u32, u32)) -> bool {
488	&(self.start.value(), self.end.value()) == other
489	}
490	}
491
492	impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
493	fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
494	&(self.start, self.end) == other
495	}
496	}
497
498	/// A single Unicode codepoint.
499	///
500	/// This type's string representation is a hexadecimal number. It is guaranteed
501	/// to be in the range `[0, 10FFFF]`.
502	///
503	/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
504	#[derive(
505	Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
506	)]
507	pub struct Codepoint(u32);
508
509	impl Codepoint {
510	/// Create a new codepoint from a `u32`.
511	///
512	/// If the given number is not a valid codepoint, then this returns an
513	/// error.
514	pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
515	if n > `0x10FFFF` {
516	err!("{:x} is not a valid Unicode codepoint", n)
517	} else {
518	Ok(Codepoint(n))
519	}
520	}
521
522	/// Return the underlying `u32` codepoint value.
523	pub fn value(self) -> u32 {
524	self.0
525	}
526
527	/// Attempt to convert this codepoint to a Unicode scalar value.
528	///
529	/// If this is a surrogate codepoint, then this returns `None`.
530	pub fn scalar(self) -> Option<char> {
531	char::from_u32(self.0)
532	}
533	}
534
535	impl IntoIterator for Codepoint {
536	type IntoIter = CodepointIter;
537	type Item = Codepoint;
538
539	fn into_iter(self) -> CodepointIter {
540	let range: CodepointRange = CodepointRange { start: self, end: self };
541	CodepointIter { next: self.value(), range }
542	}
543	}
544
545	impl FromStr for Codepoint {
546	type Err = Error;
547
548	fn from_str(s: &str) -> Result<Codepoint, Error> {
549	match u32::from_str_radix(src:s, radix:`16`) {
550	Ok(n: u32) => Codepoint::from_u32(n),
551	Err(err: ParseIntError) => {
552	return err!(
553	"failed to parse '{}' as a hexadecimal codepoint: {}",
554	s,
555	err
556	);
557	}
558	}
559	}
560	}
561
562	impl fmt::Display for Codepoint {
563	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
564	write!(f, "{:`04`X}", self.0)
565	}
566	}
567
568	impl PartialEq<u32> for Codepoint {
569	fn eq(&self, other: &u32) -> bool {
570	self.0 == *other
571	}
572	}
573
574	impl PartialEq<Codepoint> for u32 {
575	fn eq(&self, other: &Codepoint) -> bool {
576	*self == other.0
577	}
578	}
579
580	/// An iterator over a range of Unicode codepoints.
581	#[derive(Debug)]
582	pub struct CodepointIter {
583	next: u32,
584	range: CodepointRange,
585	}
586
587	impl Iterator for CodepointIter {
588	type Item = Codepoint;
589
590	fn next(&mut self) -> Option<Codepoint> {
591	if self.next > self.range.end.value() {
592	return None;
593	}
594	let current: u32 = self.next;
595	self.next += `1`;
596	Some(Codepoint::from_u32(current).unwrap())
597	}
598	}
599