common.rs source code [crates/ucd_parse/src/common.rs]

1	use std::{
2	collections::BTreeMap,
3	fmt,
4	fs::File,
5	io::{self, BufRead},
6	path::{Path, PathBuf},
7	str::FromStr,
8	};
9
10	use crate::error::{Error, ErrorKind};
11
12	/// Parse a particular file in the UCD into a sequence of rows.
13	///
14	/// The given directory should be the directory to the UCD.
15	pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
16	where
17	P: AsRef<Path>,
18	D: UcdFile,
19	{
20	let mut xs: Vec = vec![];
21	for result: Result in D::from_dir(ucd_dir)? {
22	let x: D = result?;
23	xs.push(x);
24	}
25	Ok(xs)
26	}
27
28	/// Parse a particular file in the UCD into a map from codepoint to the record.
29	///
30	/// The given directory should be the directory to the UCD.
31	pub fn parse_by_codepoint<P, D>(
32	ucd_dir: P,
33	) -> Result<BTreeMap<Codepoint, D>, Error>
34	where
35	P: AsRef<Path>,
36	D: UcdFileByCodepoint,
37	{
38	let mut map: BTreeMap = BTreeMap::new();
39	for result: Result in D::from_dir(ucd_dir)? {
40	let x: D = result?;
41	for cp: Codepoint in x.codepoints() {
42	map.insert(key:cp, value:x.clone());
43	}
44	}
45	Ok(map)
46	}
47
48	/// Parse a particular file in the UCD into a map from codepoint to all
49	/// records associated with that codepoint.
50	///
51	/// This is useful for files that have multiple records for each codepoint.
52	/// For example, the `NameAliases.txt` file lists multiple aliases for some
53	/// codepoints.
54	///
55	/// The given directory should be the directory to the UCD.
56	pub fn parse_many_by_codepoint<P, D>(
57	ucd_dir: P,
58	) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
59	where
60	P: AsRef<Path>,
61	D: UcdFileByCodepoint,
62	{
63	let mut map: BTreeMap> = BTreeMap::new();
64	for result: Result in D::from_dir(ucd_dir)? {
65	let x: D = result?;
66	for cp: Codepoint in x.codepoints() {
67	map.entry(cp).or_insert(default:vec![]).push(x.clone());
68	}
69	}
70	Ok(map)
71	}
72
73	/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
74	/// it's unicode version.
75	///
76	/// This just checks the readme and the very first line of PropList.txt -- in
77	/// practice this works for all versions of UCD since 4.1.0.
78	pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
79	ucd_dir: &D,
80	) -> Result<(u64, u64, u64), Error> {
81	// Avoid duplication from generic path parameter.
82	fn ucd_directory_version_inner(
83	ucd_dir: &Path,
84	) -> Result<(u64, u64, u64), Error> {
85	let re_version_rx = regex!(r"-([0-9]+).([0-9]+).([0-9]+).txt");
86
87	let proplist = ucd_dir.join("PropList.txt");
88	let contents = first_line(&proplist)?;
89	let caps = match re_version_rx.captures(&contents) {
90	Some(c) => c,
91	None => {
92	return err!("Failed to find version in line {:?}", contents)
93	}
94	};
95
96	let capture_to_num = \|n\| {
97	caps.get(n).unwrap().as_str().parse::<u64>().map_err(\|e\| Error {
98	kind: ErrorKind::Parse(format!(
99	"Failed to parse version from {:?} in PropList.txt: {}",
100	contents, e
101	)),
102	line: Some(`0`),
103	path: Some(proplist.clone()),
104	})
105	};
106	let major = capture_to_num(`1`)?;
107	let minor = capture_to_num(`2`)?;
108	let patch = capture_to_num(`3`)?;
109
110	Ok((major, minor, patch))
111	}
112	ucd_directory_version_inner(ucd_dir.as_ref())
113	}
114
115	fn first_line(path: &Path) -> Result<String, Error> {
116	let file: File = std::fs::File::open(path).map_err(\|e: Error\| Error {
117	kind: ErrorKind::Io(e),
118	line: None,
119	path: Some(path.into()),
120	})?;
121
122	let mut reader: BufReader = std::io::BufReader::new(inner:file);
123	let mut line_contents: String = String::new();
124	reader.read_line(&mut line_contents).map_err(\|e: Error\| Error {
125	kind: ErrorKind::Io(e),
126	line: None,
127	path: Some(path.into()),
128	})?;
129	Ok(line_contents)
130	}
131
132	/// A helper function for parsing a common record format that associates one
133	/// or more codepoints with a string value.
134	pub fn parse_codepoint_association<'a>(
135	line: &'a str,
136	) -> Result<(Codepoints, &'a str), Error> {
137	let re_parts: &Regex = regex!(
138	r"(?x)
139	^
140	\s(?P<codepoints>[^\s;]+)\s;
141	\s(?P<property>[^;\x23]+)\s
142	",
143	);
144
145	let caps: Captures<'_> = match re_parts.captures(haystack:line.trim()) {
146	Some(caps: Captures<'_>) => caps,
147	None => return err!("invalid PropList line: '{}'", line),
148	};
149	let property: &str = match caps.name("property") {
150	Some(property: Match<'_>) => property.as_str().trim(),
151	None => {
152	return err!(
153	"could not find property name in PropList line: '{}'",
154	line
155	)
156	}
157	};
158	Ok((caps["codepoints"].parse()?, property))
159	}
160
161	/// A helper function for parsing a sequence of space separated codepoints.
162	/// The sequence is permitted to be empty.
163	pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
164	let mut cps: Vec = vec![];
165	for cp: &str in s.trim().split_whitespace() {
166	cps.push(cp.parse()?);
167	}
168	Ok(cps)
169	}
170
171	/// A helper function for parsing a single test for the various break
172	/// algorithms.
173	///
174	/// Upon success, this returns the UTF-8 encoded groups of codepoints along
175	/// with the comment associated with the test. The comment is a human readable
176	/// description of the test that may prove useful for debugging.
177	pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
178	let re_parts = regex!(
179	r"(?x)
180	^
181	(?:÷\|×)
182	(?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷\|×))+)
183	\s+
184	\#(?P<comment>.+)
185	$
186	",
187	);
188	let re_group = regex!(
189	r"(?x)
190	(?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷\|×)
191	",
192	);
193
194	let caps = match re_parts.captures(line.trim()) {
195	Some(caps) => caps,
196	None => return err!("invalid break test line: '{}'", line),
197	};
198	let comment = caps["comment"].trim().to_string();
199
200	let mut groups = vec![];
201	let mut cur = String::new();
202	for cap in re_group.captures_iter(&caps["groups"]) {
203	let cp: Codepoint = cap["codepoint"].parse()?;
204	let ch = match cp.scalar() {
205	Some(ch) => ch,
206	None => {
207	return err!(
208	"invalid codepoint '{:X}' in line: '{}'",
209	cp.value(),
210	line
211	)
212	}
213	};
214	cur.push(ch);
215	if &cap["kind"] == "÷" {
216	groups.push(cur);
217	cur = String::new();
218	}
219	}
220	Ok((groups, comment))
221	}
222
223	/// Describes a single UCD file.
224	pub trait UcdFile:
225	Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
226	{
227	/// The file path corresponding to this file, relative to the UCD
228	/// directory.
229	fn relative_file_path() -> &'static Path;
230
231	/// The full file path corresponding to this file given the UCD directory
232	/// path.
233	fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
234	ucd_dir.as_ref().join(Self::relative_file_path())
235	}
236
237	/// Create an iterator over each record in this UCD file.
238	///
239	/// The parameter should correspond to the directory containing the UCD.
240	fn from_dir<P: AsRef<Path>>(
241	ucd_dir: P,
242	) -> Result<UcdLineParser<File, Self>, Error> {
243	UcdLineParser::from_path(Self::file_path(ucd_dir))
244	}
245	}
246
247	/// Describes a single UCD file where every record in the file is associated
248	/// with one or more codepoints.
249	pub trait UcdFileByCodepoint: UcdFile {
250	/// Returns the codepoints associated with this record.
251	fn codepoints(&self) -> CodepointIter;
252	}
253
254	/// A line oriented parser for a particular UCD file.
255	///
256	/// Callers can build a line parser via the
257	/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
258	///
259	/// The `R` type parameter refers to the underlying `io::Read` implementation
260	/// from which the UCD data is read.
261	///
262	/// The `D` type parameter refers to the type of the record parsed out of each
263	/// line.
264	#[derive(Debug)]
265	pub struct UcdLineParser<R, D> {
266	path: Option<PathBuf>,
267	rdr: io::BufReader<R>,
268	line: String,
269	line_number: u64,
270	_data: std::marker::PhantomData<D>,
271	}
272
273	impl<D> UcdLineParser<File, D> {
274	/// Create a new parser from the given file path.
275	pub(crate) fn from_path<P: AsRef<Path>>(
276	path: P,
277	) -> Result<UcdLineParser<File, D>, Error> {
278	let path: &Path = path.as_ref();
279	let file: File = File::open(path).map_err(\|e: Error\| Error {
280	kind: ErrorKind::Io(e),
281	line: None,
282	path: Some(path.to_path_buf()),
283	})?;
284	Ok(UcdLineParser::new(path:Some(path.to_path_buf()), rdr:file))
285	}
286	}
287
288	impl<R: io::Read, D> UcdLineParser<R, D> {
289	/// Create a new parser that parses the reader given.
290	///
291	/// The type of data parsed is determined when the `parse_next` function
292	/// is called by virtue of the type requested.
293	///
294	/// Note that the reader is buffered internally, so the caller does not
295	/// need to provide their own buffering.
296	pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
297	UcdLineParser {
298	path,
299	rdr: io::BufReader::new(inner:rdr),
300	line: String::new(),
301	line_number: `0`,
302	_data: std::marker::PhantomData,
303	}
304	}
305	}
306
307	impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
308	type Item = Result<D, Error>;
309
310	fn next(&mut self) -> Option<Result<D, Error>> {
311	loop {
312	self.line_number += `1`;
313	self.line.clear();
314	let n = match self.rdr.read_line(&mut self.line) {
315	Err(err) => {
316	return Some(Err(Error {
317	kind: ErrorKind::Io(err),
318	line: None,
319	path: self.path.clone(),
320	}))
321	}
322	Ok(n) => n,
323	};
324	if n == `0` {
325	return None;
326	}
327	if !self.line.starts_with('#') && !self.line.trim().is_empty() {
328	break;
329	}
330	}
331	let line_number = self.line_number;
332	Some(self.line.parse().map_err(\|mut err: Error\| {
333	err.line = Some(line_number);
334	err
335	}))
336	}
337	}
338
339	/// A representation of either a single codepoint or a range of codepoints.
340	#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
341	pub enum Codepoints {
342	/// A single codepoint.
343	Single(Codepoint),
344	/// A range of codepoints.
345	Range(CodepointRange),
346	}
347
348	impl Default for Codepoints {
349	fn default() -> Codepoints {
350	Codepoints::Single(Codepoint::default())
351	}
352	}
353
354	impl IntoIterator for Codepoints {
355	type IntoIter = CodepointIter;
356	type Item = Codepoint;
357
358	fn into_iter(self) -> CodepointIter {
359	match self {
360	Codepoints::Single(x: Codepoint) => x.into_iter(),
361	Codepoints::Range(x: CodepointRange) => x.into_iter(),
362	}
363	}
364	}
365
366	impl FromStr for Codepoints {
367	type Err = Error;
368
369	fn from_str(s: &str) -> Result<Codepoints, Error> {
370	if s.contains("..") {
371	CodepointRange::from_str(s).map(op:Codepoints::Range)
372	} else {
373	Codepoint::from_str(s).map(op:Codepoints::Single)
374	}
375	}
376	}
377
378	impl fmt::Display for Codepoints {
379	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
380	match *self {
381	Codepoints::Single(ref x: &Codepoint) => x.fmt(f),
382	Codepoints::Range(ref x: &CodepointRange) => x.fmt(f),
383	}
384	}
385	}
386
387	impl PartialEq<u32> for Codepoints {
388	fn eq(&self, other: &u32) -> bool {
389	match *self {
390	Codepoints::Single(ref x: &Codepoint) => x == other,
391	Codepoints::Range(ref x: &CodepointRange) => x == &(other, other),
392	}
393	}
394	}
395
396	impl PartialEq<Codepoint> for Codepoints {
397	fn eq(&self, other: &Codepoint) -> bool {
398	match *self {
399	Codepoints::Single(ref x: &Codepoint) => x == other,
400	Codepoints::Range(ref x: &CodepointRange) => x == &(other, other),
401	}
402	}
403	}
404
405	impl PartialEq<(u32, u32)> for Codepoints {
406	fn eq(&self, other: &(u32, u32)) -> bool {
407	match *self {
408	Codepoints::Single(ref x: &Codepoint) => &(x.value(), x.value()) == other,
409	Codepoints::Range(ref x: &CodepointRange) => x == other,
410	}
411	}
412	}
413
414	impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
415	fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
416	match *self {
417	Codepoints::Single(ref x: &Codepoint) => &(x, x) == other,
418	Codepoints::Range(ref x: &CodepointRange) => x == other,
419	}
420	}
421	}
422
423	/// A range of Unicode codepoints. The range is inclusive; both ends of the
424	/// range are guaranteed to be valid codepoints.
425	#[derive(
426	Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
427	)]
428	pub struct CodepointRange {
429	/// The start of the codepoint range.
430	pub start: Codepoint,
431	/// The end of the codepoint range.
432	pub end: Codepoint,
433	}
434
435	impl IntoIterator for CodepointRange {
436	type IntoIter = CodepointIter;
437	type Item = Codepoint;
438
439	fn into_iter(self) -> CodepointIter {
440	CodepointIter { next: self.start.value(), range: self }
441	}
442	}
443
444	impl FromStr for CodepointRange {
445	type Err = Error;
446
447	fn from_str(s: &str) -> Result<CodepointRange, Error> {
448	let re_parts: &Regex = regex!(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$");
449	let caps: Captures<'_> = match re_parts.captures(haystack:s) {
450	Some(caps: Captures<'_>) => caps,
451	None => return err!("invalid codepoint range: '{}'", s),
452	};
453	let start: Codepoint = caps["start"].parse().or_else(\|err: Error\| {
454	err!("failed to parse '{}' as a codepoint range: {}", s, err)
455	})?;
456	let end: Codepoint = caps["end"].parse().or_else(\|err: Error\| {
457	err!("failed to parse '{}' as a codepoint range: {}", s, err)
458	})?;
459	Ok(CodepointRange { start, end })
460	}
461	}
462
463	impl fmt::Display for CodepointRange {
464	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
465	write!(f, "{}..{}", self.start, self.end)
466	}
467	}
468
469	impl PartialEq<(u32, u32)> for CodepointRange {
470	fn eq(&self, other: &(u32, u32)) -> bool {
471	&(self.start.value(), self.end.value()) == other
472	}
473	}
474
475	impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
476	fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
477	&(self.start, self.end) == other
478	}
479	}
480
481	/// A single Unicode codepoint.
482	///
483	/// This type's string representation is a hexadecimal number. It is guaranteed
484	/// to be in the range `[0, 10FFFF]`.
485	///
486	/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
487	#[derive(
488	Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
489	)]
490	pub struct Codepoint(u32);
491
492	impl Codepoint {
493	/// Create a new codepoint from a `u32`.
494	///
495	/// If the given number is not a valid codepoint, then this returns an
496	/// error.
497	pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
498	if n > `0x10FFFF` {
499	err!("{:x} is not a valid Unicode codepoint", n)
500	} else {
501	Ok(Codepoint(n))
502	}
503	}
504
505	/// Return the underlying `u32` codepoint value.
506	pub fn value(self) -> u32 {
507	self.0
508	}
509
510	/// Attempt to convert this codepoint to a Unicode scalar value.
511	///
512	/// If this is a surrogate codepoint, then this returns `None`.
513	pub fn scalar(self) -> Option<char> {
514	char::from_u32(self.0)
515	}
516	}
517
518	impl IntoIterator for Codepoint {
519	type IntoIter = CodepointIter;
520	type Item = Codepoint;
521
522	fn into_iter(self) -> CodepointIter {
523	let range: CodepointRange = CodepointRange { start: self, end: self };
524	CodepointIter { next: self.value(), range }
525	}
526	}
527
528	impl FromStr for Codepoint {
529	type Err = Error;
530
531	fn from_str(s: &str) -> Result<Codepoint, Error> {
532	match u32::from_str_radix(src:s, radix:`16`) {
533	Ok(n: u32) => Codepoint::from_u32(n),
534	Err(err: ParseIntError) => {
535	return err!(
536	"failed to parse '{}' as a hexadecimal codepoint: {}",
537	s,
538	err
539	);
540	}
541	}
542	}
543	}
544
545	impl fmt::Display for Codepoint {
546	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
547	write!(f, "{:`04`X}", self.0)
548	}
549	}
550
551	impl PartialEq<u32> for Codepoint {
552	fn eq(&self, other: &u32) -> bool {
553	self.0 == *other
554	}
555	}
556
557	impl PartialEq<Codepoint> for u32 {
558	fn eq(&self, other: &Codepoint) -> bool {
559	*self == other.0
560	}
561	}
562
563	/// An iterator over a range of Unicode codepoints.
564	#[derive(Debug)]
565	pub struct CodepointIter {
566	next: u32,
567	range: CodepointRange,
568	}
569
570	impl Iterator for CodepointIter {
571	type Item = Codepoint;
572
573	fn next(&mut self) -> Option<Codepoint> {
574	if self.next > self.range.end.value() {
575	return None;
576	}
577	let current: u32 = self.next;
578	self.next += `1`;
579	Some(Codepoint::from_u32(current).unwrap())
580	}
581	}
582