sentence_break.rs source code [crates/ucd_parse/src/sentence_break.rs]

1	use std::path::Path;
2
3	use crate::{
4	common::{
5	parse_break_test, parse_codepoint_association, CodepointIter,
6	Codepoints, UcdFile, UcdFileByCodepoint,
7	},
8	error::Error,
9	};
10
11	/// A single row in the `auxiliary/SentenceBreakProperty.txt` file.
12	#[derive(Clone, Debug, Default, Eq, PartialEq)]
13	pub struct SentenceBreak {
14	/// The codepoint or codepoint range for this entry.
15	pub codepoints: Codepoints,
16	/// The property value assigned to the codepoints in this entry.
17	pub value: String,
18	}
19
20	impl UcdFile for SentenceBreak {
21	fn relative_file_path() -> &'static Path {
22	Path::new("auxiliary/SentenceBreakProperty.txt")
23	}
24	}
25
26	impl UcdFileByCodepoint for SentenceBreak {
27	fn codepoints(&self) -> CodepointIter {
28	self.codepoints.into_iter()
29	}
30	}
31
32	impl std::str::FromStr for SentenceBreak {
33	type Err = Error;
34
35	fn from_str(line: &str) -> Result<SentenceBreak, Error> {
36	let (codepoints: Codepoints, value) = parse_codepoint_association(line)?;
37	Ok(SentenceBreak { codepoints, value: value.to_string() })
38	}
39	}
40
41	/// A single row in the `auxiliary/SentenceBreakTest.txt` file.
42	///
43	/// This file defines tests for the sentence break algorithm.
44	#[derive(Clone, Debug, Default, Eq, PartialEq)]
45	pub struct SentenceBreakTest {
46	/// Each string is a UTF-8 encoded group of codepoints that make up a
47	/// single sentence.
48	pub sentences: Vec<String>,
49	/// A human readable description of this test.
50	pub comment: String,
51	}
52
53	impl UcdFile for SentenceBreakTest {
54	fn relative_file_path() -> &'static Path {
55	Path::new("auxiliary/SentenceBreakTest.txt")
56	}
57	}
58
59	impl std::str::FromStr for SentenceBreakTest {
60	type Err = Error;
61
62	fn from_str(line: &str) -> Result<SentenceBreakTest, Error> {
63	let (groups, comment) = parse_break_test(line)?;
64	Ok(SentenceBreakTest { sentences: groups, comment })
65	}
66	}
67
68	#[cfg(test)]
69	mod tests {
70	use super::{SentenceBreak, SentenceBreakTest};
71
72	#[test]
73	fn parse_single() {
74	let line = "11445 ; Extend # Mc NEWA SIGN VISARGA`\n`";
75	let row: SentenceBreak = line.parse().unwrap();
76	assert_eq!(row.codepoints, `0x11445`);
77	assert_eq!(row.value, "Extend");
78	}
79
80	#[test]
81	fn parse_range() {
82	let line = "FE31..FE32 ; SContinue # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH`\n`";
83	let row: SentenceBreak = line.parse().unwrap();
84	assert_eq!(row.codepoints, (`0xFE31`, `0xFE32`));
85	assert_eq!(row.value, "SContinue");
86	}
87
88	#[test]
89	fn parse_test() {
90	let line = "÷ 2060 × 5B57 × 2060 × 002E × 2060 ÷ 5B57 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]";
91
92	let row: SentenceBreakTest = line.parse().unwrap();
93	assert_eq!(
94	row.sentences,
95	vec![
96	"`\u{2060}\u{5B57}\u{2060}\u{002E}\u{2060}`",
97	"`\u{5B57}\u{2060}\u{2060}`",
98	]
99	);
100	assert!(row.comment.contains("[5.0] WORD JOINER (Format_FE)"));
101	}
102	}
103