lib.rs source code [crates/unicode_linebreak/src/lib.rs]

1	//! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14].
2	//!
3	//! Given an input text, locates "line break opportunities", or positions appropriate for wrapping
4	//! lines when displaying text.
5	//!
6	//! # Example
7	//!
8	//! ```
9	//! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
10	//!
11	//! let text = "a b `\n`c";
12	//! assert!(linebreaks(text).eq([
13	//! (`2`, Allowed), // May break after first space
14	//! (`5`, Mandatory), // Must break after line feed
15	//! (`6`, Mandatory) // Must break at end of text, so that there always is at least one LB
16	//! ]));
17	//! ```
18	//!
19	//! [UAX14]: https://www.unicode.org/reports/tr14/
20
21	#![no_std]
22	#![deny(missing_docs, missing_debug_implementations)]
23
24	use core::iter::once;
25
26	/// The [Unicode version](https://www.unicode.org/versions/) conformed to.
27	pub const UNICODE_VERSION: (u8, u8, u8) = (`15`, `0`, `0`);
28
29	include!("shared.rs");
30	include!("tables.rs");
31
32	/// Returns the line break property of the specified code point.
33	///
34	/// # Examples
35	///
36	/// ```
37	/// use unicode_linebreak::{BreakClass, break_property};
38	/// assert_eq!(break_property(`0x2CF3`), BreakClass::Alphabetic);
39	/// ```
40	#[inline(always)]
41	pub fn break_property(codepoint: u32) -> BreakClass {
42	const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT;
43	const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1;
44
45	let data_pos: u16 = if codepoint < BMP_LIMIT {
46	let i: u32 = codepoint >> BMP_SHIFT;
47	BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - `1`)) as u16
48	} else if codepoint < BREAK_PROP_TRIE_HIGH_START {
49	let i1: u32 = codepoint >> SHIFT_1;
50	let i2: u16 = BREAK_PROP_TRIE_INDEX
51	[(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize]
52	+ ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - `1`)) as u16;
53	let i3_block: u16 = BREAK_PROP_TRIE_INDEX[i2 as usize];
54	let i3_pos: u16 = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - `1`)) as u16;
55
56	debug_assert!(i3_block & `0x8000` == `0`, "18-bit indices are unexpected");
57	let data_block: u16 = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize];
58	data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - `1`)) as u16
59	} else {
60	return XX;
61	};
62	BREAK_PROP_TRIE_DATA[data_pos as usize]
63	}
64
65	/// Break opportunity type.
66	#[derive(Copy, Clone, PartialEq, Eq, Debug)]
67	pub enum BreakOpportunity {
68	/// A line must break at this spot.
69	Mandatory,
70	/// A line is allowed to end at this spot.
71	Allowed,
72	}
73
74	/// Returns an iterator over line break opportunities in the specified string.
75	///
76	/// Break opportunities are given as tuples of the byte index of the character succeeding the break
77	/// and the type.
78	///
79	/// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent
80	/// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of
81	/// General_Category.
82	///
83	/// # Examples
84	///
85	/// ```
86	/// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
87	/// assert!(linebreaks("Hello world!").eq(vec![(`6`, Allowed), (`12`, Mandatory)]));
88	/// ```
89	pub fn linebreaks(s: &str) -> impl Iterator<Item = (usize, BreakOpportunity)> + Clone + '_ {
90	use BreakOpportunity::{Allowed, Mandatory};
91
92	s.char_indices()
93	.map(\|(i, c)\| (i, break_property(c as u32) as u8))
94	.chain(once((s.len(), eot)))
95	.scan((sot, `false`), \|state, (i, cls)\| {
96	// ZWJ is handled outside the table to reduce its size
97	let val = PAIR_TABLE[state.0 as usize][cls as usize];
98	let is_mandatory = val & MANDATORY_BREAK_BIT != `0`;
99	let is_break = val & ALLOWED_BREAK_BIT != `0` && (!state.1 \|\| is_mandatory);
100	*state = (
101	val & !(ALLOWED_BREAK_BIT \| MANDATORY_BREAK_BIT),
102	cls == BreakClass::ZeroWidthJoiner as u8,
103	);
104
105	Some((i, is_break, is_mandatory))
106	})
107	.filter_map(\|(i, is_break, is_mandatory)\| {
108	if is_break {
109	Some((i, if is_mandatory { Mandatory } else { Allowed }))
110	} else {
111	None
112	}
113	})
114	}
115
116	/// Divides the string at the last index where further breaks do not depend on prior context.
117	///
118	/// The trivial index at `eot` is excluded.
119	///
120	/// A common optimization is to determine only the nearest line break opportunity before the first
121	/// character that would cause the line to become overfull, requiring backward traversal, of which
122	/// there are two approaches:
123	///
124	/// Cache breaks from forward traversals*
125	/// Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly*
126	///
127	/// # Examples
128	///
129	/// ```
130	/// use unicode_linebreak::{linebreaks, split_at_safe};
131	/// let s = "Not allowed to break within em dashes: — —";
132	/// let (prev, safe) = split_at_safe(s);
133	/// let n = prev.len();
134	/// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(\|(i, x)\| i.checked_sub(n).map(\|i\| (i, x)))));
135	/// ```
136	pub fn split_at_safe(s: &str) -> (&str, &str) {
137	let mut chars: impl Iterator = s.char_indices().rev().scan(initial_state:None, \|state: &mut Option, (i: usize, c: char)\| {
138	let cls: BreakClass = break_property(codepoint:c as u32);
139	let is_safe_pair: bool = state
140	.replace(cls)
141	.map_or(default:`false`, \|prev: BreakClass\| is_safe_pair(a:cls, b:prev)); // Reversed since iterating backwards
142	Some((i, is_safe_pair))
143	});
144	chars.find(\|&(_, is_safe_pair: bool)\| is_safe_pair);
145	// Include preceding char for `linebreaks` to pick up break before match (disallowed after sot)
146	s.split_at(mid:chars.next().map_or(default:`0`, \|(i: usize, _)\| i))
147	}
148
149	#[cfg(test)]
150	mod tests {
151	use super::*;
152
153	#[test]
154	fn it_works() {
155	assert_eq!(break_property(`0xA`), BreakClass::LineFeed);
156	assert_eq!(break_property(`0xDB80`), BreakClass::Surrogate);
157	assert_eq!(break_property(`0xe01ef`), BreakClass::CombiningMark);
158	assert_eq!(break_property(`0x10ffff`), BreakClass::Unknown);
159	}
160	}
161