1//! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14].
2//!
3//! Given an input text, locates "line break opportunities", or positions appropriate for wrapping
4//! lines when displaying text.
5//!
6//! # Example
7//!
8//! ```
9//! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
10//!
11//! let text = "a b \nc";
12//! assert!(linebreaks(text).eq([
13//! (2, Allowed), // May break after first space
14//! (5, Mandatory), // Must break after line feed
15//! (6, Mandatory) // Must break at end of text, so that there always is at least one LB
16//! ]));
17//! ```
18//!
19//! [UAX14]: https://www.unicode.org/reports/tr14/
20
21#![no_std]
22#![deny(missing_docs, missing_debug_implementations)]
23
24use core::iter::once;
25
26/// The [Unicode version](https://www.unicode.org/versions/) conformed to.
27pub const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0);
28
29include!("shared.rs");
30include!("tables.rs");
31
32/// Returns the line break property of the specified code point.
33///
34/// # Examples
35///
36/// ```
37/// use unicode_linebreak::{BreakClass, break_property};
38/// assert_eq!(break_property(0x2CF3), BreakClass::Alphabetic);
39/// ```
40#[inline(always)]
41pub fn break_property(codepoint: u32) -> BreakClass {
42 const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT;
43 const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1;
44
45 let data_pos: u16 = if codepoint < BMP_LIMIT {
46 let i: u32 = codepoint >> BMP_SHIFT;
47 BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - 1)) as u16
48 } else if codepoint < BREAK_PROP_TRIE_HIGH_START {
49 let i1: u32 = codepoint >> SHIFT_1;
50 let i2: u16 = BREAK_PROP_TRIE_INDEX
51 [(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize]
52 + ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - 1)) as u16;
53 let i3_block: u16 = BREAK_PROP_TRIE_INDEX[i2 as usize];
54 let i3_pos: u16 = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - 1)) as u16;
55
56 debug_assert!(i3_block & 0x8000 == 0, "18-bit indices are unexpected");
57 let data_block: u16 = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize];
58 data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - 1)) as u16
59 } else {
60 return XX;
61 };
62 BREAK_PROP_TRIE_DATA[data_pos as usize]
63}
64
65/// Break opportunity type.
66#[derive(Copy, Clone, PartialEq, Eq, Debug)]
67pub enum BreakOpportunity {
68 /// A line must break at this spot.
69 Mandatory,
70 /// A line is allowed to end at this spot.
71 Allowed,
72}
73
74/// Returns an iterator over line break opportunities in the specified string.
75///
76/// Break opportunities are given as tuples of the byte index of the character succeeding the break
77/// and the type.
78///
79/// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent
80/// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of
81/// General_Category.
82///
83/// # Examples
84///
85/// ```
86/// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}};
87/// assert!(linebreaks("Hello world!").eq(vec![(6, Allowed), (12, Mandatory)]));
88/// ```
89pub fn linebreaks(s: &str) -> impl Iterator<Item = (usize, BreakOpportunity)> + Clone + '_ {
90 use BreakOpportunity::{Allowed, Mandatory};
91
92 s.char_indices()
93 .map(|(i, c)| (i, break_property(c as u32) as u8))
94 .chain(once((s.len(), eot)))
95 .scan((sot, false), |state, (i, cls)| {
96 // ZWJ is handled outside the table to reduce its size
97 let val = PAIR_TABLE[state.0 as usize][cls as usize];
98 let is_mandatory = val & MANDATORY_BREAK_BIT != 0;
99 let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory);
100 *state = (
101 val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT),
102 cls == BreakClass::ZeroWidthJoiner as u8,
103 );
104
105 Some((i, is_break, is_mandatory))
106 })
107 .filter_map(|(i, is_break, is_mandatory)| {
108 if is_break {
109 Some((i, if is_mandatory { Mandatory } else { Allowed }))
110 } else {
111 None
112 }
113 })
114}
115
116/// Divides the string at the last index where further breaks do not depend on prior context.
117///
118/// The trivial index at `eot` is excluded.
119///
120/// A common optimization is to determine only the nearest line break opportunity before the first
121/// character that would cause the line to become overfull, requiring backward traversal, of which
122/// there are two approaches:
123///
124/// * Cache breaks from forward traversals
125/// * Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly
126///
127/// # Examples
128///
129/// ```
130/// use unicode_linebreak::{linebreaks, split_at_safe};
131/// let s = "Not allowed to break within em dashes: — —";
132/// let (prev, safe) = split_at_safe(s);
133/// let n = prev.len();
134/// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(|(i, x)| i.checked_sub(n).map(|i| (i, x)))));
135/// ```
136pub fn split_at_safe(s: &str) -> (&str, &str) {
137 let mut chars: impl Iterator = s.char_indices().rev().scan(initial_state:None, |state: &mut Option, (i: usize, c: char)| {
138 let cls: BreakClass = break_property(codepoint:c as u32);
139 let is_safe_pair: bool = state
140 .replace(cls)
141 .map_or(default:false, |prev: BreakClass| is_safe_pair(a:cls, b:prev)); // Reversed since iterating backwards
142 Some((i, is_safe_pair))
143 });
144 chars.find(|&(_, is_safe_pair: bool)| is_safe_pair);
145 // Include preceding char for `linebreaks` to pick up break before match (disallowed after sot)
146 s.split_at(mid:chars.next().map_or(default:0, |(i: usize, _)| i))
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152
153 #[test]
154 fn it_works() {
155 assert_eq!(break_property(0xA), BreakClass::LineFeed);
156 assert_eq!(break_property(0xDB80), BreakClass::Surrogate);
157 assert_eq!(break_property(0xe01ef), BreakClass::CombiningMark);
158 assert_eq!(break_property(0x10ffff), BreakClass::Unknown);
159 }
160}
161