1 | //! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14]. |
2 | //! |
3 | //! Given an input text, locates "line break opportunities", or positions appropriate for wrapping |
4 | //! lines when displaying text. |
5 | //! |
6 | //! # Example |
7 | //! |
8 | //! ``` |
9 | //! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; |
10 | //! |
11 | //! let text = "a b \nc" ; |
12 | //! assert!(linebreaks(text).eq([ |
13 | //! (2, Allowed), // May break after first space |
14 | //! (5, Mandatory), // Must break after line feed |
15 | //! (6, Mandatory) // Must break at end of text, so that there always is at least one LB |
16 | //! ])); |
17 | //! ``` |
18 | //! |
19 | //! [UAX14]: https://www.unicode.org/reports/tr14/ |
20 | |
21 | #![no_std ] |
22 | #![deny (missing_docs, missing_debug_implementations)] |
23 | |
24 | use core::iter::once; |
25 | |
26 | /// The [Unicode version](https://www.unicode.org/versions/) conformed to. |
27 | pub const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0); |
28 | |
29 | include!("shared.rs" ); |
30 | include!("tables.rs" ); |
31 | |
32 | /// Returns the line break property of the specified code point. |
33 | /// |
34 | /// # Examples |
35 | /// |
36 | /// ``` |
37 | /// use unicode_linebreak::{BreakClass, break_property}; |
38 | /// assert_eq!(break_property(0x2CF3), BreakClass::Alphabetic); |
39 | /// ``` |
40 | #[inline (always)] |
41 | pub fn break_property(codepoint: u32) -> BreakClass { |
42 | const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT; |
43 | const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1; |
44 | |
45 | let data_pos: u16 = if codepoint < BMP_LIMIT { |
46 | let i: u32 = codepoint >> BMP_SHIFT; |
47 | BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - 1)) as u16 |
48 | } else if codepoint < BREAK_PROP_TRIE_HIGH_START { |
49 | let i1: u32 = codepoint >> SHIFT_1; |
50 | let i2: u16 = BREAK_PROP_TRIE_INDEX |
51 | [(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize] |
52 | + ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - 1)) as u16; |
53 | let i3_block: u16 = BREAK_PROP_TRIE_INDEX[i2 as usize]; |
54 | let i3_pos: u16 = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - 1)) as u16; |
55 | |
56 | debug_assert!(i3_block & 0x8000 == 0, "18-bit indices are unexpected" ); |
57 | let data_block: u16 = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize]; |
58 | data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - 1)) as u16 |
59 | } else { |
60 | return XX; |
61 | }; |
62 | BREAK_PROP_TRIE_DATA[data_pos as usize] |
63 | } |
64 | |
65 | /// Break opportunity type. |
66 | #[derive (Copy, Clone, PartialEq, Eq, Debug)] |
67 | pub enum BreakOpportunity { |
68 | /// A line must break at this spot. |
69 | Mandatory, |
70 | /// A line is allowed to end at this spot. |
71 | Allowed, |
72 | } |
73 | |
74 | /// Returns an iterator over line break opportunities in the specified string. |
75 | /// |
76 | /// Break opportunities are given as tuples of the byte index of the character succeeding the break |
77 | /// and the type. |
78 | /// |
79 | /// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent |
80 | /// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of |
81 | /// General_Category. |
82 | /// |
83 | /// # Examples |
84 | /// |
85 | /// ``` |
86 | /// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; |
87 | /// assert!(linebreaks("Hello world!" ).eq(vec![(6, Allowed), (12, Mandatory)])); |
88 | /// ``` |
89 | pub fn linebreaks(s: &str) -> impl Iterator<Item = (usize, BreakOpportunity)> + Clone + '_ { |
90 | use BreakOpportunity::{Allowed, Mandatory}; |
91 | |
92 | s.char_indices() |
93 | .map(|(i, c)| (i, break_property(c as u32) as u8)) |
94 | .chain(once((s.len(), eot))) |
95 | .scan((sot, false), |state, (i, cls)| { |
96 | // ZWJ is handled outside the table to reduce its size |
97 | let val = PAIR_TABLE[state.0 as usize][cls as usize]; |
98 | let is_mandatory = val & MANDATORY_BREAK_BIT != 0; |
99 | let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory); |
100 | *state = ( |
101 | val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), |
102 | cls == BreakClass::ZeroWidthJoiner as u8, |
103 | ); |
104 | |
105 | Some((i, is_break, is_mandatory)) |
106 | }) |
107 | .filter_map(|(i, is_break, is_mandatory)| { |
108 | if is_break { |
109 | Some((i, if is_mandatory { Mandatory } else { Allowed })) |
110 | } else { |
111 | None |
112 | } |
113 | }) |
114 | } |
115 | |
116 | /// Divides the string at the last index where further breaks do not depend on prior context. |
117 | /// |
118 | /// The trivial index at `eot` is excluded. |
119 | /// |
120 | /// A common optimization is to determine only the nearest line break opportunity before the first |
121 | /// character that would cause the line to become overfull, requiring backward traversal, of which |
122 | /// there are two approaches: |
123 | /// |
124 | /// * Cache breaks from forward traversals |
125 | /// * Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly |
126 | /// |
127 | /// # Examples |
128 | /// |
129 | /// ``` |
130 | /// use unicode_linebreak::{linebreaks, split_at_safe}; |
131 | /// let s = "Not allowed to break within em dashes: — —" ; |
132 | /// let (prev, safe) = split_at_safe(s); |
133 | /// let n = prev.len(); |
134 | /// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(|(i, x)| i.checked_sub(n).map(|i| (i, x))))); |
135 | /// ``` |
136 | pub fn split_at_safe(s: &str) -> (&str, &str) { |
137 | let mut chars: impl Iterator = s.char_indices().rev().scan(initial_state:None, |state: &mut Option, (i: usize, c: char)| { |
138 | let cls: BreakClass = break_property(codepoint:c as u32); |
139 | let is_safe_pair: bool = state |
140 | .replace(cls) |
141 | .map_or(default:false, |prev: BreakClass| is_safe_pair(a:cls, b:prev)); // Reversed since iterating backwards |
142 | Some((i, is_safe_pair)) |
143 | }); |
144 | chars.find(|&(_, is_safe_pair: bool)| is_safe_pair); |
145 | // Include preceding char for `linebreaks` to pick up break before match (disallowed after sot) |
146 | s.split_at(mid:chars.next().map_or(default:0, |(i: usize, _)| i)) |
147 | } |
148 | |
149 | #[cfg (test)] |
150 | mod tests { |
151 | use super::*; |
152 | |
153 | #[test ] |
154 | fn it_works() { |
155 | assert_eq!(break_property(0xA), BreakClass::LineFeed); |
156 | assert_eq!(break_property(0xDB80), BreakClass::Surrogate); |
157 | assert_eq!(break_property(0xe01ef), BreakClass::CombiningMark); |
158 | assert_eq!(break_property(0x10ffff), BreakClass::Unknown); |
159 | } |
160 | } |
161 | |