| 1 | //! Implementation of the Line Breaking Algorithm described in [Unicode Standard Annex #14][UAX14]. |
| 2 | //! |
| 3 | //! Given an input text, locates "line break opportunities", or positions appropriate for wrapping |
| 4 | //! lines when displaying text. |
| 5 | //! |
| 6 | //! # Example |
| 7 | //! |
| 8 | //! ``` |
| 9 | //! use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; |
| 10 | //! |
| 11 | //! let text = "a b \nc" ; |
| 12 | //! assert!(linebreaks(text).eq([ |
| 13 | //! (2, Allowed), // May break after first space |
| 14 | //! (5, Mandatory), // Must break after line feed |
| 15 | //! (6, Mandatory) // Must break at end of text, so that there always is at least one LB |
| 16 | //! ])); |
| 17 | //! ``` |
| 18 | //! |
| 19 | //! [UAX14]: https://www.unicode.org/reports/tr14/ |
| 20 | |
| 21 | #![no_std ] |
| 22 | #![deny (missing_docs, missing_debug_implementations)] |
| 23 | |
| 24 | use core::iter::once; |
| 25 | |
| 26 | /// The [Unicode version](https://www.unicode.org/versions/) conformed to. |
| 27 | pub const UNICODE_VERSION: (u8, u8, u8) = (15, 0, 0); |
| 28 | |
| 29 | include!("shared.rs" ); |
| 30 | include!("tables.rs" ); |
| 31 | |
| 32 | /// Returns the line break property of the specified code point. |
| 33 | /// |
| 34 | /// # Examples |
| 35 | /// |
| 36 | /// ``` |
| 37 | /// use unicode_linebreak::{BreakClass, break_property}; |
| 38 | /// assert_eq!(break_property(0x2CF3), BreakClass::Alphabetic); |
| 39 | /// ``` |
| 40 | #[inline (always)] |
| 41 | pub fn break_property(codepoint: u32) -> BreakClass { |
| 42 | const BMP_INDEX_LENGTH: u32 = BMP_LIMIT >> BMP_SHIFT; |
| 43 | const OMITTED_BMP_INDEX_1_LENGTH: u32 = BMP_LIMIT >> SHIFT_1; |
| 44 | |
| 45 | let data_pos: u16 = if codepoint < BMP_LIMIT { |
| 46 | let i: u32 = codepoint >> BMP_SHIFT; |
| 47 | BREAK_PROP_TRIE_INDEX[i as usize] + (codepoint & (BMP_DATA_BLOCK_LENGTH - 1)) as u16 |
| 48 | } else if codepoint < BREAK_PROP_TRIE_HIGH_START { |
| 49 | let i1: u32 = codepoint >> SHIFT_1; |
| 50 | let i2: u16 = BREAK_PROP_TRIE_INDEX |
| 51 | [(i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH) as usize] |
| 52 | + ((codepoint >> SHIFT_2) & (INDEX_2_BLOCK_LENGTH - 1)) as u16; |
| 53 | let i3_block: u16 = BREAK_PROP_TRIE_INDEX[i2 as usize]; |
| 54 | let i3_pos: u16 = ((codepoint >> SHIFT_3) & (INDEX_3_BLOCK_LENGTH - 1)) as u16; |
| 55 | |
| 56 | debug_assert!(i3_block & 0x8000 == 0, "18-bit indices are unexpected" ); |
| 57 | let data_block: u16 = BREAK_PROP_TRIE_INDEX[(i3_block + i3_pos) as usize]; |
| 58 | data_block + (codepoint & (SMALL_DATA_BLOCK_LENGTH - 1)) as u16 |
| 59 | } else { |
| 60 | return XX; |
| 61 | }; |
| 62 | BREAK_PROP_TRIE_DATA[data_pos as usize] |
| 63 | } |
| 64 | |
| 65 | /// Break opportunity type. |
| 66 | #[derive (Copy, Clone, PartialEq, Eq, Debug)] |
| 67 | pub enum BreakOpportunity { |
| 68 | /// A line must break at this spot. |
| 69 | Mandatory, |
| 70 | /// A line is allowed to end at this spot. |
| 71 | Allowed, |
| 72 | } |
| 73 | |
| 74 | /// Returns an iterator over line break opportunities in the specified string. |
| 75 | /// |
| 76 | /// Break opportunities are given as tuples of the byte index of the character succeeding the break |
| 77 | /// and the type. |
| 78 | /// |
| 79 | /// Uses the default Line Breaking Algorithm with the tailoring that Complex-Context Dependent |
| 80 | /// (SA) characters get resolved to Ordinary Alphabetic and Symbol Characters (AL) regardless of |
| 81 | /// General_Category. |
| 82 | /// |
| 83 | /// # Examples |
| 84 | /// |
| 85 | /// ``` |
| 86 | /// use unicode_linebreak::{linebreaks, BreakOpportunity::{Mandatory, Allowed}}; |
| 87 | /// assert!(linebreaks("Hello world!" ).eq(vec![(6, Allowed), (12, Mandatory)])); |
| 88 | /// ``` |
| 89 | pub fn linebreaks(s: &str) -> impl Iterator<Item = (usize, BreakOpportunity)> + Clone + '_ { |
| 90 | use BreakOpportunity::{Allowed, Mandatory}; |
| 91 | |
| 92 | s.char_indices() |
| 93 | .map(|(i, c)| (i, break_property(c as u32) as u8)) |
| 94 | .chain(once((s.len(), eot))) |
| 95 | .scan((sot, false), |state, (i, cls)| { |
| 96 | // ZWJ is handled outside the table to reduce its size |
| 97 | let val = PAIR_TABLE[state.0 as usize][cls as usize]; |
| 98 | let is_mandatory = val & MANDATORY_BREAK_BIT != 0; |
| 99 | let is_break = val & ALLOWED_BREAK_BIT != 0 && (!state.1 || is_mandatory); |
| 100 | *state = ( |
| 101 | val & !(ALLOWED_BREAK_BIT | MANDATORY_BREAK_BIT), |
| 102 | cls == BreakClass::ZeroWidthJoiner as u8, |
| 103 | ); |
| 104 | |
| 105 | Some((i, is_break, is_mandatory)) |
| 106 | }) |
| 107 | .filter_map(|(i, is_break, is_mandatory)| { |
| 108 | if is_break { |
| 109 | Some((i, if is_mandatory { Mandatory } else { Allowed })) |
| 110 | } else { |
| 111 | None |
| 112 | } |
| 113 | }) |
| 114 | } |
| 115 | |
| 116 | /// Divides the string at the last index where further breaks do not depend on prior context. |
| 117 | /// |
| 118 | /// The trivial index at `eot` is excluded. |
| 119 | /// |
| 120 | /// A common optimization is to determine only the nearest line break opportunity before the first |
| 121 | /// character that would cause the line to become overfull, requiring backward traversal, of which |
| 122 | /// there are two approaches: |
| 123 | /// |
| 124 | /// * Cache breaks from forward traversals |
| 125 | /// * Step backward and with `split_at_safe` find a pos to safely search forward from, repeatedly |
| 126 | /// |
| 127 | /// # Examples |
| 128 | /// |
| 129 | /// ``` |
| 130 | /// use unicode_linebreak::{linebreaks, split_at_safe}; |
| 131 | /// let s = "Not allowed to break within em dashes: — —" ; |
| 132 | /// let (prev, safe) = split_at_safe(s); |
| 133 | /// let n = prev.len(); |
| 134 | /// assert!(linebreaks(safe).eq(linebreaks(s).filter_map(|(i, x)| i.checked_sub(n).map(|i| (i, x))))); |
| 135 | /// ``` |
| 136 | pub fn split_at_safe(s: &str) -> (&str, &str) { |
| 137 | let mut chars: impl Iterator = s.char_indices().rev().scan(initial_state:None, |state: &mut Option, (i: usize, c: char)| { |
| 138 | let cls: BreakClass = break_property(codepoint:c as u32); |
| 139 | let is_safe_pair: bool = state |
| 140 | .replace(cls) |
| 141 | .map_or(default:false, |prev: BreakClass| is_safe_pair(a:cls, b:prev)); // Reversed since iterating backwards |
| 142 | Some((i, is_safe_pair)) |
| 143 | }); |
| 144 | chars.find(|&(_, is_safe_pair: bool)| is_safe_pair); |
| 145 | // Include preceding char for `linebreaks` to pick up break before match (disallowed after sot) |
| 146 | s.split_at(mid:chars.next().map_or(default:0, |(i: usize, _)| i)) |
| 147 | } |
| 148 | |
| 149 | #[cfg (test)] |
| 150 | mod tests { |
| 151 | use super::*; |
| 152 | |
| 153 | #[test ] |
| 154 | fn it_works() { |
| 155 | assert_eq!(break_property(0xA), BreakClass::LineFeed); |
| 156 | assert_eq!(break_property(0xDB80), BreakClass::Surrogate); |
| 157 | assert_eq!(break_property(0xe01ef), BreakClass::CombiningMark); |
| 158 | assert_eq!(break_property(0x10ffff), BreakClass::Unknown); |
| 159 | } |
| 160 | } |
| 161 | |