1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! This crate implements the [Unicode Bidirectional Algorithm][tr9] for display of mixed
11//! right-to-left and left-to-right text. It is written in safe Rust, compatible with the
12//! current stable release.
13//!
14//! ## Example
15//!
16//! ```rust
17//! # #[cfg(feature = "hardcoded-data")] {
18//! use unicode_bidi::BidiInfo;
19//!
20//! // This example text is defined using `concat!` because some browsers
21//! // and text editors have trouble displaying bidi strings.
22//! let text = concat![
23//! "א",
24//! "ב",
25//! "ג",
26//! "a",
27//! "b",
28//! "c",
29//! ];
30//!
31//! // Resolve embedding levels within the text. Pass `None` to detect the
32//! // paragraph level automatically.
33//! let bidi_info = BidiInfo::new(&text, None);
34//!
35//! // This paragraph has embedding level 1 because its first strong character is RTL.
36//! assert_eq!(bidi_info.paragraphs.len(), 1);
37//! let para = &bidi_info.paragraphs[0];
38//! assert_eq!(para.level.number(), 1);
39//! assert_eq!(para.level.is_rtl(), true);
40//!
41//! // Re-ordering is done after wrapping each paragraph into a sequence of
42//! // lines. For this example, I'll just use a single line that spans the
43//! // entire paragraph.
44//! let line = para.range.clone();
45//!
46//! let display = bidi_info.reorder_line(para, line);
47//! assert_eq!(display, concat![
48//! "a",
49//! "b",
50//! "c",
51//! "ג",
52//! "ב",
53//! "א",
54//! ]);
55//! # } // feature = "hardcoded-data"
56//! ```
57//!
58//! # Features
59//!
60//! - `std`: Enabled by default, but can be disabled to make `unicode_bidi`
61//! `#![no_std]` + `alloc` compatible.
62//! - `hardcoded-data`: Enabled by default. Includes hardcoded Unicode bidi data and more convenient APIs.
63//! - `serde`: Adds [`serde::Serialize`] and [`serde::Deserialize`]
64//! implementations to relevant types.
65//!
66//! [tr9]: <http://www.unicode.org/reports/tr9/>
67
68#![no_std]
69// We need to link to std to make doc tests work on older Rust versions
70#[cfg(feature = "std")]
71extern crate std;
72#[macro_use]
73extern crate alloc;
74#[cfg(feature = "smallvec")]
75extern crate smallvec;
76
77pub mod data_source;
78pub mod deprecated;
79pub mod format_chars;
80pub mod level;
81pub mod utf16;
82
83mod char_data;
84mod explicit;
85mod implicit;
86mod prepare;
87
88pub use crate::char_data::{BidiClass, UNICODE_VERSION};
89pub use crate::data_source::BidiDataSource;
90pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL};
91pub use crate::prepare::{LevelRun, LevelRunVec};
92
93#[cfg(feature = "hardcoded-data")]
94pub use crate::char_data::{bidi_class, HardcodedBidiData};
95
96use alloc::borrow::Cow;
97use alloc::string::String;
98use alloc::vec::Vec;
99use core::char;
100use core::cmp;
101use core::iter::repeat;
102use core::ops::Range;
103use core::str::CharIndices;
104#[cfg(feature = "smallvec")]
105use smallvec::SmallVec;
106
107use crate::format_chars as chars;
108use crate::BidiClass::*;
109
110/// Trait that abstracts over a text source for use by the bidi algorithms.
111/// We implement this for str (UTF-8) and for [u16] (UTF-16, native-endian).
112/// (For internal unicode-bidi use; API may be unstable.)
113/// This trait is sealed and cannot be implemented for types outside this crate.
114pub trait TextSource<'text>: private::Sealed {
115 type CharIter: Iterator<Item = char>;
116 type CharIndexIter: Iterator<Item = (usize, char)>;
117 type IndexLenIter: Iterator<Item = (usize, usize)>;
118
119 /// Return the length of the text in code units.
120 #[doc(hidden)]
121 fn len(&self) -> usize;
122
123 /// Get the character at a given code unit index, along with its length in code units.
124 /// Returns None if index is out of range, or points inside a multi-code-unit character.
125 /// Returns REPLACEMENT_CHARACTER for any unpaired surrogates in UTF-16.
126 #[doc(hidden)]
127 fn char_at(&self, index: usize) -> Option<(char, usize)>;
128
129 /// Return a subrange of the text, indexed by code units.
130 /// (We don't implement all of the Index trait, just the minimum we use.)
131 #[doc(hidden)]
132 fn subrange(&self, range: Range<usize>) -> &Self;
133
134 /// An iterator over the text returning Unicode characters,
135 /// REPLACEMENT_CHAR for invalid code units.
136 #[doc(hidden)]
137 fn chars(&'text self) -> Self::CharIter;
138
139 /// An iterator over the text returning (index, char) tuples,
140 /// where index is the starting code-unit index of the character,
141 /// and char is its Unicode value (or REPLACEMENT_CHAR if invalid).
142 #[doc(hidden)]
143 fn char_indices(&'text self) -> Self::CharIndexIter;
144
145 /// An iterator over the text returning (index, length) tuples,
146 /// where index is the starting code-unit index of the character,
147 /// and length is its length in code units.
148 #[doc(hidden)]
149 fn indices_lengths(&'text self) -> Self::IndexLenIter;
150
151 /// Number of code units the given character uses.
152 #[doc(hidden)]
153 fn char_len(ch: char) -> usize;
154}
155
156mod private {
157 pub trait Sealed {}
158
159 // Implement for str and [u16] only.
160 impl Sealed for str {}
161 impl Sealed for [u16] {}
162}
163
164#[derive(PartialEq, Debug)]
165pub enum Direction {
166 Ltr,
167 Rtl,
168 Mixed,
169}
170
171/// Bidi information about a single paragraph
172#[derive(Clone, Debug, PartialEq)]
173pub struct ParagraphInfo {
174 /// The paragraphs boundaries within the text, as byte indices.
175 ///
176 /// TODO: Shrink this to only include the starting index?
177 pub range: Range<usize>,
178
179 /// The paragraph embedding level.
180 ///
181 /// <http://www.unicode.org/reports/tr9/#BD4>
182 pub level: Level,
183}
184
185impl ParagraphInfo {
186 /// Gets the length of the paragraph in the source text.
187 pub fn len(&self) -> usize {
188 self.range.end - self.range.start
189 }
190}
191
192/// Initial bidi information of the text.
193///
194/// Contains the text paragraphs and `BidiClass` of its characters.
195#[derive(PartialEq, Debug)]
196pub struct InitialInfo<'text> {
197 /// The text
198 pub text: &'text str,
199
200 /// The BidiClass of the character at each byte in the text.
201 /// If a character is multiple bytes, its class will appear multiple times in the vector.
202 pub original_classes: Vec<BidiClass>,
203
204 /// The boundaries and level of each paragraph within the text.
205 pub paragraphs: Vec<ParagraphInfo>,
206}
207
208impl<'text> InitialInfo<'text> {
209 /// Find the paragraphs and BidiClasses in a string of text.
210 ///
211 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
212 ///
213 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
214 /// character is found before the matching PDI. If no strong character is found, the class will
215 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
216 ///
217 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
218 #[cfg_attr(feature = "flame_it", flamer::flame)]
219 #[cfg(feature = "hardcoded-data")]
220 pub fn new(text: &str, default_para_level: Option<Level>) -> InitialInfo<'_> {
221 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
222 }
223
224 /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
225 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
226 /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
227 ///
228 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
229 ///
230 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
231 /// character is found before the matching PDI. If no strong character is found, the class will
232 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
233 #[cfg_attr(feature = "flame_it", flamer::flame)]
234 pub fn new_with_data_source<'a, D: BidiDataSource>(
235 data_source: &D,
236 text: &'a str,
237 default_para_level: Option<Level>,
238 ) -> InitialInfo<'a> {
239 InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base
240 }
241}
242
243/// Extended version of InitialInfo (not public API).
244#[derive(PartialEq, Debug)]
245struct InitialInfoExt<'text> {
246 /// The base InitialInfo for the text, recording its paragraphs and bidi classes.
247 base: InitialInfo<'text>,
248
249 /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
250 /// requires no further bidi processing (i.e. there are no RTL characters or bidi
251 /// control codes present), and whether any bidi isolation controls are present.
252 flags: Vec<ParagraphInfoFlags>,
253}
254
255#[derive(PartialEq, Debug)]
256struct ParagraphInfoFlags {
257 is_pure_ltr: bool,
258 has_isolate_controls: bool,
259}
260
261impl<'text> InitialInfoExt<'text> {
262 /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
263 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
264 /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
265 ///
266 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
267 ///
268 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
269 /// character is found before the matching PDI. If no strong character is found, the class will
270 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
271 #[cfg_attr(feature = "flame_it", flamer::flame)]
272 pub fn new_with_data_source<'a, D: BidiDataSource>(
273 data_source: &D,
274 text: &'a str,
275 default_para_level: Option<Level>,
276 ) -> InitialInfoExt<'a> {
277 let mut paragraphs = Vec::<ParagraphInfo>::new();
278 let mut flags = Vec::<ParagraphInfoFlags>::new();
279 let (original_classes, _, _, _) = compute_initial_info(
280 data_source,
281 text,
282 default_para_level,
283 Some((&mut paragraphs, &mut flags)),
284 );
285
286 InitialInfoExt {
287 base: InitialInfo {
288 text,
289 original_classes,
290 paragraphs,
291 },
292 flags,
293 }
294 }
295}
296
297/// Implementation of initial-info computation for both BidiInfo and ParagraphBidiInfo.
298/// To treat the text as (potentially) multiple paragraphs, the caller should pass the
299/// pair of optional outparam arrays to receive the ParagraphInfo and pure-ltr flags
300/// for each paragraph. Passing None for split_paragraphs will ignore any paragraph-
301/// separator characters in the text, treating it just as a single paragraph.
302/// Returns the array of BidiClass values for each code unit of the text, along with
303/// the embedding level and pure-ltr flag for the *last* (or only) paragraph.
304fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
305 data_source: &D,
306 text: &'a T,
307 default_para_level: Option<Level>,
308 mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<ParagraphInfoFlags>)>,
309) -> (Vec<BidiClass>, Level, bool, bool) {
310 let mut original_classes = Vec::with_capacity(text.len());
311
312 // The stack contains the starting code unit index for each nested isolate we're inside.
313 #[cfg(feature = "smallvec")]
314 let mut isolate_stack = SmallVec::<[usize; 8]>::new();
315 #[cfg(not(feature = "smallvec"))]
316 let mut isolate_stack = Vec::new();
317
318 debug_assert!(
319 if let Some((ref paragraphs, ref flags)) = split_paragraphs {
320 paragraphs.is_empty() && flags.is_empty()
321 } else {
322 true
323 }
324 );
325
326 let mut para_start = 0;
327 let mut para_level = default_para_level;
328
329 // Per-paragraph flag: can subsequent processing be skipped? Set to false if any
330 // RTL characters or bidi control characters are encountered in the paragraph.
331 let mut is_pure_ltr = true;
332 // Set to true if any bidi isolation controls are present in the paragraph.
333 let mut has_isolate_controls = false;
334
335 #[cfg(feature = "flame_it")]
336 flame::start("compute_initial_info(): iter text.char_indices()");
337
338 for (i, c) in text.char_indices() {
339 let class = data_source.bidi_class(c);
340
341 #[cfg(feature = "flame_it")]
342 flame::start("original_classes.extend()");
343
344 let len = T::char_len(c);
345 original_classes.extend(repeat(class).take(len));
346
347 #[cfg(feature = "flame_it")]
348 flame::end("original_classes.extend()");
349
350 match class {
351 B => {
352 if let Some((ref mut paragraphs, ref mut flags)) = split_paragraphs {
353 // P1. Split the text into separate paragraphs. The paragraph separator is kept
354 // with the previous paragraph.
355 let para_end = i + len;
356 paragraphs.push(ParagraphInfo {
357 range: para_start..para_end,
358 // P3. If no character is found in p2, set the paragraph level to zero.
359 level: para_level.unwrap_or(LTR_LEVEL),
360 });
361 flags.push(ParagraphInfoFlags {
362 is_pure_ltr,
363 has_isolate_controls,
364 });
365 // Reset state for the start of the next paragraph.
366 para_start = para_end;
367 // TODO: Support defaulting to direction of previous paragraph
368 //
369 // <http://www.unicode.org/reports/tr9/#HL1>
370 para_level = default_para_level;
371 is_pure_ltr = true;
372 has_isolate_controls = false;
373 isolate_stack.clear();
374 }
375 }
376
377 L | R | AL => {
378 if class != L {
379 is_pure_ltr = false;
380 }
381 match isolate_stack.last() {
382 Some(&start) => {
383 if original_classes[start] == FSI {
384 // X5c. If the first strong character between FSI and its matching
385 // PDI is R or AL, treat it as RLI. Otherwise, treat it as LRI.
386 for j in 0..T::char_len(chars::FSI) {
387 original_classes[start + j] = if class == L { LRI } else { RLI };
388 }
389 }
390 }
391
392 None => {
393 if para_level.is_none() {
394 // P2. Find the first character of type L, AL, or R, while skipping
395 // any characters between an isolate initiator and its matching
396 // PDI.
397 para_level = Some(if class != L { RTL_LEVEL } else { LTR_LEVEL });
398 }
399 }
400 }
401 }
402
403 AN | LRE | RLE | LRO | RLO => {
404 is_pure_ltr = false;
405 }
406
407 RLI | LRI | FSI => {
408 is_pure_ltr = false;
409 has_isolate_controls = true;
410 isolate_stack.push(i);
411 }
412
413 PDI => {
414 isolate_stack.pop();
415 }
416
417 _ => {}
418 }
419 }
420
421 if let Some((paragraphs, flags)) = split_paragraphs {
422 if para_start < text.len() {
423 paragraphs.push(ParagraphInfo {
424 range: para_start..text.len(),
425 level: para_level.unwrap_or(LTR_LEVEL),
426 });
427 flags.push(ParagraphInfoFlags {
428 is_pure_ltr,
429 has_isolate_controls,
430 });
431 }
432 debug_assert_eq!(paragraphs.len(), flags.len());
433 }
434 debug_assert_eq!(original_classes.len(), text.len());
435
436 #[cfg(feature = "flame_it")]
437 flame::end("compute_initial_info(): iter text.char_indices()");
438
439 (
440 original_classes,
441 para_level.unwrap_or(LTR_LEVEL),
442 is_pure_ltr,
443 has_isolate_controls,
444 )
445}
446
447/// Bidi information of the text.
448///
449/// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a
450/// character is multiple bytes wide, then its class and level will appear multiple times in these
451/// vectors.
452// TODO: Impl `struct StringProperty<T> { values: Vec<T> }` and use instead of Vec<T>
453#[derive(Debug, PartialEq)]
454pub struct BidiInfo<'text> {
455 /// The text
456 pub text: &'text str,
457
458 /// The BidiClass of the character at each byte in the text.
459 pub original_classes: Vec<BidiClass>,
460
461 /// The directional embedding level of each byte in the text.
462 pub levels: Vec<Level>,
463
464 /// The boundaries and paragraph embedding level of each paragraph within the text.
465 ///
466 /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs?
467 /// Or just don't include the first paragraph, which always starts at 0?
468 pub paragraphs: Vec<ParagraphInfo>,
469}
470
471impl<'text> BidiInfo<'text> {
472 /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph.
473 ///
474 ///
475 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
476 ///
477 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
478 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
479 ///
480 /// TODO: Support auto-RTL base direction
481 #[cfg_attr(feature = "flame_it", flamer::flame)]
482 #[cfg(feature = "hardcoded-data")]
483 #[inline]
484 pub fn new(text: &str, default_para_level: Option<Level>) -> BidiInfo<'_> {
485 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
486 }
487
488 /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`]
489 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
490 /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
491 ///
492 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
493 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
494 ///
495 /// TODO: Support auto-RTL base direction
496 #[cfg_attr(feature = "flame_it", flamer::flame)]
497 pub fn new_with_data_source<'a, D: BidiDataSource>(
498 data_source: &D,
499 text: &'a str,
500 default_para_level: Option<Level>,
501 ) -> BidiInfo<'a> {
502 let InitialInfoExt { base, flags, .. } =
503 InitialInfoExt::new_with_data_source(data_source, text, default_para_level);
504
505 let mut levels = Vec::<Level>::with_capacity(text.len());
506 let mut processing_classes = base.original_classes.clone();
507
508 for (para, flags) in base.paragraphs.iter().zip(flags.iter()) {
509 let text = &text[para.range.clone()];
510 let original_classes = &base.original_classes[para.range.clone()];
511
512 compute_bidi_info_for_para(
513 data_source,
514 para,
515 flags.is_pure_ltr,
516 flags.has_isolate_controls,
517 text,
518 original_classes,
519 &mut processing_classes,
520 &mut levels,
521 );
522 }
523
524 BidiInfo {
525 text,
526 original_classes: base.original_classes,
527 paragraphs: base.paragraphs,
528 levels,
529 }
530 }
531
532 /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
533 /// in the paragraph. The returned vector includes bytes that are not included
534 /// in the `line`, but will not adjust them.
535 ///
536 /// This runs [Rule L1], you can run
537 /// [Rule L2] by calling [`Self::reorder_visual()`].
538 /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
539 /// to avoid non-byte indices.
540 ///
541 /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
542 ///
543 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
544 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
545 #[cfg_attr(feature = "flame_it", flamer::flame)]
546 pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
547 assert!(line.start <= self.levels.len());
548 assert!(line.end <= self.levels.len());
549
550 let mut levels = self.levels.clone();
551 let line_classes = &self.original_classes[line.clone()];
552 let line_levels = &mut levels[line.clone()];
553
554 reorder_levels(
555 line_classes,
556 line_levels,
557 self.text.subrange(line),
558 para.level,
559 );
560
561 levels
562 }
563
564 /// Produce the levels for this paragraph as needed for reordering, one level per *character*
565 /// in the paragraph. The returned vector includes characters that are not included
566 /// in the `line`, but will not adjust them.
567 ///
568 /// This runs [Rule L1], you can run
569 /// [Rule L2] by calling [`Self::reorder_visual()`].
570 /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
571 /// to avoid non-byte indices.
572 ///
573 /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
574 ///
575 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
576 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
577 #[cfg_attr(feature = "flame_it", flamer::flame)]
578 pub fn reordered_levels_per_char(
579 &self,
580 para: &ParagraphInfo,
581 line: Range<usize>,
582 ) -> Vec<Level> {
583 let levels = self.reordered_levels(para, line);
584 self.text.char_indices().map(|(i, _)| levels[i]).collect()
585 }
586
587 /// Re-order a line based on resolved levels and return the line in display order.
588 ///
589 /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
590 ///
591 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
592 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
593 #[cfg_attr(feature = "flame_it", flamer::flame)]
594 pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str> {
595 if !level::has_rtl(&self.levels[line.clone()]) {
596 return self.text[line].into();
597 }
598 let (levels, runs) = self.visual_runs(para, line.clone());
599 reorder_line(self.text, line, levels, runs)
600 }
601
602 /// Reorders pre-calculated levels of a sequence of characters.
603 ///
604 /// NOTE: This is a convenience method that does not use a `Paragraph` object. It is
605 /// intended to be used when an application has determined the levels of the objects (character sequences)
606 /// and just needs to have them reordered.
607 ///
608 /// the index map will result in `indexMap[visualIndex]==logicalIndex`.
609 ///
610 /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
611 /// information about the actual text.
612 ///
613 /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
614 /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
615 /// is for a single code point.
616 ///
617 ///
618 /// # # Example
619 /// ```
620 /// use unicode_bidi::BidiInfo;
621 /// use unicode_bidi::Level;
622 ///
623 /// let l0 = Level::from(0);
624 /// let l1 = Level::from(1);
625 /// let l2 = Level::from(2);
626 ///
627 /// let levels = vec![l0, l0, l0, l0];
628 /// let index_map = BidiInfo::reorder_visual(&levels);
629 /// assert_eq!(levels.len(), index_map.len());
630 /// assert_eq!(index_map, [0, 1, 2, 3]);
631 ///
632 /// let levels: Vec<Level> = vec![l0, l0, l0, l1, l1, l1, l2, l2];
633 /// let index_map = BidiInfo::reorder_visual(&levels);
634 /// assert_eq!(levels.len(), index_map.len());
635 /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]);
636 /// ```
637 #[cfg_attr(feature = "flame_it", flamer::flame)]
638 #[inline]
639 pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
640 reorder_visual(levels)
641 }
642
643 /// Find the level runs within a line and return them in visual order.
644 ///
645 /// `line` is a range of bytes indices within `levels`.
646 ///
647 /// The first return value is a vector of levels used by the reordering algorithm,
648 /// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
649 /// the result of [Rule L2], showing the visual order that each level run (a run of text with the
650 /// same level) should be displayed. Within each run, the display order can be checked
651 /// against the Level vector.
652 ///
653 /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
654 /// as that should be handled by the engine using this API.
655 ///
656 /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
657 /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
658 /// of producing a level map, since one may wish to deal with the fact that this is operating on
659 /// byte rather than character indices.
660 ///
661 /// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
662 ///
663 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
664 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
665 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
666 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
667 #[cfg_attr(feature = "flame_it", flamer::flame)]
668 #[inline]
669 pub fn visual_runs(
670 &self,
671 para: &ParagraphInfo,
672 line: Range<usize>,
673 ) -> (Vec<Level>, Vec<LevelRun>) {
674 let levels = self.reordered_levels(para, line.clone());
675 visual_runs_for_line(levels, &line)
676 }
677
678 /// If processed text has any computed RTL levels
679 ///
680 /// This information is usually used to skip re-ordering of text when no RTL level is present
681 #[inline]
682 pub fn has_rtl(&self) -> bool {
683 level::has_rtl(&self.levels)
684 }
685}
686
687/// Bidi information of text treated as a single paragraph.
688///
689/// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a
690/// character is multiple bytes wide, then its class and level will appear multiple times in these
691/// vectors.
692#[derive(Debug, PartialEq)]
693pub struct ParagraphBidiInfo<'text> {
694 /// The text
695 pub text: &'text str,
696
697 /// The BidiClass of the character at each byte in the text.
698 pub original_classes: Vec<BidiClass>,
699
700 /// The directional embedding level of each byte in the text.
701 pub levels: Vec<Level>,
702
703 /// The paragraph embedding level.
704 pub paragraph_level: Level,
705
706 /// Whether the paragraph is purely LTR.
707 pub is_pure_ltr: bool,
708}
709
710impl<'text> ParagraphBidiInfo<'text> {
711 /// Determine the bidi embedding level.
712 ///
713 ///
714 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
715 ///
716 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
717 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
718 ///
719 /// TODO: Support auto-RTL base direction
720 #[cfg_attr(feature = "flame_it", flamer::flame)]
721 #[cfg(feature = "hardcoded-data")]
722 #[inline]
723 pub fn new(text: &str, default_para_level: Option<Level>) -> ParagraphBidiInfo<'_> {
724 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
725 }
726
727 /// Determine the bidi embedding level, with a custom [`BidiDataSource`]
728 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
729 /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
730 ///
731 /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source,
732 /// and should be kept in sync with it.
733 #[cfg_attr(feature = "flame_it", flamer::flame)]
734 pub fn new_with_data_source<'a, D: BidiDataSource>(
735 data_source: &D,
736 text: &'a str,
737 default_para_level: Option<Level>,
738 ) -> ParagraphBidiInfo<'a> {
739 // Here we could create a ParagraphInitialInfo struct to parallel the one
740 // used by BidiInfo, but there doesn't seem any compelling reason for it.
741 let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) =
742 compute_initial_info(data_source, text, default_para_level, None);
743
744 let mut levels = Vec::<Level>::with_capacity(text.len());
745 let mut processing_classes = original_classes.clone();
746
747 let para_info = ParagraphInfo {
748 range: Range {
749 start: 0,
750 end: text.len(),
751 },
752 level: paragraph_level,
753 };
754
755 compute_bidi_info_for_para(
756 data_source,
757 &para_info,
758 is_pure_ltr,
759 has_isolate_controls,
760 text,
761 &original_classes,
762 &mut processing_classes,
763 &mut levels,
764 );
765
766 ParagraphBidiInfo {
767 text,
768 original_classes,
769 levels,
770 paragraph_level,
771 is_pure_ltr,
772 }
773 }
774
775 /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
776 /// in the paragraph. The returned vector includes bytes that are not included
777 /// in the `line`, but will not adjust them.
778 ///
779 /// See BidiInfo::reordered_levels for details.
780 ///
781 /// (This should be kept in sync with BidiInfo::reordered_levels.)
782 #[cfg_attr(feature = "flame_it", flamer::flame)]
783 pub fn reordered_levels(&self, line: Range<usize>) -> Vec<Level> {
784 assert!(line.start <= self.levels.len());
785 assert!(line.end <= self.levels.len());
786
787 let mut levels = self.levels.clone();
788 let line_classes = &self.original_classes[line.clone()];
789 let line_levels = &mut levels[line.clone()];
790
791 reorder_levels(
792 line_classes,
793 line_levels,
794 self.text.subrange(line),
795 self.paragraph_level,
796 );
797
798 levels
799 }
800
801 /// Produce the levels for this paragraph as needed for reordering, one level per *character*
802 /// in the paragraph. The returned vector includes characters that are not included
803 /// in the `line`, but will not adjust them.
804 ///
805 /// See BidiInfo::reordered_levels_per_char for details.
806 ///
807 /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.)
808 #[cfg_attr(feature = "flame_it", flamer::flame)]
809 pub fn reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level> {
810 let levels = self.reordered_levels(line);
811 self.text.char_indices().map(|(i, _)| levels[i]).collect()
812 }
813
814 /// Re-order a line based on resolved levels and return the line in display order.
815 ///
816 /// See BidiInfo::reorder_line for details.
817 ///
818 /// (This should be kept in sync with BidiInfo::reorder_line.)
819 #[cfg_attr(feature = "flame_it", flamer::flame)]
820 pub fn reorder_line(&self, line: Range<usize>) -> Cow<'text, str> {
821 if !level::has_rtl(&self.levels[line.clone()]) {
822 return self.text[line].into();
823 }
824
825 let (levels, runs) = self.visual_runs(line.clone());
826
827 reorder_line(self.text, line, levels, runs)
828 }
829
830 /// Reorders pre-calculated levels of a sequence of characters.
831 ///
832 /// See BidiInfo::reorder_visual for details.
833 #[cfg_attr(feature = "flame_it", flamer::flame)]
834 #[inline]
835 pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
836 reorder_visual(levels)
837 }
838
839 /// Find the level runs within a line and return them in visual order.
840 ///
841 /// `line` is a range of bytes indices within `levels`.
842 ///
843 /// See BidiInfo::visual_runs for details.
844 ///
845 /// (This should be kept in sync with BidiInfo::visual_runs.)
846 #[cfg_attr(feature = "flame_it", flamer::flame)]
847 #[inline]
848 pub fn visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
849 let levels = self.reordered_levels(line.clone());
850 visual_runs_for_line(levels, &line)
851 }
852
853 /// If processed text has any computed RTL levels
854 ///
855 /// This information is usually used to skip re-ordering of text when no RTL level is present
856 #[inline]
857 pub fn has_rtl(&self) -> bool {
858 !self.is_pure_ltr
859 }
860
861 /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels.
862 #[inline]
863 pub fn direction(&self) -> Direction {
864 para_direction(&self.levels)
865 }
866}
867
868/// Return a line of the text in display order based on resolved levels.
869///
870/// `text` the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis
871/// `line` a range of byte indices within `text` corresponding to one line
872/// `levels` array of `Level` values, with `line`'s levels reordered into visual order
873/// `runs` array of `LevelRun`s in visual order
874///
875/// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or
876/// `ParagraphBidiInfo::visual_runs()` for the line of interest.)
877///
878/// Returns: the reordered text of the line.
879///
880/// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
881///
882/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
883/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
884fn reorder_line(
885 text: &str,
886 line: Range<usize>,
887 levels: Vec<Level>,
888 runs: Vec<LevelRun>,
889) -> Cow<'_, str> {
890 // If all isolating run sequences are LTR, no reordering is needed
891 if runs.iter().all(|run: &Range| levels[run.start].is_ltr()) {
892 return text[line].into();
893 }
894
895 let mut result: String = String::with_capacity(line.len());
896 for run: Range in runs {
897 if levels[run.start].is_rtl() {
898 result.extend(iter:text[run].chars().rev());
899 } else {
900 result.push_str(&text[run]);
901 }
902 }
903 result.into()
904}
905
906/// Find the level runs within a line and return them in visual order.
907///
908/// `line` is a range of code-unit indices within `levels`.
909///
910/// The first return value is a vector of levels used by the reordering algorithm,
911/// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
912/// the result of [Rule L2], showing the visual order that each level run (a run of text with the
913/// same level) should be displayed. Within each run, the display order can be checked
914/// against the Level vector.
915///
916/// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
917/// as that should be handled by the engine using this API.
918///
919/// Conceptually, this is the same as running [`reordered_levels()`] followed by
920/// [`reorder_visual()`], however it returns the result as a list of level runs instead
921/// of producing a level map, since one may wish to deal with the fact that this is operating on
922/// byte rather than character indices.
923///
924/// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
925///
926/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
927/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
928/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
929/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
930fn visual_runs_for_line(levels: Vec<Level>, line: &Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
931 // Find consecutive level runs.
932 let mut runs = Vec::new();
933 let mut start = line.start;
934 let mut run_level = levels[start];
935 let mut min_level = run_level;
936 let mut max_level = run_level;
937
938 for (i, &new_level) in levels.iter().enumerate().take(line.end).skip(start + 1) {
939 if new_level != run_level {
940 // End of the previous run, start of a new one.
941 runs.push(start..i);
942 start = i;
943 run_level = new_level;
944 min_level = cmp::min(run_level, min_level);
945 max_level = cmp::max(run_level, max_level);
946 }
947 }
948 runs.push(start..line.end);
949
950 let run_count = runs.len();
951
952 // Re-order the odd runs.
953 // <http://www.unicode.org/reports/tr9/#L2>
954
955 // Stop at the lowest *odd* level.
956 min_level = min_level.new_lowest_ge_rtl().expect("Level error");
957 // This loop goes through contiguous chunks of level runs that have a level
958 // ≥ max_level and reverses their contents, reducing max_level by 1 each time.
959 while max_level >= min_level {
960 // Look for the start of a sequence of consecutive runs of max_level or higher.
961 let mut seq_start = 0;
962 while seq_start < run_count {
963 if levels[runs[seq_start].start] < max_level {
964 seq_start += 1;
965 continue;
966 }
967
968 // Found the start of a sequence. Now find the end.
969 let mut seq_end = seq_start + 1;
970 while seq_end < run_count {
971 if levels[runs[seq_end].start] < max_level {
972 break;
973 }
974 seq_end += 1;
975 }
976 // Reverse the runs within this sequence.
977 runs[seq_start..seq_end].reverse();
978
979 seq_start = seq_end;
980 }
981 max_level
982 .lower(1)
983 .expect("Lowering embedding level below zero");
984 }
985 (levels, runs)
986}
987
988/// Reorders pre-calculated levels of a sequence of characters.
989///
990/// NOTE: This is a convenience method that does not use a `Paragraph` object. It is
991/// intended to be used when an application has determined the levels of the objects (character sequences)
992/// and just needs to have them reordered.
993///
994/// the index map will result in `indexMap[visualIndex]==logicalIndex`.
995///
996/// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
997/// information about the actual text.
998///
999/// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
1000/// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
1001/// is for a single code point.
1002fn reorder_visual(levels: &[Level]) -> Vec<usize> {
1003 // Gets the next range of characters after start_index with a level greater
1004 // than or equal to `max`
1005 fn next_range(levels: &[level::Level], mut start_index: usize, max: Level) -> Range<usize> {
1006 if levels.is_empty() || start_index >= levels.len() {
1007 return start_index..start_index;
1008 }
1009 while let Some(l) = levels.get(start_index) {
1010 if *l >= max {
1011 break;
1012 }
1013 start_index += 1;
1014 }
1015
1016 if levels.get(start_index).is_none() {
1017 // If at the end of the array, adding one will
1018 // produce an out-of-range end element
1019 return start_index..start_index;
1020 }
1021
1022 let mut end_index = start_index + 1;
1023 while let Some(l) = levels.get(end_index) {
1024 if *l < max {
1025 return start_index..end_index;
1026 }
1027 end_index += 1;
1028 }
1029
1030 start_index..end_index
1031 }
1032
1033 // This implementation is similar to the L2 implementation in `visual_runs()`
1034 // but it cannot benefit from a precalculated LevelRun vector so needs to be different.
1035
1036 if levels.is_empty() {
1037 return vec![];
1038 }
1039
1040 // Get the min and max levels
1041 let (mut min, mut max) = levels
1042 .iter()
1043 .fold((levels[0], levels[0]), |(min, max), &l| {
1044 (cmp::min(min, l), cmp::max(max, l))
1045 });
1046
1047 // Initialize an index map
1048 let mut result: Vec<usize> = (0..levels.len()).collect();
1049
1050 if min == max && min.is_ltr() {
1051 // Everything is LTR and at the same level, do nothing
1052 return result;
1053 }
1054
1055 // Stop at the lowest *odd* level, since everything below that
1056 // is LTR and does not need further reordering
1057 min = min.new_lowest_ge_rtl().expect("Level error");
1058
1059 // For each max level, take all contiguous chunks of
1060 // levels ≥ max and reverse them
1061 //
1062 // We can do this check with the original levels instead of checking reorderings because all
1063 // prior reorderings will have been for contiguous chunks of levels >> max, which will
1064 // be a subset of these chunks anyway.
1065 while min <= max {
1066 let mut range = 0..0;
1067 loop {
1068 range = next_range(levels, range.end, max);
1069 result[range.clone()].reverse();
1070
1071 if range.end >= levels.len() {
1072 break;
1073 }
1074 }
1075
1076 max.lower(1).expect("Level error");
1077 }
1078
1079 result
1080}
1081
1082/// The core of BidiInfo initialization, factored out into a function that both
1083/// the utf-8 and utf-16 versions of BidiInfo can use.
1084fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1085 data_source: &D,
1086 para: &ParagraphInfo,
1087 is_pure_ltr: bool,
1088 has_isolate_controls: bool,
1089 text: &'a T,
1090 original_classes: &[BidiClass],
1091 processing_classes: &mut [BidiClass],
1092 levels: &mut Vec<Level>,
1093) {
1094 let new_len = levels.len() + para.range.len();
1095 levels.resize(new_len, para.level);
1096 if para.level == LTR_LEVEL && is_pure_ltr {
1097 return;
1098 }
1099
1100 let processing_classes = &mut processing_classes[para.range.clone()];
1101 let levels = &mut levels[para.range.clone()];
1102 let mut level_runs = LevelRunVec::new();
1103
1104 explicit::compute(
1105 text,
1106 para.level,
1107 original_classes,
1108 levels,
1109 processing_classes,
1110 &mut level_runs,
1111 );
1112
1113 let mut sequences = prepare::IsolatingRunSequenceVec::new();
1114 prepare::isolating_run_sequences(
1115 para.level,
1116 original_classes,
1117 levels,
1118 level_runs,
1119 has_isolate_controls,
1120 &mut sequences,
1121 );
1122 for sequence in &sequences {
1123 implicit::resolve_weak(text, sequence, processing_classes);
1124 implicit::resolve_neutral(
1125 text,
1126 data_source,
1127 sequence,
1128 levels,
1129 original_classes,
1130 processing_classes,
1131 );
1132 }
1133
1134 implicit::resolve_levels(processing_classes, levels);
1135
1136 assign_levels_to_removed_chars(para.level, original_classes, levels);
1137}
1138
1139/// Produce the levels for this paragraph as needed for reordering, one level per *code unit*
1140/// in the paragraph. The returned vector includes code units that are not included
1141/// in the `line`, but will not adjust them.
1142///
1143/// This runs [Rule L1]
1144///
1145/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
1146fn reorder_levels<'a, T: TextSource<'a> + ?Sized>(
1147 line_classes: &[BidiClass],
1148 line_levels: &mut [Level],
1149 line_text: &'a T,
1150 para_level: Level,
1151) {
1152 // Reset some whitespace chars to paragraph level.
1153 // <http://www.unicode.org/reports/tr9/#L1>
1154 let mut reset_from: Option<usize> = Some(0);
1155 let mut reset_to: Option<usize> = None;
1156 let mut prev_level = para_level;
1157 for ((i, c), (_, length)) in line_text.char_indices().zip(line_text.indices_lengths()) {
1158 match line_classes[i] {
1159 // Segment separator, Paragraph separator
1160 B | S => {
1161 assert_eq!(reset_to, None);
1162 reset_to = Some(i + T::char_len(c));
1163 if reset_from.is_none() {
1164 reset_from = Some(i);
1165 }
1166 }
1167 // Whitespace, isolate formatting
1168 WS | FSI | LRI | RLI | PDI => {
1169 if reset_from.is_none() {
1170 reset_from = Some(i);
1171 }
1172 }
1173 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
1174 // same as above + set the level
1175 RLE | LRE | RLO | LRO | PDF | BN => {
1176 if reset_from.is_none() {
1177 reset_from = Some(i);
1178 }
1179 // also set the level to previous
1180 for level in &mut line_levels[i..i + length] {
1181 *level = prev_level;
1182 }
1183 }
1184 _ => {
1185 reset_from = None;
1186 }
1187 }
1188 if let (Some(from), Some(to)) = (reset_from, reset_to) {
1189 for level in &mut line_levels[from..to] {
1190 *level = para_level;
1191 }
1192 reset_from = None;
1193 reset_to = None;
1194 }
1195 prev_level = line_levels[i];
1196 }
1197 if let Some(from) = reset_from {
1198 for level in &mut line_levels[from..] {
1199 *level = para_level;
1200 }
1201 }
1202}
1203
1204/// Contains a reference of `BidiInfo` and one of its `paragraphs`.
1205/// And it supports all operation in the `Paragraph` that needs also its
1206/// `BidiInfo` such as `direction`.
1207#[derive(Debug)]
1208pub struct Paragraph<'a, 'text> {
1209 pub info: &'a BidiInfo<'text>,
1210 pub para: &'a ParagraphInfo,
1211}
1212
1213impl<'a, 'text> Paragraph<'a, 'text> {
1214 #[inline]
1215 pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> {
1216 Paragraph { info, para }
1217 }
1218
1219 /// Returns if the paragraph is Left direction, right direction or mixed.
1220 #[inline]
1221 pub fn direction(&self) -> Direction {
1222 para_direction(&self.info.levels[self.para.range.clone()])
1223 }
1224
1225 /// Returns the `Level` of a certain character in the paragraph.
1226 #[inline]
1227 pub fn level_at(&self, pos: usize) -> Level {
1228 let actual_position: usize = self.para.range.start + pos;
1229 self.info.levels[actual_position]
1230 }
1231}
1232
1233/// Return the directionality of the paragraph (Left, Right or Mixed) from its levels.
1234#[cfg_attr(feature = "flame_it", flamer::flame)]
1235fn para_direction(levels: &[Level]) -> Direction {
1236 let mut ltr = false;
1237 let mut rtl = false;
1238 for level in levels {
1239 if level.is_ltr() {
1240 ltr = true;
1241 if rtl {
1242 return Direction::Mixed;
1243 }
1244 }
1245
1246 if level.is_rtl() {
1247 rtl = true;
1248 if ltr {
1249 return Direction::Mixed;
1250 }
1251 }
1252 }
1253
1254 if ltr {
1255 return Direction::Ltr;
1256 }
1257
1258 Direction::Rtl
1259}
1260
1261/// Assign levels to characters removed by rule X9.
1262///
1263/// The levels assigned to these characters are not specified by the algorithm. This function
1264/// assigns each one the level of the previous character, to avoid breaking level runs.
1265#[cfg_attr(feature = "flame_it", flamer::flame)]
1266fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level]) {
1267 for i: usize in 0..levels.len() {
1268 if prepare::removed_by_x9(class:classes[i]) {
1269 levels[i] = if i > 0 { levels[i - 1] } else { para_level };
1270 }
1271 }
1272}
1273
1274/// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm.
1275///
1276/// See rules P2 and P3.
1277///
1278/// The base direction is derived from the first character in the string with bidi character type
1279/// L, R, or AL. If the first such character has type L, Direction::Ltr is returned. If the first
1280/// such character has type R or AL, Direction::Rtl is returned.
1281///
1282/// If the string does not contain any character of these types (outside of embedded isolate runs),
1283/// then Direction::Mixed is returned (but should be considered as meaning "neutral" or "unknown",
1284/// not in fact mixed directions).
1285///
1286/// This is a lightweight function for use when only the base direction is needed and no further
1287/// bidi processing of the text is needed.
1288///
1289/// If the text contains paragraph separators, this function considers only the first paragraph.
1290#[cfg(feature = "hardcoded-data")]
1291#[inline]
1292pub fn get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
1293 get_base_direction_with_data_source(&HardcodedBidiData, text)
1294}
1295
1296/// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm,
1297/// considering the full text if the first paragraph is all-neutral.
1298///
1299/// This is the same as get_base_direction except that it does not stop at the first block
1300/// separator, but just resets the embedding level and continues to look for a strongly-
1301/// directional character. So the result will be the base direction of the first paragraph
1302/// that is not purely neutral characters.
1303#[cfg(feature = "hardcoded-data")]
1304#[inline]
1305pub fn get_base_direction_full<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
1306 get_base_direction_full_with_data_source(&HardcodedBidiData, text)
1307}
1308
1309#[inline]
1310pub fn get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1311 data_source: &D,
1312 text: &'a T,
1313) -> Direction {
1314 get_base_direction_impl(data_source, text, use_full_text:false)
1315}
1316
1317#[inline]
1318pub fn get_base_direction_full_with_data_source<
1319 'a,
1320 D: BidiDataSource,
1321 T: TextSource<'a> + ?Sized,
1322>(
1323 data_source: &D,
1324 text: &'a T,
1325) -> Direction {
1326 get_base_direction_impl(data_source, text, use_full_text:true)
1327}
1328
1329fn get_base_direction_impl<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1330 data_source: &D,
1331 text: &'a T,
1332 use_full_text: bool,
1333) -> Direction {
1334 let mut isolate_level: i32 = 0;
1335 for c: char in text.chars() {
1336 match data_source.bidi_class(c) {
1337 LRI | RLI | FSI => isolate_level += 1,
1338 PDI if isolate_level > 0 => isolate_level -= 1,
1339 L if isolate_level == 0 => return Direction::Ltr,
1340 R | AL if isolate_level == 0 => return Direction::Rtl,
1341 B if !use_full_text => break,
1342 B if use_full_text => isolate_level = 0,
1343 _ => (),
1344 }
1345 }
1346 // If no strong char was found, return Mixed. Normally this will be treated as Ltr by callers
1347 // (see rule P3), but we don't map this to Ltr here so that a caller that wants to apply other
1348 // heuristics to an all-neutral paragraph can tell the difference.
1349 Direction::Mixed
1350}
1351
1352/// Implementation of TextSource for UTF-8 text (a string slice).
1353impl<'text> TextSource<'text> for str {
1354 type CharIter = core::str::Chars<'text>;
1355 type CharIndexIter = core::str::CharIndices<'text>;
1356 type IndexLenIter = Utf8IndexLenIter<'text>;
1357
1358 #[inline]
1359 fn len(&self) -> usize {
1360 (self as &str).len()
1361 }
1362 #[inline]
1363 fn char_at(&self, index: usize) -> Option<(char, usize)> {
1364 if let Some(slice) = self.get(index..) {
1365 if let Some(ch) = slice.chars().next() {
1366 return Some((ch, ch.len_utf8()));
1367 }
1368 }
1369 None
1370 }
1371 #[inline]
1372 fn subrange(&self, range: Range<usize>) -> &Self {
1373 &(self as &str)[range]
1374 }
1375 #[inline]
1376 fn chars(&'text self) -> Self::CharIter {
1377 (self as &str).chars()
1378 }
1379 #[inline]
1380 fn char_indices(&'text self) -> Self::CharIndexIter {
1381 (self as &str).char_indices()
1382 }
1383 #[inline]
1384 fn indices_lengths(&'text self) -> Self::IndexLenIter {
1385 Utf8IndexLenIter::new(self)
1386 }
1387 #[inline]
1388 fn char_len(ch: char) -> usize {
1389 ch.len_utf8()
1390 }
1391}
1392
1393/// Iterator over (UTF-8) string slices returning (index, char_len) tuple.
1394#[derive(Debug)]
1395pub struct Utf8IndexLenIter<'text> {
1396 iter: CharIndices<'text>,
1397}
1398
1399impl<'text> Utf8IndexLenIter<'text> {
1400 #[inline]
1401 pub fn new(text: &'text str) -> Self {
1402 Utf8IndexLenIter {
1403 iter: text.char_indices(),
1404 }
1405 }
1406}
1407
1408impl Iterator for Utf8IndexLenIter<'_> {
1409 type Item = (usize, usize);
1410
1411 #[inline]
1412 fn next(&mut self) -> Option<Self::Item> {
1413 if let Some((pos: usize, ch: char)) = self.iter.next() {
1414 return Some((pos, ch.len_utf8()));
1415 }
1416 None
1417 }
1418}
1419
1420#[cfg(test)]
1421fn to_utf16(s: &str) -> Vec<u16> {
1422 s.encode_utf16().collect()
1423}
1424
1425#[cfg(test)]
1426#[cfg(feature = "hardcoded-data")]
1427mod tests {
1428 use super::*;
1429
1430 use utf16::{
1431 BidiInfo as BidiInfoU16, InitialInfo as InitialInfoU16, Paragraph as ParagraphU16,
1432 ParagraphBidiInfo as ParagraphBidiInfoU16,
1433 };
1434
1435 #[test]
1436 fn test_utf16_text_source() {
1437 let text: &[u16] =
1438 &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800];
1439 assert_eq!(text.char_at(0), Some(('A', 1)));
1440 assert_eq!(text.char_at(1), Some(('\u{10401}', 2)));
1441 assert_eq!(text.char_at(2), None);
1442 assert_eq!(text.char_at(3), Some((' ', 1)));
1443 assert_eq!(text.char_at(4), Some((char::REPLACEMENT_CHARACTER, 1)));
1444 assert_eq!(text.char_at(5), Some((' ', 1)));
1445 assert_eq!(text.char_at(6), Some((char::REPLACEMENT_CHARACTER, 1)));
1446 assert_eq!(text.char_at(7), Some((' ', 1)));
1447 assert_eq!(text.char_at(8), Some((char::REPLACEMENT_CHARACTER, 1)));
1448 assert_eq!(text.char_at(9), Some((char::REPLACEMENT_CHARACTER, 1)));
1449 assert_eq!(text.char_at(10), None);
1450 }
1451
1452 #[test]
1453 fn test_utf16_char_iter() {
1454 let text: &[u16] =
1455 &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800];
1456 assert_eq!(text.len(), 10);
1457 assert_eq!(text.chars().count(), 9);
1458 let mut chars = text.chars();
1459 assert_eq!(chars.next(), Some('A'));
1460 assert_eq!(chars.next(), Some('\u{10401}'));
1461 assert_eq!(chars.next(), Some(' '));
1462 assert_eq!(chars.next(), Some('\u{FFFD}'));
1463 assert_eq!(chars.next(), Some(' '));
1464 assert_eq!(chars.next(), Some('\u{FFFD}'));
1465 assert_eq!(chars.next(), Some(' '));
1466 assert_eq!(chars.next(), Some('\u{FFFD}'));
1467 assert_eq!(chars.next(), Some('\u{FFFD}'));
1468 assert_eq!(chars.next(), None);
1469 }
1470
1471 #[test]
1472 fn test_initial_text_info() {
1473 let tests = vec![
1474 (
1475 // text
1476 "a1",
1477 // expected bidi classes per utf-8 byte
1478 vec![L, EN],
1479 // expected paragraph-info for utf-8
1480 vec![ParagraphInfo {
1481 range: 0..2,
1482 level: LTR_LEVEL,
1483 }],
1484 // expected bidi classes per utf-16 code unit
1485 vec![L, EN],
1486 // expected paragraph-info for utf-16
1487 vec![ParagraphInfo {
1488 range: 0..2,
1489 level: LTR_LEVEL,
1490 }],
1491 ),
1492 (
1493 // Arabic, space, Hebrew
1494 "\u{0639} \u{05D0}",
1495 vec![AL, AL, WS, R, R],
1496 vec![ParagraphInfo {
1497 range: 0..5,
1498 level: RTL_LEVEL,
1499 }],
1500 vec![AL, WS, R],
1501 vec![ParagraphInfo {
1502 range: 0..3,
1503 level: RTL_LEVEL,
1504 }],
1505 ),
1506 (
1507 // SMP characters from Kharoshthi, Cuneiform, Adlam:
1508 "\u{10A00}\u{12000}\u{1E900}",
1509 vec![R, R, R, R, L, L, L, L, R, R, R, R],
1510 vec![ParagraphInfo {
1511 range: 0..12,
1512 level: RTL_LEVEL,
1513 }],
1514 vec![R, R, L, L, R, R],
1515 vec![ParagraphInfo {
1516 range: 0..6,
1517 level: RTL_LEVEL,
1518 }],
1519 ),
1520 (
1521 "a\u{2029}b",
1522 vec![L, B, B, B, L],
1523 vec![
1524 ParagraphInfo {
1525 range: 0..4,
1526 level: LTR_LEVEL,
1527 },
1528 ParagraphInfo {
1529 range: 4..5,
1530 level: LTR_LEVEL,
1531 },
1532 ],
1533 vec![L, B, L],
1534 vec![
1535 ParagraphInfo {
1536 range: 0..2,
1537 level: LTR_LEVEL,
1538 },
1539 ParagraphInfo {
1540 range: 2..3,
1541 level: LTR_LEVEL,
1542 },
1543 ],
1544 ),
1545 (
1546 "\u{2068}א\u{2069}a", // U+2068 FSI, U+2069 PDI
1547 vec![RLI, RLI, RLI, R, R, PDI, PDI, PDI, L],
1548 vec![ParagraphInfo {
1549 range: 0..9,
1550 level: LTR_LEVEL,
1551 }],
1552 vec![RLI, R, PDI, L],
1553 vec![ParagraphInfo {
1554 range: 0..4,
1555 level: LTR_LEVEL,
1556 }],
1557 ),
1558 ];
1559
1560 for t in tests {
1561 assert_eq!(
1562 InitialInfo::new(t.0, None),
1563 InitialInfo {
1564 text: t.0,
1565 original_classes: t.1,
1566 paragraphs: t.2,
1567 }
1568 );
1569 let text = &to_utf16(t.0);
1570 assert_eq!(
1571 InitialInfoU16::new(text, None),
1572 InitialInfoU16 {
1573 text,
1574 original_classes: t.3,
1575 paragraphs: t.4,
1576 }
1577 );
1578 }
1579 }
1580
1581 #[test]
1582 #[cfg(feature = "hardcoded-data")]
1583 fn test_process_text() {
1584 let tests = vec![
1585 (
1586 // text
1587 "",
1588 // base level
1589 Some(RTL_LEVEL),
1590 // levels
1591 Level::vec(&[]),
1592 // original_classes
1593 vec![],
1594 // paragraphs
1595 vec![],
1596 // levels_u16
1597 Level::vec(&[]),
1598 // original_classes_u16
1599 vec![],
1600 // paragraphs_u16
1601 vec![],
1602 ),
1603 (
1604 // text
1605 "abc123",
1606 // base level
1607 Some(LTR_LEVEL),
1608 // levels
1609 Level::vec(&[0, 0, 0, 0, 0, 0]),
1610 // original_classes
1611 vec![L, L, L, EN, EN, EN],
1612 // paragraphs
1613 vec![ParagraphInfo {
1614 range: 0..6,
1615 level: LTR_LEVEL,
1616 }],
1617 // levels_u16
1618 Level::vec(&[0, 0, 0, 0, 0, 0]),
1619 // original_classes_u16
1620 vec![L, L, L, EN, EN, EN],
1621 // paragraphs_u16
1622 vec![ParagraphInfo {
1623 range: 0..6,
1624 level: LTR_LEVEL,
1625 }],
1626 ),
1627 (
1628 "abc \u{05D0}\u{05D1}\u{05D2}",
1629 Some(LTR_LEVEL),
1630 Level::vec(&[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]),
1631 vec![L, L, L, WS, R, R, R, R, R, R],
1632 vec![ParagraphInfo {
1633 range: 0..10,
1634 level: LTR_LEVEL,
1635 }],
1636 Level::vec(&[0, 0, 0, 0, 1, 1, 1]),
1637 vec![L, L, L, WS, R, R, R],
1638 vec![ParagraphInfo {
1639 range: 0..7,
1640 level: LTR_LEVEL,
1641 }],
1642 ),
1643 (
1644 "abc \u{05D0}\u{05D1}\u{05D2}",
1645 Some(RTL_LEVEL),
1646 Level::vec(&[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]),
1647 vec![L, L, L, WS, R, R, R, R, R, R],
1648 vec![ParagraphInfo {
1649 range: 0..10,
1650 level: RTL_LEVEL,
1651 }],
1652 Level::vec(&[2, 2, 2, 1, 1, 1, 1]),
1653 vec![L, L, L, WS, R, R, R],
1654 vec![ParagraphInfo {
1655 range: 0..7,
1656 level: RTL_LEVEL,
1657 }],
1658 ),
1659 (
1660 "\u{05D0}\u{05D1}\u{05D2} abc",
1661 Some(LTR_LEVEL),
1662 Level::vec(&[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
1663 vec![R, R, R, R, R, R, WS, L, L, L],
1664 vec![ParagraphInfo {
1665 range: 0..10,
1666 level: LTR_LEVEL,
1667 }],
1668 Level::vec(&[1, 1, 1, 0, 0, 0, 0]),
1669 vec![R, R, R, WS, L, L, L],
1670 vec![ParagraphInfo {
1671 range: 0..7,
1672 level: LTR_LEVEL,
1673 }],
1674 ),
1675 (
1676 "\u{05D0}\u{05D1}\u{05D2} abc",
1677 None,
1678 Level::vec(&[1, 1, 1, 1, 1, 1, 1, 2, 2, 2]),
1679 vec![R, R, R, R, R, R, WS, L, L, L],
1680 vec![ParagraphInfo {
1681 range: 0..10,
1682 level: RTL_LEVEL,
1683 }],
1684 Level::vec(&[1, 1, 1, 1, 2, 2, 2]),
1685 vec![R, R, R, WS, L, L, L],
1686 vec![ParagraphInfo {
1687 range: 0..7,
1688 level: RTL_LEVEL,
1689 }],
1690 ),
1691 (
1692 "\u{063A}2\u{0638} \u{05D0}2\u{05D2}",
1693 Some(LTR_LEVEL),
1694 Level::vec(&[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]),
1695 vec![AL, AL, EN, AL, AL, WS, R, R, EN, R, R],
1696 vec![ParagraphInfo {
1697 range: 0..11,
1698 level: LTR_LEVEL,
1699 }],
1700 Level::vec(&[1, 2, 1, 1, 1, 2, 1]),
1701 vec![AL, EN, AL, WS, R, EN, R],
1702 vec![ParagraphInfo {
1703 range: 0..7,
1704 level: LTR_LEVEL,
1705 }],
1706 ),
1707 (
1708 "a א.\nג",
1709 None,
1710 Level::vec(&[0, 0, 1, 1, 0, 0, 1, 1]),
1711 vec![L, WS, R, R, CS, B, R, R],
1712 vec![
1713 ParagraphInfo {
1714 range: 0..6,
1715 level: LTR_LEVEL,
1716 },
1717 ParagraphInfo {
1718 range: 6..8,
1719 level: RTL_LEVEL,
1720 },
1721 ],
1722 Level::vec(&[0, 0, 1, 0, 0, 1]),
1723 vec![L, WS, R, CS, B, R],
1724 vec![
1725 ParagraphInfo {
1726 range: 0..5,
1727 level: LTR_LEVEL,
1728 },
1729 ParagraphInfo {
1730 range: 5..6,
1731 level: RTL_LEVEL,
1732 },
1733 ],
1734 ),
1735 // BidiTest:69635 (AL ET EN)
1736 (
1737 "\u{060B}\u{20CF}\u{06F9}",
1738 None,
1739 Level::vec(&[1, 1, 1, 1, 1, 2, 2]),
1740 vec![AL, AL, ET, ET, ET, EN, EN],
1741 vec![ParagraphInfo {
1742 range: 0..7,
1743 level: RTL_LEVEL,
1744 }],
1745 Level::vec(&[1, 1, 2]),
1746 vec![AL, ET, EN],
1747 vec![ParagraphInfo {
1748 range: 0..3,
1749 level: RTL_LEVEL,
1750 }],
1751 ),
1752 ];
1753
1754 for t in tests {
1755 assert_eq!(
1756 BidiInfo::new(t.0, t.1),
1757 BidiInfo {
1758 text: t.0,
1759 levels: t.2.clone(),
1760 original_classes: t.3.clone(),
1761 paragraphs: t.4.clone(),
1762 }
1763 );
1764 // If it was empty, also test that ParagraphBidiInfo handles it safely.
1765 if t.4.len() == 0 {
1766 assert_eq!(
1767 ParagraphBidiInfo::new(t.0, t.1),
1768 ParagraphBidiInfo {
1769 text: t.0,
1770 original_classes: t.3.clone(),
1771 levels: t.2.clone(),
1772 paragraph_level: RTL_LEVEL,
1773 is_pure_ltr: true,
1774 }
1775 )
1776 }
1777 // If it was a single paragraph, also test ParagraphBidiInfo.
1778 if t.4.len() == 1 {
1779 assert_eq!(
1780 ParagraphBidiInfo::new(t.0, t.1),
1781 ParagraphBidiInfo {
1782 text: t.0,
1783 original_classes: t.3,
1784 levels: t.2.clone(),
1785 paragraph_level: t.4[0].level,
1786 is_pure_ltr: !level::has_rtl(&t.2),
1787 }
1788 )
1789 }
1790 let text = &to_utf16(t.0);
1791 assert_eq!(
1792 BidiInfoU16::new(text, t.1),
1793 BidiInfoU16 {
1794 text,
1795 levels: t.5.clone(),
1796 original_classes: t.6.clone(),
1797 paragraphs: t.7.clone(),
1798 }
1799 );
1800 if t.7.len() == 1 {
1801 assert_eq!(
1802 ParagraphBidiInfoU16::new(text, t.1),
1803 ParagraphBidiInfoU16 {
1804 text: text,
1805 original_classes: t.6.clone(),
1806 levels: t.5.clone(),
1807 paragraph_level: t.7[0].level,
1808 is_pure_ltr: !level::has_rtl(&t.5),
1809 }
1810 )
1811 }
1812 }
1813 }
1814
1815 #[test]
1816 #[cfg(feature = "hardcoded-data")]
1817 fn test_paragraph_bidi_info() {
1818 // Passing text that includes a paragraph break to the ParagraphBidiInfo API:
1819 // this is a misuse of the API by the client, but our behavior is safe &
1820 // consistent. The embedded paragraph break acts like a separator (tab) would.
1821 let tests = vec![
1822 (
1823 "a א.\nג",
1824 None,
1825 // utf-8 results:
1826 vec![L, WS, R, R, CS, B, R, R],
1827 Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]),
1828 // utf-16 results:
1829 vec![L, WS, R, CS, B, R],
1830 Level::vec(&[0, 0, 1, 1, 1, 1]),
1831 // paragraph level; is_pure_ltr
1832 LTR_LEVEL,
1833 false,
1834 ),
1835 (
1836 "\u{5d1} a.\nb.",
1837 None,
1838 // utf-8 results:
1839 vec![R, R, WS, L, CS, B, L, CS],
1840 Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]),
1841 // utf-16 results:
1842 vec![R, WS, L, CS, B, L, CS],
1843 Level::vec(&[1, 1, 2, 2, 2, 2, 1]),
1844 // paragraph level; is_pure_ltr
1845 RTL_LEVEL,
1846 false,
1847 ),
1848 (
1849 "a א.\tג",
1850 None,
1851 // utf-8 results:
1852 vec![L, WS, R, R, CS, S, R, R],
1853 Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]),
1854 // utf-16 results:
1855 vec![L, WS, R, CS, S, R],
1856 Level::vec(&[0, 0, 1, 1, 1, 1]),
1857 // paragraph level; is_pure_ltr
1858 LTR_LEVEL,
1859 false,
1860 ),
1861 (
1862 "\u{5d1} a.\tb.",
1863 None,
1864 // utf-8 results:
1865 vec![R, R, WS, L, CS, S, L, CS],
1866 Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]),
1867 // utf-16 results:
1868 vec![R, WS, L, CS, S, L, CS],
1869 Level::vec(&[1, 1, 2, 2, 2, 2, 1]),
1870 // paragraph level; is_pure_ltr
1871 RTL_LEVEL,
1872 false,
1873 ),
1874 ];
1875
1876 for t in tests {
1877 assert_eq!(
1878 ParagraphBidiInfo::new(t.0, t.1),
1879 ParagraphBidiInfo {
1880 text: t.0,
1881 original_classes: t.2,
1882 levels: t.3,
1883 paragraph_level: t.6,
1884 is_pure_ltr: t.7,
1885 }
1886 );
1887 let text = &to_utf16(t.0);
1888 assert_eq!(
1889 ParagraphBidiInfoU16::new(text, t.1),
1890 ParagraphBidiInfoU16 {
1891 text: text,
1892 original_classes: t.4,
1893 levels: t.5,
1894 paragraph_level: t.6,
1895 is_pure_ltr: t.7,
1896 }
1897 );
1898 }
1899 }
1900
1901 #[test]
1902 #[cfg(feature = "hardcoded-data")]
1903 fn test_bidi_info_has_rtl() {
1904 let tests = vec![
1905 // ASCII only
1906 ("123", None, false),
1907 ("123", Some(LTR_LEVEL), false),
1908 ("123", Some(RTL_LEVEL), false),
1909 ("abc", None, false),
1910 ("abc", Some(LTR_LEVEL), false),
1911 ("abc", Some(RTL_LEVEL), false),
1912 ("abc 123", None, false),
1913 ("abc\n123", None, false),
1914 // With Hebrew
1915 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1916 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(LTR_LEVEL), true),
1917 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(RTL_LEVEL), true),
1918 ("abc \u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1919 ("abc\n\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1920 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} abc", None, true),
1921 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\nabc", None, true),
1922 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} 123", None, true),
1923 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\n123", None, true),
1924 ];
1925
1926 for t in tests {
1927 assert_eq!(BidiInfo::new(t.0, t.1).has_rtl(), t.2);
1928 assert_eq!(BidiInfoU16::new(&to_utf16(t.0), t.1).has_rtl(), t.2);
1929 }
1930 }
1931
1932 #[cfg(feature = "hardcoded-data")]
1933 fn reorder_paras(text: &str) -> Vec<Cow<'_, str>> {
1934 let bidi_info = BidiInfo::new(text, None);
1935 bidi_info
1936 .paragraphs
1937 .iter()
1938 .map(|para| bidi_info.reorder_line(para, para.range.clone()))
1939 .collect()
1940 }
1941
1942 #[cfg(feature = "hardcoded-data")]
1943 fn reorder_paras_u16(text: &[u16]) -> Vec<Cow<'_, [u16]>> {
1944 let bidi_info = BidiInfoU16::new(text, None);
1945 bidi_info
1946 .paragraphs
1947 .iter()
1948 .map(|para| bidi_info.reorder_line(para, para.range.clone()))
1949 .collect()
1950 }
1951
1952 #[test]
1953 #[cfg(feature = "hardcoded-data")]
1954 fn test_reorder_line() {
1955 let tests = vec![
1956 // Bidi_Class: L L L B L L L B L L L
1957 ("abc\ndef\nghi", vec!["abc\n", "def\n", "ghi"]),
1958 // Bidi_Class: L L EN B L L EN B L L EN
1959 ("ab1\nde2\ngh3", vec!["ab1\n", "de2\n", "gh3"]),
1960 // Bidi_Class: L L L B AL AL AL
1961 ("abc\nابج", vec!["abc\n", "جبا"]),
1962 // Bidi_Class: AL AL AL B L L L
1963 (
1964 "\u{0627}\u{0628}\u{062C}\nabc",
1965 vec!["\n\u{062C}\u{0628}\u{0627}", "abc"],
1966 ),
1967 ("1.-2", vec!["1.-2"]),
1968 ("1-.2", vec!["1-.2"]),
1969 ("abc אבג", vec!["abc גבא"]),
1970 // Numbers being weak LTR characters, cannot reorder strong RTL
1971 ("123 \u{05D0}\u{05D1}\u{05D2}", vec!["גבא 123"]),
1972 ("abc\u{202A}def", vec!["abc\u{202A}def"]),
1973 (
1974 "abc\u{202A}def\u{202C}ghi",
1975 vec!["abc\u{202A}def\u{202C}ghi"],
1976 ),
1977 (
1978 "abc\u{2066}def\u{2069}ghi",
1979 vec!["abc\u{2066}def\u{2069}ghi"],
1980 ),
1981 // Testing for RLE Character
1982 ("\u{202B}abc אבג\u{202C}", vec!["\u{202b}גבא abc\u{202c}"]),
1983 // Testing neutral characters
1984 ("\u{05D0}בג? אבג", vec!["גבא ?גבא"]),
1985 // Testing neutral characters with special case
1986 ("A אבג?", vec!["A גבא?"]),
1987 // Testing neutral characters with Implicit RTL Marker
1988 ("A אבג?\u{200F}", vec!["A \u{200F}?גבא"]),
1989 ("\u{05D0}בג abc", vec!["abc גבא"]),
1990 ("abc\u{2067}.-\u{2069}ghi", vec!["abc\u{2067}-.\u{2069}ghi"]),
1991 (
1992 "Hello, \u{2068}\u{202E}world\u{202C}\u{2069}!",
1993 vec!["Hello, \u{2068}\u{202E}\u{202C}dlrow\u{2069}!"],
1994 ),
1995 // With mirrorable characters in RTL run
1996 ("\u{05D0}(ב)ג.", vec![".ג)ב(א"]),
1997 // With mirrorable characters on level boundary
1998 ("\u{05D0}ב(גד[&ef].)gh", vec!["gh).]ef&[דג(בא"]),
1999 ];
2000
2001 for t in tests {
2002 assert_eq!(reorder_paras(t.0), t.1);
2003 let expect_utf16 = t.1.iter().map(|v| to_utf16(v)).collect::<Vec<_>>();
2004 assert_eq!(reorder_paras_u16(&to_utf16(t.0)), expect_utf16);
2005 }
2006 }
2007
2008 fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
2009 let bidi_info = BidiInfo::new(text, None);
2010 bidi_info
2011 .paragraphs
2012 .iter()
2013 .map(|para| bidi_info.reordered_levels(para, para.range.clone()))
2014 .collect()
2015 }
2016
2017 fn reordered_levels_per_char_for_paras(text: &str) -> Vec<Vec<Level>> {
2018 let bidi_info = BidiInfo::new(text, None);
2019 bidi_info
2020 .paragraphs
2021 .iter()
2022 .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
2023 .collect()
2024 }
2025
2026 fn reordered_levels_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>> {
2027 let bidi_info = BidiInfoU16::new(text, None);
2028 bidi_info
2029 .paragraphs
2030 .iter()
2031 .map(|para| bidi_info.reordered_levels(para, para.range.clone()))
2032 .collect()
2033 }
2034
2035 fn reordered_levels_per_char_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>> {
2036 let bidi_info = BidiInfoU16::new(text, None);
2037 bidi_info
2038 .paragraphs
2039 .iter()
2040 .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
2041 .collect()
2042 }
2043
2044 #[test]
2045 #[cfg(feature = "hardcoded-data")]
2046 // See issue #138
2047 fn test_reordered_levels_range() {
2048 // |---------------|
2049 let s = "\u{202a}A\u{202c}\u{202a}A\u{202c}";
2050 let range = 4..11;
2051 assert!(s.get(range.clone()).is_some());
2052
2053 let bidi = BidiInfo::new(s, None);
2054 let (_, runs) = bidi.visual_runs(&bidi.paragraphs[0], range);
2055
2056 for run in runs {
2057 let _ = &s[run]; // should be valid slice of s
2058 }
2059 }
2060
2061 #[test]
2062 #[cfg(feature = "hardcoded-data")]
2063 fn test_reordered_levels() {
2064 let tests = vec![
2065 // BidiTest:946 (LRI PDI)
2066 (
2067 "\u{2067}\u{2069}",
2068 vec![Level::vec(&[0, 0, 0, 0, 0, 0])],
2069 vec![Level::vec(&[0, 0])],
2070 vec![Level::vec(&[0, 0])],
2071 ),
2072 // BidiTest:69635 (AL ET EN)
2073 (
2074 "\u{060B}\u{20CF}\u{06F9}",
2075 vec![Level::vec(&[1, 1, 1, 1, 1, 2, 2])],
2076 vec![Level::vec(&[1, 1, 2])],
2077 vec![Level::vec(&[1, 1, 2])],
2078 ),
2079 ];
2080
2081 for t in tests {
2082 assert_eq!(reordered_levels_for_paras(t.0), t.1);
2083 assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2);
2084 let text = &to_utf16(t.0);
2085 assert_eq!(reordered_levels_for_paras_u16(text), t.3);
2086 assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2);
2087 }
2088
2089 let tests = vec![
2090 // BidiTest:291284 (AN RLI PDF R)
2091 (
2092 "\u{0605}\u{2067}\u{202C}\u{0590}",
2093 vec![&["2", "2", "0", "0", "0", "x", "x", "x", "1", "1"]],
2094 vec![&["2", "0", "x", "1"]],
2095 vec![&["2", "0", "x", "1"]],
2096 ),
2097 ];
2098
2099 for t in tests {
2100 assert_eq!(reordered_levels_for_paras(t.0), t.1);
2101 assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2);
2102 let text = &to_utf16(t.0);
2103 assert_eq!(reordered_levels_for_paras_u16(text), t.3);
2104 assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2);
2105 }
2106
2107 let text = "aa טֶ";
2108 let bidi_info = BidiInfo::new(text, None);
2109 assert_eq!(
2110 bidi_info.reordered_levels(&bidi_info.paragraphs[0], 3..7),
2111 Level::vec(&[0, 0, 0, 1, 1, 1, 1]),
2112 );
2113
2114 let text = &to_utf16(text);
2115 let bidi_info = BidiInfoU16::new(text, None);
2116 assert_eq!(
2117 bidi_info.reordered_levels(&bidi_info.paragraphs[0], 1..4),
2118 Level::vec(&[0, 0, 0, 1, 1]),
2119 );
2120 }
2121
2122 #[test]
2123 fn test_paragraph_info_len() {
2124 let text = "hello world";
2125 let bidi_info = BidiInfo::new(text, None);
2126 assert_eq!(bidi_info.paragraphs.len(), 1);
2127 assert_eq!(bidi_info.paragraphs[0].len(), text.len());
2128
2129 let text2 = "How are you";
2130 let whole_text = format!("{}\n{}", text, text2);
2131 let bidi_info = BidiInfo::new(&whole_text, None);
2132 assert_eq!(bidi_info.paragraphs.len(), 2);
2133
2134 // The first paragraph include the paragraph separator.
2135 // TODO: investigate if the paragraph separator character
2136 // should not be part of any paragraph.
2137 assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1);
2138 assert_eq!(bidi_info.paragraphs[1].len(), text2.len());
2139
2140 let text = &to_utf16(text);
2141 let bidi_info = BidiInfoU16::new(text, None);
2142 assert_eq!(bidi_info.paragraphs.len(), 1);
2143 assert_eq!(bidi_info.paragraphs[0].len(), text.len());
2144
2145 let text2 = &to_utf16(text2);
2146 let whole_text = &to_utf16(&whole_text);
2147 let bidi_info = BidiInfoU16::new(&whole_text, None);
2148 assert_eq!(bidi_info.paragraphs.len(), 2);
2149
2150 assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1);
2151 assert_eq!(bidi_info.paragraphs[1].len(), text2.len());
2152 }
2153
2154 #[test]
2155 fn test_direction() {
2156 let ltr_text = "hello world";
2157 let rtl_text = "أهلا بكم";
2158 let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text);
2159 let bidi_info = BidiInfo::new(&all_paragraphs, None);
2160 assert_eq!(bidi_info.paragraphs.len(), 3);
2161 let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2162 let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]);
2163 let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]);
2164 assert_eq!(p_ltr.direction(), Direction::Ltr);
2165 assert_eq!(p_rtl.direction(), Direction::Rtl);
2166 assert_eq!(p_mixed.direction(), Direction::Mixed);
2167
2168 let all_paragraphs = &to_utf16(&all_paragraphs);
2169 let bidi_info = BidiInfoU16::new(&all_paragraphs, None);
2170 assert_eq!(bidi_info.paragraphs.len(), 3);
2171 let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2172 let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]);
2173 let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]);
2174 assert_eq!(p_ltr.direction(), Direction::Ltr);
2175 assert_eq!(p_rtl.direction(), Direction::Rtl);
2176 assert_eq!(p_mixed.direction(), Direction::Mixed);
2177 }
2178
2179 #[test]
2180 fn test_edge_cases_direction() {
2181 // No paragraphs for empty text.
2182 let empty = "";
2183 let bidi_info = BidiInfo::new(empty, Option::from(RTL_LEVEL));
2184 assert_eq!(bidi_info.paragraphs.len(), 0);
2185
2186 let empty = &to_utf16(empty);
2187 let bidi_info = BidiInfoU16::new(empty, Option::from(RTL_LEVEL));
2188 assert_eq!(bidi_info.paragraphs.len(), 0);
2189
2190 let tests = vec![
2191 // The paragraph separator will take the value of the default direction
2192 // which is left to right.
2193 ("\n", None, Direction::Ltr),
2194 // The paragraph separator will take the value of the given initial direction
2195 // which is left to right.
2196 ("\n", Option::from(LTR_LEVEL), Direction::Ltr),
2197 // The paragraph separator will take the value of the given initial direction
2198 // which is right to left.
2199 ("\n", Option::from(RTL_LEVEL), Direction::Rtl),
2200 ];
2201
2202 for t in tests {
2203 let bidi_info = BidiInfo::new(t.0, t.1);
2204 assert_eq!(bidi_info.paragraphs.len(), 1);
2205 let p = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2206 assert_eq!(p.direction(), t.2);
2207 let text = &to_utf16(t.0);
2208 let bidi_info = BidiInfoU16::new(text, t.1);
2209 let p = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2210 assert_eq!(p.direction(), t.2);
2211 }
2212 }
2213
2214 #[test]
2215 fn test_level_at() {
2216 let ltr_text = "hello world";
2217 let rtl_text = "أهلا بكم";
2218 let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text);
2219 let bidi_info = BidiInfo::new(&all_paragraphs, None);
2220 assert_eq!(bidi_info.paragraphs.len(), 3);
2221
2222 let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2223 let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]);
2224 let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]);
2225
2226 assert_eq!(p_ltr.level_at(0), LTR_LEVEL);
2227 assert_eq!(p_rtl.level_at(0), RTL_LEVEL);
2228 assert_eq!(p_mixed.level_at(0), LTR_LEVEL);
2229 assert_eq!(p_mixed.info.levels.len(), 54);
2230 assert_eq!(p_mixed.para.range.start, 28);
2231 assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
2232
2233 let all_paragraphs = &to_utf16(&all_paragraphs);
2234 let bidi_info = BidiInfoU16::new(&all_paragraphs, None);
2235 assert_eq!(bidi_info.paragraphs.len(), 3);
2236
2237 let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2238 let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]);
2239 let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]);
2240
2241 assert_eq!(p_ltr.level_at(0), LTR_LEVEL);
2242 assert_eq!(p_rtl.level_at(0), RTL_LEVEL);
2243 assert_eq!(p_mixed.level_at(0), LTR_LEVEL);
2244 assert_eq!(p_mixed.info.levels.len(), 40);
2245 assert_eq!(p_mixed.para.range.start, 21);
2246 assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
2247 }
2248
2249 #[test]
2250 fn test_get_base_direction() {
2251 let tests = vec![
2252 ("", Direction::Mixed), // return Mixed if no strong character found
2253 ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
2254 ("3.14\npi", Direction::Mixed), // only first paragraph is considered
2255 ("[123 'abc']", Direction::Ltr),
2256 ("[123 '\u{0628}' abc", Direction::Rtl),
2257 ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
2258 ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
2259 ];
2260
2261 for t in tests {
2262 assert_eq!(get_base_direction(t.0), t.1);
2263 let text = &to_utf16(t.0);
2264 assert_eq!(get_base_direction(text.as_slice()), t.1);
2265 }
2266 }
2267
2268 #[test]
2269 fn test_get_base_direction_full() {
2270 let tests = vec![
2271 ("", Direction::Mixed), // return Mixed if no strong character found
2272 ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
2273 ("3.14\npi", Direction::Ltr), // direction taken from the second paragraph
2274 ("3.14\n\u{05D0}", Direction::Rtl), // direction taken from the second paragraph
2275 ("[123 'abc']", Direction::Ltr),
2276 ("[123 '\u{0628}' abc", Direction::Rtl),
2277 ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
2278 ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
2279 ("[123 '\u{2066}abc\u{2068}'\n\u{0628}]", Direction::Rtl), // \n resets embedding level
2280 ];
2281
2282 for t in tests {
2283 assert_eq!(get_base_direction_full(t.0), t.1);
2284 let text = &to_utf16(t.0);
2285 assert_eq!(get_base_direction_full(text.as_slice()), t.1);
2286 }
2287 }
2288}
2289
2290#[cfg(all(feature = "serde", feature = "hardcoded-data", test))]
2291mod serde_tests {
2292 use super::*;
2293 use serde_test::{assert_tokens, Token};
2294
2295 #[test]
2296 fn test_levels() {
2297 let text = "abc אבג";
2298 let bidi_info = BidiInfo::new(text, None);
2299 let levels = bidi_info.levels;
2300 assert_eq!(text.as_bytes().len(), 10);
2301 assert_eq!(levels.len(), 10);
2302 assert_tokens(
2303 &levels,
2304 &[
2305 Token::Seq { len: Some(10) },
2306 Token::NewtypeStruct { name: "Level" },
2307 Token::U8(0),
2308 Token::NewtypeStruct { name: "Level" },
2309 Token::U8(0),
2310 Token::NewtypeStruct { name: "Level" },
2311 Token::U8(0),
2312 Token::NewtypeStruct { name: "Level" },
2313 Token::U8(0),
2314 Token::NewtypeStruct { name: "Level" },
2315 Token::U8(1),
2316 Token::NewtypeStruct { name: "Level" },
2317 Token::U8(1),
2318 Token::NewtypeStruct { name: "Level" },
2319 Token::U8(1),
2320 Token::NewtypeStruct { name: "Level" },
2321 Token::U8(1),
2322 Token::NewtypeStruct { name: "Level" },
2323 Token::U8(1),
2324 Token::NewtypeStruct { name: "Level" },
2325 Token::U8(1),
2326 Token::SeqEnd,
2327 ],
2328 );
2329 }
2330}
2331