1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! This crate implements the [Unicode Bidirectional Algorithm][tr9] for display of mixed
11//! right-to-left and left-to-right text. It is written in safe Rust, compatible with the
12//! current stable release.
13//!
14//! ## Example
15//!
16//! ```rust
17//! # #[cfg(feature = "hardcoded-data")] {
18//! use unicode_bidi::BidiInfo;
19//!
20//! // This example text is defined using `concat!` because some browsers
21//! // and text editors have trouble displaying bidi strings.
22//! let text = concat![
23//! "א",
24//! "ב",
25//! "ג",
26//! "a",
27//! "b",
28//! "c",
29//! ];
30//!
31//! // Resolve embedding levels within the text. Pass `None` to detect the
32//! // paragraph level automatically.
33//! let bidi_info = BidiInfo::new(&text, None);
34//!
35//! // This paragraph has embedding level 1 because its first strong character is RTL.
36//! assert_eq!(bidi_info.paragraphs.len(), 1);
37//! let para = &bidi_info.paragraphs[0];
38//! assert_eq!(para.level.number(), 1);
39//! assert_eq!(para.level.is_rtl(), true);
40//!
41//! // Re-ordering is done after wrapping each paragraph into a sequence of
42//! // lines. For this example, I'll just use a single line that spans the
43//! // entire paragraph.
44//! let line = para.range.clone();
45//!
46//! let display = bidi_info.reorder_line(para, line);
47//! assert_eq!(display, concat![
48//! "a",
49//! "b",
50//! "c",
51//! "ג",
52//! "ב",
53//! "א",
54//! ]);
55//! # } // feature = "hardcoded-data"
56//! ```
57//!
58//! # Features
59//!
60//! - `std`: Enabled by default, but can be disabled to make `unicode_bidi`
61//! `#![no_std]` + `alloc` compatible.
62//! - `hardcoded-data`: Enabled by default. Includes hardcoded Unicode bidi data and more convenient APIs.
63//! - `serde`: Adds [`serde::Serialize`] and [`serde::Deserialize`]
64//! implementations to relevant types.
65//!
66//! [tr9]: <http://www.unicode.org/reports/tr9/>
67
68#![no_std]
69// We need to link to std to make doc tests work on older Rust versions
70#[cfg(feature = "std")]
71extern crate std;
72#[macro_use]
73extern crate alloc;
74
75pub mod data_source;
76pub mod deprecated;
77pub mod format_chars;
78pub mod level;
79pub mod utf16;
80
81mod char_data;
82mod explicit;
83mod implicit;
84mod prepare;
85
86pub use crate::char_data::{BidiClass, UNICODE_VERSION};
87pub use crate::data_source::BidiDataSource;
88pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL};
89pub use crate::prepare::LevelRun;
90
91#[cfg(feature = "hardcoded-data")]
92pub use crate::char_data::{bidi_class, HardcodedBidiData};
93
94use alloc::borrow::Cow;
95use alloc::string::String;
96use alloc::vec::Vec;
97use core::char;
98use core::cmp;
99use core::iter::repeat;
100use core::ops::Range;
101use core::str::CharIndices;
102
103use crate::format_chars as chars;
104use crate::BidiClass::*;
105
106/// Trait that abstracts over a text source for use by the bidi algorithms.
107/// We implement this for str (UTF-8) and for [u16] (UTF-16, native-endian).
108/// (For internal unicode-bidi use; API may be unstable.)
109/// This trait is sealed and cannot be implemented for types outside this crate.
110pub trait TextSource<'text>: private::Sealed {
111 type CharIter: Iterator<Item = char>;
112 type CharIndexIter: Iterator<Item = (usize, char)>;
113 type IndexLenIter: Iterator<Item = (usize, usize)>;
114
115 /// Return the length of the text in code units.
116 #[doc(hidden)]
117 fn len(&self) -> usize;
118
119 /// Get the character at a given code unit index, along with its length in code units.
120 /// Returns None if index is out of range, or points inside a multi-code-unit character.
121 /// Returns REPLACEMENT_CHARACTER for any unpaired surrogates in UTF-16.
122 #[doc(hidden)]
123 fn char_at(&self, index: usize) -> Option<(char, usize)>;
124
125 /// Return a subrange of the text, indexed by code units.
126 /// (We don't implement all of the Index trait, just the minimum we use.)
127 #[doc(hidden)]
128 fn subrange(&self, range: Range<usize>) -> &Self;
129
130 /// An iterator over the text returning Unicode characters,
131 /// REPLACEMENT_CHAR for invalid code units.
132 #[doc(hidden)]
133 fn chars(&'text self) -> Self::CharIter;
134
135 /// An iterator over the text returning (index, char) tuples,
136 /// where index is the starting code-unit index of the character,
137 /// and char is its Unicode value (or REPLACEMENT_CHAR if invalid).
138 #[doc(hidden)]
139 fn char_indices(&'text self) -> Self::CharIndexIter;
140
141 /// An iterator over the text returning (index, length) tuples,
142 /// where index is the starting code-unit index of the character,
143 /// and length is its length in code units.
144 #[doc(hidden)]
145 fn indices_lengths(&'text self) -> Self::IndexLenIter;
146
147 /// Number of code units the given character uses.
148 #[doc(hidden)]
149 fn char_len(ch: char) -> usize;
150}
151
152mod private {
153 pub trait Sealed {}
154
155 // Implement for str and [u16] only.
156 impl Sealed for str {}
157 impl Sealed for [u16] {}
158}
159
160#[derive(PartialEq, Debug)]
161pub enum Direction {
162 Ltr,
163 Rtl,
164 Mixed,
165}
166
167/// Bidi information about a single paragraph
168#[derive(Clone, Debug, PartialEq)]
169pub struct ParagraphInfo {
170 /// The paragraphs boundaries within the text, as byte indices.
171 ///
172 /// TODO: Shrink this to only include the starting index?
173 pub range: Range<usize>,
174
175 /// The paragraph embedding level.
176 ///
177 /// <http://www.unicode.org/reports/tr9/#BD4>
178 pub level: Level,
179}
180
181impl ParagraphInfo {
182 /// Gets the length of the paragraph in the source text.
183 pub fn len(&self) -> usize {
184 self.range.end - self.range.start
185 }
186}
187
188/// Initial bidi information of the text.
189///
190/// Contains the text paragraphs and `BidiClass` of its characters.
191#[derive(PartialEq, Debug)]
192pub struct InitialInfo<'text> {
193 /// The text
194 pub text: &'text str,
195
196 /// The BidiClass of the character at each byte in the text.
197 /// If a character is multiple bytes, its class will appear multiple times in the vector.
198 pub original_classes: Vec<BidiClass>,
199
200 /// The boundaries and level of each paragraph within the text.
201 pub paragraphs: Vec<ParagraphInfo>,
202}
203
204impl<'text> InitialInfo<'text> {
205 /// Find the paragraphs and BidiClasses in a string of text.
206 ///
207 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
208 ///
209 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
210 /// character is found before the matching PDI. If no strong character is found, the class will
211 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
212 ///
213 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
214 #[cfg_attr(feature = "flame_it", flamer::flame)]
215 #[cfg(feature = "hardcoded-data")]
216 pub fn new(text: &str, default_para_level: Option<Level>) -> InitialInfo<'_> {
217 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
218 }
219
220 /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
221 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
222 /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
223 ///
224 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
225 ///
226 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
227 /// character is found before the matching PDI. If no strong character is found, the class will
228 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
229 #[cfg_attr(feature = "flame_it", flamer::flame)]
230 pub fn new_with_data_source<'a, D: BidiDataSource>(
231 data_source: &D,
232 text: &'a str,
233 default_para_level: Option<Level>,
234 ) -> InitialInfo<'a> {
235 InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base
236 }
237}
238
239/// Extended version of InitialInfo (not public API).
240#[derive(PartialEq, Debug)]
241struct InitialInfoExt<'text> {
242 /// The base InitialInfo for the text, recording its paragraphs and bidi classes.
243 base: InitialInfo<'text>,
244
245 /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
246 /// requires no further bidi processing (i.e. there are no RTL characters or bidi
247 /// control codes present).
248 pure_ltr: Vec<bool>,
249}
250
251impl<'text> InitialInfoExt<'text> {
252 /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
253 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
254 /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
255 ///
256 /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
257 ///
258 /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
259 /// character is found before the matching PDI. If no strong character is found, the class will
260 /// remain FSI, and it's up to later stages to treat these as LRI when needed.
261 #[cfg_attr(feature = "flame_it", flamer::flame)]
262 pub fn new_with_data_source<'a, D: BidiDataSource>(
263 data_source: &D,
264 text: &'a str,
265 default_para_level: Option<Level>,
266 ) -> InitialInfoExt<'a> {
267 let mut paragraphs = Vec::<ParagraphInfo>::new();
268 let mut pure_ltr = Vec::<bool>::new();
269 let (original_classes, _, _) = compute_initial_info(
270 data_source,
271 text,
272 default_para_level,
273 Some((&mut paragraphs, &mut pure_ltr)),
274 );
275
276 InitialInfoExt {
277 base: InitialInfo {
278 text,
279 original_classes,
280 paragraphs,
281 },
282 pure_ltr,
283 }
284 }
285}
286
287/// Implementation of initial-info computation for both BidiInfo and ParagraphBidiInfo.
288/// To treat the text as (potentially) multiple paragraphs, the caller should pass the
289/// pair of optional outparam arrays to receive the ParagraphInfo and pure-ltr flags
290/// for each paragraph. Passing None for split_paragraphs will ignore any paragraph-
291/// separator characters in the text, treating it just as a single paragraph.
292/// Returns the array of BidiClass values for each code unit of the text, along with
293/// the embedding level and pure-ltr flag for the *last* (or only) paragraph.
294fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
295 data_source: &D,
296 text: &'a T,
297 default_para_level: Option<Level>,
298 mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<bool>)>,
299) -> (Vec<BidiClass>, Level, bool) {
300 let mut original_classes = Vec::with_capacity(text.len());
301
302 // The stack contains the starting code unit index for each nested isolate we're inside.
303 let mut isolate_stack = Vec::new();
304
305 debug_assert!(
306 if let Some((ref paragraphs, ref pure_ltr)) = split_paragraphs {
307 paragraphs.is_empty() && pure_ltr.is_empty()
308 } else {
309 true
310 }
311 );
312
313 let mut para_start = 0;
314 let mut para_level = default_para_level;
315
316 // Per-paragraph flag: can subsequent processing be skipped? Set to false if any
317 // RTL characters or bidi control characters are encountered in the paragraph.
318 let mut is_pure_ltr = true;
319
320 #[cfg(feature = "flame_it")]
321 flame::start("compute_initial_info(): iter text.char_indices()");
322
323 for (i, c) in text.char_indices() {
324 let class = data_source.bidi_class(c);
325
326 #[cfg(feature = "flame_it")]
327 flame::start("original_classes.extend()");
328
329 let len = T::char_len(c);
330 original_classes.extend(repeat(class).take(len));
331
332 #[cfg(feature = "flame_it")]
333 flame::end("original_classes.extend()");
334
335 match class {
336 B => {
337 if let Some((ref mut paragraphs, ref mut pure_ltr)) = split_paragraphs {
338 // P1. Split the text into separate paragraphs. The paragraph separator is kept
339 // with the previous paragraph.
340 let para_end = i + len;
341 paragraphs.push(ParagraphInfo {
342 range: para_start..para_end,
343 // P3. If no character is found in p2, set the paragraph level to zero.
344 level: para_level.unwrap_or(LTR_LEVEL),
345 });
346 pure_ltr.push(is_pure_ltr);
347 // Reset state for the start of the next paragraph.
348 para_start = para_end;
349 // TODO: Support defaulting to direction of previous paragraph
350 //
351 // <http://www.unicode.org/reports/tr9/#HL1>
352 para_level = default_para_level;
353 is_pure_ltr = true;
354 isolate_stack.clear();
355 }
356 }
357
358 L | R | AL => {
359 if class != L {
360 is_pure_ltr = false;
361 }
362 match isolate_stack.last() {
363 Some(&start) => {
364 if original_classes[start] == FSI {
365 // X5c. If the first strong character between FSI and its matching
366 // PDI is R or AL, treat it as RLI. Otherwise, treat it as LRI.
367 for j in 0..T::char_len(chars::FSI) {
368 original_classes[start + j] = if class == L { LRI } else { RLI };
369 }
370 }
371 }
372
373 None => {
374 if para_level.is_none() {
375 // P2. Find the first character of type L, AL, or R, while skipping
376 // any characters between an isolate initiator and its matching
377 // PDI.
378 para_level = Some(if class != L { RTL_LEVEL } else { LTR_LEVEL });
379 }
380 }
381 }
382 }
383
384 AN | LRE | RLE | LRO | RLO => {
385 is_pure_ltr = false;
386 }
387
388 RLI | LRI | FSI => {
389 is_pure_ltr = false;
390 isolate_stack.push(i);
391 }
392
393 PDI => {
394 isolate_stack.pop();
395 }
396
397 _ => {}
398 }
399 }
400
401 if let Some((paragraphs, pure_ltr)) = split_paragraphs {
402 if para_start < text.len() {
403 paragraphs.push(ParagraphInfo {
404 range: para_start..text.len(),
405 level: para_level.unwrap_or(LTR_LEVEL),
406 });
407 pure_ltr.push(is_pure_ltr);
408 }
409 debug_assert_eq!(paragraphs.len(), pure_ltr.len());
410 }
411 debug_assert_eq!(original_classes.len(), text.len());
412
413 #[cfg(feature = "flame_it")]
414 flame::end("compute_initial_info(): iter text.char_indices()");
415
416 (
417 original_classes,
418 para_level.unwrap_or(LTR_LEVEL),
419 is_pure_ltr,
420 )
421}
422
423/// Bidi information of the text.
424///
425/// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a
426/// character is multiple bytes wide, then its class and level will appear multiple times in these
427/// vectors.
428// TODO: Impl `struct StringProperty<T> { values: Vec<T> }` and use instead of Vec<T>
429#[derive(Debug, PartialEq)]
430pub struct BidiInfo<'text> {
431 /// The text
432 pub text: &'text str,
433
434 /// The BidiClass of the character at each byte in the text.
435 pub original_classes: Vec<BidiClass>,
436
437 /// The directional embedding level of each byte in the text.
438 pub levels: Vec<Level>,
439
440 /// The boundaries and paragraph embedding level of each paragraph within the text.
441 ///
442 /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs?
443 /// Or just don't include the first paragraph, which always starts at 0?
444 pub paragraphs: Vec<ParagraphInfo>,
445}
446
447impl<'text> BidiInfo<'text> {
448 /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph.
449 ///
450 ///
451 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
452 ///
453 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
454 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
455 ///
456 /// TODO: Support auto-RTL base direction
457 #[cfg_attr(feature = "flame_it", flamer::flame)]
458 #[cfg(feature = "hardcoded-data")]
459 #[inline]
460 pub fn new(text: &str, default_para_level: Option<Level>) -> BidiInfo<'_> {
461 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
462 }
463
464 /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`]
465 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
466 /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
467 ///
468 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
469 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
470 ///
471 /// TODO: Support auto-RTL base direction
472 #[cfg_attr(feature = "flame_it", flamer::flame)]
473 pub fn new_with_data_source<'a, D: BidiDataSource>(
474 data_source: &D,
475 text: &'a str,
476 default_para_level: Option<Level>,
477 ) -> BidiInfo<'a> {
478 let InitialInfoExt { base, pure_ltr, .. } =
479 InitialInfoExt::new_with_data_source(data_source, text, default_para_level);
480
481 let mut levels = Vec::<Level>::with_capacity(text.len());
482 let mut processing_classes = base.original_classes.clone();
483
484 for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) {
485 let text = &text[para.range.clone()];
486 let original_classes = &base.original_classes[para.range.clone()];
487
488 compute_bidi_info_for_para(
489 data_source,
490 para,
491 *is_pure_ltr,
492 text,
493 original_classes,
494 &mut processing_classes,
495 &mut levels,
496 );
497 }
498
499 BidiInfo {
500 text,
501 original_classes: base.original_classes,
502 paragraphs: base.paragraphs,
503 levels,
504 }
505 }
506
507 /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
508 /// in the paragraph. The returned vector includes bytes that are not included
509 /// in the `line`, but will not adjust them.
510 ///
511 /// This runs [Rule L1], you can run
512 /// [Rule L2] by calling [`Self::reorder_visual()`].
513 /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
514 /// to avoid non-byte indices.
515 ///
516 /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
517 ///
518 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
519 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
520 #[cfg_attr(feature = "flame_it", flamer::flame)]
521 pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
522 assert!(line.start <= self.levels.len());
523 assert!(line.end <= self.levels.len());
524
525 let mut levels = self.levels.clone();
526 let line_classes = &self.original_classes[line.clone()];
527 let line_levels = &mut levels[line.clone()];
528
529 reorder_levels(
530 line_classes,
531 line_levels,
532 self.text.subrange(line),
533 para.level,
534 );
535
536 levels
537 }
538
539 /// Produce the levels for this paragraph as needed for reordering, one level per *character*
540 /// in the paragraph. The returned vector includes characters that are not included
541 /// in the `line`, but will not adjust them.
542 ///
543 /// This runs [Rule L1], you can run
544 /// [Rule L2] by calling [`Self::reorder_visual()`].
545 /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
546 /// to avoid non-byte indices.
547 ///
548 /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
549 ///
550 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
551 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
552 #[cfg_attr(feature = "flame_it", flamer::flame)]
553 pub fn reordered_levels_per_char(
554 &self,
555 para: &ParagraphInfo,
556 line: Range<usize>,
557 ) -> Vec<Level> {
558 let levels = self.reordered_levels(para, line);
559 self.text.char_indices().map(|(i, _)| levels[i]).collect()
560 }
561
562 /// Re-order a line based on resolved levels and return the line in display order.
563 ///
564 /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
565 ///
566 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
567 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
568 #[cfg_attr(feature = "flame_it", flamer::flame)]
569 pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str> {
570 if !level::has_rtl(&self.levels[line.clone()]) {
571 return self.text[line].into();
572 }
573 let (levels, runs) = self.visual_runs(para, line.clone());
574 reorder_line(self.text, line, levels, runs)
575 }
576
577 /// Reorders pre-calculated levels of a sequence of characters.
578 ///
579 /// NOTE: This is a convenience method that does not use a `Paragraph` object. It is
580 /// intended to be used when an application has determined the levels of the objects (character sequences)
581 /// and just needs to have them reordered.
582 ///
583 /// the index map will result in `indexMap[visualIndex]==logicalIndex`.
584 ///
585 /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
586 /// information about the actual text.
587 ///
588 /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
589 /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
590 /// is for a single code point.
591 ///
592 ///
593 /// # # Example
594 /// ```
595 /// use unicode_bidi::BidiInfo;
596 /// use unicode_bidi::Level;
597 ///
598 /// let l0 = Level::from(0);
599 /// let l1 = Level::from(1);
600 /// let l2 = Level::from(2);
601 ///
602 /// let levels = vec![l0, l0, l0, l0];
603 /// let index_map = BidiInfo::reorder_visual(&levels);
604 /// assert_eq!(levels.len(), index_map.len());
605 /// assert_eq!(index_map, [0, 1, 2, 3]);
606 ///
607 /// let levels: Vec<Level> = vec![l0, l0, l0, l1, l1, l1, l2, l2];
608 /// let index_map = BidiInfo::reorder_visual(&levels);
609 /// assert_eq!(levels.len(), index_map.len());
610 /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]);
611 /// ```
612 #[cfg_attr(feature = "flame_it", flamer::flame)]
613 #[inline]
614 pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
615 reorder_visual(levels)
616 }
617
618 /// Find the level runs within a line and return them in visual order.
619 ///
620 /// `line` is a range of bytes indices within `levels`.
621 ///
622 /// The first return value is a vector of levels used by the reordering algorithm,
623 /// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
624 /// the result of [Rule L2], showing the visual order that each level run (a run of text with the
625 /// same level) should be displayed. Within each run, the display order can be checked
626 /// against the Level vector.
627 ///
628 /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
629 /// as that should be handled by the engine using this API.
630 ///
631 /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
632 /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
633 /// of producing a level map, since one may wish to deal with the fact that this is operating on
634 /// byte rather than character indices.
635 ///
636 /// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
637 ///
638 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
639 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
640 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
641 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
642 #[cfg_attr(feature = "flame_it", flamer::flame)]
643 #[inline]
644 pub fn visual_runs(
645 &self,
646 para: &ParagraphInfo,
647 line: Range<usize>,
648 ) -> (Vec<Level>, Vec<LevelRun>) {
649 let levels = self.reordered_levels(para, line.clone());
650 visual_runs_for_line(levels, &line)
651 }
652
653 /// If processed text has any computed RTL levels
654 ///
655 /// This information is usually used to skip re-ordering of text when no RTL level is present
656 #[inline]
657 pub fn has_rtl(&self) -> bool {
658 level::has_rtl(&self.levels)
659 }
660}
661
662/// Bidi information of text treated as a single paragraph.
663///
664/// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a
665/// character is multiple bytes wide, then its class and level will appear multiple times in these
666/// vectors.
667#[derive(Debug, PartialEq)]
668pub struct ParagraphBidiInfo<'text> {
669 /// The text
670 pub text: &'text str,
671
672 /// The BidiClass of the character at each byte in the text.
673 pub original_classes: Vec<BidiClass>,
674
675 /// The directional embedding level of each byte in the text.
676 pub levels: Vec<Level>,
677
678 /// The paragraph embedding level.
679 pub paragraph_level: Level,
680
681 /// Whether the paragraph is purely LTR.
682 pub is_pure_ltr: bool,
683}
684
685impl<'text> ParagraphBidiInfo<'text> {
686 /// Determine the bidi embedding level.
687 ///
688 ///
689 /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
690 ///
691 /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
692 /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison.
693 ///
694 /// TODO: Support auto-RTL base direction
695 #[cfg_attr(feature = "flame_it", flamer::flame)]
696 #[cfg(feature = "hardcoded-data")]
697 #[inline]
698 pub fn new(text: &str, default_para_level: Option<Level>) -> ParagraphBidiInfo<'_> {
699 Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
700 }
701
702 /// Determine the bidi embedding level, with a custom [`BidiDataSource`]
703 /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
704 /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
705 ///
706 /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source,
707 /// and should be kept in sync with it.
708 #[cfg_attr(feature = "flame_it", flamer::flame)]
709 pub fn new_with_data_source<'a, D: BidiDataSource>(
710 data_source: &D,
711 text: &'a str,
712 default_para_level: Option<Level>,
713 ) -> ParagraphBidiInfo<'a> {
714 // Here we could create a ParagraphInitialInfo struct to parallel the one
715 // used by BidiInfo, but there doesn't seem any compelling reason for it.
716 let (original_classes, paragraph_level, is_pure_ltr) =
717 compute_initial_info(data_source, text, default_para_level, None);
718
719 let mut levels = Vec::<Level>::with_capacity(text.len());
720 let mut processing_classes = original_classes.clone();
721
722 let para_info = ParagraphInfo {
723 range: Range {
724 start: 0,
725 end: text.len(),
726 },
727 level: paragraph_level,
728 };
729
730 compute_bidi_info_for_para(
731 data_source,
732 &para_info,
733 is_pure_ltr,
734 text,
735 &original_classes,
736 &mut processing_classes,
737 &mut levels,
738 );
739
740 ParagraphBidiInfo {
741 text,
742 original_classes,
743 levels,
744 paragraph_level,
745 is_pure_ltr,
746 }
747 }
748
749 /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
750 /// in the paragraph. The returned vector includes bytes that are not included
751 /// in the `line`, but will not adjust them.
752 ///
753 /// See BidiInfo::reordered_levels for details.
754 ///
755 /// (This should be kept in sync with BidiInfo::reordered_levels.)
756 #[cfg_attr(feature = "flame_it", flamer::flame)]
757 pub fn reordered_levels(&self, line: Range<usize>) -> Vec<Level> {
758 assert!(line.start <= self.levels.len());
759 assert!(line.end <= self.levels.len());
760
761 let mut levels = self.levels.clone();
762 let line_classes = &self.original_classes[line.clone()];
763 let line_levels = &mut levels[line.clone()];
764
765 reorder_levels(
766 line_classes,
767 line_levels,
768 self.text.subrange(line),
769 self.paragraph_level,
770 );
771
772 levels
773 }
774
775 /// Produce the levels for this paragraph as needed for reordering, one level per *character*
776 /// in the paragraph. The returned vector includes characters that are not included
777 /// in the `line`, but will not adjust them.
778 ///
779 /// See BidiInfo::reordered_levels_per_char for details.
780 ///
781 /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.)
782 #[cfg_attr(feature = "flame_it", flamer::flame)]
783 pub fn reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level> {
784 let levels = self.reordered_levels(line);
785 self.text.char_indices().map(|(i, _)| levels[i]).collect()
786 }
787
788 /// Re-order a line based on resolved levels and return the line in display order.
789 ///
790 /// See BidiInfo::reorder_line for details.
791 ///
792 /// (This should be kept in sync with BidiInfo::reorder_line.)
793 #[cfg_attr(feature = "flame_it", flamer::flame)]
794 pub fn reorder_line(&self, line: Range<usize>) -> Cow<'text, str> {
795 if !level::has_rtl(&self.levels[line.clone()]) {
796 return self.text[line].into();
797 }
798
799 let (levels, runs) = self.visual_runs(line.clone());
800
801 reorder_line(self.text, line, levels, runs)
802 }
803
804 /// Reorders pre-calculated levels of a sequence of characters.
805 ///
806 /// See BidiInfo::reorder_visual for details.
807 #[cfg_attr(feature = "flame_it", flamer::flame)]
808 #[inline]
809 pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
810 reorder_visual(levels)
811 }
812
813 /// Find the level runs within a line and return them in visual order.
814 ///
815 /// `line` is a range of bytes indices within `levels`.
816 ///
817 /// See BidiInfo::visual_runs for details.
818 ///
819 /// (This should be kept in sync with BidiInfo::visual_runs.)
820 #[cfg_attr(feature = "flame_it", flamer::flame)]
821 #[inline]
822 pub fn visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
823 let levels = self.reordered_levels(line.clone());
824 visual_runs_for_line(levels, &line)
825 }
826
827 /// If processed text has any computed RTL levels
828 ///
829 /// This information is usually used to skip re-ordering of text when no RTL level is present
830 #[inline]
831 pub fn has_rtl(&self) -> bool {
832 !self.is_pure_ltr
833 }
834
835 /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels.
836 #[inline]
837 pub fn direction(&self) -> Direction {
838 para_direction(&self.levels)
839 }
840}
841
842/// Return a line of the text in display order based on resolved levels.
843///
844/// `text` the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis
845/// `line` a range of byte indices within `text` corresponding to one line
846/// `levels` array of `Level` values, with `line`'s levels reordered into visual order
847/// `runs` array of `LevelRun`s in visual order
848///
849/// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or
850/// `ParagraphBidiInfo::visual_runs()` for the line of interest.)
851///
852/// Returns: the reordered text of the line.
853///
854/// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
855///
856/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
857/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
858fn reorder_line<'text>(
859 text: &'text str,
860 line: Range<usize>,
861 levels: Vec<Level>,
862 runs: Vec<LevelRun>,
863) -> Cow<'text, str> {
864 // If all isolating run sequences are LTR, no reordering is needed
865 if runs.iter().all(|run: &Range| levels[run.start].is_ltr()) {
866 return text[line].into();
867 }
868
869 let mut result: String = String::with_capacity(line.len());
870 for run: Range in runs {
871 if levels[run.start].is_rtl() {
872 result.extend(iter:text[run].chars().rev());
873 } else {
874 result.push_str(&text[run]);
875 }
876 }
877 result.into()
878}
879
880/// Find the level runs within a line and return them in visual order.
881///
882/// `line` is a range of code-unit indices within `levels`.
883///
884/// The first return value is a vector of levels used by the reordering algorithm,
885/// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
886/// the result of [Rule L2], showing the visual order that each level run (a run of text with the
887/// same level) should be displayed. Within each run, the display order can be checked
888/// against the Level vector.
889///
890/// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
891/// as that should be handled by the engine using this API.
892///
893/// Conceptually, this is the same as running [`reordered_levels()`] followed by
894/// [`reorder_visual()`], however it returns the result as a list of level runs instead
895/// of producing a level map, since one may wish to deal with the fact that this is operating on
896/// byte rather than character indices.
897///
898/// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
899///
900/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
901/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
902/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
903/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
904fn visual_runs_for_line(levels: Vec<Level>, line: &Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
905 // Find consecutive level runs.
906 let mut runs = Vec::new();
907 let mut start = line.start;
908 let mut run_level = levels[start];
909 let mut min_level = run_level;
910 let mut max_level = run_level;
911
912 for (i, &new_level) in levels.iter().enumerate().take(line.end).skip(start + 1) {
913 if new_level != run_level {
914 // End of the previous run, start of a new one.
915 runs.push(start..i);
916 start = i;
917 run_level = new_level;
918 min_level = cmp::min(run_level, min_level);
919 max_level = cmp::max(run_level, max_level);
920 }
921 }
922 runs.push(start..line.end);
923
924 let run_count = runs.len();
925
926 // Re-order the odd runs.
927 // <http://www.unicode.org/reports/tr9/#L2>
928
929 // Stop at the lowest *odd* level.
930 min_level = min_level.new_lowest_ge_rtl().expect("Level error");
931 // This loop goes through contiguous chunks of level runs that have a level
932 // ≥ max_level and reverses their contents, reducing max_level by 1 each time.
933 while max_level >= min_level {
934 // Look for the start of a sequence of consecutive runs of max_level or higher.
935 let mut seq_start = 0;
936 while seq_start < run_count {
937 if levels[runs[seq_start].start] < max_level {
938 seq_start += 1;
939 continue;
940 }
941
942 // Found the start of a sequence. Now find the end.
943 let mut seq_end = seq_start + 1;
944 while seq_end < run_count {
945 if levels[runs[seq_end].start] < max_level {
946 break;
947 }
948 seq_end += 1;
949 }
950 // Reverse the runs within this sequence.
951 runs[seq_start..seq_end].reverse();
952
953 seq_start = seq_end;
954 }
955 max_level
956 .lower(1)
957 .expect("Lowering embedding level below zero");
958 }
959 (levels, runs)
960}
961
962/// Reorders pre-calculated levels of a sequence of characters.
963///
964/// NOTE: This is a convenience method that does not use a `Paragraph` object. It is
965/// intended to be used when an application has determined the levels of the objects (character sequences)
966/// and just needs to have them reordered.
967///
968/// the index map will result in `indexMap[visualIndex]==logicalIndex`.
969///
970/// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
971/// information about the actual text.
972///
973/// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
974/// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
975/// is for a single code point.
976fn reorder_visual(levels: &[Level]) -> Vec<usize> {
977 // Gets the next range of characters after start_index with a level greater
978 // than or equal to `max`
979 fn next_range(levels: &[level::Level], mut start_index: usize, max: Level) -> Range<usize> {
980 if levels.is_empty() || start_index >= levels.len() {
981 return start_index..start_index;
982 }
983 while let Some(l) = levels.get(start_index) {
984 if *l >= max {
985 break;
986 }
987 start_index += 1;
988 }
989
990 if levels.get(start_index).is_none() {
991 // If at the end of the array, adding one will
992 // produce an out-of-range end element
993 return start_index..start_index;
994 }
995
996 let mut end_index = start_index + 1;
997 while let Some(l) = levels.get(end_index) {
998 if *l < max {
999 return start_index..end_index;
1000 }
1001 end_index += 1;
1002 }
1003
1004 start_index..end_index
1005 }
1006
1007 // This implementation is similar to the L2 implementation in `visual_runs()`
1008 // but it cannot benefit from a precalculated LevelRun vector so needs to be different.
1009
1010 if levels.is_empty() {
1011 return vec![];
1012 }
1013
1014 // Get the min and max levels
1015 let (mut min, mut max) = levels
1016 .iter()
1017 .fold((levels[0], levels[0]), |(min, max), &l| {
1018 (cmp::min(min, l), cmp::max(max, l))
1019 });
1020
1021 // Initialize an index map
1022 let mut result: Vec<usize> = (0..levels.len()).collect();
1023
1024 if min == max && min.is_ltr() {
1025 // Everything is LTR and at the same level, do nothing
1026 return result;
1027 }
1028
1029 // Stop at the lowest *odd* level, since everything below that
1030 // is LTR and does not need further reordering
1031 min = min.new_lowest_ge_rtl().expect("Level error");
1032
1033 // For each max level, take all contiguous chunks of
1034 // levels ≥ max and reverse them
1035 //
1036 // We can do this check with the original levels instead of checking reorderings because all
1037 // prior reorderings will have been for contiguous chunks of levels >> max, which will
1038 // be a subset of these chunks anyway.
1039 while min <= max {
1040 let mut range = 0..0;
1041 loop {
1042 range = next_range(levels, range.end, max);
1043 result[range.clone()].reverse();
1044
1045 if range.end >= levels.len() {
1046 break;
1047 }
1048 }
1049
1050 max.lower(1).expect("Level error");
1051 }
1052
1053 result
1054}
1055
1056/// The core of BidiInfo initialization, factored out into a function that both
1057/// the utf-8 and utf-16 versions of BidiInfo can use.
1058fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1059 data_source: &D,
1060 para: &ParagraphInfo,
1061 is_pure_ltr: bool,
1062 text: &'a T,
1063 original_classes: &[BidiClass],
1064 processing_classes: &mut [BidiClass],
1065 levels: &mut Vec<Level>,
1066) {
1067 let new_len = levels.len() + para.range.len();
1068 levels.resize(new_len, para.level);
1069 if para.level == LTR_LEVEL && is_pure_ltr {
1070 return;
1071 }
1072
1073 let processing_classes = &mut processing_classes[para.range.clone()];
1074 let levels = &mut levels[para.range.clone()];
1075
1076 explicit::compute(
1077 text,
1078 para.level,
1079 original_classes,
1080 levels,
1081 processing_classes,
1082 );
1083
1084 let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
1085 for sequence in &sequences {
1086 implicit::resolve_weak(text, sequence, processing_classes);
1087 implicit::resolve_neutral(
1088 text,
1089 data_source,
1090 sequence,
1091 levels,
1092 original_classes,
1093 processing_classes,
1094 );
1095 }
1096 implicit::resolve_levels(processing_classes, levels);
1097
1098 assign_levels_to_removed_chars(para.level, original_classes, levels);
1099}
1100
1101/// Produce the levels for this paragraph as needed for reordering, one level per *code unit*
1102/// in the paragraph. The returned vector includes code units that are not included
1103/// in the `line`, but will not adjust them.
1104///
1105/// This runs [Rule L1]
1106///
1107/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
1108fn reorder_levels<'a, T: TextSource<'a> + ?Sized>(
1109 line_classes: &[BidiClass],
1110 line_levels: &mut [Level],
1111 line_text: &'a T,
1112 para_level: Level,
1113) {
1114 // Reset some whitespace chars to paragraph level.
1115 // <http://www.unicode.org/reports/tr9/#L1>
1116 let mut reset_from: Option<usize> = Some(0);
1117 let mut reset_to: Option<usize> = None;
1118 let mut prev_level = para_level;
1119 for (i, c) in line_text.char_indices() {
1120 match line_classes[i] {
1121 // Segment separator, Paragraph separator
1122 B | S => {
1123 assert_eq!(reset_to, None);
1124 reset_to = Some(i + T::char_len(c));
1125 if reset_from == None {
1126 reset_from = Some(i);
1127 }
1128 }
1129 // Whitespace, isolate formatting
1130 WS | FSI | LRI | RLI | PDI => {
1131 if reset_from == None {
1132 reset_from = Some(i);
1133 }
1134 }
1135 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
1136 // same as above + set the level
1137 RLE | LRE | RLO | LRO | PDF | BN => {
1138 if reset_from == None {
1139 reset_from = Some(i);
1140 }
1141 // also set the level to previous
1142 line_levels[i] = prev_level;
1143 }
1144 _ => {
1145 reset_from = None;
1146 }
1147 }
1148 if let (Some(from), Some(to)) = (reset_from, reset_to) {
1149 for level in &mut line_levels[from..to] {
1150 *level = para_level;
1151 }
1152 reset_from = None;
1153 reset_to = None;
1154 }
1155 prev_level = line_levels[i];
1156 }
1157 if let Some(from) = reset_from {
1158 for level in &mut line_levels[from..] {
1159 *level = para_level;
1160 }
1161 }
1162}
1163
1164/// Contains a reference of `BidiInfo` and one of its `paragraphs`.
1165/// And it supports all operation in the `Paragraph` that needs also its
1166/// `BidiInfo` such as `direction`.
1167#[derive(Debug)]
1168pub struct Paragraph<'a, 'text> {
1169 pub info: &'a BidiInfo<'text>,
1170 pub para: &'a ParagraphInfo,
1171}
1172
1173impl<'a, 'text> Paragraph<'a, 'text> {
1174 #[inline]
1175 pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> {
1176 Paragraph { info, para }
1177 }
1178
1179 /// Returns if the paragraph is Left direction, right direction or mixed.
1180 #[inline]
1181 pub fn direction(&self) -> Direction {
1182 para_direction(&self.info.levels[self.para.range.clone()])
1183 }
1184
1185 /// Returns the `Level` of a certain character in the paragraph.
1186 #[inline]
1187 pub fn level_at(&self, pos: usize) -> Level {
1188 let actual_position: usize = self.para.range.start + pos;
1189 self.info.levels[actual_position]
1190 }
1191}
1192
1193/// Return the directionality of the paragraph (Left, Right or Mixed) from its levels.
1194#[cfg_attr(feature = "flame_it", flamer::flame)]
1195fn para_direction(levels: &[Level]) -> Direction {
1196 let mut ltr = false;
1197 let mut rtl = false;
1198 for level in levels {
1199 if level.is_ltr() {
1200 ltr = true;
1201 if rtl {
1202 return Direction::Mixed;
1203 }
1204 }
1205
1206 if level.is_rtl() {
1207 rtl = true;
1208 if ltr {
1209 return Direction::Mixed;
1210 }
1211 }
1212 }
1213
1214 if ltr {
1215 return Direction::Ltr;
1216 }
1217
1218 Direction::Rtl
1219}
1220
1221/// Assign levels to characters removed by rule X9.
1222///
1223/// The levels assigned to these characters are not specified by the algorithm. This function
1224/// assigns each one the level of the previous character, to avoid breaking level runs.
1225#[cfg_attr(feature = "flame_it", flamer::flame)]
1226fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level]) {
1227 for i: usize in 0..levels.len() {
1228 if prepare::removed_by_x9(class:classes[i]) {
1229 levels[i] = if i > 0 { levels[i - 1] } else { para_level };
1230 }
1231 }
1232}
1233
1234/// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm.
1235///
1236/// See rules P2 and P3.
1237///
1238/// The base direction is derived from the first character in the string with bidi character type
1239/// L, R, or AL. If the first such character has type L, Direction::Ltr is returned. If the first
1240/// such character has type R or AL, Direction::Rtl is returned.
1241///
1242/// If the string does not contain any character of these types (outside of embedded isolate runs),
1243/// then Direction::Mixed is returned (but should be considered as meaning "neutral" or "unknown",
1244/// not in fact mixed directions).
1245///
1246/// This is a lightweight function for use when only the base direction is needed and no further
1247/// bidi processing of the text is needed.
1248///
1249/// If the text contains paragraph separators, this function considers only the first paragraph.
1250#[cfg(feature = "hardcoded-data")]
1251#[inline]
1252pub fn get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
1253 get_base_direction_with_data_source(&HardcodedBidiData, text)
1254}
1255
1256/// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm,
1257/// considering the full text if the first paragraph is all-neutral.
1258///
1259/// This is the same as get_base_direction except that it does not stop at the first block
1260/// separator, but just resets the embedding level and continues to look for a strongly-
1261/// directional character. So the result will be the base direction of the first paragraph
1262/// that is not purely neutral characters.
1263#[cfg(feature = "hardcoded-data")]
1264#[inline]
1265pub fn get_base_direction_full<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
1266 get_base_direction_full_with_data_source(&HardcodedBidiData, text)
1267}
1268
1269#[inline]
1270pub fn get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1271 data_source: &D,
1272 text: &'a T,
1273) -> Direction {
1274 get_base_direction_impl(data_source, text, use_full_text:false)
1275}
1276
1277#[inline]
1278pub fn get_base_direction_full_with_data_source<
1279 'a,
1280 D: BidiDataSource,
1281 T: TextSource<'a> + ?Sized,
1282>(
1283 data_source: &D,
1284 text: &'a T,
1285) -> Direction {
1286 get_base_direction_impl(data_source, text, use_full_text:true)
1287}
1288
1289fn get_base_direction_impl<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1290 data_source: &D,
1291 text: &'a T,
1292 use_full_text: bool,
1293) -> Direction {
1294 let mut isolate_level: i32 = 0;
1295 for c: char in text.chars() {
1296 match data_source.bidi_class(c) {
1297 LRI | RLI | FSI => isolate_level = isolate_level + 1,
1298 PDI if isolate_level > 0 => isolate_level = isolate_level - 1,
1299 L if isolate_level == 0 => return Direction::Ltr,
1300 R | AL if isolate_level == 0 => return Direction::Rtl,
1301 B if !use_full_text => break,
1302 B if use_full_text => isolate_level = 0,
1303 _ => (),
1304 }
1305 }
1306 // If no strong char was found, return Mixed. Normally this will be treated as Ltr by callers
1307 // (see rule P3), but we don't map this to Ltr here so that a caller that wants to apply other
1308 // heuristics to an all-neutral paragraph can tell the difference.
1309 Direction::Mixed
1310}
1311
1312/// Implementation of TextSource for UTF-8 text (a string slice).
1313impl<'text> TextSource<'text> for str {
1314 type CharIter = core::str::Chars<'text>;
1315 type CharIndexIter = core::str::CharIndices<'text>;
1316 type IndexLenIter = Utf8IndexLenIter<'text>;
1317
1318 #[inline]
1319 fn len(&self) -> usize {
1320 (self as &str).len()
1321 }
1322 #[inline]
1323 fn char_at(&self, index: usize) -> Option<(char, usize)> {
1324 if let Some(slice) = self.get(index..) {
1325 if let Some(ch) = slice.chars().next() {
1326 return Some((ch, ch.len_utf8()));
1327 }
1328 }
1329 None
1330 }
1331 #[inline]
1332 fn subrange(&self, range: Range<usize>) -> &Self {
1333 &(self as &str)[range]
1334 }
1335 #[inline]
1336 fn chars(&'text self) -> Self::CharIter {
1337 (self as &str).chars()
1338 }
1339 #[inline]
1340 fn char_indices(&'text self) -> Self::CharIndexIter {
1341 (self as &str).char_indices()
1342 }
1343 #[inline]
1344 fn indices_lengths(&'text self) -> Self::IndexLenIter {
1345 Utf8IndexLenIter::new(&self)
1346 }
1347 #[inline]
1348 fn char_len(ch: char) -> usize {
1349 ch.len_utf8()
1350 }
1351}
1352
1353/// Iterator over (UTF-8) string slices returning (index, char_len) tuple.
1354#[derive(Debug)]
1355pub struct Utf8IndexLenIter<'text> {
1356 iter: CharIndices<'text>,
1357}
1358
1359impl<'text> Utf8IndexLenIter<'text> {
1360 #[inline]
1361 pub fn new(text: &'text str) -> Self {
1362 Utf8IndexLenIter {
1363 iter: text.char_indices(),
1364 }
1365 }
1366}
1367
1368impl Iterator for Utf8IndexLenIter<'_> {
1369 type Item = (usize, usize);
1370
1371 #[inline]
1372 fn next(&mut self) -> Option<Self::Item> {
1373 if let Some((pos: usize, ch: char)) = self.iter.next() {
1374 return Some((pos, ch.len_utf8()));
1375 }
1376 None
1377 }
1378}
1379
1380#[cfg(test)]
1381fn to_utf16(s: &str) -> Vec<u16> {
1382 s.encode_utf16().collect()
1383}
1384
1385#[cfg(test)]
1386#[cfg(feature = "hardcoded-data")]
1387mod tests {
1388 use super::*;
1389
1390 use utf16::{
1391 BidiInfo as BidiInfoU16, InitialInfo as InitialInfoU16, Paragraph as ParagraphU16,
1392 ParagraphBidiInfo as ParagraphBidiInfoU16,
1393 };
1394
1395 #[test]
1396 fn test_utf16_text_source() {
1397 let text: &[u16] =
1398 &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800];
1399 assert_eq!(text.char_at(0), Some(('A', 1)));
1400 assert_eq!(text.char_at(1), Some(('\u{10401}', 2)));
1401 assert_eq!(text.char_at(2), None);
1402 assert_eq!(text.char_at(3), Some((' ', 1)));
1403 assert_eq!(text.char_at(4), Some((char::REPLACEMENT_CHARACTER, 1)));
1404 assert_eq!(text.char_at(5), Some((' ', 1)));
1405 assert_eq!(text.char_at(6), Some((char::REPLACEMENT_CHARACTER, 1)));
1406 assert_eq!(text.char_at(7), Some((' ', 1)));
1407 assert_eq!(text.char_at(8), Some((char::REPLACEMENT_CHARACTER, 1)));
1408 assert_eq!(text.char_at(9), Some((char::REPLACEMENT_CHARACTER, 1)));
1409 assert_eq!(text.char_at(10), None);
1410 }
1411
1412 #[test]
1413 fn test_utf16_char_iter() {
1414 let text: &[u16] =
1415 &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800];
1416 assert_eq!(text.len(), 10);
1417 assert_eq!(text.chars().count(), 9);
1418 let mut chars = text.chars();
1419 assert_eq!(chars.next(), Some('A'));
1420 assert_eq!(chars.next(), Some('\u{10401}'));
1421 assert_eq!(chars.next(), Some(' '));
1422 assert_eq!(chars.next(), Some('\u{FFFD}'));
1423 assert_eq!(chars.next(), Some(' '));
1424 assert_eq!(chars.next(), Some('\u{FFFD}'));
1425 assert_eq!(chars.next(), Some(' '));
1426 assert_eq!(chars.next(), Some('\u{FFFD}'));
1427 assert_eq!(chars.next(), Some('\u{FFFD}'));
1428 assert_eq!(chars.next(), None);
1429 }
1430
1431 #[test]
1432 fn test_initial_text_info() {
1433 let tests = vec![
1434 (
1435 // text
1436 "a1",
1437 // expected bidi classes per utf-8 byte
1438 vec![L, EN],
1439 // expected paragraph-info for utf-8
1440 vec![ParagraphInfo {
1441 range: 0..2,
1442 level: LTR_LEVEL,
1443 }],
1444 // expected bidi classes per utf-16 code unit
1445 vec![L, EN],
1446 // expected paragraph-info for utf-16
1447 vec![ParagraphInfo {
1448 range: 0..2,
1449 level: LTR_LEVEL,
1450 }],
1451 ),
1452 (
1453 // Arabic, space, Hebrew
1454 "\u{0639} \u{05D0}",
1455 vec![AL, AL, WS, R, R],
1456 vec![ParagraphInfo {
1457 range: 0..5,
1458 level: RTL_LEVEL,
1459 }],
1460 vec![AL, WS, R],
1461 vec![ParagraphInfo {
1462 range: 0..3,
1463 level: RTL_LEVEL,
1464 }],
1465 ),
1466 (
1467 // SMP characters from Kharoshthi, Cuneiform, Adlam:
1468 "\u{10A00}\u{12000}\u{1E900}",
1469 vec![R, R, R, R, L, L, L, L, R, R, R, R],
1470 vec![ParagraphInfo {
1471 range: 0..12,
1472 level: RTL_LEVEL,
1473 }],
1474 vec![R, R, L, L, R, R],
1475 vec![ParagraphInfo {
1476 range: 0..6,
1477 level: RTL_LEVEL,
1478 }],
1479 ),
1480 (
1481 "a\u{2029}b",
1482 vec![L, B, B, B, L],
1483 vec![
1484 ParagraphInfo {
1485 range: 0..4,
1486 level: LTR_LEVEL,
1487 },
1488 ParagraphInfo {
1489 range: 4..5,
1490 level: LTR_LEVEL,
1491 },
1492 ],
1493 vec![L, B, L],
1494 vec![
1495 ParagraphInfo {
1496 range: 0..2,
1497 level: LTR_LEVEL,
1498 },
1499 ParagraphInfo {
1500 range: 2..3,
1501 level: LTR_LEVEL,
1502 },
1503 ],
1504 ),
1505 (
1506 "\u{2068}א\u{2069}a", // U+2068 FSI, U+2069 PDI
1507 vec![RLI, RLI, RLI, R, R, PDI, PDI, PDI, L],
1508 vec![ParagraphInfo {
1509 range: 0..9,
1510 level: LTR_LEVEL,
1511 }],
1512 vec![RLI, R, PDI, L],
1513 vec![ParagraphInfo {
1514 range: 0..4,
1515 level: LTR_LEVEL,
1516 }],
1517 ),
1518 ];
1519
1520 for t in tests {
1521 assert_eq!(
1522 InitialInfo::new(t.0, None),
1523 InitialInfo {
1524 text: t.0,
1525 original_classes: t.1,
1526 paragraphs: t.2,
1527 }
1528 );
1529 let text = &to_utf16(t.0);
1530 assert_eq!(
1531 InitialInfoU16::new(text, None),
1532 InitialInfoU16 {
1533 text,
1534 original_classes: t.3,
1535 paragraphs: t.4,
1536 }
1537 );
1538 }
1539 }
1540
1541 #[test]
1542 #[cfg(feature = "hardcoded-data")]
1543 fn test_process_text() {
1544 let tests = vec![
1545 (
1546 // text
1547 "abc123",
1548 // base level
1549 Some(LTR_LEVEL),
1550 // levels
1551 Level::vec(&[0, 0, 0, 0, 0, 0]),
1552 // original_classes
1553 vec![L, L, L, EN, EN, EN],
1554 // paragraphs
1555 vec![ParagraphInfo {
1556 range: 0..6,
1557 level: LTR_LEVEL,
1558 }],
1559 // levels_u16
1560 Level::vec(&[0, 0, 0, 0, 0, 0]),
1561 // original_classes_u16
1562 vec![L, L, L, EN, EN, EN],
1563 // paragraphs_u16
1564 vec![ParagraphInfo {
1565 range: 0..6,
1566 level: LTR_LEVEL,
1567 }],
1568 ),
1569 (
1570 "abc \u{05D0}\u{05D1}\u{05D2}",
1571 Some(LTR_LEVEL),
1572 Level::vec(&[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]),
1573 vec![L, L, L, WS, R, R, R, R, R, R],
1574 vec![ParagraphInfo {
1575 range: 0..10,
1576 level: LTR_LEVEL,
1577 }],
1578 Level::vec(&[0, 0, 0, 0, 1, 1, 1]),
1579 vec![L, L, L, WS, R, R, R],
1580 vec![ParagraphInfo {
1581 range: 0..7,
1582 level: LTR_LEVEL,
1583 }],
1584 ),
1585 (
1586 "abc \u{05D0}\u{05D1}\u{05D2}",
1587 Some(RTL_LEVEL),
1588 Level::vec(&[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]),
1589 vec![L, L, L, WS, R, R, R, R, R, R],
1590 vec![ParagraphInfo {
1591 range: 0..10,
1592 level: RTL_LEVEL,
1593 }],
1594 Level::vec(&[2, 2, 2, 1, 1, 1, 1]),
1595 vec![L, L, L, WS, R, R, R],
1596 vec![ParagraphInfo {
1597 range: 0..7,
1598 level: RTL_LEVEL,
1599 }],
1600 ),
1601 (
1602 "\u{05D0}\u{05D1}\u{05D2} abc",
1603 Some(LTR_LEVEL),
1604 Level::vec(&[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
1605 vec![R, R, R, R, R, R, WS, L, L, L],
1606 vec![ParagraphInfo {
1607 range: 0..10,
1608 level: LTR_LEVEL,
1609 }],
1610 Level::vec(&[1, 1, 1, 0, 0, 0, 0]),
1611 vec![R, R, R, WS, L, L, L],
1612 vec![ParagraphInfo {
1613 range: 0..7,
1614 level: LTR_LEVEL,
1615 }],
1616 ),
1617 (
1618 "\u{05D0}\u{05D1}\u{05D2} abc",
1619 None,
1620 Level::vec(&[1, 1, 1, 1, 1, 1, 1, 2, 2, 2]),
1621 vec![R, R, R, R, R, R, WS, L, L, L],
1622 vec![ParagraphInfo {
1623 range: 0..10,
1624 level: RTL_LEVEL,
1625 }],
1626 Level::vec(&[1, 1, 1, 1, 2, 2, 2]),
1627 vec![R, R, R, WS, L, L, L],
1628 vec![ParagraphInfo {
1629 range: 0..7,
1630 level: RTL_LEVEL,
1631 }],
1632 ),
1633 (
1634 "\u{063A}2\u{0638} \u{05D0}2\u{05D2}",
1635 Some(LTR_LEVEL),
1636 Level::vec(&[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]),
1637 vec![AL, AL, EN, AL, AL, WS, R, R, EN, R, R],
1638 vec![ParagraphInfo {
1639 range: 0..11,
1640 level: LTR_LEVEL,
1641 }],
1642 Level::vec(&[1, 2, 1, 1, 1, 2, 1]),
1643 vec![AL, EN, AL, WS, R, EN, R],
1644 vec![ParagraphInfo {
1645 range: 0..7,
1646 level: LTR_LEVEL,
1647 }],
1648 ),
1649 (
1650 "a א.\nג",
1651 None,
1652 Level::vec(&[0, 0, 1, 1, 0, 0, 1, 1]),
1653 vec![L, WS, R, R, CS, B, R, R],
1654 vec![
1655 ParagraphInfo {
1656 range: 0..6,
1657 level: LTR_LEVEL,
1658 },
1659 ParagraphInfo {
1660 range: 6..8,
1661 level: RTL_LEVEL,
1662 },
1663 ],
1664 Level::vec(&[0, 0, 1, 0, 0, 1]),
1665 vec![L, WS, R, CS, B, R],
1666 vec![
1667 ParagraphInfo {
1668 range: 0..5,
1669 level: LTR_LEVEL,
1670 },
1671 ParagraphInfo {
1672 range: 5..6,
1673 level: RTL_LEVEL,
1674 },
1675 ],
1676 ),
1677 // BidiTest:69635 (AL ET EN)
1678 (
1679 "\u{060B}\u{20CF}\u{06F9}",
1680 None,
1681 Level::vec(&[1, 1, 1, 1, 1, 2, 2]),
1682 vec![AL, AL, ET, ET, ET, EN, EN],
1683 vec![ParagraphInfo {
1684 range: 0..7,
1685 level: RTL_LEVEL,
1686 }],
1687 Level::vec(&[1, 1, 2]),
1688 vec![AL, ET, EN],
1689 vec![ParagraphInfo {
1690 range: 0..3,
1691 level: RTL_LEVEL,
1692 }],
1693 ),
1694 ];
1695
1696 for t in tests {
1697 assert_eq!(
1698 BidiInfo::new(t.0, t.1),
1699 BidiInfo {
1700 text: t.0,
1701 levels: t.2.clone(),
1702 original_classes: t.3.clone(),
1703 paragraphs: t.4.clone(),
1704 }
1705 );
1706 // If it was a single paragraph, also test ParagraphBidiInfo.
1707 if t.4.len() == 1 {
1708 assert_eq!(
1709 ParagraphBidiInfo::new(t.0, t.1),
1710 ParagraphBidiInfo {
1711 text: t.0,
1712 original_classes: t.3,
1713 levels: t.2.clone(),
1714 paragraph_level: t.4[0].level,
1715 is_pure_ltr: !level::has_rtl(&t.2),
1716 }
1717 )
1718 }
1719 let text = &to_utf16(t.0);
1720 assert_eq!(
1721 BidiInfoU16::new(text, t.1),
1722 BidiInfoU16 {
1723 text,
1724 levels: t.5.clone(),
1725 original_classes: t.6.clone(),
1726 paragraphs: t.7.clone(),
1727 }
1728 );
1729 if t.7.len() == 1 {
1730 assert_eq!(
1731 ParagraphBidiInfoU16::new(text, t.1),
1732 ParagraphBidiInfoU16 {
1733 text: text,
1734 original_classes: t.6.clone(),
1735 levels: t.5.clone(),
1736 paragraph_level: t.7[0].level,
1737 is_pure_ltr: !level::has_rtl(&t.5),
1738 }
1739 )
1740 }
1741 }
1742 }
1743
1744 #[test]
1745 #[cfg(feature = "hardcoded-data")]
1746 fn test_paragraph_bidi_info() {
1747 // Passing text that includes a paragraph break to the ParagraphBidiInfo API:
1748 // this is a misuse of the API by the client, but our behavior is safe &
1749 // consistent. The embedded paragraph break acts like a separator (tab) would.
1750 let tests = vec![
1751 (
1752 "a א.\nג",
1753 None,
1754 // utf-8 results:
1755 vec![L, WS, R, R, CS, B, R, R],
1756 Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]),
1757 // utf-16 results:
1758 vec![L, WS, R, CS, B, R],
1759 Level::vec(&[0, 0, 1, 1, 1, 1]),
1760 // paragraph level; is_pure_ltr
1761 LTR_LEVEL,
1762 false,
1763 ),
1764 (
1765 "\u{5d1} a.\nb.",
1766 None,
1767 // utf-8 results:
1768 vec![R, R, WS, L, CS, B, L, CS],
1769 Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]),
1770 // utf-16 results:
1771 vec![R, WS, L, CS, B, L, CS],
1772 Level::vec(&[1, 1, 2, 2, 2, 2, 1]),
1773 // paragraph level; is_pure_ltr
1774 RTL_LEVEL,
1775 false,
1776 ),
1777 (
1778 "a א.\tג",
1779 None,
1780 // utf-8 results:
1781 vec![L, WS, R, R, CS, S, R, R],
1782 Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]),
1783 // utf-16 results:
1784 vec![L, WS, R, CS, S, R],
1785 Level::vec(&[0, 0, 1, 1, 1, 1]),
1786 // paragraph level; is_pure_ltr
1787 LTR_LEVEL,
1788 false,
1789 ),
1790 (
1791 "\u{5d1} a.\tb.",
1792 None,
1793 // utf-8 results:
1794 vec![R, R, WS, L, CS, S, L, CS],
1795 Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]),
1796 // utf-16 results:
1797 vec![R, WS, L, CS, S, L, CS],
1798 Level::vec(&[1, 1, 2, 2, 2, 2, 1]),
1799 // paragraph level; is_pure_ltr
1800 RTL_LEVEL,
1801 false,
1802 ),
1803 ];
1804
1805 for t in tests {
1806 assert_eq!(
1807 ParagraphBidiInfo::new(t.0, t.1),
1808 ParagraphBidiInfo {
1809 text: t.0,
1810 original_classes: t.2,
1811 levels: t.3,
1812 paragraph_level: t.6,
1813 is_pure_ltr: t.7,
1814 }
1815 );
1816 let text = &to_utf16(t.0);
1817 assert_eq!(
1818 ParagraphBidiInfoU16::new(text, t.1),
1819 ParagraphBidiInfoU16 {
1820 text: text,
1821 original_classes: t.4,
1822 levels: t.5,
1823 paragraph_level: t.6,
1824 is_pure_ltr: t.7,
1825 }
1826 );
1827 }
1828 }
1829
1830 #[test]
1831 #[cfg(feature = "hardcoded-data")]
1832 fn test_bidi_info_has_rtl() {
1833 let tests = vec![
1834 // ASCII only
1835 ("123", None, false),
1836 ("123", Some(LTR_LEVEL), false),
1837 ("123", Some(RTL_LEVEL), false),
1838 ("abc", None, false),
1839 ("abc", Some(LTR_LEVEL), false),
1840 ("abc", Some(RTL_LEVEL), false),
1841 ("abc 123", None, false),
1842 ("abc\n123", None, false),
1843 // With Hebrew
1844 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1845 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(LTR_LEVEL), true),
1846 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(RTL_LEVEL), true),
1847 ("abc \u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1848 ("abc\n\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1849 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} abc", None, true),
1850 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\nabc", None, true),
1851 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} 123", None, true),
1852 ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\n123", None, true),
1853 ];
1854
1855 for t in tests {
1856 assert_eq!(BidiInfo::new(t.0, t.1).has_rtl(), t.2);
1857 assert_eq!(BidiInfoU16::new(&to_utf16(t.0), t.1).has_rtl(), t.2);
1858 }
1859 }
1860
1861 #[cfg(feature = "hardcoded-data")]
1862 fn reorder_paras(text: &str) -> Vec<Cow<'_, str>> {
1863 let bidi_info = BidiInfo::new(text, None);
1864 bidi_info
1865 .paragraphs
1866 .iter()
1867 .map(|para| bidi_info.reorder_line(para, para.range.clone()))
1868 .collect()
1869 }
1870
1871 #[cfg(feature = "hardcoded-data")]
1872 fn reorder_paras_u16(text: &[u16]) -> Vec<Cow<'_, [u16]>> {
1873 let bidi_info = BidiInfoU16::new(text, None);
1874 bidi_info
1875 .paragraphs
1876 .iter()
1877 .map(|para| bidi_info.reorder_line(para, para.range.clone()))
1878 .collect()
1879 }
1880
1881 #[test]
1882 #[cfg(feature = "hardcoded-data")]
1883 fn test_reorder_line() {
1884 let tests = vec![
1885 // Bidi_Class: L L L B L L L B L L L
1886 ("abc\ndef\nghi", vec!["abc\n", "def\n", "ghi"]),
1887 // Bidi_Class: L L EN B L L EN B L L EN
1888 ("ab1\nde2\ngh3", vec!["ab1\n", "de2\n", "gh3"]),
1889 // Bidi_Class: L L L B AL AL AL
1890 ("abc\nابج", vec!["abc\n", "جبا"]),
1891 // Bidi_Class: AL AL AL B L L L
1892 (
1893 "\u{0627}\u{0628}\u{062C}\nabc",
1894 vec!["\n\u{062C}\u{0628}\u{0627}", "abc"],
1895 ),
1896 ("1.-2", vec!["1.-2"]),
1897 ("1-.2", vec!["1-.2"]),
1898 ("abc אבג", vec!["abc גבא"]),
1899 // Numbers being weak LTR characters, cannot reorder strong RTL
1900 ("123 \u{05D0}\u{05D1}\u{05D2}", vec!["גבא 123"]),
1901 ("abc\u{202A}def", vec!["abc\u{202A}def"]),
1902 (
1903 "abc\u{202A}def\u{202C}ghi",
1904 vec!["abc\u{202A}def\u{202C}ghi"],
1905 ),
1906 (
1907 "abc\u{2066}def\u{2069}ghi",
1908 vec!["abc\u{2066}def\u{2069}ghi"],
1909 ),
1910 // Testing for RLE Character
1911 ("\u{202B}abc אבג\u{202C}", vec!["\u{202b}גבא abc\u{202c}"]),
1912 // Testing neutral characters
1913 ("\u{05D0}בג? אבג", vec!["גבא ?גבא"]),
1914 // Testing neutral characters with special case
1915 ("A אבג?", vec!["A גבא?"]),
1916 // Testing neutral characters with Implicit RTL Marker
1917 ("A אבג?\u{200F}", vec!["A \u{200F}?גבא"]),
1918 ("\u{05D0}בג abc", vec!["abc גבא"]),
1919 ("abc\u{2067}.-\u{2069}ghi", vec!["abc\u{2067}-.\u{2069}ghi"]),
1920 (
1921 "Hello, \u{2068}\u{202E}world\u{202C}\u{2069}!",
1922 vec!["Hello, \u{2068}\u{202E}\u{202C}dlrow\u{2069}!"],
1923 ),
1924 // With mirrorable characters in RTL run
1925 ("\u{05D0}(ב)ג.", vec![".ג)ב(א"]),
1926 // With mirrorable characters on level boundary
1927 ("\u{05D0}ב(גד[&ef].)gh", vec!["gh).]ef&[דג(בא"]),
1928 ];
1929
1930 for t in tests {
1931 assert_eq!(reorder_paras(t.0), t.1);
1932 let expect_utf16 = t.1.iter().map(|v| to_utf16(v)).collect::<Vec<_>>();
1933 assert_eq!(reorder_paras_u16(&to_utf16(t.0)), expect_utf16);
1934 }
1935 }
1936
1937 fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
1938 let bidi_info = BidiInfo::new(text, None);
1939 bidi_info
1940 .paragraphs
1941 .iter()
1942 .map(|para| bidi_info.reordered_levels(para, para.range.clone()))
1943 .collect()
1944 }
1945
1946 fn reordered_levels_per_char_for_paras(text: &str) -> Vec<Vec<Level>> {
1947 let bidi_info = BidiInfo::new(text, None);
1948 bidi_info
1949 .paragraphs
1950 .iter()
1951 .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
1952 .collect()
1953 }
1954
1955 fn reordered_levels_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>> {
1956 let bidi_info = BidiInfoU16::new(text, None);
1957 bidi_info
1958 .paragraphs
1959 .iter()
1960 .map(|para| bidi_info.reordered_levels(para, para.range.clone()))
1961 .collect()
1962 }
1963
1964 fn reordered_levels_per_char_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>> {
1965 let bidi_info = BidiInfoU16::new(text, None);
1966 bidi_info
1967 .paragraphs
1968 .iter()
1969 .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
1970 .collect()
1971 }
1972
1973 #[test]
1974 #[cfg(feature = "hardcoded-data")]
1975 fn test_reordered_levels() {
1976 let tests = vec![
1977 // BidiTest:946 (LRI PDI)
1978 (
1979 "\u{2067}\u{2069}",
1980 vec![Level::vec(&[0, 0, 0, 0, 0, 0])],
1981 vec![Level::vec(&[0, 0])],
1982 vec![Level::vec(&[0, 0])],
1983 ),
1984 // BidiTest:69635 (AL ET EN)
1985 (
1986 "\u{060B}\u{20CF}\u{06F9}",
1987 vec![Level::vec(&[1, 1, 1, 1, 1, 2, 2])],
1988 vec![Level::vec(&[1, 1, 2])],
1989 vec![Level::vec(&[1, 1, 2])],
1990 ),
1991 ];
1992
1993 for t in tests {
1994 assert_eq!(reordered_levels_for_paras(t.0), t.1);
1995 assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2);
1996 let text = &to_utf16(t.0);
1997 assert_eq!(reordered_levels_for_paras_u16(text), t.3);
1998 assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2);
1999 }
2000
2001 let tests = vec![
2002 // BidiTest:291284 (AN RLI PDF R)
2003 (
2004 "\u{0605}\u{2067}\u{202C}\u{0590}",
2005 vec![&["2", "2", "0", "0", "0", "x", "x", "x", "1", "1"]],
2006 vec![&["2", "0", "x", "1"]],
2007 vec![&["2", "0", "x", "1"]],
2008 ),
2009 ];
2010
2011 for t in tests {
2012 assert_eq!(reordered_levels_for_paras(t.0), t.1);
2013 assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2);
2014 let text = &to_utf16(t.0);
2015 assert_eq!(reordered_levels_for_paras_u16(text), t.3);
2016 assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2);
2017 }
2018
2019 let text = "aa טֶ";
2020 let bidi_info = BidiInfo::new(text, None);
2021 assert_eq!(
2022 bidi_info.reordered_levels(&bidi_info.paragraphs[0], 3..7),
2023 Level::vec(&[0, 0, 0, 1, 1, 1, 1]),
2024 );
2025
2026 let text = &to_utf16(text);
2027 let bidi_info = BidiInfoU16::new(text, None);
2028 assert_eq!(
2029 bidi_info.reordered_levels(&bidi_info.paragraphs[0], 1..4),
2030 Level::vec(&[0, 0, 0, 1, 1]),
2031 );
2032 }
2033
2034 #[test]
2035 fn test_paragraph_info_len() {
2036 let text = "hello world";
2037 let bidi_info = BidiInfo::new(text, None);
2038 assert_eq!(bidi_info.paragraphs.len(), 1);
2039 assert_eq!(bidi_info.paragraphs[0].len(), text.len());
2040
2041 let text2 = "How are you";
2042 let whole_text = format!("{}\n{}", text, text2);
2043 let bidi_info = BidiInfo::new(&whole_text, None);
2044 assert_eq!(bidi_info.paragraphs.len(), 2);
2045
2046 // The first paragraph include the paragraph separator.
2047 // TODO: investigate if the paragraph separator character
2048 // should not be part of any paragraph.
2049 assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1);
2050 assert_eq!(bidi_info.paragraphs[1].len(), text2.len());
2051
2052 let text = &to_utf16(text);
2053 let bidi_info = BidiInfoU16::new(text, None);
2054 assert_eq!(bidi_info.paragraphs.len(), 1);
2055 assert_eq!(bidi_info.paragraphs[0].len(), text.len());
2056
2057 let text2 = &to_utf16(text2);
2058 let whole_text = &to_utf16(&whole_text);
2059 let bidi_info = BidiInfoU16::new(&whole_text, None);
2060 assert_eq!(bidi_info.paragraphs.len(), 2);
2061
2062 assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1);
2063 assert_eq!(bidi_info.paragraphs[1].len(), text2.len());
2064 }
2065
2066 #[test]
2067 fn test_direction() {
2068 let ltr_text = "hello world";
2069 let rtl_text = "أهلا بكم";
2070 let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text);
2071 let bidi_info = BidiInfo::new(&all_paragraphs, None);
2072 assert_eq!(bidi_info.paragraphs.len(), 3);
2073 let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2074 let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]);
2075 let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]);
2076 assert_eq!(p_ltr.direction(), Direction::Ltr);
2077 assert_eq!(p_rtl.direction(), Direction::Rtl);
2078 assert_eq!(p_mixed.direction(), Direction::Mixed);
2079
2080 let all_paragraphs = &to_utf16(&all_paragraphs);
2081 let bidi_info = BidiInfoU16::new(&all_paragraphs, None);
2082 assert_eq!(bidi_info.paragraphs.len(), 3);
2083 let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2084 let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]);
2085 let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]);
2086 assert_eq!(p_ltr.direction(), Direction::Ltr);
2087 assert_eq!(p_rtl.direction(), Direction::Rtl);
2088 assert_eq!(p_mixed.direction(), Direction::Mixed);
2089 }
2090
2091 #[test]
2092 fn test_edge_cases_direction() {
2093 // No paragraphs for empty text.
2094 let empty = "";
2095 let bidi_info = BidiInfo::new(empty, Option::from(RTL_LEVEL));
2096 assert_eq!(bidi_info.paragraphs.len(), 0);
2097
2098 let empty = &to_utf16(empty);
2099 let bidi_info = BidiInfoU16::new(empty, Option::from(RTL_LEVEL));
2100 assert_eq!(bidi_info.paragraphs.len(), 0);
2101
2102 let tests = vec![
2103 // The paragraph separator will take the value of the default direction
2104 // which is left to right.
2105 ("\n", None, Direction::Ltr),
2106 // The paragraph separator will take the value of the given initial direction
2107 // which is left to right.
2108 ("\n", Option::from(LTR_LEVEL), Direction::Ltr),
2109 // The paragraph separator will take the value of the given initial direction
2110 // which is right to left.
2111 ("\n", Option::from(RTL_LEVEL), Direction::Rtl),
2112 ];
2113
2114 for t in tests {
2115 let bidi_info = BidiInfo::new(t.0, t.1);
2116 assert_eq!(bidi_info.paragraphs.len(), 1);
2117 let p = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2118 assert_eq!(p.direction(), t.2);
2119 let text = &to_utf16(t.0);
2120 let bidi_info = BidiInfoU16::new(text, t.1);
2121 let p = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2122 assert_eq!(p.direction(), t.2);
2123 }
2124 }
2125
2126 #[test]
2127 fn test_level_at() {
2128 let ltr_text = "hello world";
2129 let rtl_text = "أهلا بكم";
2130 let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text);
2131 let bidi_info = BidiInfo::new(&all_paragraphs, None);
2132 assert_eq!(bidi_info.paragraphs.len(), 3);
2133
2134 let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2135 let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]);
2136 let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]);
2137
2138 assert_eq!(p_ltr.level_at(0), LTR_LEVEL);
2139 assert_eq!(p_rtl.level_at(0), RTL_LEVEL);
2140 assert_eq!(p_mixed.level_at(0), LTR_LEVEL);
2141 assert_eq!(p_mixed.info.levels.len(), 54);
2142 assert_eq!(p_mixed.para.range.start, 28);
2143 assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
2144
2145 let all_paragraphs = &to_utf16(&all_paragraphs);
2146 let bidi_info = BidiInfoU16::new(&all_paragraphs, None);
2147 assert_eq!(bidi_info.paragraphs.len(), 3);
2148
2149 let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2150 let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]);
2151 let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]);
2152
2153 assert_eq!(p_ltr.level_at(0), LTR_LEVEL);
2154 assert_eq!(p_rtl.level_at(0), RTL_LEVEL);
2155 assert_eq!(p_mixed.level_at(0), LTR_LEVEL);
2156 assert_eq!(p_mixed.info.levels.len(), 40);
2157 assert_eq!(p_mixed.para.range.start, 21);
2158 assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
2159 }
2160
2161 #[test]
2162 fn test_get_base_direction() {
2163 let tests = vec![
2164 ("", Direction::Mixed), // return Mixed if no strong character found
2165 ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
2166 ("3.14\npi", Direction::Mixed), // only first paragraph is considered
2167 ("[123 'abc']", Direction::Ltr),
2168 ("[123 '\u{0628}' abc", Direction::Rtl),
2169 ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
2170 ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
2171 ];
2172
2173 for t in tests {
2174 assert_eq!(get_base_direction(t.0), t.1);
2175 let text = &to_utf16(t.0);
2176 assert_eq!(get_base_direction(text.as_slice()), t.1);
2177 }
2178 }
2179
2180 #[test]
2181 fn test_get_base_direction_full() {
2182 let tests = vec![
2183 ("", Direction::Mixed), // return Mixed if no strong character found
2184 ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
2185 ("3.14\npi", Direction::Ltr), // direction taken from the second paragraph
2186 ("3.14\n\u{05D0}", Direction::Rtl), // direction taken from the second paragraph
2187 ("[123 'abc']", Direction::Ltr),
2188 ("[123 '\u{0628}' abc", Direction::Rtl),
2189 ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
2190 ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
2191 ("[123 '\u{2066}abc\u{2068}'\n\u{0628}]", Direction::Rtl), // \n resets embedding level
2192 ];
2193
2194 for t in tests {
2195 assert_eq!(get_base_direction_full(t.0), t.1);
2196 let text = &to_utf16(t.0);
2197 assert_eq!(get_base_direction_full(text.as_slice()), t.1);
2198 }
2199 }
2200}
2201
2202#[cfg(all(feature = "serde", feature = "hardcoded-data", test))]
2203mod serde_tests {
2204 use super::*;
2205 use serde_test::{assert_tokens, Token};
2206
2207 #[test]
2208 fn test_levels() {
2209 let text = "abc אבג";
2210 let bidi_info = BidiInfo::new(text, None);
2211 let levels = bidi_info.levels;
2212 assert_eq!(text.as_bytes().len(), 10);
2213 assert_eq!(levels.len(), 10);
2214 assert_tokens(
2215 &levels,
2216 &[
2217 Token::Seq { len: Some(10) },
2218 Token::NewtypeStruct { name: "Level" },
2219 Token::U8(0),
2220 Token::NewtypeStruct { name: "Level" },
2221 Token::U8(0),
2222 Token::NewtypeStruct { name: "Level" },
2223 Token::U8(0),
2224 Token::NewtypeStruct { name: "Level" },
2225 Token::U8(0),
2226 Token::NewtypeStruct { name: "Level" },
2227 Token::U8(1),
2228 Token::NewtypeStruct { name: "Level" },
2229 Token::U8(1),
2230 Token::NewtypeStruct { name: "Level" },
2231 Token::U8(1),
2232 Token::NewtypeStruct { name: "Level" },
2233 Token::U8(1),
2234 Token::NewtypeStruct { name: "Level" },
2235 Token::U8(1),
2236 Token::NewtypeStruct { name: "Level" },
2237 Token::U8(1),
2238 Token::SeqEnd,
2239 ],
2240 );
2241 }
2242}
2243