1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! 3.3.3 Preparations for Implicit Processing
11//!
12//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13
14use alloc::vec::Vec;
15use core::cmp::max;
16use core::ops::Range;
17#[cfg(feature = "smallvec")]
18use smallvec::{smallvec, SmallVec};
19
20use super::level::Level;
21use super::BidiClass::{self, *};
22
23/// A maximal substring of characters with the same embedding level.
24///
25/// Represented as a range of byte indices.
26pub type LevelRun = Range<usize>;
27
28#[cfg(feature = "smallvec")]
29pub type LevelRunVec = SmallVec<[LevelRun; 8]>;
30#[cfg(not(feature = "smallvec"))]
31pub type LevelRunVec = Vec<LevelRun>;
32
33/// Output of `isolating_run_sequences` (steps X9-X10)
34#[derive(Debug, PartialEq)]
35pub struct IsolatingRunSequence {
36 pub runs: Vec<LevelRun>,
37 pub sos: BidiClass, // Start-of-sequence type.
38 pub eos: BidiClass, // End-of-sequence type.
39}
40
41#[cfg(feature = "smallvec")]
42pub type IsolatingRunSequenceVec = SmallVec<[IsolatingRunSequence; 8]>;
43#[cfg(not(feature = "smallvec"))]
44pub type IsolatingRunSequenceVec = Vec<IsolatingRunSequence>;
45
46/// Compute the set of isolating run sequences.
47///
48/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
49/// except the last one in the sequence, the last character of the run is an isolate initiator
50/// whose matching PDI is the first character of the next level run in the sequence.
51///
52/// Note: This function does *not* return the sequences in order by their first characters.
53#[cfg_attr(feature = "flame_it", flamer::flame)]
54pub fn isolating_run_sequences(
55 para_level: Level,
56 original_classes: &[BidiClass],
57 levels: &[Level],
58 runs: LevelRunVec,
59 has_isolate_controls: bool,
60 isolating_run_sequences: &mut IsolatingRunSequenceVec,
61) {
62 // Per http://www.unicode.org/reports/tr9/#BD13:
63 // "In the absence of isolate initiators, each isolating run sequence in a paragraph
64 // consists of exactly one level run, and each level run constitutes a separate
65 // isolating run sequence."
66 // We can take a simplified path to handle this case.
67 if !has_isolate_controls {
68 isolating_run_sequences.reserve_exact(runs.len());
69 for run in runs {
70 // Determine the `sos` and `eos` class for the sequence.
71 // <http://www.unicode.org/reports/tr9/#X10>
72
73 let run_levels = &levels[run.clone()];
74 let run_classes = &original_classes[run.clone()];
75 let seq_level = run_levels[run_classes
76 .iter()
77 .position(|c| not_removed_by_x9(c))
78 .unwrap_or(0)];
79
80 let end_level = run_levels[run_classes
81 .iter()
82 .rposition(|c| not_removed_by_x9(c))
83 .unwrap_or(run.end - run.start - 1)];
84
85 // Get the level of the last non-removed char before the run.
86 let pred_level = match original_classes[..run.start]
87 .iter()
88 .rposition(not_removed_by_x9)
89 {
90 Some(idx) => levels[idx],
91 None => para_level,
92 };
93
94 // Get the level of the next non-removed char after the run.
95 let succ_level = match original_classes[run.end..]
96 .iter()
97 .position(not_removed_by_x9)
98 {
99 Some(idx) => levels[run.end + idx],
100 None => para_level,
101 };
102
103 isolating_run_sequences.push(IsolatingRunSequence {
104 runs: vec![run],
105 sos: max(seq_level, pred_level).bidi_class(),
106 eos: max(end_level, succ_level).bidi_class(),
107 });
108 }
109 return;
110 }
111
112 // Compute the set of isolating run sequences.
113 // <http://www.unicode.org/reports/tr9/#BD13>
114 let mut sequences = Vec::with_capacity(runs.len());
115
116 // When we encounter an isolate initiator, we push the current sequence onto the
117 // stack so we can resume it after the matching PDI.
118 #[cfg(feature = "smallvec")]
119 let mut stack: SmallVec<[Vec<Range<usize>>; 8]> = smallvec![vec![]];
120 #[cfg(not(feature = "smallvec"))]
121 let mut stack = vec![vec![]];
122
123 for run in runs {
124 assert!(!run.is_empty());
125 assert!(!stack.is_empty());
126
127 let start_class = original_classes[run.start];
128 // > In rule X10, [..] skip over any BNs when [..].
129 // > Do the same when determining if the last character of the sequence is an isolate initiator.
130 //
131 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
132 let end_class = original_classes[run.start..run.end]
133 .iter()
134 .copied()
135 .rev()
136 .find(not_removed_by_x9)
137 .unwrap_or(start_class);
138
139 let mut sequence = if start_class == PDI && stack.len() > 1 {
140 // Continue a previous sequence interrupted by an isolate.
141 stack.pop().unwrap()
142 } else {
143 // Start a new sequence.
144 Vec::new()
145 };
146
147 sequence.push(run);
148
149 if matches!(end_class, RLI | LRI | FSI) {
150 // Resume this sequence after the isolate.
151 stack.push(sequence);
152 } else {
153 // This sequence is finished.
154 sequences.push(sequence);
155 }
156 }
157 // Pop any remaining sequences off the stack.
158 sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
159
160 // Determine the `sos` and `eos` class for each sequence.
161 // <http://www.unicode.org/reports/tr9/#X10>
162 for sequence in sequences {
163 assert!(!sequence.is_empty());
164
165 let start_of_seq = sequence[0].start;
166 let runs_len = sequence.len();
167 let end_of_seq = sequence[runs_len - 1].end;
168
169 let mut result = IsolatingRunSequence {
170 runs: sequence,
171 sos: L,
172 eos: L,
173 };
174
175 // > (not counting characters removed by X9)
176 let seq_level = levels[result
177 .iter_forwards_from(start_of_seq, 0)
178 .find(|i| not_removed_by_x9(&original_classes[*i]))
179 .unwrap_or(start_of_seq)];
180
181 // XXXManishearth the spec talks of a start and end level,
182 // but for a given IRS the two should be equivalent, yes?
183 let end_level = levels[result
184 .iter_backwards_from(end_of_seq, runs_len - 1)
185 .find(|i| not_removed_by_x9(&original_classes[*i]))
186 .unwrap_or(end_of_seq - 1)];
187
188 #[cfg(test)]
189 for idx in result.runs.clone().into_iter().flatten() {
190 if not_removed_by_x9(&original_classes[idx]) {
191 assert_eq!(seq_level, levels[idx]);
192 }
193 }
194
195 // Get the level of the last non-removed char before the runs.
196 let pred_level = match original_classes[..start_of_seq]
197 .iter()
198 .rposition(not_removed_by_x9)
199 {
200 Some(idx) => levels[idx],
201 None => para_level,
202 };
203
204 // Get the last non-removed character to check if it is an isolate initiator.
205 // The spec calls for an unmatched one, but matched isolate initiators
206 // will never be at the end of a level run (otherwise there would be more to the run).
207 // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
208 let last_non_removed = original_classes[..end_of_seq]
209 .iter()
210 .copied()
211 .rev()
212 .find(not_removed_by_x9)
213 .unwrap_or(BN);
214
215 // Get the level of the next non-removed char after the runs.
216 let succ_level = if matches!(last_non_removed, RLI | LRI | FSI) {
217 para_level
218 } else {
219 match original_classes[end_of_seq..]
220 .iter()
221 .position(not_removed_by_x9)
222 {
223 Some(idx) => levels[end_of_seq + idx],
224 None => para_level,
225 }
226 };
227
228 result.sos = max(seq_level, pred_level).bidi_class();
229 result.eos = max(end_level, succ_level).bidi_class();
230
231 isolating_run_sequences.push(result);
232 }
233}
234
235impl IsolatingRunSequence {
236 /// Given a text-relative position `pos` and an index of the level run it is in,
237 /// produce an iterator of all characters after and pos (`pos..`) that are in this
238 /// run sequence
239 pub(crate) fn iter_forwards_from(
240 &self,
241 pos: usize,
242 level_run_index: usize,
243 ) -> impl Iterator<Item = usize> + '_ {
244 let runs = &self.runs[level_run_index..];
245
246 // Check that it is in range
247 // (we can't use contains() since we want an inclusive range)
248 #[cfg(feature = "std")]
249 debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
250
251 (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
252 }
253
254 /// Given a text-relative position `pos` and an index of the level run it is in,
255 /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
256 /// run sequence
257 pub(crate) fn iter_backwards_from(
258 &self,
259 pos: usize,
260 level_run_index: usize,
261 ) -> impl Iterator<Item = usize> + '_ {
262 let prev_runs = &self.runs[..level_run_index];
263 let current = &self.runs[level_run_index];
264
265 // Check that it is in range
266 // (we can't use contains() since we want an inclusive range)
267 #[cfg(feature = "std")]
268 debug_assert!(current.start <= pos && pos <= current.end);
269
270 (current.start..pos)
271 .rev()
272 .chain(prev_runs.iter().rev().flat_map(Clone::clone))
273 }
274}
275
276/// Finds the level runs in a paragraph.
277///
278/// <http://www.unicode.org/reports/tr9/#BD7>
279///
280/// This is only used by tests; normally level runs are identified during explicit::compute.
281#[cfg(test)]
282fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
283 assert_eq!(levels.len(), original_classes.len());
284
285 let mut runs = Vec::new();
286 if levels.is_empty() {
287 return runs;
288 }
289
290 let mut current_run_level = levels[0];
291 let mut current_run_start = 0;
292 for i in 1..levels.len() {
293 if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
294 // End the last run and start a new one.
295 runs.push(current_run_start..i);
296 current_run_level = levels[i];
297 current_run_start = i;
298 }
299 }
300 runs.push(current_run_start..levels.len());
301
302 runs
303}
304
305/// Should this character be ignored in steps after X9?
306///
307/// <http://www.unicode.org/reports/tr9/#X9>
308pub fn removed_by_x9(class: BidiClass) -> bool {
309 matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
310}
311
312// For use as a predicate for `position` / `rposition`
313pub fn not_removed_by_x9(class: &BidiClass) -> bool {
314 !removed_by_x9(*class)
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320
321 #[test]
322 fn test_level_runs() {
323 assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
324 assert_eq!(
325 level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
326 &[0..3, 3..5, 5..6, 6..8]
327 );
328 }
329
330 // From <http://www.unicode.org/reports/tr9/#BD13>
331 #[rustfmt::skip]
332 #[test]
333 fn test_isolating_run_sequences() {
334
335 // == Example 1 ==
336 // text1·RLE·text2·PDF·RLE·text3·PDF·text4
337 // index 0 1 2 3 4 5 6 7
338 let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
339 let levels = &[0, 1, 1, 1, 1, 1, 1, 0];
340 let para_level = Level::ltr();
341 let mut sequences = IsolatingRunSequenceVec::new();
342 isolating_run_sequences(
343 para_level,
344 classes,
345 &Level::vec(levels),
346 level_runs(&Level::vec(levels), classes).into(),
347 false,
348 &mut sequences);
349 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
350 assert_eq!(
351 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
352 vec![vec![0..2], vec![2..7], vec![7..8]]
353 );
354
355 // == Example 2 ==
356 // text1·RLI·text2·PDI·RLI·text3·PDI·text4
357 // index 0 1 2 3 4 5 6 7
358 let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
359 let levels = &[0, 0, 1, 0, 0, 1, 0, 0];
360 let para_level = Level::ltr();
361 let mut sequences = IsolatingRunSequenceVec::new();
362 isolating_run_sequences(
363 para_level,
364 classes,
365 &Level::vec(levels),
366 level_runs(&Level::vec(levels), classes).into(),
367 true,
368 &mut sequences);
369 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
370 assert_eq!(
371 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
372 vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
373 );
374
375 // == Example 3 ==
376 // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
377 // index 0 1 2 3 4 5 6 7 8 9 10 11 12
378 let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L];
379 let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0];
380 let para_level = Level::ltr();
381 let mut sequences = IsolatingRunSequenceVec::new();
382 isolating_run_sequences(
383 para_level,
384 classes,
385 &Level::vec(levels),
386 level_runs(&Level::vec(levels), classes).into(),
387 true,
388 &mut sequences);
389 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
390 assert_eq!(
391 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
392 vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
393 );
394 }
395
396 // From <http://www.unicode.org/reports/tr9/#X10>
397 #[rustfmt::skip]
398 #[test]
399 fn test_isolating_run_sequences_sos_and_eos() {
400
401 // == Example 1 ==
402 // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
403 // index 0 1 2 3 4 5 6 7 8 9 10 11
404 let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L];
405 let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0];
406 let para_level = Level::ltr();
407 let mut sequences = IsolatingRunSequenceVec::new();
408 isolating_run_sequences(
409 para_level,
410 classes,
411 &Level::vec(levels),
412 level_runs(&Level::vec(levels), classes).into(),
413 false,
414 &mut sequences);
415 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
416
417 // text1
418 assert_eq!(
419 &sequences[0],
420 &IsolatingRunSequence {
421 runs: vec![0..2],
422 sos: L,
423 eos: R,
424 }
425 );
426
427 // text2
428 assert_eq!(
429 &sequences[1],
430 &IsolatingRunSequence {
431 runs: vec![2..4],
432 sos: R,
433 eos: L,
434 }
435 );
436
437 // text3
438 assert_eq!(
439 &sequences[2],
440 &IsolatingRunSequence {
441 runs: vec![4..6],
442 sos: L,
443 eos: L,
444 }
445 );
446
447 // text4 text5
448 assert_eq!(
449 &sequences[3],
450 &IsolatingRunSequence {
451 runs: vec![6..11],
452 sos: L,
453 eos: R,
454 }
455 );
456
457 // text6
458 assert_eq!(
459 &sequences[4],
460 &IsolatingRunSequence {
461 runs: vec![11..12],
462 sos: R,
463 eos: L,
464 }
465 );
466
467 // == Example 2 ==
468 // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
469 // index 0 1 2 3 4 5 6 7 8 9 10 11
470 let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L];
471 let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0];
472 let para_level = Level::ltr();
473 let mut sequences = IsolatingRunSequenceVec::new();
474 isolating_run_sequences(
475 para_level,
476 classes,
477 &Level::vec(levels),
478 level_runs(&Level::vec(levels), classes).into(),
479 true,
480 &mut sequences);
481 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
482
483 // text1·RLI·PDI·RLI·PDI·text6
484 assert_eq!(
485 &sequences[0],
486 &IsolatingRunSequence {
487 runs: vec![0..2, 7..9, 10..12],
488 sos: L,
489 eos: L,
490 }
491 );
492
493 // text2·LRI·PDI·text4
494 assert_eq!(
495 &sequences[1],
496 &IsolatingRunSequence {
497 runs: vec![2..4, 5..7],
498 sos: R,
499 eos: R,
500 }
501 );
502
503 // text3
504 assert_eq!(
505 &sequences[2],
506 &IsolatingRunSequence {
507 runs: vec![4..5],
508 sos: L,
509 eos: L,
510 }
511 );
512
513 // text5
514 assert_eq!(
515 &sequences[3],
516 &IsolatingRunSequence {
517 runs: vec![9..10],
518 sos: R,
519 eos: R,
520 }
521 );
522 }
523
524 #[test]
525 fn test_removed_by_x9() {
526 let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
527 let not_classes = &[L, RLI, AL, LRI, PDI];
528 for x in rem_classes {
529 assert_eq!(removed_by_x9(*x), true);
530 }
531 for x in not_classes {
532 assert_eq!(removed_by_x9(*x), false);
533 }
534 }
535
536 #[test]
537 fn test_not_removed_by_x9() {
538 let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
539 for x in non_x9_classes {
540 assert_eq!(not_removed_by_x9(&x), true);
541 }
542 }
543}
544