1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! 3.3.3 Preparations for Implicit Processing
11//!
12//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13
14use alloc::vec::Vec;
15use core::cmp::max;
16use core::ops::Range;
17
18use super::level::Level;
19use super::BidiClass::{self, *};
20
21/// A maximal substring of characters with the same embedding level.
22///
23/// Represented as a range of byte indices.
24pub type LevelRun = Range<usize>;
25
26/// Output of `isolating_run_sequences` (steps X9-X10)
27#[derive(Debug, PartialEq)]
28pub struct IsolatingRunSequence {
29 pub runs: Vec<LevelRun>,
30 pub sos: BidiClass, // Start-of-sequence type.
31 pub eos: BidiClass, // End-of-sequence type.
32}
33
34/// Compute the set of isolating run sequences.
35///
36/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
37/// except the last one in the sequence, the last character of the run is an isolate initiator
38/// whose matching PDI is the first character of the next level run in the sequence.
39///
40/// Note: This function does *not* return the sequences in order by their first characters.
41#[cfg_attr(feature = "flame_it", flamer::flame)]
42pub fn isolating_run_sequences(
43 para_level: Level,
44 original_classes: &[BidiClass],
45 levels: &[Level],
46) -> Vec<IsolatingRunSequence> {
47 let runs = level_runs(levels, original_classes);
48
49 // Compute the set of isolating run sequences.
50 // <http://www.unicode.org/reports/tr9/#BD13>
51 let mut sequences = Vec::with_capacity(runs.len());
52
53 // When we encounter an isolate initiator, we push the current sequence onto the
54 // stack so we can resume it after the matching PDI.
55 let mut stack = vec![Vec::new()];
56
57 for run in runs {
58 assert!(run.len() > 0);
59 assert!(!stack.is_empty());
60
61 let start_class = original_classes[run.start];
62 // > In rule X10, [..] skip over any BNs when [..].
63 // > Do the same when determining if the last character of the sequence is an isolate initiator.
64 //
65 // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
66 let end_class = original_classes[run.start..run.end]
67 .iter()
68 .copied()
69 .rev()
70 .filter(not_removed_by_x9)
71 .next()
72 .unwrap_or(start_class);
73
74 let mut sequence = if start_class == PDI && stack.len() > 1 {
75 // Continue a previous sequence interrupted by an isolate.
76 stack.pop().unwrap()
77 } else {
78 // Start a new sequence.
79 Vec::new()
80 };
81
82 sequence.push(run);
83
84 if let RLI | LRI | FSI = end_class {
85 // Resume this sequence after the isolate.
86 stack.push(sequence);
87 } else {
88 // This sequence is finished.
89 sequences.push(sequence);
90 }
91 }
92 // Pop any remaning sequences off the stack.
93 sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
94
95 // Determine the `sos` and `eos` class for each sequence.
96 // <http://www.unicode.org/reports/tr9/#X10>
97 sequences
98 .into_iter()
99 .map(|sequence: Vec<LevelRun>| {
100 assert!(!sequence.is_empty());
101
102 let mut result = IsolatingRunSequence {
103 runs: sequence,
104 sos: L,
105 eos: L,
106 };
107
108 let start_of_seq = result.runs[0].start;
109 let runs_len = result.runs.len();
110 let end_of_seq = result.runs[runs_len - 1].end;
111
112 // > (not counting characters removed by X9)
113 let seq_level = result
114 .iter_forwards_from(start_of_seq, 0)
115 .filter(|i| not_removed_by_x9(&original_classes[*i]))
116 .map(|i| levels[i])
117 .next()
118 .unwrap_or(levels[start_of_seq]);
119
120 // XXXManishearth the spec talks of a start and end level,
121 // but for a given IRS the two should be equivalent, yes?
122 let end_level = result
123 .iter_backwards_from(end_of_seq, runs_len - 1)
124 .filter(|i| not_removed_by_x9(&original_classes[*i]))
125 .map(|i| levels[i])
126 .next()
127 .unwrap_or(levels[end_of_seq - 1]);
128
129 #[cfg(test)]
130 for run in result.runs.clone() {
131 for idx in run {
132 if not_removed_by_x9(&original_classes[idx]) {
133 assert_eq!(seq_level, levels[idx]);
134 }
135 }
136 }
137
138 // Get the level of the last non-removed char before the runs.
139 let pred_level = match original_classes[..start_of_seq]
140 .iter()
141 .rposition(not_removed_by_x9)
142 {
143 Some(idx) => levels[idx],
144 None => para_level,
145 };
146
147 // Get the last non-removed character to check if it is an isolate initiator.
148 // The spec calls for an unmatched one, but matched isolate initiators
149 // will never be at the end of a level run (otherwise there would be more to the run).
150 // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
151 let last_non_removed = original_classes[..end_of_seq]
152 .iter()
153 .copied()
154 .rev()
155 .find(not_removed_by_x9)
156 .unwrap_or(BN);
157
158 // Get the level of the next non-removed char after the runs.
159 let succ_level = if let RLI | LRI | FSI = last_non_removed {
160 para_level
161 } else {
162 match original_classes[end_of_seq..]
163 .iter()
164 .position(not_removed_by_x9)
165 {
166 Some(idx) => levels[end_of_seq + idx],
167 None => para_level,
168 }
169 };
170
171 result.sos = max(seq_level, pred_level).bidi_class();
172 result.eos = max(end_level, succ_level).bidi_class();
173 result
174 })
175 .collect()
176}
177
178impl IsolatingRunSequence {
179 /// Given a text-relative position `pos` and an index of the level run it is in,
180 /// produce an iterator of all characters after and pos (`pos..`) that are in this
181 /// run sequence
182 pub(crate) fn iter_forwards_from(
183 &self,
184 pos: usize,
185 level_run_index: usize,
186 ) -> impl Iterator<Item = usize> + '_ {
187 let runs = &self.runs[level_run_index..];
188
189 // Check that it is in range
190 // (we can't use contains() since we want an inclusive range)
191 #[cfg(feature = "std")]
192 debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
193
194 (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
195 }
196
197 /// Given a text-relative position `pos` and an index of the level run it is in,
198 /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
199 /// run sequence
200 pub(crate) fn iter_backwards_from(
201 &self,
202 pos: usize,
203 level_run_index: usize,
204 ) -> impl Iterator<Item = usize> + '_ {
205 let prev_runs = &self.runs[..level_run_index];
206 let current = &self.runs[level_run_index];
207
208 // Check that it is in range
209 // (we can't use contains() since we want an inclusive range)
210 #[cfg(feature = "std")]
211 debug_assert!(current.start <= pos && pos <= current.end);
212
213 (current.start..pos)
214 .rev()
215 .chain(prev_runs.iter().rev().flat_map(Clone::clone))
216 }
217}
218
219/// Finds the level runs in a paragraph.
220///
221/// <http://www.unicode.org/reports/tr9/#BD7>
222fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
223 assert_eq!(levels.len(), original_classes.len());
224
225 let mut runs: Vec> = Vec::new();
226 if levels.is_empty() {
227 return runs;
228 }
229
230 let mut current_run_level: Level = levels[0];
231 let mut current_run_start: usize = 0;
232 for i: usize in 1..levels.len() {
233 if !removed_by_x9(class:original_classes[i]) && levels[i] != current_run_level {
234 // End the last run and start a new one.
235 runs.push(current_run_start..i);
236 current_run_level = levels[i];
237 current_run_start = i;
238 }
239 }
240 runs.push(current_run_start..levels.len());
241
242 runs
243}
244
245/// Should this character be ignored in steps after X9?
246///
247/// <http://www.unicode.org/reports/tr9/#X9>
248pub fn removed_by_x9(class: BidiClass) -> bool {
249 match class {
250 RLE | LRE | RLO | LRO | PDF | BN => true,
251 _ => false,
252 }
253}
254
255// For use as a predicate for `position` / `rposition`
256pub fn not_removed_by_x9(class: &BidiClass) -> bool {
257 !removed_by_x9(*class)
258}
259
260#[cfg(test)]
261mod tests {
262 use super::*;
263
264 #[test]
265 fn test_level_runs() {
266 assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
267 assert_eq!(
268 level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
269 &[0..3, 3..5, 5..6, 6..8]
270 );
271 }
272
273 // From <http://www.unicode.org/reports/tr9/#BD13>
274 #[rustfmt::skip]
275 #[test]
276 fn test_isolating_run_sequences() {
277
278 // == Example 1 ==
279 // text1·RLE·text2·PDF·RLE·text3·PDF·text4
280 // index 0 1 2 3 4 5 6 7
281 let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
282 let levels = &[0, 1, 1, 1, 1, 1, 1, 0];
283 let para_level = Level::ltr();
284 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
285 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
286 assert_eq!(
287 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
288 vec![vec![0..2], vec![2..7], vec![7..8]]
289 );
290
291 // == Example 2 ==
292 // text1·RLI·text2·PDI·RLI·text3·PDI·text4
293 // index 0 1 2 3 4 5 6 7
294 let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
295 let levels = &[0, 0, 1, 0, 0, 1, 0, 0];
296 let para_level = Level::ltr();
297 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
298 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
299 assert_eq!(
300 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
301 vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
302 );
303
304 // == Example 3 ==
305 // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
306 // index 0 1 2 3 4 5 6 7 8 9 10 11 12
307 let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L];
308 let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0];
309 let para_level = Level::ltr();
310 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
311 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
312 assert_eq!(
313 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
314 vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
315 );
316 }
317
318 // From <http://www.unicode.org/reports/tr9/#X10>
319 #[rustfmt::skip]
320 #[test]
321 fn test_isolating_run_sequences_sos_and_eos() {
322
323 // == Example 1 ==
324 // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
325 // index 0 1 2 3 4 5 6 7 8 9 10 11
326 let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L];
327 let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0];
328 let para_level = Level::ltr();
329 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
330 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
331
332 // text1
333 assert_eq!(
334 &sequences[0],
335 &IsolatingRunSequence {
336 runs: vec![0..2],
337 sos: L,
338 eos: R,
339 }
340 );
341
342 // text2
343 assert_eq!(
344 &sequences[1],
345 &IsolatingRunSequence {
346 runs: vec![2..4],
347 sos: R,
348 eos: L,
349 }
350 );
351
352 // text3
353 assert_eq!(
354 &sequences[2],
355 &IsolatingRunSequence {
356 runs: vec![4..6],
357 sos: L,
358 eos: L,
359 }
360 );
361
362 // text4 text5
363 assert_eq!(
364 &sequences[3],
365 &IsolatingRunSequence {
366 runs: vec![6..11],
367 sos: L,
368 eos: R,
369 }
370 );
371
372 // text6
373 assert_eq!(
374 &sequences[4],
375 &IsolatingRunSequence {
376 runs: vec![11..12],
377 sos: R,
378 eos: L,
379 }
380 );
381
382 // == Example 2 ==
383 // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
384 // index 0 1 2 3 4 5 6 7 8 9 10 11
385 let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L];
386 let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0];
387 let para_level = Level::ltr();
388 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
389 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
390
391 // text1·RLI·PDI·RLI·PDI·text6
392 assert_eq!(
393 &sequences[0],
394 &IsolatingRunSequence {
395 runs: vec![0..2, 7..9, 10..12],
396 sos: L,
397 eos: L,
398 }
399 );
400
401 // text2·LRI·PDI·text4
402 assert_eq!(
403 &sequences[1],
404 &IsolatingRunSequence {
405 runs: vec![2..4, 5..7],
406 sos: R,
407 eos: R,
408 }
409 );
410
411 // text3
412 assert_eq!(
413 &sequences[2],
414 &IsolatingRunSequence {
415 runs: vec![4..5],
416 sos: L,
417 eos: L,
418 }
419 );
420
421 // text5
422 assert_eq!(
423 &sequences[3],
424 &IsolatingRunSequence {
425 runs: vec![9..10],
426 sos: R,
427 eos: R,
428 }
429 );
430 }
431
432 #[test]
433 fn test_removed_by_x9() {
434 let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
435 let not_classes = &[L, RLI, AL, LRI, PDI];
436 for x in rem_classes {
437 assert_eq!(removed_by_x9(*x), true);
438 }
439 for x in not_classes {
440 assert_eq!(removed_by_x9(*x), false);
441 }
442 }
443
444 #[test]
445 fn test_not_removed_by_x9() {
446 let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
447 for x in non_x9_classes {
448 assert_eq!(not_removed_by_x9(&x), true);
449 }
450 }
451}
452