1// Copyright 2015 The Servo Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! 3.3.3 Preparations for Implicit Processing
11//!
12//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13
14use alloc::vec::Vec;
15use core::cmp::max;
16use core::ops::Range;
17
18use super::level::Level;
19use super::BidiClass::{self, *};
20
21/// A maximal substring of characters with the same embedding level.
22///
23/// Represented as a range of byte indices.
24pub type LevelRun = Range<usize>;
25
26/// Output of `isolating_run_sequences` (steps X9-X10)
27#[derive(Debug, PartialEq)]
28pub struct IsolatingRunSequence {
29 pub runs: Vec<LevelRun>,
30 pub sos: BidiClass, // Start-of-sequence type.
31 pub eos: BidiClass, // End-of-sequence type.
32}
33
34/// Compute the set of isolating run sequences.
35///
36/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
37/// except the last one in the sequence, the last character of the run is an isolate initiator
38/// whose matching PDI is the first character of the next level run in the sequence.
39///
40/// Note: This function does *not* return the sequences in order by their first characters.
41#[cfg_attr(feature = "flame_it", flamer::flame)]
42pub fn isolating_run_sequences(
43 para_level: Level,
44 original_classes: &[BidiClass],
45 levels: &[Level],
46) -> Vec<IsolatingRunSequence> {
47 let runs = level_runs(levels, original_classes);
48
49 // Compute the set of isolating run sequences.
50 // <http://www.unicode.org/reports/tr9/#BD13>
51 let mut sequences = Vec::with_capacity(runs.len());
52
53 // When we encounter an isolate initiator, we push the current sequence onto the
54 // stack so we can resume it after the matching PDI.
55 let mut stack = vec![Vec::new()];
56
57 for run in runs {
58 assert!(run.len() > 0);
59 assert!(!stack.is_empty());
60
61 let start_class = original_classes[run.start];
62 let end_class = original_classes[run.end - 1];
63
64 let mut sequence = if start_class == PDI && stack.len() > 1 {
65 // Continue a previous sequence interrupted by an isolate.
66 stack.pop().unwrap()
67 } else {
68 // Start a new sequence.
69 Vec::new()
70 };
71
72 sequence.push(run);
73
74 if let RLI | LRI | FSI = end_class {
75 // Resume this sequence after the isolate.
76 stack.push(sequence);
77 } else {
78 // This sequence is finished.
79 sequences.push(sequence);
80 }
81 }
82 // Pop any remaning sequences off the stack.
83 sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
84
85 // Determine the `sos` and `eos` class for each sequence.
86 // <http://www.unicode.org/reports/tr9/#X10>
87 sequences
88 .into_iter()
89 .map(|sequence: Vec<LevelRun>| {
90 assert!(!sequence.is_empty());
91
92 let mut result = IsolatingRunSequence {
93 runs: sequence,
94 sos: L,
95 eos: L,
96 };
97
98 let start_of_seq = result.runs[0].start;
99 let runs_len = result.runs.len();
100 let end_of_seq = result.runs[runs_len - 1].end;
101
102 // > (not counting characters removed by X9)
103 let seq_level = result
104 .iter_forwards_from(start_of_seq, 0)
105 .filter(|i| not_removed_by_x9(&original_classes[*i]))
106 .map(|i| levels[i])
107 .next()
108 .unwrap_or(levels[start_of_seq]);
109
110 // XXXManishearth the spec talks of a start and end level,
111 // but for a given IRS the two should be equivalent, yes?
112 let end_level = result
113 .iter_backwards_from(end_of_seq, runs_len - 1)
114 .filter(|i| not_removed_by_x9(&original_classes[*i]))
115 .map(|i| levels[i])
116 .next()
117 .unwrap_or(levels[end_of_seq - 1]);
118
119 #[cfg(test)]
120 for run in result.runs.clone() {
121 for idx in run {
122 if not_removed_by_x9(&original_classes[idx]) {
123 assert_eq!(seq_level, levels[idx]);
124 }
125 }
126 }
127
128 // Get the level of the last non-removed char before the runs.
129 let pred_level = match original_classes[..start_of_seq]
130 .iter()
131 .rposition(not_removed_by_x9)
132 {
133 Some(idx) => levels[idx],
134 None => para_level,
135 };
136
137 // Get the last non-removed character to check if it is an isolate initiator.
138 // The spec calls for an unmatched one, but matched isolate initiators
139 // will never be at the end of a level run (otherwise there would be more to the run).
140 // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
141 let last_non_removed = original_classes[..end_of_seq]
142 .iter()
143 .copied()
144 .rev()
145 .find(not_removed_by_x9)
146 .unwrap_or(BN);
147
148 // Get the level of the next non-removed char after the runs.
149 let succ_level = if let RLI | LRI | FSI = last_non_removed {
150 para_level
151 } else {
152 match original_classes[end_of_seq..]
153 .iter()
154 .position(not_removed_by_x9)
155 {
156 Some(idx) => levels[end_of_seq + idx],
157 None => para_level,
158 }
159 };
160
161 result.sos = max(seq_level, pred_level).bidi_class();
162 result.eos = max(end_level, succ_level).bidi_class();
163 result
164 })
165 .collect()
166}
167
168impl IsolatingRunSequence {
169 /// Returns the full range of text represented by this isolating run sequence
170 pub(crate) fn text_range(&self) -> Range<usize> {
171 if let (Some(start), Some(end)) = (self.runs.first(), self.runs.last()) {
172 start.start..end.end
173 } else {
174 return 0..0;
175 }
176 }
177
178 /// Given a text-relative position `pos` and an index of the level run it is in,
179 /// produce an iterator of all characters after and pos (`pos..`) that are in this
180 /// run sequence
181 pub(crate) fn iter_forwards_from(
182 &self,
183 pos: usize,
184 level_run_index: usize,
185 ) -> impl Iterator<Item = usize> + '_ {
186 let runs = &self.runs[level_run_index..];
187
188 // Check that it is in range
189 // (we can't use contains() since we want an inclusive range)
190 #[cfg(feature = "std")]
191 debug_assert!(runs[0].start <= pos && pos <= runs[0].end);
192
193 (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone))
194 }
195
196 /// Given a text-relative position `pos` and an index of the level run it is in,
197 /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
198 /// run sequence
199 pub(crate) fn iter_backwards_from(
200 &self,
201 pos: usize,
202 level_run_index: usize,
203 ) -> impl Iterator<Item = usize> + '_ {
204 let prev_runs = &self.runs[..level_run_index];
205 let current = &self.runs[level_run_index];
206
207 // Check that it is in range
208 // (we can't use contains() since we want an inclusive range)
209 #[cfg(feature = "std")]
210 debug_assert!(current.start <= pos && pos <= current.end);
211
212 (current.start..pos)
213 .rev()
214 .chain(prev_runs.iter().rev().flat_map(Clone::clone))
215 }
216}
217
218/// Finds the level runs in a paragraph.
219///
220/// <http://www.unicode.org/reports/tr9/#BD7>
221fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
222 assert_eq!(levels.len(), original_classes.len());
223
224 let mut runs: Vec> = Vec::new();
225 if levels.is_empty() {
226 return runs;
227 }
228
229 let mut current_run_level: Level = levels[0];
230 let mut current_run_start: usize = 0;
231 for i: usize in 1..levels.len() {
232 if !removed_by_x9(class:original_classes[i]) && levels[i] != current_run_level {
233 // End the last run and start a new one.
234 runs.push(current_run_start..i);
235 current_run_level = levels[i];
236 current_run_start = i;
237 }
238 }
239 runs.push(current_run_start..levels.len());
240
241 runs
242}
243
244/// Should this character be ignored in steps after X9?
245///
246/// <http://www.unicode.org/reports/tr9/#X9>
247pub fn removed_by_x9(class: BidiClass) -> bool {
248 match class {
249 RLE | LRE | RLO | LRO | PDF | BN => true,
250 _ => false,
251 }
252}
253
254// For use as a predicate for `position` / `rposition`
255pub fn not_removed_by_x9(class: &BidiClass) -> bool {
256 !removed_by_x9(*class)
257}
258
259#[cfg(test)]
260mod tests {
261 use super::*;
262
263 #[test]
264 fn test_level_runs() {
265 assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
266 assert_eq!(
267 level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
268 &[0..3, 3..5, 5..6, 6..8]
269 );
270 }
271
272 // From <http://www.unicode.org/reports/tr9/#BD13>
273 #[rustfmt::skip]
274 #[test]
275 fn test_isolating_run_sequences() {
276
277 // == Example 1 ==
278 // text1·RLE·text2·PDF·RLE·text3·PDF·text4
279 // index 0 1 2 3 4 5 6 7
280 let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
281 let levels = &[0, 1, 1, 1, 1, 1, 1, 0];
282 let para_level = Level::ltr();
283 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
284 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
285 assert_eq!(
286 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
287 vec![vec![0..2], vec![2..7], vec![7..8]]
288 );
289
290 // == Example 2 ==
291 // text1·RLI·text2·PDI·RLI·text3·PDI·text4
292 // index 0 1 2 3 4 5 6 7
293 let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
294 let levels = &[0, 0, 1, 0, 0, 1, 0, 0];
295 let para_level = Level::ltr();
296 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
297 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
298 assert_eq!(
299 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
300 vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
301 );
302
303 // == Example 3 ==
304 // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
305 // index 0 1 2 3 4 5 6 7 8 9 10 11 12
306 let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L];
307 let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0];
308 let para_level = Level::ltr();
309 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
310 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
311 assert_eq!(
312 sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
313 vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
314 );
315 }
316
317 // From <http://www.unicode.org/reports/tr9/#X10>
318 #[rustfmt::skip]
319 #[test]
320 fn test_isolating_run_sequences_sos_and_eos() {
321
322 // == Example 1 ==
323 // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
324 // index 0 1 2 3 4 5 6 7 8 9 10 11
325 let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L];
326 let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0];
327 let para_level = Level::ltr();
328 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
329 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
330
331 // text1
332 assert_eq!(
333 &sequences[0],
334 &IsolatingRunSequence {
335 runs: vec![0..2],
336 sos: L,
337 eos: R,
338 }
339 );
340
341 // text2
342 assert_eq!(
343 &sequences[1],
344 &IsolatingRunSequence {
345 runs: vec![2..4],
346 sos: R,
347 eos: L,
348 }
349 );
350
351 // text3
352 assert_eq!(
353 &sequences[2],
354 &IsolatingRunSequence {
355 runs: vec![4..6],
356 sos: L,
357 eos: L,
358 }
359 );
360
361 // text4 text5
362 assert_eq!(
363 &sequences[3],
364 &IsolatingRunSequence {
365 runs: vec![6..11],
366 sos: L,
367 eos: R,
368 }
369 );
370
371 // text6
372 assert_eq!(
373 &sequences[4],
374 &IsolatingRunSequence {
375 runs: vec![11..12],
376 sos: R,
377 eos: L,
378 }
379 );
380
381 // == Example 2 ==
382 // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
383 // index 0 1 2 3 4 5 6 7 8 9 10 11
384 let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L];
385 let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0];
386 let para_level = Level::ltr();
387 let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
388 sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
389
390 // text1·RLI·PDI·RLI·PDI·text6
391 assert_eq!(
392 &sequences[0],
393 &IsolatingRunSequence {
394 runs: vec![0..2, 7..9, 10..12],
395 sos: L,
396 eos: L,
397 }
398 );
399
400 // text2·LRI·PDI·text4
401 assert_eq!(
402 &sequences[1],
403 &IsolatingRunSequence {
404 runs: vec![2..4, 5..7],
405 sos: R,
406 eos: R,
407 }
408 );
409
410 // text3
411 assert_eq!(
412 &sequences[2],
413 &IsolatingRunSequence {
414 runs: vec![4..5],
415 sos: L,
416 eos: L,
417 }
418 );
419
420 // text5
421 assert_eq!(
422 &sequences[3],
423 &IsolatingRunSequence {
424 runs: vec![9..10],
425 sos: R,
426 eos: R,
427 }
428 );
429 }
430
431 #[test]
432 fn test_removed_by_x9() {
433 let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
434 let not_classes = &[L, RLI, AL, LRI, PDI];
435 for x in rem_classes {
436 assert_eq!(removed_by_x9(*x), true);
437 }
438 for x in not_classes {
439 assert_eq!(removed_by_x9(*x), false);
440 }
441 }
442
443 #[test]
444 fn test_not_removed_by_x9() {
445 let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
446 for x in non_x9_classes {
447 assert_eq!(not_removed_by_x9(&x), true);
448 }
449 }
450}
451