1 | // Copyright 2015 The Servo Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! 3.3.3 Preparations for Implicit Processing |
11 | //! |
12 | //! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing> |
13 | |
14 | use alloc::vec::Vec; |
15 | use core::cmp::max; |
16 | use core::ops::Range; |
17 | #[cfg (feature = "smallvec" )] |
18 | use smallvec::{smallvec, SmallVec}; |
19 | |
20 | use super::level::Level; |
21 | use super::BidiClass::{self, *}; |
22 | |
23 | /// A maximal substring of characters with the same embedding level. |
24 | /// |
25 | /// Represented as a range of byte indices. |
26 | pub type LevelRun = Range<usize>; |
27 | |
28 | #[cfg (feature = "smallvec" )] |
29 | pub type LevelRunVec = SmallVec<[LevelRun; 8]>; |
30 | #[cfg (not(feature = "smallvec" ))] |
31 | pub type LevelRunVec = Vec<LevelRun>; |
32 | |
33 | /// Output of `isolating_run_sequences` (steps X9-X10) |
34 | #[derive (Debug, PartialEq)] |
35 | pub struct IsolatingRunSequence { |
36 | pub runs: Vec<LevelRun>, |
37 | pub sos: BidiClass, // Start-of-sequence type. |
38 | pub eos: BidiClass, // End-of-sequence type. |
39 | } |
40 | |
41 | #[cfg (feature = "smallvec" )] |
42 | pub type IsolatingRunSequenceVec = SmallVec<[IsolatingRunSequence; 8]>; |
43 | #[cfg (not(feature = "smallvec" ))] |
44 | pub type IsolatingRunSequenceVec = Vec<IsolatingRunSequence>; |
45 | |
46 | /// Compute the set of isolating run sequences. |
47 | /// |
48 | /// An isolating run sequence is a maximal sequence of level runs such that for all level runs |
49 | /// except the last one in the sequence, the last character of the run is an isolate initiator |
50 | /// whose matching PDI is the first character of the next level run in the sequence. |
51 | /// |
52 | /// Note: This function does *not* return the sequences in order by their first characters. |
53 | #[cfg_attr (feature = "flame_it" , flamer::flame)] |
54 | pub fn isolating_run_sequences( |
55 | para_level: Level, |
56 | original_classes: &[BidiClass], |
57 | levels: &[Level], |
58 | runs: LevelRunVec, |
59 | has_isolate_controls: bool, |
60 | isolating_run_sequences: &mut IsolatingRunSequenceVec, |
61 | ) { |
62 | // Per http://www.unicode.org/reports/tr9/#BD13: |
63 | // "In the absence of isolate initiators, each isolating run sequence in a paragraph |
64 | // consists of exactly one level run, and each level run constitutes a separate |
65 | // isolating run sequence." |
66 | // We can take a simplified path to handle this case. |
67 | if !has_isolate_controls { |
68 | isolating_run_sequences.reserve_exact(runs.len()); |
69 | for run in runs { |
70 | // Determine the `sos` and `eos` class for the sequence. |
71 | // <http://www.unicode.org/reports/tr9/#X10> |
72 | |
73 | let run_levels = &levels[run.clone()]; |
74 | let run_classes = &original_classes[run.clone()]; |
75 | let seq_level = run_levels[run_classes |
76 | .iter() |
77 | .position(|c| not_removed_by_x9(c)) |
78 | .unwrap_or(0)]; |
79 | |
80 | let end_level = run_levels[run_classes |
81 | .iter() |
82 | .rposition(|c| not_removed_by_x9(c)) |
83 | .unwrap_or(run.end - run.start - 1)]; |
84 | |
85 | // Get the level of the last non-removed char before the run. |
86 | let pred_level = match original_classes[..run.start] |
87 | .iter() |
88 | .rposition(not_removed_by_x9) |
89 | { |
90 | Some(idx) => levels[idx], |
91 | None => para_level, |
92 | }; |
93 | |
94 | // Get the level of the next non-removed char after the run. |
95 | let succ_level = match original_classes[run.end..] |
96 | .iter() |
97 | .position(not_removed_by_x9) |
98 | { |
99 | Some(idx) => levels[run.end + idx], |
100 | None => para_level, |
101 | }; |
102 | |
103 | isolating_run_sequences.push(IsolatingRunSequence { |
104 | runs: vec![run], |
105 | sos: max(seq_level, pred_level).bidi_class(), |
106 | eos: max(end_level, succ_level).bidi_class(), |
107 | }); |
108 | } |
109 | return; |
110 | } |
111 | |
112 | // Compute the set of isolating run sequences. |
113 | // <http://www.unicode.org/reports/tr9/#BD13> |
114 | let mut sequences = Vec::with_capacity(runs.len()); |
115 | |
116 | // When we encounter an isolate initiator, we push the current sequence onto the |
117 | // stack so we can resume it after the matching PDI. |
118 | #[cfg (feature = "smallvec" )] |
119 | let mut stack: SmallVec<[Vec<Range<usize>>; 8]> = smallvec![vec![]]; |
120 | #[cfg (not(feature = "smallvec" ))] |
121 | let mut stack = vec![vec![]]; |
122 | |
123 | for run in runs { |
124 | assert!(!run.is_empty()); |
125 | assert!(!stack.is_empty()); |
126 | |
127 | let start_class = original_classes[run.start]; |
128 | // > In rule X10, [..] skip over any BNs when [..]. |
129 | // > Do the same when determining if the last character of the sequence is an isolate initiator. |
130 | // |
131 | // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> |
132 | let end_class = original_classes[run.start..run.end] |
133 | .iter() |
134 | .copied() |
135 | .rev() |
136 | .find(not_removed_by_x9) |
137 | .unwrap_or(start_class); |
138 | |
139 | let mut sequence = if start_class == PDI && stack.len() > 1 { |
140 | // Continue a previous sequence interrupted by an isolate. |
141 | stack.pop().unwrap() |
142 | } else { |
143 | // Start a new sequence. |
144 | Vec::new() |
145 | }; |
146 | |
147 | sequence.push(run); |
148 | |
149 | if matches!(end_class, RLI | LRI | FSI) { |
150 | // Resume this sequence after the isolate. |
151 | stack.push(sequence); |
152 | } else { |
153 | // This sequence is finished. |
154 | sequences.push(sequence); |
155 | } |
156 | } |
157 | // Pop any remaining sequences off the stack. |
158 | sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty())); |
159 | |
160 | // Determine the `sos` and `eos` class for each sequence. |
161 | // <http://www.unicode.org/reports/tr9/#X10> |
162 | for sequence in sequences { |
163 | assert!(!sequence.is_empty()); |
164 | |
165 | let start_of_seq = sequence[0].start; |
166 | let runs_len = sequence.len(); |
167 | let end_of_seq = sequence[runs_len - 1].end; |
168 | |
169 | let mut result = IsolatingRunSequence { |
170 | runs: sequence, |
171 | sos: L, |
172 | eos: L, |
173 | }; |
174 | |
175 | // > (not counting characters removed by X9) |
176 | let seq_level = levels[result |
177 | .iter_forwards_from(start_of_seq, 0) |
178 | .find(|i| not_removed_by_x9(&original_classes[*i])) |
179 | .unwrap_or(start_of_seq)]; |
180 | |
181 | // XXXManishearth the spec talks of a start and end level, |
182 | // but for a given IRS the two should be equivalent, yes? |
183 | let end_level = levels[result |
184 | .iter_backwards_from(end_of_seq, runs_len - 1) |
185 | .find(|i| not_removed_by_x9(&original_classes[*i])) |
186 | .unwrap_or(end_of_seq - 1)]; |
187 | |
188 | #[cfg (test)] |
189 | for idx in result.runs.clone().into_iter().flatten() { |
190 | if not_removed_by_x9(&original_classes[idx]) { |
191 | assert_eq!(seq_level, levels[idx]); |
192 | } |
193 | } |
194 | |
195 | // Get the level of the last non-removed char before the runs. |
196 | let pred_level = match original_classes[..start_of_seq] |
197 | .iter() |
198 | .rposition(not_removed_by_x9) |
199 | { |
200 | Some(idx) => levels[idx], |
201 | None => para_level, |
202 | }; |
203 | |
204 | // Get the last non-removed character to check if it is an isolate initiator. |
205 | // The spec calls for an unmatched one, but matched isolate initiators |
206 | // will never be at the end of a level run (otherwise there would be more to the run). |
207 | // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check. |
208 | let last_non_removed = original_classes[..end_of_seq] |
209 | .iter() |
210 | .copied() |
211 | .rev() |
212 | .find(not_removed_by_x9) |
213 | .unwrap_or(BN); |
214 | |
215 | // Get the level of the next non-removed char after the runs. |
216 | let succ_level = if matches!(last_non_removed, RLI | LRI | FSI) { |
217 | para_level |
218 | } else { |
219 | match original_classes[end_of_seq..] |
220 | .iter() |
221 | .position(not_removed_by_x9) |
222 | { |
223 | Some(idx) => levels[end_of_seq + idx], |
224 | None => para_level, |
225 | } |
226 | }; |
227 | |
228 | result.sos = max(seq_level, pred_level).bidi_class(); |
229 | result.eos = max(end_level, succ_level).bidi_class(); |
230 | |
231 | isolating_run_sequences.push(result); |
232 | } |
233 | } |
234 | |
235 | impl IsolatingRunSequence { |
236 | /// Given a text-relative position `pos` and an index of the level run it is in, |
237 | /// produce an iterator of all characters after and pos (`pos..`) that are in this |
238 | /// run sequence |
239 | pub(crate) fn iter_forwards_from( |
240 | &self, |
241 | pos: usize, |
242 | level_run_index: usize, |
243 | ) -> impl Iterator<Item = usize> + '_ { |
244 | let runs = &self.runs[level_run_index..]; |
245 | |
246 | // Check that it is in range |
247 | // (we can't use contains() since we want an inclusive range) |
248 | #[cfg (feature = "std" )] |
249 | debug_assert!(runs[0].start <= pos && pos <= runs[0].end); |
250 | |
251 | (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone)) |
252 | } |
253 | |
254 | /// Given a text-relative position `pos` and an index of the level run it is in, |
255 | /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this |
256 | /// run sequence |
257 | pub(crate) fn iter_backwards_from( |
258 | &self, |
259 | pos: usize, |
260 | level_run_index: usize, |
261 | ) -> impl Iterator<Item = usize> + '_ { |
262 | let prev_runs = &self.runs[..level_run_index]; |
263 | let current = &self.runs[level_run_index]; |
264 | |
265 | // Check that it is in range |
266 | // (we can't use contains() since we want an inclusive range) |
267 | #[cfg (feature = "std" )] |
268 | debug_assert!(current.start <= pos && pos <= current.end); |
269 | |
270 | (current.start..pos) |
271 | .rev() |
272 | .chain(prev_runs.iter().rev().flat_map(Clone::clone)) |
273 | } |
274 | } |
275 | |
276 | /// Finds the level runs in a paragraph. |
277 | /// |
278 | /// <http://www.unicode.org/reports/tr9/#BD7> |
279 | /// |
280 | /// This is only used by tests; normally level runs are identified during explicit::compute. |
281 | #[cfg (test)] |
282 | fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> { |
283 | assert_eq!(levels.len(), original_classes.len()); |
284 | |
285 | let mut runs = Vec::new(); |
286 | if levels.is_empty() { |
287 | return runs; |
288 | } |
289 | |
290 | let mut current_run_level = levels[0]; |
291 | let mut current_run_start = 0; |
292 | for i in 1..levels.len() { |
293 | if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { |
294 | // End the last run and start a new one. |
295 | runs.push(current_run_start..i); |
296 | current_run_level = levels[i]; |
297 | current_run_start = i; |
298 | } |
299 | } |
300 | runs.push(current_run_start..levels.len()); |
301 | |
302 | runs |
303 | } |
304 | |
305 | /// Should this character be ignored in steps after X9? |
306 | /// |
307 | /// <http://www.unicode.org/reports/tr9/#X9> |
308 | pub fn removed_by_x9(class: BidiClass) -> bool { |
309 | matches!(class, RLE | LRE | RLO | LRO | PDF | BN) |
310 | } |
311 | |
312 | // For use as a predicate for `position` / `rposition` |
313 | pub fn not_removed_by_x9(class: &BidiClass) -> bool { |
314 | !removed_by_x9(*class) |
315 | } |
316 | |
317 | #[cfg (test)] |
318 | mod tests { |
319 | use super::*; |
320 | |
321 | #[test ] |
322 | fn test_level_runs() { |
323 | assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]); |
324 | assert_eq!( |
325 | level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]), |
326 | &[0..3, 3..5, 5..6, 6..8] |
327 | ); |
328 | } |
329 | |
330 | // From <http://www.unicode.org/reports/tr9/#BD13> |
331 | #[rustfmt::skip] |
332 | #[test ] |
333 | fn test_isolating_run_sequences() { |
334 | |
335 | // == Example 1 == |
336 | // text1·RLE·text2·PDF·RLE·text3·PDF·text4 |
337 | // index 0 1 2 3 4 5 6 7 |
338 | let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; |
339 | let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; |
340 | let para_level = Level::ltr(); |
341 | let mut sequences = IsolatingRunSequenceVec::new(); |
342 | isolating_run_sequences( |
343 | para_level, |
344 | classes, |
345 | &Level::vec(levels), |
346 | level_runs(&Level::vec(levels), classes).into(), |
347 | false, |
348 | &mut sequences); |
349 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
350 | assert_eq!( |
351 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
352 | vec![vec![0..2], vec![2..7], vec![7..8]] |
353 | ); |
354 | |
355 | // == Example 2 == |
356 | // text1·RLI·text2·PDI·RLI·text3·PDI·text4 |
357 | // index 0 1 2 3 4 5 6 7 |
358 | let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; |
359 | let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; |
360 | let para_level = Level::ltr(); |
361 | let mut sequences = IsolatingRunSequenceVec::new(); |
362 | isolating_run_sequences( |
363 | para_level, |
364 | classes, |
365 | &Level::vec(levels), |
366 | level_runs(&Level::vec(levels), classes).into(), |
367 | true, |
368 | &mut sequences); |
369 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
370 | assert_eq!( |
371 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
372 | vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]] |
373 | ); |
374 | |
375 | // == Example 3 == |
376 | // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7 |
377 | // index 0 1 2 3 4 5 6 7 8 9 10 11 12 |
378 | let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; |
379 | let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; |
380 | let para_level = Level::ltr(); |
381 | let mut sequences = IsolatingRunSequenceVec::new(); |
382 | isolating_run_sequences( |
383 | para_level, |
384 | classes, |
385 | &Level::vec(levels), |
386 | level_runs(&Level::vec(levels), classes).into(), |
387 | true, |
388 | &mut sequences); |
389 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
390 | assert_eq!( |
391 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
392 | vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]] |
393 | ); |
394 | } |
395 | |
396 | // From <http://www.unicode.org/reports/tr9/#X10> |
397 | #[rustfmt::skip] |
398 | #[test ] |
399 | fn test_isolating_run_sequences_sos_and_eos() { |
400 | |
401 | // == Example 1 == |
402 | // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6 |
403 | // index 0 1 2 3 4 5 6 7 8 9 10 11 |
404 | let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; |
405 | let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; |
406 | let para_level = Level::ltr(); |
407 | let mut sequences = IsolatingRunSequenceVec::new(); |
408 | isolating_run_sequences( |
409 | para_level, |
410 | classes, |
411 | &Level::vec(levels), |
412 | level_runs(&Level::vec(levels), classes).into(), |
413 | false, |
414 | &mut sequences); |
415 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
416 | |
417 | // text1 |
418 | assert_eq!( |
419 | &sequences[0], |
420 | &IsolatingRunSequence { |
421 | runs: vec![0..2], |
422 | sos: L, |
423 | eos: R, |
424 | } |
425 | ); |
426 | |
427 | // text2 |
428 | assert_eq!( |
429 | &sequences[1], |
430 | &IsolatingRunSequence { |
431 | runs: vec![2..4], |
432 | sos: R, |
433 | eos: L, |
434 | } |
435 | ); |
436 | |
437 | // text3 |
438 | assert_eq!( |
439 | &sequences[2], |
440 | &IsolatingRunSequence { |
441 | runs: vec![4..6], |
442 | sos: L, |
443 | eos: L, |
444 | } |
445 | ); |
446 | |
447 | // text4 text5 |
448 | assert_eq!( |
449 | &sequences[3], |
450 | &IsolatingRunSequence { |
451 | runs: vec![6..11], |
452 | sos: L, |
453 | eos: R, |
454 | } |
455 | ); |
456 | |
457 | // text6 |
458 | assert_eq!( |
459 | &sequences[4], |
460 | &IsolatingRunSequence { |
461 | runs: vec![11..12], |
462 | sos: R, |
463 | eos: L, |
464 | } |
465 | ); |
466 | |
467 | // == Example 2 == |
468 | // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6 |
469 | // index 0 1 2 3 4 5 6 7 8 9 10 11 |
470 | let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; |
471 | let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; |
472 | let para_level = Level::ltr(); |
473 | let mut sequences = IsolatingRunSequenceVec::new(); |
474 | isolating_run_sequences( |
475 | para_level, |
476 | classes, |
477 | &Level::vec(levels), |
478 | level_runs(&Level::vec(levels), classes).into(), |
479 | true, |
480 | &mut sequences); |
481 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
482 | |
483 | // text1·RLI·PDI·RLI·PDI·text6 |
484 | assert_eq!( |
485 | &sequences[0], |
486 | &IsolatingRunSequence { |
487 | runs: vec![0..2, 7..9, 10..12], |
488 | sos: L, |
489 | eos: L, |
490 | } |
491 | ); |
492 | |
493 | // text2·LRI·PDI·text4 |
494 | assert_eq!( |
495 | &sequences[1], |
496 | &IsolatingRunSequence { |
497 | runs: vec![2..4, 5..7], |
498 | sos: R, |
499 | eos: R, |
500 | } |
501 | ); |
502 | |
503 | // text3 |
504 | assert_eq!( |
505 | &sequences[2], |
506 | &IsolatingRunSequence { |
507 | runs: vec![4..5], |
508 | sos: L, |
509 | eos: L, |
510 | } |
511 | ); |
512 | |
513 | // text5 |
514 | assert_eq!( |
515 | &sequences[3], |
516 | &IsolatingRunSequence { |
517 | runs: vec![9..10], |
518 | sos: R, |
519 | eos: R, |
520 | } |
521 | ); |
522 | } |
523 | |
524 | #[test ] |
525 | fn test_removed_by_x9() { |
526 | let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN]; |
527 | let not_classes = &[L, RLI, AL, LRI, PDI]; |
528 | for x in rem_classes { |
529 | assert_eq!(removed_by_x9(*x), true); |
530 | } |
531 | for x in not_classes { |
532 | assert_eq!(removed_by_x9(*x), false); |
533 | } |
534 | } |
535 | |
536 | #[test ] |
537 | fn test_not_removed_by_x9() { |
538 | let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI]; |
539 | for x in non_x9_classes { |
540 | assert_eq!(not_removed_by_x9(&x), true); |
541 | } |
542 | } |
543 | } |
544 | |