1 | // Copyright 2015 The Servo Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! 3.3.3 Preparations for Implicit Processing |
11 | //! |
12 | //! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing> |
13 | |
14 | use alloc::vec::Vec; |
15 | use core::cmp::max; |
16 | use core::ops::Range; |
17 | |
18 | use super::level::Level; |
19 | use super::BidiClass::{self, *}; |
20 | |
21 | /// A maximal substring of characters with the same embedding level. |
22 | /// |
23 | /// Represented as a range of byte indices. |
24 | pub type LevelRun = Range<usize>; |
25 | |
26 | /// Output of `isolating_run_sequences` (steps X9-X10) |
27 | #[derive (Debug, PartialEq)] |
28 | pub struct IsolatingRunSequence { |
29 | pub runs: Vec<LevelRun>, |
30 | pub sos: BidiClass, // Start-of-sequence type. |
31 | pub eos: BidiClass, // End-of-sequence type. |
32 | } |
33 | |
34 | /// Compute the set of isolating run sequences. |
35 | /// |
36 | /// An isolating run sequence is a maximal sequence of level runs such that for all level runs |
37 | /// except the last one in the sequence, the last character of the run is an isolate initiator |
38 | /// whose matching PDI is the first character of the next level run in the sequence. |
39 | /// |
40 | /// Note: This function does *not* return the sequences in order by their first characters. |
41 | #[cfg_attr (feature = "flame_it" , flamer::flame)] |
42 | pub fn isolating_run_sequences( |
43 | para_level: Level, |
44 | original_classes: &[BidiClass], |
45 | levels: &[Level], |
46 | ) -> Vec<IsolatingRunSequence> { |
47 | let runs = level_runs(levels, original_classes); |
48 | |
49 | // Compute the set of isolating run sequences. |
50 | // <http://www.unicode.org/reports/tr9/#BD13> |
51 | let mut sequences = Vec::with_capacity(runs.len()); |
52 | |
53 | // When we encounter an isolate initiator, we push the current sequence onto the |
54 | // stack so we can resume it after the matching PDI. |
55 | let mut stack = vec![Vec::new()]; |
56 | |
57 | for run in runs { |
58 | assert!(run.len() > 0); |
59 | assert!(!stack.is_empty()); |
60 | |
61 | let start_class = original_classes[run.start]; |
62 | // > In rule X10, [..] skip over any BNs when [..]. |
63 | // > Do the same when determining if the last character of the sequence is an isolate initiator. |
64 | // |
65 | // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters> |
66 | let end_class = original_classes[run.start..run.end] |
67 | .iter() |
68 | .copied() |
69 | .rev() |
70 | .filter(not_removed_by_x9) |
71 | .next() |
72 | .unwrap_or(start_class); |
73 | |
74 | let mut sequence = if start_class == PDI && stack.len() > 1 { |
75 | // Continue a previous sequence interrupted by an isolate. |
76 | stack.pop().unwrap() |
77 | } else { |
78 | // Start a new sequence. |
79 | Vec::new() |
80 | }; |
81 | |
82 | sequence.push(run); |
83 | |
84 | if let RLI | LRI | FSI = end_class { |
85 | // Resume this sequence after the isolate. |
86 | stack.push(sequence); |
87 | } else { |
88 | // This sequence is finished. |
89 | sequences.push(sequence); |
90 | } |
91 | } |
92 | // Pop any remaning sequences off the stack. |
93 | sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty())); |
94 | |
95 | // Determine the `sos` and `eos` class for each sequence. |
96 | // <http://www.unicode.org/reports/tr9/#X10> |
97 | sequences |
98 | .into_iter() |
99 | .map(|sequence: Vec<LevelRun>| { |
100 | assert!(!sequence.is_empty()); |
101 | |
102 | let mut result = IsolatingRunSequence { |
103 | runs: sequence, |
104 | sos: L, |
105 | eos: L, |
106 | }; |
107 | |
108 | let start_of_seq = result.runs[0].start; |
109 | let runs_len = result.runs.len(); |
110 | let end_of_seq = result.runs[runs_len - 1].end; |
111 | |
112 | // > (not counting characters removed by X9) |
113 | let seq_level = result |
114 | .iter_forwards_from(start_of_seq, 0) |
115 | .filter(|i| not_removed_by_x9(&original_classes[*i])) |
116 | .map(|i| levels[i]) |
117 | .next() |
118 | .unwrap_or(levels[start_of_seq]); |
119 | |
120 | // XXXManishearth the spec talks of a start and end level, |
121 | // but for a given IRS the two should be equivalent, yes? |
122 | let end_level = result |
123 | .iter_backwards_from(end_of_seq, runs_len - 1) |
124 | .filter(|i| not_removed_by_x9(&original_classes[*i])) |
125 | .map(|i| levels[i]) |
126 | .next() |
127 | .unwrap_or(levels[end_of_seq - 1]); |
128 | |
129 | #[cfg (test)] |
130 | for run in result.runs.clone() { |
131 | for idx in run { |
132 | if not_removed_by_x9(&original_classes[idx]) { |
133 | assert_eq!(seq_level, levels[idx]); |
134 | } |
135 | } |
136 | } |
137 | |
138 | // Get the level of the last non-removed char before the runs. |
139 | let pred_level = match original_classes[..start_of_seq] |
140 | .iter() |
141 | .rposition(not_removed_by_x9) |
142 | { |
143 | Some(idx) => levels[idx], |
144 | None => para_level, |
145 | }; |
146 | |
147 | // Get the last non-removed character to check if it is an isolate initiator. |
148 | // The spec calls for an unmatched one, but matched isolate initiators |
149 | // will never be at the end of a level run (otherwise there would be more to the run). |
150 | // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check. |
151 | let last_non_removed = original_classes[..end_of_seq] |
152 | .iter() |
153 | .copied() |
154 | .rev() |
155 | .find(not_removed_by_x9) |
156 | .unwrap_or(BN); |
157 | |
158 | // Get the level of the next non-removed char after the runs. |
159 | let succ_level = if let RLI | LRI | FSI = last_non_removed { |
160 | para_level |
161 | } else { |
162 | match original_classes[end_of_seq..] |
163 | .iter() |
164 | .position(not_removed_by_x9) |
165 | { |
166 | Some(idx) => levels[end_of_seq + idx], |
167 | None => para_level, |
168 | } |
169 | }; |
170 | |
171 | result.sos = max(seq_level, pred_level).bidi_class(); |
172 | result.eos = max(end_level, succ_level).bidi_class(); |
173 | result |
174 | }) |
175 | .collect() |
176 | } |
177 | |
178 | impl IsolatingRunSequence { |
179 | /// Given a text-relative position `pos` and an index of the level run it is in, |
180 | /// produce an iterator of all characters after and pos (`pos..`) that are in this |
181 | /// run sequence |
182 | pub(crate) fn iter_forwards_from( |
183 | &self, |
184 | pos: usize, |
185 | level_run_index: usize, |
186 | ) -> impl Iterator<Item = usize> + '_ { |
187 | let runs = &self.runs[level_run_index..]; |
188 | |
189 | // Check that it is in range |
190 | // (we can't use contains() since we want an inclusive range) |
191 | #[cfg (feature = "std" )] |
192 | debug_assert!(runs[0].start <= pos && pos <= runs[0].end); |
193 | |
194 | (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone)) |
195 | } |
196 | |
197 | /// Given a text-relative position `pos` and an index of the level run it is in, |
198 | /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this |
199 | /// run sequence |
200 | pub(crate) fn iter_backwards_from( |
201 | &self, |
202 | pos: usize, |
203 | level_run_index: usize, |
204 | ) -> impl Iterator<Item = usize> + '_ { |
205 | let prev_runs = &self.runs[..level_run_index]; |
206 | let current = &self.runs[level_run_index]; |
207 | |
208 | // Check that it is in range |
209 | // (we can't use contains() since we want an inclusive range) |
210 | #[cfg (feature = "std" )] |
211 | debug_assert!(current.start <= pos && pos <= current.end); |
212 | |
213 | (current.start..pos) |
214 | .rev() |
215 | .chain(prev_runs.iter().rev().flat_map(Clone::clone)) |
216 | } |
217 | } |
218 | |
219 | /// Finds the level runs in a paragraph. |
220 | /// |
221 | /// <http://www.unicode.org/reports/tr9/#BD7> |
222 | fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> { |
223 | assert_eq!(levels.len(), original_classes.len()); |
224 | |
225 | let mut runs: Vec> = Vec::new(); |
226 | if levels.is_empty() { |
227 | return runs; |
228 | } |
229 | |
230 | let mut current_run_level: Level = levels[0]; |
231 | let mut current_run_start: usize = 0; |
232 | for i: usize in 1..levels.len() { |
233 | if !removed_by_x9(class:original_classes[i]) && levels[i] != current_run_level { |
234 | // End the last run and start a new one. |
235 | runs.push(current_run_start..i); |
236 | current_run_level = levels[i]; |
237 | current_run_start = i; |
238 | } |
239 | } |
240 | runs.push(current_run_start..levels.len()); |
241 | |
242 | runs |
243 | } |
244 | |
245 | /// Should this character be ignored in steps after X9? |
246 | /// |
247 | /// <http://www.unicode.org/reports/tr9/#X9> |
248 | pub fn removed_by_x9(class: BidiClass) -> bool { |
249 | match class { |
250 | RLE | LRE | RLO | LRO | PDF | BN => true, |
251 | _ => false, |
252 | } |
253 | } |
254 | |
255 | // For use as a predicate for `position` / `rposition` |
256 | pub fn not_removed_by_x9(class: &BidiClass) -> bool { |
257 | !removed_by_x9(*class) |
258 | } |
259 | |
260 | #[cfg (test)] |
261 | mod tests { |
262 | use super::*; |
263 | |
264 | #[test ] |
265 | fn test_level_runs() { |
266 | assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]); |
267 | assert_eq!( |
268 | level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]), |
269 | &[0..3, 3..5, 5..6, 6..8] |
270 | ); |
271 | } |
272 | |
273 | // From <http://www.unicode.org/reports/tr9/#BD13> |
274 | #[rustfmt::skip] |
275 | #[test ] |
276 | fn test_isolating_run_sequences() { |
277 | |
278 | // == Example 1 == |
279 | // text1·RLE·text2·PDF·RLE·text3·PDF·text4 |
280 | // index 0 1 2 3 4 5 6 7 |
281 | let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; |
282 | let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; |
283 | let para_level = Level::ltr(); |
284 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
285 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
286 | assert_eq!( |
287 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
288 | vec![vec![0..2], vec![2..7], vec![7..8]] |
289 | ); |
290 | |
291 | // == Example 2 == |
292 | // text1·RLI·text2·PDI·RLI·text3·PDI·text4 |
293 | // index 0 1 2 3 4 5 6 7 |
294 | let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; |
295 | let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; |
296 | let para_level = Level::ltr(); |
297 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
298 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
299 | assert_eq!( |
300 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
301 | vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]] |
302 | ); |
303 | |
304 | // == Example 3 == |
305 | // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7 |
306 | // index 0 1 2 3 4 5 6 7 8 9 10 11 12 |
307 | let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; |
308 | let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; |
309 | let para_level = Level::ltr(); |
310 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
311 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
312 | assert_eq!( |
313 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
314 | vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]] |
315 | ); |
316 | } |
317 | |
318 | // From <http://www.unicode.org/reports/tr9/#X10> |
319 | #[rustfmt::skip] |
320 | #[test ] |
321 | fn test_isolating_run_sequences_sos_and_eos() { |
322 | |
323 | // == Example 1 == |
324 | // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6 |
325 | // index 0 1 2 3 4 5 6 7 8 9 10 11 |
326 | let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; |
327 | let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; |
328 | let para_level = Level::ltr(); |
329 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
330 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
331 | |
332 | // text1 |
333 | assert_eq!( |
334 | &sequences[0], |
335 | &IsolatingRunSequence { |
336 | runs: vec![0..2], |
337 | sos: L, |
338 | eos: R, |
339 | } |
340 | ); |
341 | |
342 | // text2 |
343 | assert_eq!( |
344 | &sequences[1], |
345 | &IsolatingRunSequence { |
346 | runs: vec![2..4], |
347 | sos: R, |
348 | eos: L, |
349 | } |
350 | ); |
351 | |
352 | // text3 |
353 | assert_eq!( |
354 | &sequences[2], |
355 | &IsolatingRunSequence { |
356 | runs: vec![4..6], |
357 | sos: L, |
358 | eos: L, |
359 | } |
360 | ); |
361 | |
362 | // text4 text5 |
363 | assert_eq!( |
364 | &sequences[3], |
365 | &IsolatingRunSequence { |
366 | runs: vec![6..11], |
367 | sos: L, |
368 | eos: R, |
369 | } |
370 | ); |
371 | |
372 | // text6 |
373 | assert_eq!( |
374 | &sequences[4], |
375 | &IsolatingRunSequence { |
376 | runs: vec![11..12], |
377 | sos: R, |
378 | eos: L, |
379 | } |
380 | ); |
381 | |
382 | // == Example 2 == |
383 | // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6 |
384 | // index 0 1 2 3 4 5 6 7 8 9 10 11 |
385 | let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; |
386 | let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; |
387 | let para_level = Level::ltr(); |
388 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
389 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
390 | |
391 | // text1·RLI·PDI·RLI·PDI·text6 |
392 | assert_eq!( |
393 | &sequences[0], |
394 | &IsolatingRunSequence { |
395 | runs: vec![0..2, 7..9, 10..12], |
396 | sos: L, |
397 | eos: L, |
398 | } |
399 | ); |
400 | |
401 | // text2·LRI·PDI·text4 |
402 | assert_eq!( |
403 | &sequences[1], |
404 | &IsolatingRunSequence { |
405 | runs: vec![2..4, 5..7], |
406 | sos: R, |
407 | eos: R, |
408 | } |
409 | ); |
410 | |
411 | // text3 |
412 | assert_eq!( |
413 | &sequences[2], |
414 | &IsolatingRunSequence { |
415 | runs: vec![4..5], |
416 | sos: L, |
417 | eos: L, |
418 | } |
419 | ); |
420 | |
421 | // text5 |
422 | assert_eq!( |
423 | &sequences[3], |
424 | &IsolatingRunSequence { |
425 | runs: vec![9..10], |
426 | sos: R, |
427 | eos: R, |
428 | } |
429 | ); |
430 | } |
431 | |
432 | #[test ] |
433 | fn test_removed_by_x9() { |
434 | let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN]; |
435 | let not_classes = &[L, RLI, AL, LRI, PDI]; |
436 | for x in rem_classes { |
437 | assert_eq!(removed_by_x9(*x), true); |
438 | } |
439 | for x in not_classes { |
440 | assert_eq!(removed_by_x9(*x), false); |
441 | } |
442 | } |
443 | |
444 | #[test ] |
445 | fn test_not_removed_by_x9() { |
446 | let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI]; |
447 | for x in non_x9_classes { |
448 | assert_eq!(not_removed_by_x9(&x), true); |
449 | } |
450 | } |
451 | } |
452 | |