1 | // Copyright 2015 The Servo Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! 3.3.3 Preparations for Implicit Processing |
11 | //! |
12 | //! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing> |
13 | |
14 | use alloc::vec::Vec; |
15 | use core::cmp::max; |
16 | use core::ops::Range; |
17 | |
18 | use super::level::Level; |
19 | use super::BidiClass::{self, *}; |
20 | |
21 | /// A maximal substring of characters with the same embedding level. |
22 | /// |
23 | /// Represented as a range of byte indices. |
24 | pub type LevelRun = Range<usize>; |
25 | |
26 | /// Output of `isolating_run_sequences` (steps X9-X10) |
27 | #[derive (Debug, PartialEq)] |
28 | pub struct IsolatingRunSequence { |
29 | pub runs: Vec<LevelRun>, |
30 | pub sos: BidiClass, // Start-of-sequence type. |
31 | pub eos: BidiClass, // End-of-sequence type. |
32 | } |
33 | |
34 | /// Compute the set of isolating run sequences. |
35 | /// |
36 | /// An isolating run sequence is a maximal sequence of level runs such that for all level runs |
37 | /// except the last one in the sequence, the last character of the run is an isolate initiator |
38 | /// whose matching PDI is the first character of the next level run in the sequence. |
39 | /// |
40 | /// Note: This function does *not* return the sequences in order by their first characters. |
41 | #[cfg_attr (feature = "flame_it" , flamer::flame)] |
42 | pub fn isolating_run_sequences( |
43 | para_level: Level, |
44 | original_classes: &[BidiClass], |
45 | levels: &[Level], |
46 | ) -> Vec<IsolatingRunSequence> { |
47 | let runs = level_runs(levels, original_classes); |
48 | |
49 | // Compute the set of isolating run sequences. |
50 | // <http://www.unicode.org/reports/tr9/#BD13> |
51 | let mut sequences = Vec::with_capacity(runs.len()); |
52 | |
53 | // When we encounter an isolate initiator, we push the current sequence onto the |
54 | // stack so we can resume it after the matching PDI. |
55 | let mut stack = vec![Vec::new()]; |
56 | |
57 | for run in runs { |
58 | assert!(run.len() > 0); |
59 | assert!(!stack.is_empty()); |
60 | |
61 | let start_class = original_classes[run.start]; |
62 | let end_class = original_classes[run.end - 1]; |
63 | |
64 | let mut sequence = if start_class == PDI && stack.len() > 1 { |
65 | // Continue a previous sequence interrupted by an isolate. |
66 | stack.pop().unwrap() |
67 | } else { |
68 | // Start a new sequence. |
69 | Vec::new() |
70 | }; |
71 | |
72 | sequence.push(run); |
73 | |
74 | if let RLI | LRI | FSI = end_class { |
75 | // Resume this sequence after the isolate. |
76 | stack.push(sequence); |
77 | } else { |
78 | // This sequence is finished. |
79 | sequences.push(sequence); |
80 | } |
81 | } |
82 | // Pop any remaning sequences off the stack. |
83 | sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty())); |
84 | |
85 | // Determine the `sos` and `eos` class for each sequence. |
86 | // <http://www.unicode.org/reports/tr9/#X10> |
87 | sequences |
88 | .into_iter() |
89 | .map(|sequence: Vec<LevelRun>| { |
90 | assert!(!sequence.is_empty()); |
91 | |
92 | let mut result = IsolatingRunSequence { |
93 | runs: sequence, |
94 | sos: L, |
95 | eos: L, |
96 | }; |
97 | |
98 | let start_of_seq = result.runs[0].start; |
99 | let runs_len = result.runs.len(); |
100 | let end_of_seq = result.runs[runs_len - 1].end; |
101 | |
102 | // > (not counting characters removed by X9) |
103 | let seq_level = result |
104 | .iter_forwards_from(start_of_seq, 0) |
105 | .filter(|i| not_removed_by_x9(&original_classes[*i])) |
106 | .map(|i| levels[i]) |
107 | .next() |
108 | .unwrap_or(levels[start_of_seq]); |
109 | |
110 | // XXXManishearth the spec talks of a start and end level, |
111 | // but for a given IRS the two should be equivalent, yes? |
112 | let end_level = result |
113 | .iter_backwards_from(end_of_seq, runs_len - 1) |
114 | .filter(|i| not_removed_by_x9(&original_classes[*i])) |
115 | .map(|i| levels[i]) |
116 | .next() |
117 | .unwrap_or(levels[end_of_seq - 1]); |
118 | |
119 | #[cfg (test)] |
120 | for run in result.runs.clone() { |
121 | for idx in run { |
122 | if not_removed_by_x9(&original_classes[idx]) { |
123 | assert_eq!(seq_level, levels[idx]); |
124 | } |
125 | } |
126 | } |
127 | |
128 | // Get the level of the last non-removed char before the runs. |
129 | let pred_level = match original_classes[..start_of_seq] |
130 | .iter() |
131 | .rposition(not_removed_by_x9) |
132 | { |
133 | Some(idx) => levels[idx], |
134 | None => para_level, |
135 | }; |
136 | |
137 | // Get the last non-removed character to check if it is an isolate initiator. |
138 | // The spec calls for an unmatched one, but matched isolate initiators |
139 | // will never be at the end of a level run (otherwise there would be more to the run). |
140 | // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check. |
141 | let last_non_removed = original_classes[..end_of_seq] |
142 | .iter() |
143 | .copied() |
144 | .rev() |
145 | .find(not_removed_by_x9) |
146 | .unwrap_or(BN); |
147 | |
148 | // Get the level of the next non-removed char after the runs. |
149 | let succ_level = if let RLI | LRI | FSI = last_non_removed { |
150 | para_level |
151 | } else { |
152 | match original_classes[end_of_seq..] |
153 | .iter() |
154 | .position(not_removed_by_x9) |
155 | { |
156 | Some(idx) => levels[end_of_seq + idx], |
157 | None => para_level, |
158 | } |
159 | }; |
160 | |
161 | result.sos = max(seq_level, pred_level).bidi_class(); |
162 | result.eos = max(end_level, succ_level).bidi_class(); |
163 | result |
164 | }) |
165 | .collect() |
166 | } |
167 | |
168 | impl IsolatingRunSequence { |
169 | /// Returns the full range of text represented by this isolating run sequence |
170 | pub(crate) fn text_range(&self) -> Range<usize> { |
171 | if let (Some(start), Some(end)) = (self.runs.first(), self.runs.last()) { |
172 | start.start..end.end |
173 | } else { |
174 | return 0..0; |
175 | } |
176 | } |
177 | |
178 | /// Given a text-relative position `pos` and an index of the level run it is in, |
179 | /// produce an iterator of all characters after and pos (`pos..`) that are in this |
180 | /// run sequence |
181 | pub(crate) fn iter_forwards_from( |
182 | &self, |
183 | pos: usize, |
184 | level_run_index: usize, |
185 | ) -> impl Iterator<Item = usize> + '_ { |
186 | let runs = &self.runs[level_run_index..]; |
187 | |
188 | // Check that it is in range |
189 | // (we can't use contains() since we want an inclusive range) |
190 | #[cfg (feature = "std" )] |
191 | debug_assert!(runs[0].start <= pos && pos <= runs[0].end); |
192 | |
193 | (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone)) |
194 | } |
195 | |
196 | /// Given a text-relative position `pos` and an index of the level run it is in, |
197 | /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this |
198 | /// run sequence |
199 | pub(crate) fn iter_backwards_from( |
200 | &self, |
201 | pos: usize, |
202 | level_run_index: usize, |
203 | ) -> impl Iterator<Item = usize> + '_ { |
204 | let prev_runs = &self.runs[..level_run_index]; |
205 | let current = &self.runs[level_run_index]; |
206 | |
207 | // Check that it is in range |
208 | // (we can't use contains() since we want an inclusive range) |
209 | #[cfg (feature = "std" )] |
210 | debug_assert!(current.start <= pos && pos <= current.end); |
211 | |
212 | (current.start..pos) |
213 | .rev() |
214 | .chain(prev_runs.iter().rev().flat_map(Clone::clone)) |
215 | } |
216 | } |
217 | |
218 | /// Finds the level runs in a paragraph. |
219 | /// |
220 | /// <http://www.unicode.org/reports/tr9/#BD7> |
221 | fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> { |
222 | assert_eq!(levels.len(), original_classes.len()); |
223 | |
224 | let mut runs: Vec> = Vec::new(); |
225 | if levels.is_empty() { |
226 | return runs; |
227 | } |
228 | |
229 | let mut current_run_level: Level = levels[0]; |
230 | let mut current_run_start: usize = 0; |
231 | for i: usize in 1..levels.len() { |
232 | if !removed_by_x9(class:original_classes[i]) && levels[i] != current_run_level { |
233 | // End the last run and start a new one. |
234 | runs.push(current_run_start..i); |
235 | current_run_level = levels[i]; |
236 | current_run_start = i; |
237 | } |
238 | } |
239 | runs.push(current_run_start..levels.len()); |
240 | |
241 | runs |
242 | } |
243 | |
244 | /// Should this character be ignored in steps after X9? |
245 | /// |
246 | /// <http://www.unicode.org/reports/tr9/#X9> |
247 | pub fn removed_by_x9(class: BidiClass) -> bool { |
248 | match class { |
249 | RLE | LRE | RLO | LRO | PDF | BN => true, |
250 | _ => false, |
251 | } |
252 | } |
253 | |
254 | // For use as a predicate for `position` / `rposition` |
255 | pub fn not_removed_by_x9(class: &BidiClass) -> bool { |
256 | !removed_by_x9(*class) |
257 | } |
258 | |
259 | #[cfg (test)] |
260 | mod tests { |
261 | use super::*; |
262 | |
263 | #[test ] |
264 | fn test_level_runs() { |
265 | assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]); |
266 | assert_eq!( |
267 | level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]), |
268 | &[0..3, 3..5, 5..6, 6..8] |
269 | ); |
270 | } |
271 | |
272 | // From <http://www.unicode.org/reports/tr9/#BD13> |
273 | #[rustfmt::skip] |
274 | #[test ] |
275 | fn test_isolating_run_sequences() { |
276 | |
277 | // == Example 1 == |
278 | // text1·RLE·text2·PDF·RLE·text3·PDF·text4 |
279 | // index 0 1 2 3 4 5 6 7 |
280 | let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; |
281 | let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; |
282 | let para_level = Level::ltr(); |
283 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
284 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
285 | assert_eq!( |
286 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
287 | vec![vec![0..2], vec![2..7], vec![7..8]] |
288 | ); |
289 | |
290 | // == Example 2 == |
291 | // text1·RLI·text2·PDI·RLI·text3·PDI·text4 |
292 | // index 0 1 2 3 4 5 6 7 |
293 | let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; |
294 | let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; |
295 | let para_level = Level::ltr(); |
296 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
297 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
298 | assert_eq!( |
299 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
300 | vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]] |
301 | ); |
302 | |
303 | // == Example 3 == |
304 | // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7 |
305 | // index 0 1 2 3 4 5 6 7 8 9 10 11 12 |
306 | let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; |
307 | let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; |
308 | let para_level = Level::ltr(); |
309 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
310 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
311 | assert_eq!( |
312 | sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(), |
313 | vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]] |
314 | ); |
315 | } |
316 | |
317 | // From <http://www.unicode.org/reports/tr9/#X10> |
318 | #[rustfmt::skip] |
319 | #[test ] |
320 | fn test_isolating_run_sequences_sos_and_eos() { |
321 | |
322 | // == Example 1 == |
323 | // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6 |
324 | // index 0 1 2 3 4 5 6 7 8 9 10 11 |
325 | let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; |
326 | let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; |
327 | let para_level = Level::ltr(); |
328 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
329 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
330 | |
331 | // text1 |
332 | assert_eq!( |
333 | &sequences[0], |
334 | &IsolatingRunSequence { |
335 | runs: vec![0..2], |
336 | sos: L, |
337 | eos: R, |
338 | } |
339 | ); |
340 | |
341 | // text2 |
342 | assert_eq!( |
343 | &sequences[1], |
344 | &IsolatingRunSequence { |
345 | runs: vec![2..4], |
346 | sos: R, |
347 | eos: L, |
348 | } |
349 | ); |
350 | |
351 | // text3 |
352 | assert_eq!( |
353 | &sequences[2], |
354 | &IsolatingRunSequence { |
355 | runs: vec![4..6], |
356 | sos: L, |
357 | eos: L, |
358 | } |
359 | ); |
360 | |
361 | // text4 text5 |
362 | assert_eq!( |
363 | &sequences[3], |
364 | &IsolatingRunSequence { |
365 | runs: vec![6..11], |
366 | sos: L, |
367 | eos: R, |
368 | } |
369 | ); |
370 | |
371 | // text6 |
372 | assert_eq!( |
373 | &sequences[4], |
374 | &IsolatingRunSequence { |
375 | runs: vec![11..12], |
376 | sos: R, |
377 | eos: L, |
378 | } |
379 | ); |
380 | |
381 | // == Example 2 == |
382 | // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6 |
383 | // index 0 1 2 3 4 5 6 7 8 9 10 11 |
384 | let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; |
385 | let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; |
386 | let para_level = Level::ltr(); |
387 | let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels)); |
388 | sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); |
389 | |
390 | // text1·RLI·PDI·RLI·PDI·text6 |
391 | assert_eq!( |
392 | &sequences[0], |
393 | &IsolatingRunSequence { |
394 | runs: vec![0..2, 7..9, 10..12], |
395 | sos: L, |
396 | eos: L, |
397 | } |
398 | ); |
399 | |
400 | // text2·LRI·PDI·text4 |
401 | assert_eq!( |
402 | &sequences[1], |
403 | &IsolatingRunSequence { |
404 | runs: vec![2..4, 5..7], |
405 | sos: R, |
406 | eos: R, |
407 | } |
408 | ); |
409 | |
410 | // text3 |
411 | assert_eq!( |
412 | &sequences[2], |
413 | &IsolatingRunSequence { |
414 | runs: vec![4..5], |
415 | sos: L, |
416 | eos: L, |
417 | } |
418 | ); |
419 | |
420 | // text5 |
421 | assert_eq!( |
422 | &sequences[3], |
423 | &IsolatingRunSequence { |
424 | runs: vec![9..10], |
425 | sos: R, |
426 | eos: R, |
427 | } |
428 | ); |
429 | } |
430 | |
431 | #[test ] |
432 | fn test_removed_by_x9() { |
433 | let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN]; |
434 | let not_classes = &[L, RLI, AL, LRI, PDI]; |
435 | for x in rem_classes { |
436 | assert_eq!(removed_by_x9(*x), true); |
437 | } |
438 | for x in not_classes { |
439 | assert_eq!(removed_by_x9(*x), false); |
440 | } |
441 | } |
442 | |
443 | #[test ] |
444 | fn test_not_removed_by_x9() { |
445 | let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI]; |
446 | for x in non_x9_classes { |
447 | assert_eq!(not_removed_by_x9(&x), true); |
448 | } |
449 | } |
450 | } |
451 | |