prepare.rs source code [crates/unicode_bidi/src/prepare.rs]

1	// Copyright 2015 The Servo Project Developers. See the
2	// COPYRIGHT file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	//! 3.3.3 Preparations for Implicit Processing
11	//!
12	//! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13
14	use alloc::vec::Vec;
15	use core::cmp::max;
16	use core::ops::Range;
17	#[cfg(feature = "smallvec")]
18	use smallvec::{smallvec, SmallVec};
19
20	use super::level::Level;
21	use super::BidiClass::{self, *};
22
23	/// A maximal substring of characters with the same embedding level.
24	///
25	/// Represented as a range of byte indices.
26	pub type LevelRun = Range<usize>;
27
28	#[cfg(feature = "smallvec")]
29	pub type LevelRunVec = SmallVec<[LevelRun; `8`]>;
30	#[cfg(not(feature = "smallvec"))]
31	pub type LevelRunVec = Vec<LevelRun>;
32
33	/// Output of `isolating_run_sequences` (steps X9-X10)
34	#[derive(Debug, PartialEq)]
35	pub struct IsolatingRunSequence {
36	pub runs: Vec<LevelRun>,
37	pub sos: BidiClass, // Start-of-sequence type.
38	pub eos: BidiClass, // End-of-sequence type.
39	}
40
41	#[cfg(feature = "smallvec")]
42	pub type IsolatingRunSequenceVec = SmallVec<[IsolatingRunSequence; `8`]>;
43	#[cfg(not(feature = "smallvec"))]
44	pub type IsolatingRunSequenceVec = Vec<IsolatingRunSequence>;
45
46	/// Compute the set of isolating run sequences.
47	///
48	/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
49	/// except the last one in the sequence, the last character of the run is an isolate initiator
50	/// whose matching PDI is the first character of the next level run in the sequence.
51	///
52	/// Note: This function does not* return the sequences in order by their first characters.*
53	#[cfg_attr(feature = "flame_it", flamer::flame)]
54	pub fn isolating_run_sequences(
55	para_level: Level,
56	original_classes: &[BidiClass],
57	levels: &[Level],
58	runs: LevelRunVec,
59	has_isolate_controls: bool,
60	isolating_run_sequences: &mut IsolatingRunSequenceVec,
61	) {
62	// Per http://www.unicode.org/reports/tr9/#BD13:
63	// "In the absence of isolate initiators, each isolating run sequence in a paragraph
64	// consists of exactly one level run, and each level run constitutes a separate
65	// isolating run sequence."
66	// We can take a simplified path to handle this case.
67	if !has_isolate_controls {
68	isolating_run_sequences.reserve_exact(runs.len());
69	for run in runs {
70	// Determine the `sos` and `eos` class for the sequence.
71	// <http://www.unicode.org/reports/tr9/#X10>
72
73	let run_levels = &levels[run.clone()];
74	let run_classes = &original_classes[run.clone()];
75	let seq_level = run_levels[run_classes
76	.iter()
77	.position(\|c\| not_removed_by_x9(c))
78	.unwrap_or(`0`)];
79
80	let end_level = run_levels[run_classes
81	.iter()
82	.rposition(\|c\| not_removed_by_x9(c))
83	.unwrap_or(run.end - run.start - `1`)];
84
85	// Get the level of the last non-removed char before the run.
86	let pred_level = match original_classes[..run.start]
87	.iter()
88	.rposition(not_removed_by_x9)
89	{
90	Some(idx) => levels[idx],
91	None => para_level,
92	};
93
94	// Get the level of the next non-removed char after the run.
95	let succ_level = match original_classes[run.end..]
96	.iter()
97	.position(not_removed_by_x9)
98	{
99	Some(idx) => levels[run.end + idx],
100	None => para_level,
101	};
102
103	isolating_run_sequences.push(IsolatingRunSequence {
104	runs: vec![run],
105	sos: max(seq_level, pred_level).bidi_class(),
106	eos: max(end_level, succ_level).bidi_class(),
107	});
108	}
109	return;
110	}
111
112	// Compute the set of isolating run sequences.
113	// <http://www.unicode.org/reports/tr9/#BD13>
114	let mut sequences = Vec::with_capacity(runs.len());
115
116	// When we encounter an isolate initiator, we push the current sequence onto the
117	// stack so we can resume it after the matching PDI.
118	#[cfg(feature = "smallvec")]
119	let mut stack: SmallVec<[Vec<Range<usize>>; `8`]> = smallvec![vec![]];
120	#[cfg(not(feature = "smallvec"))]
121	let mut stack = vec![vec![]];
122
123	for run in runs {
124	assert!(!run.is_empty());
125	assert!(!stack.is_empty());
126
127	let start_class = original_classes[run.start];
128	// > In rule X10, [..] skip over any BNs when [..].
129	// > Do the same when determining if the last character of the sequence is an isolate initiator.
130	//
131	// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
132	let end_class = original_classes[run.start..run.end]
133	.iter()
134	.copied()
135	.rev()
136	.find(not_removed_by_x9)
137	.unwrap_or(start_class);
138
139	let mut sequence = if start_class == PDI && stack.len() > `1` {
140	// Continue a previous sequence interrupted by an isolate.
141	stack.pop().unwrap()
142	} else {
143	// Start a new sequence.
144	Vec::new()
145	};
146
147	sequence.push(run);
148
149	if matches!(end_class, RLI \| LRI \| FSI) {
150	// Resume this sequence after the isolate.
151	stack.push(sequence);
152	} else {
153	// This sequence is finished.
154	sequences.push(sequence);
155	}
156	}
157	// Pop any remaining sequences off the stack.
158	sequences.extend(stack.into_iter().rev().filter(\|seq\| !seq.is_empty()));
159
160	// Determine the `sos` and `eos` class for each sequence.
161	// <http://www.unicode.org/reports/tr9/#X10>
162	for sequence in sequences {
163	assert!(!sequence.is_empty());
164
165	let start_of_seq = sequence[`0`].start;
166	let runs_len = sequence.len();
167	let end_of_seq = sequence[runs_len - `1`].end;
168
169	let mut result = IsolatingRunSequence {
170	runs: sequence,
171	sos: L,
172	eos: L,
173	};
174
175	// > (not counting characters removed by X9)
176	let seq_level = levels[result
177	.iter_forwards_from(start_of_seq, `0`)
178	.find(\|i\| not_removed_by_x9(&original_classes[*i]))
179	.unwrap_or(start_of_seq)];
180
181	// XXXManishearth the spec talks of a start and end level,
182	// but for a given IRS the two should be equivalent, yes?
183	let end_level = levels[result
184	.iter_backwards_from(end_of_seq, runs_len - `1`)
185	.find(\|i\| not_removed_by_x9(&original_classes[*i]))
186	.unwrap_or(end_of_seq - `1`)];
187
188	#[cfg(test)]
189	for idx in result.runs.clone().into_iter().flatten() {
190	if not_removed_by_x9(&original_classes[idx]) {
191	assert_eq!(seq_level, levels[idx]);
192	}
193	}
194
195	// Get the level of the last non-removed char before the runs.
196	let pred_level = match original_classes[..start_of_seq]
197	.iter()
198	.rposition(not_removed_by_x9)
199	{
200	Some(idx) => levels[idx],
201	None => para_level,
202	};
203
204	// Get the last non-removed character to check if it is an isolate initiator.
205	// The spec calls for an unmatched one, but matched isolate initiators
206	// will never be at the end of a level run (otherwise there would be more to the run).
207	// We unwrap_or(BN) because BN marks removed classes and it won't matter for the check.
208	let last_non_removed = original_classes[..end_of_seq]
209	.iter()
210	.copied()
211	.rev()
212	.find(not_removed_by_x9)
213	.unwrap_or(BN);
214
215	// Get the level of the next non-removed char after the runs.
216	let succ_level = if matches!(last_non_removed, RLI \| LRI \| FSI) {
217	para_level
218	} else {
219	match original_classes[end_of_seq..]
220	.iter()
221	.position(not_removed_by_x9)
222	{
223	Some(idx) => levels[end_of_seq + idx],
224	None => para_level,
225	}
226	};
227
228	result.sos = max(seq_level, pred_level).bidi_class();
229	result.eos = max(end_level, succ_level).bidi_class();
230
231	isolating_run_sequences.push(result);
232	}
233	}
234
235	impl IsolatingRunSequence {
236	/// Given a text-relative position `pos` and an index of the level run it is in,
237	/// produce an iterator of all characters after and pos (`pos..`) that are in this
238	/// run sequence
239	pub(crate) fn iter_forwards_from(
240	&self,
241	pos: usize,
242	level_run_index: usize,
243	) -> impl Iterator<Item = usize> + '_ {
244	let runs = &self.runs[level_run_index..];
245
246	// Check that it is in range
247	// (we can't use contains() since we want an inclusive range)
248	#[cfg(feature = "std")]
249	debug_assert!(runs[`0`].start <= pos && pos <= runs[`0`].end);
250
251	(pos..runs[`0`].end).chain(runs[`1`..].iter().flat_map(Clone::clone))
252	}
253
254	/// Given a text-relative position `pos` and an index of the level run it is in,
255	/// produce an iterator of all characters before and excludingpos (`..pos`) that are in this
256	/// run sequence
257	pub(crate) fn iter_backwards_from(
258	&self,
259	pos: usize,
260	level_run_index: usize,
261	) -> impl Iterator<Item = usize> + '_ {
262	let prev_runs = &self.runs[..level_run_index];
263	let current = &self.runs[level_run_index];
264
265	// Check that it is in range
266	// (we can't use contains() since we want an inclusive range)
267	#[cfg(feature = "std")]
268	debug_assert!(current.start <= pos && pos <= current.end);
269
270	(current.start..pos)
271	.rev()
272	.chain(prev_runs.iter().rev().flat_map(Clone::clone))
273	}
274	}
275
276	/// Finds the level runs in a paragraph.
277	///
278	/// <http://www.unicode.org/reports/tr9/#BD7>
279	///
280	/// This is only used by tests; normally level runs are identified during explicit::compute.
281	#[cfg(test)]
282	fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
283	assert_eq!(levels.len(), original_classes.len());
284
285	let mut runs = Vec::new();
286	if levels.is_empty() {
287	return runs;
288	}
289
290	let mut current_run_level = levels[`0`];
291	let mut current_run_start = `0`;
292	for i in `1`..levels.len() {
293	if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
294	// End the last run and start a new one.
295	runs.push(current_run_start..i);
296	current_run_level = levels[i];
297	current_run_start = i;
298	}
299	}
300	runs.push(current_run_start..levels.len());
301
302	runs
303	}
304
305	/// Should this character be ignored in steps after X9?
306	///
307	/// <http://www.unicode.org/reports/tr9/#X9>
308	pub fn removed_by_x9(class: BidiClass) -> bool {
309	matches!(class, RLE \| LRE \| RLO \| LRO \| PDF \| BN)
310	}
311
312	// For use as a predicate for `position` / `rposition`
313	pub fn not_removed_by_x9(class: &BidiClass) -> bool {
314	!removed_by_x9(*class)
315	}
316
317	#[cfg(test)]
318	mod tests {
319	use super::*;
320
321	#[test]
322	fn test_level_runs() {
323	assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
324	assert_eq!(
325	level_runs(&Level::vec(&[`0`, `0`, `0`, `1`, `1`, `2`, `0`, `0`]), &[L; `8`]),
326	&[`0`..`3`, `3`..`5`, `5`..`6`, `6`..`8`]
327	);
328	}
329
330	// From <http://www.unicode.org/reports/tr9/#BD13>
331	#[rustfmt::skip]
332	#[test]
333	fn test_isolating_run_sequences() {
334
335	// == Example 1 ==
336	// text1·RLE·text2·PDF·RLE·text3·PDF·text4
337	// index 0 1 2 3 4 5 6 7
338	let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
339	let levels = &[`0`, `1`, `1`, `1`, `1`, `1`, `1`, `0`];
340	let para_level = Level::ltr();
341	let mut sequences = IsolatingRunSequenceVec::new();
342	isolating_run_sequences(
343	para_level,
344	classes,
345	&Level::vec(levels),
346	level_runs(&Level::vec(levels), classes).into(),
347	`false`,
348	&mut sequences);
349	sequences.sort_by(\|a, b\| a.runs[`0`].clone().cmp(b.runs[`0`].clone()));
350	assert_eq!(
351	sequences.iter().map(\|s\| s.runs.clone()).collect::<Vec<_>>(),
352	vec![vec![`0`..`2`], vec![`2`..`7`], vec![`7`..`8`]]
353	);
354
355	// == Example 2 ==
356	// text1·RLI·text2·PDI·RLI·text3·PDI·text4
357	// index 0 1 2 3 4 5 6 7
358	let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
359	let levels = &[`0`, `0`, `1`, `0`, `0`, `1`, `0`, `0`];
360	let para_level = Level::ltr();
361	let mut sequences = IsolatingRunSequenceVec::new();
362	isolating_run_sequences(
363	para_level,
364	classes,
365	&Level::vec(levels),
366	level_runs(&Level::vec(levels), classes).into(),
367	`true`,
368	&mut sequences);
369	sequences.sort_by(\|a, b\| a.runs[`0`].clone().cmp(b.runs[`0`].clone()));
370	assert_eq!(
371	sequences.iter().map(\|s\| s.runs.clone()).collect::<Vec<_>>(),
372	vec![vec![`0`..`2`, `3`..`5`, `6`..`8`], vec![`2`..`3`], vec![`5`..`6`]]
373	);
374
375	// == Example 3 ==
376	// text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
377	// index 0 1 2 3 4 5 6 7 8 9 10 11 12
378	let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L];
379	let levels = &[`0`, `0`, `1`, `1`, `2`, `3`, `3`, `3`, `2`, `1`, `1`, `0`, `0`];
380	let para_level = Level::ltr();
381	let mut sequences = IsolatingRunSequenceVec::new();
382	isolating_run_sequences(
383	para_level,
384	classes,
385	&Level::vec(levels),
386	level_runs(&Level::vec(levels), classes).into(),
387	`true`,
388	&mut sequences);
389	sequences.sort_by(\|a, b\| a.runs[`0`].clone().cmp(b.runs[`0`].clone()));
390	assert_eq!(
391	sequences.iter().map(\|s\| s.runs.clone()).collect::<Vec<_>>(),
392	vec![vec![`0`..`2`, `11`..`13`], vec![`2`..`4`, `9`..`11`], vec![`4`..`6`], vec![`6`..`8`], vec![`8`..`9`]]
393	);
394	}
395
396	// From <http://www.unicode.org/reports/tr9/#X10>
397	#[rustfmt::skip]
398	#[test]
399	fn test_isolating_run_sequences_sos_and_eos() {
400
401	// == Example 1 ==
402	// text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
403	// index 0 1 2 3 4 5 6 7 8 9 10 11
404	let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L];
405	let levels = &[`0`, `1`, `1`, `2`, `2`, `2`, `1`, `1`, `1`, `1`, `1`, `0`];
406	let para_level = Level::ltr();
407	let mut sequences = IsolatingRunSequenceVec::new();
408	isolating_run_sequences(
409	para_level,
410	classes,
411	&Level::vec(levels),
412	level_runs(&Level::vec(levels), classes).into(),
413	`false`,
414	&mut sequences);
415	sequences.sort_by(\|a, b\| a.runs[`0`].clone().cmp(b.runs[`0`].clone()));
416
417	// text1
418	assert_eq!(
419	&sequences[`0`],
420	&IsolatingRunSequence {
421	runs: vec![`0`..`2`],
422	sos: L,
423	eos: R,
424	}
425	);
426
427	// text2
428	assert_eq!(
429	&sequences[`1`],
430	&IsolatingRunSequence {
431	runs: vec![`2`..`4`],
432	sos: R,
433	eos: L,
434	}
435	);
436
437	// text3
438	assert_eq!(
439	&sequences[`2`],
440	&IsolatingRunSequence {
441	runs: vec![`4`..`6`],
442	sos: L,
443	eos: L,
444	}
445	);
446
447	// text4 text5
448	assert_eq!(
449	&sequences[`3`],
450	&IsolatingRunSequence {
451	runs: vec![`6`..`11`],
452	sos: L,
453	eos: R,
454	}
455	);
456
457	// text6
458	assert_eq!(
459	&sequences[`4`],
460	&IsolatingRunSequence {
461	runs: vec![`11`..`12`],
462	sos: R,
463	eos: L,
464	}
465	);
466
467	// == Example 2 ==
468	// text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
469	// index 0 1 2 3 4 5 6 7 8 9 10 11
470	let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L];
471	let levels = &[`0`, `0`, `1`, `1`, `2`, `1`, `1`, `0`, `0`, `1`, `0`, `0`];
472	let para_level = Level::ltr();
473	let mut sequences = IsolatingRunSequenceVec::new();
474	isolating_run_sequences(
475	para_level,
476	classes,
477	&Level::vec(levels),
478	level_runs(&Level::vec(levels), classes).into(),
479	`true`,
480	&mut sequences);
481	sequences.sort_by(\|a, b\| a.runs[`0`].clone().cmp(b.runs[`0`].clone()));
482
483	// text1·RLI·PDI·RLI·PDI·text6
484	assert_eq!(
485	&sequences[`0`],
486	&IsolatingRunSequence {
487	runs: vec![`0`..`2`, `7`..`9`, `10`..`12`],
488	sos: L,
489	eos: L,
490	}
491	);
492
493	// text2·LRI·PDI·text4
494	assert_eq!(
495	&sequences[`1`],
496	&IsolatingRunSequence {
497	runs: vec![`2`..`4`, `5`..`7`],
498	sos: R,
499	eos: R,
500	}
501	);
502
503	// text3
504	assert_eq!(
505	&sequences[`2`],
506	&IsolatingRunSequence {
507	runs: vec![`4`..`5`],
508	sos: L,
509	eos: L,
510	}
511	);
512
513	// text5
514	assert_eq!(
515	&sequences[`3`],
516	&IsolatingRunSequence {
517	runs: vec![`9`..`10`],
518	sos: R,
519	eos: R,
520	}
521	);
522	}
523
524	#[test]
525	fn test_removed_by_x9() {
526	let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
527	let not_classes = &[L, RLI, AL, LRI, PDI];
528	for x in rem_classes {
529	assert_eq!(removed_by_x9(*x), `true`);
530	}
531	for x in not_classes {
532	assert_eq!(removed_by_x9(*x), `false`);
533	}
534	}
535
536	#[test]
537	fn test_not_removed_by_x9() {
538	let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
539	for x in non_x9_classes {
540	assert_eq!(not_removed_by_x9(&x), `true`);
541	}
542	}
543	}
544