1 | //! The first pass resolves all block structure, generating an AST. Within a block, items |
2 | //! are in a linear chain with potential inline markup identified. |
3 | |
4 | use std::cmp::max; |
5 | use std::ops::Range; |
6 | |
7 | use crate::parse::{ |
8 | scan_containers, Allocations, FootnoteDef, HeadingAttributes, Item, ItemBody, LinkDef, |
9 | LINK_MAX_NESTED_PARENS, |
10 | }; |
11 | use crate::strings::CowStr; |
12 | use crate::tree::{Tree, TreeIndex}; |
13 | use crate::Options; |
14 | use crate::{ |
15 | linklabel::{scan_link_label_rest, LinkLabel}, |
16 | HeadingLevel, |
17 | }; |
18 | use crate::{scanners::*, MetadataBlockKind}; |
19 | |
20 | use unicase::UniCase; |
21 | |
22 | /// Runs the first pass, which resolves the block structure of the document, |
23 | /// and returns the resulting tree. |
24 | pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations<'_>) { |
25 | // This is a very naive heuristic for the number of nodes |
26 | // we'll need. |
27 | let start_capacity: usize = max(v1:128, v2:text.len() / 32); |
28 | let lookup_table: &[bool; 256] = &create_lut(&options); |
29 | let first_pass: FirstPass<'_, '_> = FirstPass { |
30 | text, |
31 | tree: Tree::with_capacity(cap:start_capacity), |
32 | begin_list_item: None, |
33 | last_line_blank: false, |
34 | allocs: Allocations::new(), |
35 | options, |
36 | lookup_table, |
37 | next_paragraph_task: None, |
38 | brace_context_next: 0, |
39 | brace_context_stack: Vec::new(), |
40 | }; |
41 | first_pass.run() |
42 | } |
43 | |
44 | // Each level of brace nesting adds another entry to a hash table. |
45 | // To limit the amount of memory consumed, do not create a new brace |
46 | // context beyond some amount deep. |
47 | // |
48 | // There are actually two limits at play here: this one, |
49 | // and the one where the maximum amount of distinct contexts passes |
50 | // the 255 item limit imposed by using `u8`. When over 255 distinct |
51 | // contexts are created, it wraps around, while this one instead makes it |
52 | // saturate, which is a better behavior. |
53 | const MATH_BRACE_CONTEXT_MAX_NESTING: usize = 25; |
54 | |
55 | /// State for the first parsing pass. |
56 | struct FirstPass<'a, 'b> { |
57 | text: &'a str, |
58 | tree: Tree<Item>, |
59 | begin_list_item: Option<usize>, |
60 | last_line_blank: bool, |
61 | allocs: Allocations<'a>, |
62 | options: Options, |
63 | lookup_table: &'b LookupTable, |
64 | /// This is `Some(item)` when the next paragraph |
65 | /// starts with a task list marker. |
66 | next_paragraph_task: Option<Item>, |
67 | /// Math environment brace nesting. |
68 | brace_context_stack: Vec<u8>, |
69 | brace_context_next: usize, |
70 | } |
71 | |
72 | impl<'a, 'b> FirstPass<'a, 'b> { |
73 | fn run(mut self) -> (Tree<Item>, Allocations<'a>) { |
74 | let mut ix = 0; |
75 | while ix < self.text.len() { |
76 | ix = self.parse_block(ix); |
77 | } |
78 | for _ in 0..self.tree.spine_len() { |
79 | self.pop(ix); |
80 | } |
81 | (self.tree, self.allocs) |
82 | } |
83 | |
84 | /// Returns offset after block. |
85 | fn parse_block(&mut self, mut start_ix: usize) -> usize { |
86 | let bytes = self.text.as_bytes(); |
87 | let mut line_start = LineStart::new(&bytes[start_ix..]); |
88 | |
89 | // math spans and their braces are tracked only within a single block |
90 | self.brace_context_stack.clear(); |
91 | self.brace_context_next = 0; |
92 | |
93 | let i = scan_containers( |
94 | &self.tree, |
95 | &mut line_start, |
96 | self.options.has_gfm_footnotes(), |
97 | ); |
98 | for _ in i..self.tree.spine_len() { |
99 | self.pop(start_ix); |
100 | } |
101 | |
102 | if self.options.contains(Options::ENABLE_OLD_FOOTNOTES) { |
103 | // finish footnote if it's still open and was preceded by blank line |
104 | if let Some(node_ix) = self.tree.peek_up() { |
105 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
106 | if self.last_line_blank { |
107 | self.pop(start_ix); |
108 | } |
109 | } |
110 | } |
111 | |
112 | // Footnote definitions of the form |
113 | // [^bar]: |
114 | // * anything really |
115 | let container_start = start_ix + line_start.bytes_scanned(); |
116 | if let Some(bytecount) = self.parse_footnote(container_start) { |
117 | start_ix = container_start + bytecount; |
118 | start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0); |
119 | line_start = LineStart::new(&bytes[start_ix..]); |
120 | } |
121 | } |
122 | |
123 | // Process new containers |
124 | loop { |
125 | if self.options.has_gfm_footnotes() |
126 | || self.options.contains(Options::ENABLE_OLD_FOOTNOTES) |
127 | { |
128 | // Footnote definitions of the form |
129 | // [^bar]: |
130 | // * anything really |
131 | let save = line_start.clone(); |
132 | let indent = line_start.scan_space_upto(4); |
133 | if indent < 4 { |
134 | let container_start = start_ix + line_start.bytes_scanned(); |
135 | if let Some(bytecount) = self.parse_footnote(container_start) { |
136 | start_ix = container_start + bytecount; |
137 | line_start = LineStart::new(&bytes[start_ix..]); |
138 | continue; |
139 | } else { |
140 | line_start = save; |
141 | } |
142 | } else { |
143 | line_start = save; |
144 | } |
145 | } |
146 | let container_start = start_ix + line_start.bytes_scanned(); |
147 | if let Some((ch, index, indent)) = line_start.scan_list_marker() { |
148 | let after_marker_index = start_ix + line_start.bytes_scanned(); |
149 | self.continue_list(container_start, ch, index); |
150 | self.tree.append(Item { |
151 | start: container_start, |
152 | end: after_marker_index, // will get updated later if item not empty |
153 | body: ItemBody::ListItem(indent), |
154 | }); |
155 | self.tree.push(); |
156 | if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) { |
157 | self.begin_list_item = Some(after_marker_index + n); |
158 | return after_marker_index + n; |
159 | } |
160 | if self.options.contains(Options::ENABLE_TASKLISTS) { |
161 | let task_list_marker = |
162 | line_start.scan_task_list_marker().map(|is_checked| Item { |
163 | start: after_marker_index, |
164 | end: start_ix + line_start.bytes_scanned(), |
165 | body: ItemBody::TaskListMarker(is_checked), |
166 | }); |
167 | if let Some(task_list_marker) = task_list_marker { |
168 | if let Some(n) = scan_blank_line(&bytes[task_list_marker.end..]) { |
169 | self.tree.append(task_list_marker); |
170 | self.begin_list_item = Some(task_list_marker.end + n); |
171 | return task_list_marker.end + n; |
172 | } else { |
173 | self.next_paragraph_task = Some(task_list_marker); |
174 | } |
175 | } |
176 | } |
177 | } else if line_start.scan_blockquote_marker() { |
178 | let kind = if self.options.contains(Options::ENABLE_GFM) { |
179 | line_start.scan_blockquote_tag() |
180 | } else { |
181 | None |
182 | }; |
183 | self.finish_list(start_ix); |
184 | self.tree.append(Item { |
185 | start: container_start, |
186 | end: 0, // will get set later |
187 | body: ItemBody::BlockQuote(kind), |
188 | }); |
189 | self.tree.push(); |
190 | if kind.is_some() { |
191 | // blockquote tag leaves us at the end of the line |
192 | // we need to scan through all the container syntax for the next line |
193 | // and break out if we can't re-scan all of them |
194 | let ix = start_ix + line_start.bytes_scanned(); |
195 | let mut lazy_line_start = LineStart::new(&bytes[ix..]); |
196 | let current_container = scan_containers( |
197 | &self.tree, |
198 | &mut lazy_line_start, |
199 | self.options.has_gfm_footnotes(), |
200 | ) == self.tree.spine_len(); |
201 | if !lazy_line_start.scan_space(4) |
202 | && self.scan_paragraph_interrupt( |
203 | &bytes[ix + lazy_line_start.bytes_scanned()..], |
204 | current_container, |
205 | ) |
206 | { |
207 | return ix; |
208 | } else { |
209 | // blockquote tags act as if they were nested in a paragraph |
210 | // so you can lazily continue the imaginary paragraph off of them |
211 | line_start = lazy_line_start; |
212 | line_start.scan_all_space(); |
213 | start_ix = ix; |
214 | break; |
215 | } |
216 | } |
217 | } else { |
218 | break; |
219 | } |
220 | } |
221 | |
222 | let ix = start_ix + line_start.bytes_scanned(); |
223 | |
224 | if let Some(n) = scan_blank_line(&bytes[ix..]) { |
225 | if let Some(node_ix) = self.tree.peek_up() { |
226 | match &mut self.tree[node_ix].item.body { |
227 | ItemBody::BlockQuote(..) => (), |
228 | ItemBody::ListItem(indent) if self.begin_list_item.is_some() => { |
229 | self.last_line_blank = true; |
230 | // This is a blank list item. |
231 | // While the list itself can be continued no matter how many blank lines |
232 | // there are between this one and the next one, it cannot nest anything |
233 | // else, so its indentation should not be subtracted from the line start. |
234 | *indent = 0; |
235 | } |
236 | _ => { |
237 | self.last_line_blank = true; |
238 | } |
239 | } |
240 | } |
241 | return ix + n; |
242 | } |
243 | |
244 | self.finish_list(start_ix); |
245 | |
246 | // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks |
247 | let remaining_space = line_start.remaining_space(); |
248 | |
249 | let indent = line_start.scan_space_upto(4); |
250 | if indent == 4 { |
251 | let ix = start_ix + line_start.bytes_scanned(); |
252 | let remaining_space = line_start.remaining_space(); |
253 | return self.parse_indented_code_block(ix, remaining_space); |
254 | } |
255 | |
256 | let ix = start_ix + line_start.bytes_scanned(); |
257 | |
258 | // metadata blocks cannot be indented |
259 | if indent == 0 { |
260 | if let Some((_n, metadata_block_ch)) = scan_metadata_block( |
261 | &bytes[ix..], |
262 | self.options |
263 | .contains(Options::ENABLE_YAML_STYLE_METADATA_BLOCKS), |
264 | self.options |
265 | .contains(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS), |
266 | ) { |
267 | return self.parse_metadata_block(ix, metadata_block_ch); |
268 | } |
269 | } |
270 | |
271 | // HTML Blocks |
272 | if bytes[ix] == b'<' { |
273 | // Types 1-5 are all detected by one function and all end with the same |
274 | // pattern |
275 | if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) { |
276 | return self.parse_html_block_type_1_to_5( |
277 | ix, |
278 | html_end_tag, |
279 | remaining_space, |
280 | indent, |
281 | ); |
282 | } |
283 | |
284 | // Detect type 6 |
285 | if starts_html_block_type_6(&bytes[(ix + 1)..]) { |
286 | return self.parse_html_block_type_6_or_7(ix, remaining_space, indent); |
287 | } |
288 | |
289 | // Detect type 7 |
290 | if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) { |
291 | return self.parse_html_block_type_6_or_7(ix, remaining_space, indent); |
292 | } |
293 | } |
294 | |
295 | if let Ok(n) = scan_hrule(&bytes[ix..]) { |
296 | return self.parse_hrule(n, ix); |
297 | } |
298 | |
299 | if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) { |
300 | return self.parse_atx_heading(ix, atx_size); |
301 | } |
302 | |
303 | if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) { |
304 | return self.parse_fenced_code_block(ix, indent, fence_ch, n); |
305 | } |
306 | |
307 | // parse refdef |
308 | while let Some((bytecount, label, link_def)) = |
309 | self.parse_refdef_total(start_ix + line_start.bytes_scanned()) |
310 | { |
311 | self.allocs.refdefs.0.entry(label).or_insert(link_def); |
312 | let container_start = start_ix + line_start.bytes_scanned(); |
313 | let mut ix = container_start + bytecount; |
314 | // Refdefs act as if they were contained within a paragraph, for purposes of lazy |
315 | // continuations. For example: |
316 | // |
317 | // ``` |
318 | // > [foo]: http://example.com |
319 | // bar |
320 | // ``` |
321 | // |
322 | // is equivalent to |
323 | // |
324 | // ``` |
325 | // > bar |
326 | // |
327 | // [foo]: http://example.com |
328 | // ``` |
329 | if let Some(nl) = scan_blank_line(&bytes[ix..]) { |
330 | ix += nl; |
331 | let mut lazy_line_start = LineStart::new(&bytes[ix..]); |
332 | let current_container = scan_containers( |
333 | &self.tree, |
334 | &mut lazy_line_start, |
335 | self.options.has_gfm_footnotes(), |
336 | ) == self.tree.spine_len(); |
337 | if !lazy_line_start.scan_space(4) |
338 | && self.scan_paragraph_interrupt( |
339 | &bytes[ix + lazy_line_start.bytes_scanned()..], |
340 | current_container, |
341 | ) |
342 | { |
343 | return ix; |
344 | } else { |
345 | line_start = lazy_line_start; |
346 | line_start.scan_all_space(); |
347 | start_ix = ix; |
348 | } |
349 | } else { |
350 | return ix; |
351 | } |
352 | } |
353 | |
354 | let ix = start_ix + line_start.bytes_scanned(); |
355 | |
356 | self.parse_paragraph(ix) |
357 | } |
358 | |
359 | /// Returns the offset of the first line after the table. |
360 | /// Assumptions: current focus is a table element and the table header |
361 | /// matches the separator line (same number of columns). |
362 | fn parse_table( |
363 | &mut self, |
364 | table_cols: usize, |
365 | head_start: usize, |
366 | body_start: usize, |
367 | ) -> Option<usize> { |
368 | // filled empty cells are limited to protect against quadratic growth |
369 | // https://github.com/raphlinus/pulldown-cmark/issues/832 |
370 | let mut missing_empty_cells = 0; |
371 | // parse header. this shouldn't fail because we made sure the table header is ok |
372 | let (_sep_start, thead_ix) = |
373 | self.parse_table_row_inner(head_start, table_cols, &mut missing_empty_cells)?; |
374 | self.tree[thead_ix].item.body = ItemBody::TableHead; |
375 | |
376 | // parse body |
377 | let mut ix = body_start; |
378 | while let Some((next_ix, _row_ix)) = |
379 | self.parse_table_row(ix, table_cols, &mut missing_empty_cells) |
380 | { |
381 | ix = next_ix; |
382 | } |
383 | |
384 | self.pop(ix); |
385 | Some(ix) |
386 | } |
387 | |
388 | /// Call this when containers are taken care of. |
389 | /// Returns bytes scanned, row_ix |
390 | fn parse_table_row_inner( |
391 | &mut self, |
392 | mut ix: usize, |
393 | row_cells: usize, |
394 | missing_empty_cells: &mut usize, |
395 | ) -> Option<(usize, TreeIndex)> { |
396 | // Limit to prevent a malicious input from causing a denial of service. |
397 | const MAX_AUTOCOMPLETED_CELLS: usize = 1 << 18; // = 0x40000 |
398 | |
399 | let bytes = self.text.as_bytes(); |
400 | let mut cells = 0; |
401 | let mut final_cell_ix = None; |
402 | |
403 | let old_cur = self.tree.cur(); |
404 | let row_ix = self.tree.append(Item { |
405 | start: ix, |
406 | end: 0, // set at end of this function |
407 | body: ItemBody::TableRow, |
408 | }); |
409 | self.tree.push(); |
410 | |
411 | loop { |
412 | ix += scan_ch(&bytes[ix..], b'|' ); |
413 | let start_ix = ix; |
414 | ix += scan_whitespace_no_nl(&bytes[ix..]); |
415 | |
416 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
417 | ix += eol_bytes; |
418 | break; |
419 | } |
420 | |
421 | let cell_ix = self.tree.append(Item { |
422 | start: start_ix, |
423 | end: ix, |
424 | body: ItemBody::TableCell, |
425 | }); |
426 | self.tree.push(); |
427 | let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active); |
428 | |
429 | self.tree[cell_ix].item.end = next_ix; |
430 | self.tree.pop(); |
431 | |
432 | ix = next_ix; |
433 | cells += 1; |
434 | |
435 | if cells == row_cells { |
436 | final_cell_ix = Some(cell_ix); |
437 | } |
438 | } |
439 | |
440 | if let (Some(cur), 0) = (old_cur, cells) { |
441 | self.pop(ix); |
442 | self.tree[cur].next = None; |
443 | return None; |
444 | } |
445 | |
446 | // fill empty cells if needed |
447 | // note: this is where GFM and commonmark-extra diverge. we follow |
448 | // GFM here |
449 | for _ in cells..row_cells { |
450 | if *missing_empty_cells >= MAX_AUTOCOMPLETED_CELLS { |
451 | return None; |
452 | } |
453 | *missing_empty_cells += 1; |
454 | self.tree.append(Item { |
455 | start: ix, |
456 | end: ix, |
457 | body: ItemBody::TableCell, |
458 | }); |
459 | } |
460 | |
461 | // drop excess cells |
462 | if let Some(cell_ix) = final_cell_ix { |
463 | self.tree[cell_ix].next = None; |
464 | } |
465 | |
466 | self.pop(ix); |
467 | |
468 | Some((ix, row_ix)) |
469 | } |
470 | |
471 | /// Returns first offset after the row and the tree index of the row. |
472 | fn parse_table_row( |
473 | &mut self, |
474 | mut ix: usize, |
475 | row_cells: usize, |
476 | missing_empty_cells: &mut usize, |
477 | ) -> Option<(usize, TreeIndex)> { |
478 | let bytes = self.text.as_bytes(); |
479 | let mut line_start = LineStart::new(&bytes[ix..]); |
480 | let current_container = scan_containers( |
481 | &self.tree, |
482 | &mut line_start, |
483 | self.options.has_gfm_footnotes(), |
484 | ) == self.tree.spine_len(); |
485 | if !current_container { |
486 | return None; |
487 | } |
488 | line_start.scan_all_space(); |
489 | ix += line_start.bytes_scanned(); |
490 | if scan_paragraph_interrupt_no_table( |
491 | &bytes[ix..], |
492 | current_container, |
493 | self.options.contains(Options::ENABLE_FOOTNOTES), |
494 | &self.tree, |
495 | ) { |
496 | return None; |
497 | } |
498 | |
499 | let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells, missing_empty_cells)?; |
500 | Some((ix, row_ix)) |
501 | } |
502 | |
503 | /// Returns offset of line start after paragraph. |
504 | fn parse_paragraph(&mut self, start_ix: usize) -> usize { |
505 | let node_ix = self.tree.append(Item { |
506 | start: start_ix, |
507 | end: 0, // will get set later |
508 | body: ItemBody::Paragraph, |
509 | }); |
510 | self.tree.push(); |
511 | |
512 | if let Some(item) = self.next_paragraph_task { |
513 | self.tree.append(item); |
514 | self.next_paragraph_task = None; |
515 | } |
516 | |
517 | let bytes = self.text.as_bytes(); |
518 | let mut ix = start_ix; |
519 | loop { |
520 | let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix { |
521 | TableParseMode::Scan |
522 | } else { |
523 | TableParseMode::Disabled |
524 | }; |
525 | let (next_ix, brk) = self.parse_line(ix, None, scan_mode); |
526 | |
527 | // break out when we find a table |
528 | if let Some(Item { |
529 | body: ItemBody::Table(alignment_ix), |
530 | .. |
531 | }) = brk |
532 | { |
533 | let table_cols = self.allocs[alignment_ix].len(); |
534 | self.tree[node_ix].item.body = ItemBody::Table(alignment_ix); |
535 | // this clears out any stuff we may have appended - but there may |
536 | // be a cleaner way |
537 | self.tree[node_ix].child = None; |
538 | self.tree.pop(); |
539 | self.tree.push(); |
540 | if let Some(ix) = self.parse_table(table_cols, ix, next_ix) { |
541 | return ix; |
542 | } |
543 | } |
544 | |
545 | ix = next_ix; |
546 | let mut line_start = LineStart::new(&bytes[ix..]); |
547 | let current_container = scan_containers( |
548 | &self.tree, |
549 | &mut line_start, |
550 | self.options.has_gfm_footnotes(), |
551 | ) == self.tree.spine_len(); |
552 | let trailing_backslash_pos = match brk { |
553 | Some(Item { |
554 | start, |
555 | body: ItemBody::HardBreak(true), |
556 | .. |
557 | }) if bytes[start] == b' \\' => Some(start), |
558 | _ => None, |
559 | }; |
560 | if !line_start.scan_space(4) { |
561 | let ix_new = ix + line_start.bytes_scanned(); |
562 | if current_container { |
563 | if let Some(ix_setext) = |
564 | self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some()) |
565 | { |
566 | if let Some(pos) = trailing_backslash_pos { |
567 | self.tree.append_text(pos, pos + 1, false); |
568 | } |
569 | ix = ix_setext; |
570 | break; |
571 | } |
572 | } |
573 | // first check for non-empty lists, then for other interrupts |
574 | let suffix = &bytes[ix_new..]; |
575 | if self.scan_paragraph_interrupt(suffix, current_container) { |
576 | if let Some(pos) = trailing_backslash_pos { |
577 | self.tree.append_text(pos, pos + 1, false); |
578 | } |
579 | break; |
580 | } |
581 | } |
582 | line_start.scan_all_space(); |
583 | if line_start.is_at_eol() { |
584 | if let Some(pos) = trailing_backslash_pos { |
585 | self.tree.append_text(pos, pos + 1, false); |
586 | } |
587 | break; |
588 | } |
589 | ix = next_ix + line_start.bytes_scanned(); |
590 | if let Some(item) = brk { |
591 | self.tree.append(item); |
592 | } |
593 | } |
594 | |
595 | self.pop(ix); |
596 | ix |
597 | } |
598 | |
599 | /// Returns end ix of setext_heading on success. |
600 | fn parse_setext_heading( |
601 | &mut self, |
602 | ix: usize, |
603 | node_ix: TreeIndex, |
604 | has_trailing_content: bool, |
605 | ) -> Option<usize> { |
606 | let bytes = self.text.as_bytes(); |
607 | let (n, level) = scan_setext_heading(&bytes[ix..])?; |
608 | let mut attrs = None; |
609 | |
610 | if let Some(cur_ix) = self.tree.cur() { |
611 | let parent_ix = self.tree.peek_up().unwrap(); |
612 | let header_start = self.tree[parent_ix].item.start; |
613 | // Note that `self.tree[parent_ix].item.end` might be zero at this point. |
614 | // Use the end position of the current node (i.e. the last known child |
615 | // of the parent) instead. |
616 | let header_end = self.tree[cur_ix].item.end; |
617 | |
618 | // extract the trailing attribute block |
619 | let (content_end, attrs_) = |
620 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
621 | attrs = attrs_; |
622 | |
623 | // strip trailing whitespace |
624 | let new_end = if has_trailing_content { |
625 | content_end |
626 | } else { |
627 | let trailing_ws = |
628 | scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl); |
629 | content_end - trailing_ws |
630 | }; |
631 | |
632 | if attrs.is_some() { |
633 | // remove trailing block attributes |
634 | self.tree.truncate_siblings(new_end); |
635 | } |
636 | |
637 | if let Some(cur_ix) = self.tree.cur() { |
638 | self.tree[cur_ix].item.end = new_end; |
639 | } |
640 | } |
641 | |
642 | self.tree[node_ix].item.body = ItemBody::Heading( |
643 | level, |
644 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
645 | ); |
646 | |
647 | Some(ix + n) |
648 | } |
649 | |
650 | /// Parse a line of input, appending text and items to tree. |
651 | /// |
652 | /// Returns: index after line and an item representing the break. |
653 | fn parse_line( |
654 | &mut self, |
655 | start: usize, |
656 | end: Option<usize>, |
657 | mode: TableParseMode, |
658 | ) -> (usize, Option<Item>) { |
659 | let bytes = self.text.as_bytes(); |
660 | let bytes = match end { |
661 | Some(end) => &bytes[..end], |
662 | None => bytes, |
663 | }; |
664 | let bytes_len = bytes.len(); |
665 | let mut pipes = 0; |
666 | let mut last_pipe_ix = start; |
667 | let mut begin_text = start; |
668 | let mut backslash_escaped = false; |
669 | |
670 | let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| { |
671 | match byte { |
672 | b' \n' | b' \r' => { |
673 | if let TableParseMode::Active = mode { |
674 | return LoopInstruction::BreakAtWith(ix, None); |
675 | } |
676 | |
677 | let mut i = ix; |
678 | let eol_bytes = scan_eol(&bytes[ix..]).unwrap(); |
679 | |
680 | let end_ix = ix + eol_bytes; |
681 | let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b' \\' ); |
682 | if trailing_backslashes % 2 == 1 && end_ix < bytes_len { |
683 | i -= 1; |
684 | self.tree.append_text(begin_text, i, backslash_escaped); |
685 | backslash_escaped = false; |
686 | return LoopInstruction::BreakAtWith( |
687 | end_ix, |
688 | Some(Item { |
689 | start: i, |
690 | end: end_ix, |
691 | body: ItemBody::HardBreak(true), |
692 | }), |
693 | ); |
694 | } |
695 | |
696 | if mode == TableParseMode::Scan && pipes > 0 { |
697 | // check if we may be parsing a table |
698 | let next_line_ix = ix + eol_bytes; |
699 | let mut line_start = LineStart::new(&bytes[next_line_ix..]); |
700 | if scan_containers( |
701 | &self.tree, |
702 | &mut line_start, |
703 | self.options.has_gfm_footnotes(), |
704 | ) == self.tree.spine_len() |
705 | { |
706 | let table_head_ix = next_line_ix + line_start.bytes_scanned(); |
707 | let (table_head_bytes, alignment) = |
708 | scan_table_head(&bytes[table_head_ix..]); |
709 | |
710 | if table_head_bytes > 0 { |
711 | // computing header count from number of pipes |
712 | let header_count = |
713 | count_header_cols(bytes, pipes, start, last_pipe_ix); |
714 | |
715 | // make sure they match the number of columns we find in separator line |
716 | if alignment.len() == header_count { |
717 | let alignment_ix = self.allocs.allocate_alignment(alignment); |
718 | let end_ix = table_head_ix + table_head_bytes; |
719 | return LoopInstruction::BreakAtWith( |
720 | end_ix, |
721 | Some(Item { |
722 | start: i, |
723 | end: end_ix, // must update later |
724 | body: ItemBody::Table(alignment_ix), |
725 | }), |
726 | ); |
727 | } |
728 | } |
729 | } |
730 | } |
731 | |
732 | let trailing_whitespace = |
733 | scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl); |
734 | if trailing_whitespace >= 2 { |
735 | i -= trailing_whitespace; |
736 | self.tree.append_text(begin_text, i, backslash_escaped); |
737 | backslash_escaped = false; |
738 | return LoopInstruction::BreakAtWith( |
739 | end_ix, |
740 | Some(Item { |
741 | start: i, |
742 | end: end_ix, |
743 | body: ItemBody::HardBreak(false), |
744 | }), |
745 | ); |
746 | } |
747 | |
748 | self.tree |
749 | .append_text(begin_text, ix - trailing_whitespace, backslash_escaped); |
750 | backslash_escaped = false; |
751 | |
752 | LoopInstruction::BreakAtWith( |
753 | end_ix, |
754 | Some(Item { |
755 | start: i, |
756 | end: end_ix, |
757 | body: ItemBody::SoftBreak, |
758 | }), |
759 | ) |
760 | } |
761 | b' \\' => { |
762 | if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) { |
763 | self.tree.append_text(begin_text, ix, backslash_escaped); |
764 | if bytes[ix + 1] == b'`' { |
765 | let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`' ); |
766 | self.tree.append(Item { |
767 | start: ix + 1, |
768 | end: ix + count + 1, |
769 | body: ItemBody::MaybeCode(count, true), |
770 | }); |
771 | begin_text = ix + 1 + count; |
772 | backslash_escaped = false; |
773 | LoopInstruction::ContinueAndSkip(count) |
774 | } else if bytes[ix + 1] == b'|' && TableParseMode::Active == mode { |
775 | // Yeah, it's super weird that backslash escaped pipes in tables aren't "real" |
776 | // backslash escapes. |
777 | // |
778 | // This tree structure is intended for the benefit of inline analysis, and it |
779 | // is supposed to operate as-if backslash escaped pipes were stripped out in a |
780 | // separate pass. |
781 | begin_text = ix + 1; |
782 | backslash_escaped = false; |
783 | LoopInstruction::ContinueAndSkip(1) |
784 | } else if ix + 2 < bytes_len |
785 | && bytes[ix + 1] == b' \\' |
786 | && bytes[ix + 2] == b'|' |
787 | && TableParseMode::Active == mode |
788 | { |
789 | // To parse `\\|`, discard the backslashes and parse the `|` that follows it. |
790 | begin_text = ix + 2; |
791 | backslash_escaped = true; |
792 | LoopInstruction::ContinueAndSkip(2) |
793 | } else { |
794 | begin_text = ix + 1; |
795 | backslash_escaped = true; |
796 | LoopInstruction::ContinueAndSkip(1) |
797 | } |
798 | } else { |
799 | LoopInstruction::ContinueAndSkip(0) |
800 | } |
801 | } |
802 | c @ b'*' | c @ b'_' | c @ b'~' => { |
803 | let string_suffix = &self.text[ix..]; |
804 | let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c); |
805 | let can_open = delim_run_can_open( |
806 | &self.text[start..], |
807 | string_suffix, |
808 | count, |
809 | ix - start, |
810 | mode, |
811 | ); |
812 | let can_close = delim_run_can_close( |
813 | &self.text[start..], |
814 | string_suffix, |
815 | count, |
816 | ix - start, |
817 | mode, |
818 | ); |
819 | let is_valid_seq = c != b'~' || count <= 2; |
820 | |
821 | if (can_open || can_close) && is_valid_seq { |
822 | self.tree.append_text(begin_text, ix, backslash_escaped); |
823 | backslash_escaped = false; |
824 | for i in 0..count { |
825 | self.tree.append(Item { |
826 | start: ix + i, |
827 | end: ix + i + 1, |
828 | body: ItemBody::MaybeEmphasis(count - i, can_open, can_close), |
829 | }); |
830 | } |
831 | begin_text = ix + count; |
832 | } |
833 | LoopInstruction::ContinueAndSkip(count - 1) |
834 | } |
835 | b'$' => { |
836 | let string_suffix = &self.text[ix..]; |
837 | let can_open = !string_suffix[1..] |
838 | .as_bytes() |
839 | .first() |
840 | .copied() |
841 | .map_or(true, is_ascii_whitespace); |
842 | let can_close = ix > start |
843 | && !self.text[..ix] |
844 | .as_bytes() |
845 | .last() |
846 | .copied() |
847 | .map_or(true, is_ascii_whitespace); |
848 | |
849 | // 0xFFFF_FFFF... represents the root brace context. Using None would require |
850 | // storing Option<u8>, which is bigger than u8. |
851 | // |
852 | // These shouldn't conflict unless you have 255 levels of nesting, which is |
853 | // past the intended limit anyway. |
854 | // |
855 | // Unbalanced braces will cause the root to be changed, which is why it gets |
856 | // stored here. |
857 | let brace_context = |
858 | if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING { |
859 | self.brace_context_next as u8 |
860 | } else { |
861 | self.brace_context_stack.last().copied().unwrap_or_else(|| { |
862 | self.brace_context_stack.push(!0); |
863 | !0 |
864 | }) |
865 | }; |
866 | |
867 | self.tree.append_text(begin_text, ix, backslash_escaped); |
868 | self.tree.append(Item { |
869 | start: ix, |
870 | end: ix + 1, |
871 | body: ItemBody::MaybeMath(can_open, can_close, brace_context), |
872 | }); |
873 | begin_text = ix + 1; |
874 | LoopInstruction::ContinueAndSkip(0) |
875 | } |
876 | b'{' => { |
877 | if self.brace_context_stack.len() == MATH_BRACE_CONTEXT_MAX_NESTING { |
878 | self.brace_context_stack.push(self.brace_context_next as u8); |
879 | self.brace_context_next = MATH_BRACE_CONTEXT_MAX_NESTING; |
880 | } else if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING { |
881 | // When we reach the limit of nesting, switch from actually matching |
882 | // braces to just counting them. |
883 | self.brace_context_next += 1; |
884 | } else if !self.brace_context_stack.is_empty() { |
885 | // Store nothing if no math environment has been reached yet. |
886 | self.brace_context_stack.push(self.brace_context_next as u8); |
887 | self.brace_context_next += 1; |
888 | } |
889 | LoopInstruction::ContinueAndSkip(0) |
890 | } |
891 | b'}' => { |
892 | if let &mut [ref mut top_level_context] = &mut self.brace_context_stack[..] { |
893 | // Unbalanced Braces |
894 | // |
895 | // The initial, root top-level brace context is -1, but this is changed whenever an unbalanced |
896 | // close brace is encountered: |
897 | // |
898 | // This is not a math environment: $}$ |
899 | // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^|^ |
900 | // -1 |-2 |
901 | // |
902 | // To ensure this can't get parsed as math, each side of the unbalanced |
903 | // brace is an irreversibly separate brace context. As long as the math |
904 | // environment itself contains balanced braces, they should share a top level context. |
905 | // |
906 | // Math environment contains 2+2: $}$2+2$ |
907 | // ^^^ this is a math environment |
908 | *top_level_context = top_level_context.wrapping_sub(1); |
909 | } else if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING { |
910 | // When we exceed 25 levels of nesting, switch from accurately balancing braces |
911 | // to just counting them. When we dip back below the limit, switch back. |
912 | if self.brace_context_next <= MATH_BRACE_CONTEXT_MAX_NESTING { |
913 | self.brace_context_stack.pop(); |
914 | } else { |
915 | self.brace_context_next -= 1; |
916 | } |
917 | } else { |
918 | self.brace_context_stack.pop(); |
919 | } |
920 | LoopInstruction::ContinueAndSkip(0) |
921 | } |
922 | b'`' => { |
923 | self.tree.append_text(begin_text, ix, backslash_escaped); |
924 | backslash_escaped = false; |
925 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`' ); |
926 | self.tree.append(Item { |
927 | start: ix, |
928 | end: ix + count, |
929 | body: ItemBody::MaybeCode(count, false), |
930 | }); |
931 | begin_text = ix + count; |
932 | LoopInstruction::ContinueAndSkip(count - 1) |
933 | } |
934 | b'<' if bytes.get(ix + 1) != Some(&b' \\' ) => { |
935 | // Note: could detect some non-HTML cases and early escape here, but not |
936 | // clear that's a win. |
937 | self.tree.append_text(begin_text, ix, backslash_escaped); |
938 | backslash_escaped = false; |
939 | self.tree.append(Item { |
940 | start: ix, |
941 | end: ix + 1, |
942 | body: ItemBody::MaybeHtml, |
943 | }); |
944 | begin_text = ix + 1; |
945 | LoopInstruction::ContinueAndSkip(0) |
946 | } |
947 | b'!' => { |
948 | if ix + 1 < bytes_len && bytes[ix + 1] == b'[' { |
949 | self.tree.append_text(begin_text, ix, backslash_escaped); |
950 | backslash_escaped = false; |
951 | self.tree.append(Item { |
952 | start: ix, |
953 | end: ix + 2, |
954 | body: ItemBody::MaybeImage, |
955 | }); |
956 | begin_text = ix + 2; |
957 | LoopInstruction::ContinueAndSkip(1) |
958 | } else { |
959 | LoopInstruction::ContinueAndSkip(0) |
960 | } |
961 | } |
962 | b'[' => { |
963 | self.tree.append_text(begin_text, ix, backslash_escaped); |
964 | backslash_escaped = false; |
965 | self.tree.append(Item { |
966 | start: ix, |
967 | end: ix + 1, |
968 | body: ItemBody::MaybeLinkOpen, |
969 | }); |
970 | begin_text = ix + 1; |
971 | LoopInstruction::ContinueAndSkip(0) |
972 | } |
973 | b']' => { |
974 | self.tree.append_text(begin_text, ix, backslash_escaped); |
975 | backslash_escaped = false; |
976 | self.tree.append(Item { |
977 | start: ix, |
978 | end: ix + 1, |
979 | body: ItemBody::MaybeLinkClose(true), |
980 | }); |
981 | begin_text = ix + 1; |
982 | LoopInstruction::ContinueAndSkip(0) |
983 | } |
984 | b'&' => match scan_entity(&bytes[ix..]) { |
985 | (n, Some(value)) => { |
986 | self.tree.append_text(begin_text, ix, backslash_escaped); |
987 | backslash_escaped = false; |
988 | self.tree.append(Item { |
989 | start: ix, |
990 | end: ix + n, |
991 | body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)), |
992 | }); |
993 | begin_text = ix + n; |
994 | LoopInstruction::ContinueAndSkip(n - 1) |
995 | } |
996 | _ => LoopInstruction::ContinueAndSkip(0), |
997 | }, |
998 | b'|' => { |
999 | if ix != 0 && bytes[ix - 1] == b' \\' { |
1000 | LoopInstruction::ContinueAndSkip(0) |
1001 | } else if let TableParseMode::Active = mode { |
1002 | LoopInstruction::BreakAtWith(ix, None) |
1003 | } else { |
1004 | last_pipe_ix = ix; |
1005 | pipes += 1; |
1006 | LoopInstruction::ContinueAndSkip(0) |
1007 | } |
1008 | } |
1009 | b'.' => { |
1010 | if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' { |
1011 | self.tree.append_text(begin_text, ix, backslash_escaped); |
1012 | backslash_escaped = false; |
1013 | self.tree.append(Item { |
1014 | start: ix, |
1015 | end: ix + 3, |
1016 | body: ItemBody::SynthesizeChar('…' ), |
1017 | }); |
1018 | begin_text = ix + 3; |
1019 | LoopInstruction::ContinueAndSkip(2) |
1020 | } else { |
1021 | LoopInstruction::ContinueAndSkip(0) |
1022 | } |
1023 | } |
1024 | b'-' => { |
1025 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-' ); |
1026 | if count == 1 { |
1027 | LoopInstruction::ContinueAndSkip(0) |
1028 | } else { |
1029 | let itembody = if count == 2 { |
1030 | ItemBody::SynthesizeChar('–' ) |
1031 | } else if count == 3 { |
1032 | ItemBody::SynthesizeChar('—' ) |
1033 | } else { |
1034 | let (ems, ens) = match count % 6 { |
1035 | 0 | 3 => (count / 3, 0), |
1036 | 2 | 4 => (0, count / 2), |
1037 | 1 => (count / 3 - 1, 2), |
1038 | _ => (count / 3, 1), |
1039 | }; |
1040 | // – and — are 3 bytes each in utf8 |
1041 | let mut buf = String::with_capacity(3 * (ems + ens)); |
1042 | for _ in 0..ems { |
1043 | buf.push('—' ); |
1044 | } |
1045 | for _ in 0..ens { |
1046 | buf.push('–' ); |
1047 | } |
1048 | ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into())) |
1049 | }; |
1050 | |
1051 | self.tree.append_text(begin_text, ix, backslash_escaped); |
1052 | backslash_escaped = false; |
1053 | self.tree.append(Item { |
1054 | start: ix, |
1055 | end: ix + count, |
1056 | body: itembody, |
1057 | }); |
1058 | begin_text = ix + count; |
1059 | LoopInstruction::ContinueAndSkip(count - 1) |
1060 | } |
1061 | } |
1062 | c @ b' \'' | c @ b'"' => { |
1063 | let string_suffix = &self.text[ix..]; |
1064 | let can_open = |
1065 | delim_run_can_open(&self.text[start..], string_suffix, 1, ix - start, mode); |
1066 | let can_close = delim_run_can_close( |
1067 | &self.text[start..], |
1068 | string_suffix, |
1069 | 1, |
1070 | ix - start, |
1071 | mode, |
1072 | ); |
1073 | |
1074 | self.tree.append_text(begin_text, ix, backslash_escaped); |
1075 | backslash_escaped = false; |
1076 | self.tree.append(Item { |
1077 | start: ix, |
1078 | end: ix + 1, |
1079 | body: ItemBody::MaybeSmartQuote(c, can_open, can_close), |
1080 | }); |
1081 | begin_text = ix + 1; |
1082 | |
1083 | LoopInstruction::ContinueAndSkip(0) |
1084 | } |
1085 | _ => LoopInstruction::ContinueAndSkip(0), |
1086 | } |
1087 | }); |
1088 | |
1089 | if brk.is_none() { |
1090 | let trailing_whitespace = |
1091 | scan_rev_while(&bytes[begin_text..final_ix], is_ascii_whitespace_no_nl); |
1092 | // need to close text at eof |
1093 | self.tree.append_text( |
1094 | begin_text, |
1095 | final_ix - trailing_whitespace, |
1096 | backslash_escaped, |
1097 | ); |
1098 | } |
1099 | (final_ix, brk) |
1100 | } |
1101 | |
1102 | /// When start_ix is at the beginning of an HTML block of type 1 to 5, |
1103 | /// this will find the end of the block, adding the block itself to the |
1104 | /// tree and also keeping track of the lines of HTML within the block. |
1105 | /// |
1106 | /// The html_end_tag is the tag that must be found on a line to end the block. |
1107 | fn parse_html_block_type_1_to_5( |
1108 | &mut self, |
1109 | start_ix: usize, |
1110 | html_end_tag: &str, |
1111 | mut remaining_space: usize, |
1112 | mut indent: usize, |
1113 | ) -> usize { |
1114 | self.tree.append(Item { |
1115 | start: start_ix, |
1116 | end: 0, // set later |
1117 | body: ItemBody::HtmlBlock, |
1118 | }); |
1119 | self.tree.push(); |
1120 | |
1121 | let bytes = self.text.as_bytes(); |
1122 | let mut ix = start_ix; |
1123 | let end_ix; |
1124 | loop { |
1125 | let line_start_ix = ix; |
1126 | ix += scan_nextline(&bytes[ix..]); |
1127 | self.append_html_line(remaining_space.max(indent), line_start_ix, ix); |
1128 | |
1129 | let mut line_start = LineStart::new(&bytes[ix..]); |
1130 | let n_containers = scan_containers( |
1131 | &self.tree, |
1132 | &mut line_start, |
1133 | self.options.has_gfm_footnotes(), |
1134 | ); |
1135 | if n_containers < self.tree.spine_len() { |
1136 | end_ix = ix; |
1137 | break; |
1138 | } |
1139 | |
1140 | if self.text[line_start_ix..ix].contains(html_end_tag) { |
1141 | end_ix = ix; |
1142 | break; |
1143 | } |
1144 | |
1145 | let next_line_ix = ix + line_start.bytes_scanned(); |
1146 | if next_line_ix == self.text.len() { |
1147 | end_ix = next_line_ix; |
1148 | break; |
1149 | } |
1150 | ix = next_line_ix; |
1151 | remaining_space = line_start.remaining_space(); |
1152 | indent = 0; |
1153 | } |
1154 | self.pop(end_ix); |
1155 | ix |
1156 | } |
1157 | |
1158 | /// When start_ix is at the beginning of an HTML block of type 6 or 7, |
1159 | /// this will consume lines until there is a blank line and keep track of |
1160 | /// the HTML within the block. |
1161 | fn parse_html_block_type_6_or_7( |
1162 | &mut self, |
1163 | start_ix: usize, |
1164 | mut remaining_space: usize, |
1165 | mut indent: usize, |
1166 | ) -> usize { |
1167 | self.tree.append(Item { |
1168 | start: start_ix, |
1169 | end: 0, // set later |
1170 | body: ItemBody::HtmlBlock, |
1171 | }); |
1172 | self.tree.push(); |
1173 | |
1174 | let bytes = self.text.as_bytes(); |
1175 | let mut ix = start_ix; |
1176 | let end_ix; |
1177 | loop { |
1178 | let line_start_ix = ix; |
1179 | ix += scan_nextline(&bytes[ix..]); |
1180 | self.append_html_line(remaining_space.max(indent), line_start_ix, ix); |
1181 | |
1182 | let mut line_start = LineStart::new(&bytes[ix..]); |
1183 | let n_containers = scan_containers( |
1184 | &self.tree, |
1185 | &mut line_start, |
1186 | self.options.has_gfm_footnotes(), |
1187 | ); |
1188 | if n_containers < self.tree.spine_len() || line_start.is_at_eol() { |
1189 | end_ix = ix; |
1190 | break; |
1191 | } |
1192 | |
1193 | let next_line_ix = ix + line_start.bytes_scanned(); |
1194 | if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some() |
1195 | { |
1196 | end_ix = next_line_ix; |
1197 | break; |
1198 | } |
1199 | ix = next_line_ix; |
1200 | remaining_space = line_start.remaining_space(); |
1201 | indent = 0; |
1202 | } |
1203 | self.pop(end_ix); |
1204 | ix |
1205 | } |
1206 | |
1207 | fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize { |
1208 | self.tree.append(Item { |
1209 | start: start_ix, |
1210 | end: 0, // will get set later |
1211 | body: ItemBody::IndentCodeBlock, |
1212 | }); |
1213 | self.tree.push(); |
1214 | let bytes = self.text.as_bytes(); |
1215 | let mut last_nonblank_child = None; |
1216 | let mut last_nonblank_ix = 0; |
1217 | let mut end_ix = 0; |
1218 | self.last_line_blank = false; |
1219 | |
1220 | let mut ix = start_ix; |
1221 | loop { |
1222 | let line_start_ix = ix; |
1223 | ix += scan_nextline(&bytes[ix..]); |
1224 | self.append_code_text(remaining_space, line_start_ix, ix); |
1225 | // TODO(spec clarification): should we synthesize newline at EOF? |
1226 | |
1227 | if !self.last_line_blank { |
1228 | last_nonblank_child = self.tree.cur(); |
1229 | last_nonblank_ix = ix; |
1230 | end_ix = ix; |
1231 | } |
1232 | |
1233 | let mut line_start = LineStart::new(&bytes[ix..]); |
1234 | let n_containers = scan_containers( |
1235 | &self.tree, |
1236 | &mut line_start, |
1237 | self.options.has_gfm_footnotes(), |
1238 | ); |
1239 | if n_containers < self.tree.spine_len() |
1240 | || !(line_start.scan_space(4) || line_start.is_at_eol()) |
1241 | { |
1242 | break; |
1243 | } |
1244 | let next_line_ix = ix + line_start.bytes_scanned(); |
1245 | if next_line_ix == self.text.len() { |
1246 | break; |
1247 | } |
1248 | ix = next_line_ix; |
1249 | remaining_space = line_start.remaining_space(); |
1250 | self.last_line_blank = scan_blank_line(&bytes[ix..]).is_some(); |
1251 | } |
1252 | |
1253 | // Trim trailing blank lines. |
1254 | if let Some(child) = last_nonblank_child { |
1255 | self.tree[child].next = None; |
1256 | self.tree[child].item.end = last_nonblank_ix; |
1257 | } |
1258 | self.pop(end_ix); |
1259 | ix |
1260 | } |
1261 | |
1262 | fn parse_fenced_code_block( |
1263 | &mut self, |
1264 | start_ix: usize, |
1265 | indent: usize, |
1266 | fence_ch: u8, |
1267 | n_fence_char: usize, |
1268 | ) -> usize { |
1269 | let bytes = self.text.as_bytes(); |
1270 | let mut info_start = start_ix + n_fence_char; |
1271 | info_start += scan_whitespace_no_nl(&bytes[info_start..]); |
1272 | // TODO: info strings are typically very short. wouldn't it be faster |
1273 | // to just do a forward scan here? |
1274 | let mut ix = info_start + scan_nextline(&bytes[info_start..]); |
1275 | let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace); |
1276 | let info_string = unescape(&self.text[info_start..info_end], self.tree.is_in_table()); |
1277 | self.tree.append(Item { |
1278 | start: start_ix, |
1279 | end: 0, // will get set later |
1280 | body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)), |
1281 | }); |
1282 | self.tree.push(); |
1283 | loop { |
1284 | let mut line_start = LineStart::new(&bytes[ix..]); |
1285 | let n_containers = scan_containers( |
1286 | &self.tree, |
1287 | &mut line_start, |
1288 | self.options.has_gfm_footnotes(), |
1289 | ); |
1290 | if n_containers < self.tree.spine_len() { |
1291 | // this line will get parsed again as not being part of the code |
1292 | // if it's blank, it should be parsed as a blank line |
1293 | self.pop(ix); |
1294 | return ix; |
1295 | } |
1296 | line_start.scan_space(indent); |
1297 | let mut close_line_start = line_start.clone(); |
1298 | if !close_line_start.scan_space(4 - indent) { |
1299 | let close_ix = ix + close_line_start.bytes_scanned(); |
1300 | if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char) |
1301 | { |
1302 | ix = close_ix + n; |
1303 | self.pop(ix); |
1304 | // try to read trailing whitespace or it will register as a completely blank line |
1305 | return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0); |
1306 | } |
1307 | } |
1308 | let remaining_space = line_start.remaining_space(); |
1309 | ix += line_start.bytes_scanned(); |
1310 | let next_ix = ix + scan_nextline(&bytes[ix..]); |
1311 | self.append_code_text(remaining_space, ix, next_ix); |
1312 | ix = next_ix; |
1313 | } |
1314 | } |
1315 | |
1316 | fn parse_metadata_block(&mut self, start_ix: usize, metadata_block_ch: u8) -> usize { |
1317 | let bytes = self.text.as_bytes(); |
1318 | let metadata_block_kind = match metadata_block_ch { |
1319 | b'-' => MetadataBlockKind::YamlStyle, |
1320 | b'+' => MetadataBlockKind::PlusesStyle, |
1321 | _ => panic!("Erroneous metadata block character when parsing metadata block" ), |
1322 | }; |
1323 | // 3 delimiter characters |
1324 | let mut ix = start_ix + 3 + scan_nextline(&bytes[start_ix + 3..]); |
1325 | self.tree.append(Item { |
1326 | start: start_ix, |
1327 | end: 0, // will get set later |
1328 | body: ItemBody::MetadataBlock(metadata_block_kind), |
1329 | }); |
1330 | self.tree.push(); |
1331 | loop { |
1332 | let mut line_start = LineStart::new(&bytes[ix..]); |
1333 | let n_containers = scan_containers( |
1334 | &self.tree, |
1335 | &mut line_start, |
1336 | self.options.has_gfm_footnotes(), |
1337 | ); |
1338 | if n_containers < self.tree.spine_len() { |
1339 | break; |
1340 | } |
1341 | if let (_, 0) = calc_indent(&bytes[ix..], 4) { |
1342 | if let Some(n) = scan_closing_metadata_block(&bytes[ix..], metadata_block_ch) { |
1343 | ix += n; |
1344 | break; |
1345 | } |
1346 | } |
1347 | let remaining_space = line_start.remaining_space(); |
1348 | ix += line_start.bytes_scanned(); |
1349 | let next_ix = ix + scan_nextline(&bytes[ix..]); |
1350 | self.append_code_text(remaining_space, ix, next_ix); |
1351 | ix = next_ix; |
1352 | } |
1353 | |
1354 | self.pop(ix); |
1355 | |
1356 | // try to read trailing whitespace or it will register as a completely blank line |
1357 | ix + scan_blank_line(&bytes[ix..]).unwrap_or(0) |
1358 | } |
1359 | |
1360 | fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) { |
1361 | if remaining_space > 0 { |
1362 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
1363 | self.tree.append(Item { |
1364 | start, |
1365 | end: start, |
1366 | body: ItemBody::SynthesizeText(cow_ix), |
1367 | }); |
1368 | } |
1369 | if self.text.as_bytes()[end - 2] == b' \r' { |
1370 | // Normalize CRLF to LF |
1371 | self.tree.append_text(start, end - 2, false); |
1372 | self.tree.append_text(end - 1, end, false); |
1373 | } else { |
1374 | self.tree.append_text(start, end, false); |
1375 | } |
1376 | } |
1377 | |
1378 | /// Appends a line of HTML to the tree. |
1379 | fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) { |
1380 | if remaining_space > 0 { |
1381 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
1382 | self.tree.append(Item { |
1383 | start, |
1384 | end: start, |
1385 | body: ItemBody::SynthesizeText(cow_ix), |
1386 | }); |
1387 | } |
1388 | if self.text.as_bytes()[end - 2] == b' \r' { |
1389 | // Normalize CRLF to LF |
1390 | self.tree.append(Item { |
1391 | start, |
1392 | end: end - 2, |
1393 | body: ItemBody::Html, |
1394 | }); |
1395 | self.tree.append(Item { |
1396 | start: end - 1, |
1397 | end, |
1398 | body: ItemBody::Html, |
1399 | }); |
1400 | } else { |
1401 | self.tree.append(Item { |
1402 | start, |
1403 | end, |
1404 | body: ItemBody::Html, |
1405 | }); |
1406 | } |
1407 | } |
1408 | |
1409 | /// Pop a container, setting its end. |
1410 | fn pop(&mut self, ix: usize) { |
1411 | let cur_ix = self.tree.pop().unwrap(); |
1412 | self.tree[cur_ix].item.end = ix; |
1413 | if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body { |
1414 | surgerize_tight_list(&mut self.tree, cur_ix); |
1415 | self.begin_list_item = None; |
1416 | } |
1417 | } |
1418 | |
1419 | /// Close a list if it's open. Also set loose if last line was blank |
1420 | /// and end current list if it's a lone, empty item |
1421 | fn finish_list(&mut self, ix: usize) { |
1422 | self.finish_empty_list_item(); |
1423 | if let Some(node_ix) = self.tree.peek_up() { |
1424 | if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body { |
1425 | self.pop(ix); |
1426 | } |
1427 | } |
1428 | if self.last_line_blank { |
1429 | if let Some(node_ix) = self.tree.peek_grandparent() { |
1430 | if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body { |
1431 | *is_tight = false; |
1432 | } |
1433 | } |
1434 | self.last_line_blank = false; |
1435 | } |
1436 | } |
1437 | |
1438 | fn finish_empty_list_item(&mut self) { |
1439 | if let Some(begin_list_item) = self.begin_list_item { |
1440 | if self.last_line_blank { |
1441 | // A list item can begin with at most one blank line. |
1442 | if let Some(node_ix) = self.tree.peek_up() { |
1443 | if let ItemBody::ListItem(_) = self.tree[node_ix].item.body { |
1444 | self.pop(begin_list_item); |
1445 | } |
1446 | } |
1447 | } |
1448 | } |
1449 | self.begin_list_item = None; |
1450 | } |
1451 | |
1452 | /// Continue an existing list or start a new one if there's not an open |
1453 | /// list that matches. |
1454 | fn continue_list(&mut self, start: usize, ch: u8, index: u64) { |
1455 | self.finish_empty_list_item(); |
1456 | if let Some(node_ix) = self.tree.peek_up() { |
1457 | if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body { |
1458 | if existing_ch == ch { |
1459 | if self.last_line_blank { |
1460 | *is_tight = false; |
1461 | self.last_line_blank = false; |
1462 | } |
1463 | return; |
1464 | } |
1465 | } |
1466 | // TODO: this is not the best choice for end; maybe get end from last list item. |
1467 | self.finish_list(start); |
1468 | } |
1469 | self.tree.append(Item { |
1470 | start, |
1471 | end: 0, // will get set later |
1472 | body: ItemBody::List(true, ch, index), |
1473 | }); |
1474 | self.tree.push(); |
1475 | self.last_line_blank = false; |
1476 | } |
1477 | |
1478 | /// Parse a thematic break. |
1479 | /// |
1480 | /// Returns index of start of next line. |
1481 | fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize { |
1482 | self.tree.append(Item { |
1483 | start: ix, |
1484 | end: ix + hrule_size, |
1485 | body: ItemBody::Rule, |
1486 | }); |
1487 | ix + hrule_size |
1488 | } |
1489 | |
1490 | /// Parse an ATX heading. |
1491 | /// |
1492 | /// Returns index of start of next line. |
1493 | fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize { |
1494 | let mut ix = start; |
1495 | let heading_ix = self.tree.append(Item { |
1496 | start, |
1497 | end: 0, // set later |
1498 | body: ItemBody::default(), // set later |
1499 | }); |
1500 | ix += atx_level as usize; |
1501 | // next char is space or eol (guaranteed by scan_atx_heading) |
1502 | let bytes = self.text.as_bytes(); |
1503 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
1504 | self.tree[heading_ix].item.end = ix + eol_bytes; |
1505 | self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None); |
1506 | return ix + eol_bytes; |
1507 | } |
1508 | // skip leading spaces |
1509 | let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]); |
1510 | ix += skip_spaces; |
1511 | |
1512 | // now handle the header text |
1513 | let header_start = ix; |
1514 | let header_node_idx = self.tree.push(); // so that we can set the endpoint later |
1515 | |
1516 | // trim the trailing attribute block before parsing the entire line, if necessary |
1517 | let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) |
1518 | { |
1519 | // the start of the next line is the end of the header since the |
1520 | // header cannot have line breaks |
1521 | let header_end = header_start + scan_nextline(&bytes[header_start..]); |
1522 | let (content_end, attrs) = |
1523 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
1524 | self.parse_line(ix, Some(content_end), TableParseMode::Disabled); |
1525 | (header_end, content_end, attrs) |
1526 | } else { |
1527 | let (line_ix, line_brk) = self.parse_line(ix, None, TableParseMode::Disabled); |
1528 | ix = line_ix; |
1529 | // Backslash at end is actually hard line break |
1530 | if let Some(Item { |
1531 | start, |
1532 | end, |
1533 | body: ItemBody::HardBreak(true), |
1534 | }) = line_brk |
1535 | { |
1536 | self.tree.append_text(start, end, false); |
1537 | } |
1538 | (ix, ix, None) |
1539 | }; |
1540 | self.tree[header_node_idx].item.end = end; |
1541 | |
1542 | // remove trailing matter from header text |
1543 | let mut empty_text_node = false; |
1544 | if let Some(cur_ix) = self.tree.cur() { |
1545 | // remove closing of the ATX heading |
1546 | let header_text = &bytes[header_start..content_end]; |
1547 | let mut limit = header_text |
1548 | .iter() |
1549 | .rposition(|&b| !(b == b' \n' || b == b' \r' || b == b' ' )) |
1550 | .map_or(0, |i| i + 1); |
1551 | let closer = header_text[..limit] |
1552 | .iter() |
1553 | .rposition(|&b| b != b'#' ) |
1554 | .map_or(0, |i| i + 1); |
1555 | if closer == 0 { |
1556 | limit = closer; |
1557 | } else { |
1558 | let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ' ); |
1559 | if spaces > 0 { |
1560 | limit = closer - spaces; |
1561 | } |
1562 | } |
1563 | // if text is only spaces, then remove them |
1564 | self.tree[cur_ix].item.end = limit + header_start; |
1565 | |
1566 | // limit = 0 when text is empty after removing spaces |
1567 | if limit == 0 { |
1568 | empty_text_node = true; |
1569 | } |
1570 | } |
1571 | |
1572 | if empty_text_node { |
1573 | self.tree.remove_node(); |
1574 | } else { |
1575 | self.tree.pop(); |
1576 | } |
1577 | self.tree[heading_ix].item.body = ItemBody::Heading( |
1578 | atx_level, |
1579 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
1580 | ); |
1581 | |
1582 | end |
1583 | } |
1584 | |
1585 | /// Returns the number of bytes scanned on success. |
1586 | fn parse_footnote(&mut self, start: usize) -> Option<usize> { |
1587 | let bytes = &self.text.as_bytes()[start..]; |
1588 | if !bytes.starts_with(b"[^" ) { |
1589 | return None; |
1590 | } |
1591 | let (mut i, label) = if self.options.has_gfm_footnotes() { |
1592 | // GitHub doesn't allow footnote definition labels to contain line breaks. |
1593 | // It actually does allow this for link definitions under certain circumstances, |
1594 | // but for this it's simpler to avoid it. |
1595 | scan_link_label_rest(&self.text[start + 2..], &|_| None, self.tree.is_in_table())? |
1596 | } else { |
1597 | self.parse_refdef_label(start + 2)? |
1598 | }; |
1599 | if self.options.has_gfm_footnotes() && label.bytes().any(|b| b == b' \r' || b == b' \n' ) { |
1600 | // GitHub doesn't allow footnote definition labels to contain line breaks, |
1601 | // even if they're escaped. |
1602 | return None; |
1603 | } |
1604 | i += 2; |
1605 | if scan_ch(&bytes[i..], b':' ) == 0 { |
1606 | return None; |
1607 | } |
1608 | i += 1; |
1609 | self.finish_list(start); |
1610 | if let Some(node_ix) = self.tree.peek_up() { |
1611 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
1612 | // finish previous footnote if it's still open |
1613 | self.pop(start); |
1614 | } |
1615 | } |
1616 | if self.options.has_gfm_footnotes() { |
1617 | i += scan_whitespace_no_nl(&bytes[i..]); |
1618 | } |
1619 | self.allocs.footdefs.0.insert( |
1620 | UniCase::new(label.clone()), |
1621 | FootnoteDef { use_count: 0 }, |
1622 | ); |
1623 | self.tree.append(Item { |
1624 | start, |
1625 | end: 0, // will get set later |
1626 | // TODO: check whether the label here is strictly necessary |
1627 | body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)), |
1628 | }); |
1629 | self.tree.push(); |
1630 | Some(i) |
1631 | } |
1632 | |
1633 | /// Tries to parse a reference label, which can be interrupted by new blocks. |
1634 | /// On success, returns the number of bytes of the label and the label itself. |
1635 | fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> { |
1636 | scan_link_label_rest( |
1637 | &self.text[start..], |
1638 | &|bytes| { |
1639 | let mut line_start = LineStart::new(bytes); |
1640 | let current_container = scan_containers( |
1641 | &self.tree, |
1642 | &mut line_start, |
1643 | self.options.has_gfm_footnotes(), |
1644 | ) == self.tree.spine_len(); |
1645 | if line_start.scan_space(4) { |
1646 | return Some(line_start.bytes_scanned()); |
1647 | } |
1648 | let bytes_scanned = line_start.bytes_scanned(); |
1649 | let suffix = &bytes[bytes_scanned..]; |
1650 | if self.scan_paragraph_interrupt(suffix, current_container) |
1651 | || (current_container && scan_setext_heading(suffix).is_some()) |
1652 | { |
1653 | None |
1654 | } else { |
1655 | Some(bytes_scanned) |
1656 | } |
1657 | }, |
1658 | self.tree.is_in_table(), |
1659 | ) |
1660 | } |
1661 | |
1662 | /// Returns number of bytes scanned, label and definition on success. |
1663 | fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> { |
1664 | let bytes = &self.text.as_bytes()[start..]; |
1665 | if scan_ch(bytes, b'[' ) == 0 { |
1666 | return None; |
1667 | } |
1668 | let (mut i, label) = self.parse_refdef_label(start + 1)?; |
1669 | i += 1; |
1670 | if scan_ch(&bytes[i..], b':' ) == 0 { |
1671 | return None; |
1672 | } |
1673 | i += 1; |
1674 | let (bytecount, link_def) = self.scan_refdef(start, start + i)?; |
1675 | Some((bytecount + i, UniCase::new(label), link_def)) |
1676 | } |
1677 | |
1678 | /// Returns number of bytes and number of newlines |
1679 | fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> { |
1680 | let mut newlines = 0; |
1681 | loop { |
1682 | let whitespaces = scan_whitespace_no_nl(&bytes[i..]); |
1683 | i += whitespaces; |
1684 | if let Some(eol_bytes) = scan_eol(&bytes[i..]) { |
1685 | i += eol_bytes; |
1686 | newlines += 1; |
1687 | if newlines > 1 { |
1688 | return None; |
1689 | } |
1690 | } else { |
1691 | break; |
1692 | } |
1693 | let mut line_start = LineStart::new(&bytes[i..]); |
1694 | let current_container = scan_containers( |
1695 | &self.tree, |
1696 | &mut line_start, |
1697 | self.options.has_gfm_footnotes(), |
1698 | ) == self.tree.spine_len(); |
1699 | if !line_start.scan_space(4) { |
1700 | let suffix = &bytes[i + line_start.bytes_scanned()..]; |
1701 | if self.scan_paragraph_interrupt(suffix, current_container) |
1702 | || scan_setext_heading(suffix).is_some() |
1703 | { |
1704 | return None; |
1705 | } |
1706 | } |
1707 | i += line_start.bytes_scanned(); |
1708 | } |
1709 | Some((i, newlines)) |
1710 | } |
1711 | |
1712 | // returns (bytelength, title_str) |
1713 | fn scan_refdef_title<'t>(&self, text: &'t str) -> Option<(usize, CowStr<'t>)> { |
1714 | let bytes = text.as_bytes(); |
1715 | let closing_delim = match bytes.first()? { |
1716 | b' \'' => b' \'' , |
1717 | b'"' => b'"' , |
1718 | b'(' => b')' , |
1719 | _ => return None, |
1720 | }; |
1721 | let mut bytecount = 1; |
1722 | let mut linestart = 1; |
1723 | |
1724 | let mut linebuf = None; |
1725 | |
1726 | while let Some(&c) = bytes.get(bytecount) { |
1727 | match c { |
1728 | b'(' if closing_delim == b')' => { |
1729 | // https://spec.commonmark.org/0.30/#link-title |
1730 | // a sequence of zero or more characters between matching parentheses ((...)), |
1731 | // including a ( or ) character only if it is backslash-escaped. |
1732 | return None; |
1733 | } |
1734 | b' \n' | b' \r' => { |
1735 | // push text to line buffer |
1736 | // this is used to strip the block formatting: |
1737 | // |
1738 | // > [first]: http://example.com " |
1739 | // > second" |
1740 | // |
1741 | // should get turned into `<a href="example.com" title=" second">first</a>` |
1742 | let linebuf = if let Some(linebuf) = &mut linebuf { |
1743 | linebuf |
1744 | } else { |
1745 | linebuf = Some(String::new()); |
1746 | linebuf.as_mut().unwrap() |
1747 | }; |
1748 | linebuf.push_str(&text[linestart..bytecount]); |
1749 | linebuf.push(' \n' ); // normalize line breaks |
1750 | // skip line break |
1751 | bytecount += 1; |
1752 | if c == b' \r' && bytes.get(bytecount) == Some(&b' \n' ) { |
1753 | bytecount += 1; |
1754 | } |
1755 | let mut line_start = LineStart::new(&bytes[bytecount..]); |
1756 | let current_container = scan_containers( |
1757 | &self.tree, |
1758 | &mut line_start, |
1759 | self.options.has_gfm_footnotes(), |
1760 | ) == self.tree.spine_len(); |
1761 | if !line_start.scan_space(4) { |
1762 | let suffix = &bytes[bytecount + line_start.bytes_scanned()..]; |
1763 | if self.scan_paragraph_interrupt(suffix, current_container) |
1764 | || scan_setext_heading(suffix).is_some() |
1765 | { |
1766 | return None; |
1767 | } |
1768 | } |
1769 | line_start.scan_all_space(); |
1770 | bytecount += line_start.bytes_scanned(); |
1771 | linestart = bytecount; |
1772 | if scan_blank_line(&bytes[bytecount..]).is_some() { |
1773 | // blank line - not allowed |
1774 | return None; |
1775 | } |
1776 | } |
1777 | b' \\' => { |
1778 | bytecount += 1; |
1779 | if let Some(c) = bytes.get(bytecount) { |
1780 | if c != &b' \r' && c != &b' \n' { |
1781 | bytecount += 1; |
1782 | } |
1783 | } |
1784 | } |
1785 | c if c == closing_delim => { |
1786 | let cow = if let Some(mut linebuf) = linebuf { |
1787 | linebuf.push_str(&text[linestart..bytecount]); |
1788 | CowStr::from(linebuf) |
1789 | } else { |
1790 | CowStr::from(&text[linestart..bytecount]) |
1791 | }; |
1792 | return Some((bytecount + 1, cow)); |
1793 | } |
1794 | _ => { |
1795 | bytecount += 1; |
1796 | } |
1797 | } |
1798 | } |
1799 | None |
1800 | } |
1801 | |
1802 | /// Returns # of bytes and definition. |
1803 | /// Assumes the label of the reference including colon has already been scanned. |
1804 | fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> { |
1805 | let bytes = self.text.as_bytes(); |
1806 | |
1807 | // whitespace between label and url (including up to one newline) |
1808 | let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?; |
1809 | |
1810 | // scan link dest |
1811 | let (dest_length, dest) = scan_link_dest(self.text, i, LINK_MAX_NESTED_PARENS)?; |
1812 | if dest_length == 0 { |
1813 | return None; |
1814 | } |
1815 | let dest = unescape(dest, self.tree.is_in_table()); |
1816 | i += dest_length; |
1817 | |
1818 | // no title |
1819 | let mut backup = ( |
1820 | i - start, |
1821 | LinkDef { |
1822 | dest, |
1823 | title: None, |
1824 | span: span_start..i, |
1825 | }, |
1826 | ); |
1827 | |
1828 | // scan whitespace between dest and label |
1829 | let (mut i, newlines) = |
1830 | if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) { |
1831 | if i == self.text.len() { |
1832 | newlines += 1; |
1833 | } |
1834 | if new_i == i && newlines == 0 { |
1835 | return None; |
1836 | } |
1837 | if newlines > 1 { |
1838 | return Some(backup); |
1839 | }; |
1840 | (new_i, newlines) |
1841 | } else { |
1842 | return Some(backup); |
1843 | }; |
1844 | |
1845 | // scan title |
1846 | // if this fails but newline == 1, return also a refdef without title |
1847 | if let Some((title_length, title)) = self.scan_refdef_title(&self.text[i..]) { |
1848 | i += title_length; |
1849 | if scan_blank_line(&bytes[i..]).is_some() { |
1850 | backup.0 = i - start; |
1851 | backup.1.span = span_start..i; |
1852 | backup.1.title = Some(unescape(title, self.tree.is_in_table())); |
1853 | return Some(backup); |
1854 | } |
1855 | } |
1856 | if newlines > 0 { |
1857 | Some(backup) |
1858 | } else { |
1859 | None |
1860 | } |
1861 | } |
1862 | |
1863 | /// Checks whether we should break a paragraph on the given input. |
1864 | fn scan_paragraph_interrupt(&self, bytes: &[u8], current_container: bool) -> bool { |
1865 | let has_footnote = self.options.contains(Options::ENABLE_FOOTNOTES); |
1866 | if scan_paragraph_interrupt_no_table(bytes, current_container, has_footnote, &self.tree) { |
1867 | return true; |
1868 | } |
1869 | // pulldown-cmark allows heavy tables, that have a `|` on the header row, |
1870 | // to interrupt paragraphs. |
1871 | // |
1872 | // ```markdown |
1873 | // This is a table |
1874 | // | a | b | c | |
1875 | // |---|---|---| |
1876 | // | d | e | f | |
1877 | // |
1878 | // This is not a table |
1879 | // a | b | c |
1880 | // ---|---|--- |
1881 | // d | e | f |
1882 | // ``` |
1883 | if !self.options.contains(Options::ENABLE_TABLES) || !bytes.starts_with(b"|" ) { |
1884 | return false; |
1885 | } |
1886 | |
1887 | // Checking if something's a valid table or not requires looking at two lines. |
1888 | // First line, count unescaped pipes. |
1889 | let mut pipes = 0; |
1890 | let mut next_line_ix = 0; |
1891 | let mut bsesc = false; |
1892 | let mut last_pipe_ix = 0; |
1893 | for (i, &byte) in bytes.iter().enumerate() { |
1894 | match byte { |
1895 | b' \\' => { |
1896 | bsesc = true; |
1897 | continue; |
1898 | } |
1899 | b'|' if !bsesc => { |
1900 | pipes += 1; |
1901 | last_pipe_ix = i; |
1902 | } |
1903 | b' \r' | b' \n' => { |
1904 | next_line_ix = i + scan_eol(&bytes[i..]).unwrap(); |
1905 | break; |
1906 | } |
1907 | _ => {} |
1908 | } |
1909 | bsesc = false; |
1910 | } |
1911 | |
1912 | // scan_eol can't return 0, so this can't be zero |
1913 | if next_line_ix == 0 { |
1914 | return false; |
1915 | } |
1916 | |
1917 | // Scan the table head. The part that looks like: |
1918 | // |
1919 | // |---|---|---| |
1920 | // |
1921 | // Also scan any containing items, since it's on its own line, and |
1922 | // might be nested inside a block quote or something |
1923 | // |
1924 | // > Table: First |
1925 | // > | first col | second col | |
1926 | // > |-----------|------------| |
1927 | // ^ |
1928 | // | need to skip over the `>` when checking for the table |
1929 | let mut line_start = LineStart::new(&bytes[next_line_ix..]); |
1930 | if scan_containers( |
1931 | &self.tree, |
1932 | &mut line_start, |
1933 | self.options.has_gfm_footnotes(), |
1934 | ) != self.tree.spine_len() |
1935 | { |
1936 | return false; |
1937 | } |
1938 | let table_head_ix = next_line_ix + line_start.bytes_scanned(); |
1939 | let (table_head_bytes, alignment) = scan_table_head(&bytes[table_head_ix..]); |
1940 | |
1941 | if table_head_bytes == 0 { |
1942 | return false; |
1943 | } |
1944 | |
1945 | // computing header count from number of pipes |
1946 | let header_count = count_header_cols(bytes, pipes, 0, last_pipe_ix); |
1947 | |
1948 | // make sure they match the number of columns we find in separator line |
1949 | alignment.len() == header_count |
1950 | } |
1951 | |
1952 | /// Extracts and parses a heading attribute block if exists. |
1953 | /// |
1954 | /// Returns `(end_offset_of_heading_content, (id, classes))`. |
1955 | /// |
1956 | /// If `header_end` is less than or equal to `header_start`, the given |
1957 | /// input is considered as empty. |
1958 | fn extract_and_parse_heading_attribute_block( |
1959 | &mut self, |
1960 | header_start: usize, |
1961 | header_end: usize, |
1962 | ) -> (usize, Option<HeadingAttributes<'a>>) { |
1963 | if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) { |
1964 | return (header_end, None); |
1965 | } |
1966 | |
1967 | // extract the trailing attribute block |
1968 | let header_bytes = &self.text.as_bytes()[header_start..header_end]; |
1969 | let (content_len, attr_block_range_rel) = |
1970 | extract_attribute_block_content_from_header_text(header_bytes); |
1971 | let content_end = header_start + content_len; |
1972 | let attrs = attr_block_range_rel.and_then(|r| { |
1973 | parse_inside_attribute_block( |
1974 | &self.text[(header_start + r.start)..(header_start + r.end)], |
1975 | ) |
1976 | }); |
1977 | (content_end, attrs) |
1978 | } |
1979 | } |
1980 | |
1981 | /// Scanning modes for `Parser`'s `parse_line` method. |
1982 | #[derive (PartialEq, Eq, Copy, Clone)] |
1983 | enum TableParseMode { |
1984 | /// Inside a paragraph, scanning for table headers. |
1985 | Scan, |
1986 | /// Inside a table. |
1987 | Active, |
1988 | /// Inside a paragraph, not scanning for table headers. |
1989 | Disabled, |
1990 | } |
1991 | |
1992 | /// Computes the number of header columns in a table line by computing the number of dividing pipes |
1993 | /// that aren't followed or preceded by whitespace. |
1994 | fn count_header_cols( |
1995 | bytes: &[u8], |
1996 | mut pipes: usize, |
1997 | mut start: usize, |
1998 | last_pipe_ix: usize, |
1999 | ) -> usize { |
2000 | // was first pipe preceded by whitespace? if so, subtract one |
2001 | start += scan_whitespace_no_nl(&bytes[start..]); |
2002 | if bytes[start] == b'|' { |
2003 | pipes -= 1; |
2004 | } |
2005 | |
2006 | // was last pipe followed by whitespace? if so, sub one |
2007 | if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() { |
2008 | pipes |
2009 | } else { |
2010 | pipes + 1 |
2011 | } |
2012 | } |
2013 | |
2014 | /// Checks whether we should break a paragraph on the given input. |
2015 | /// |
2016 | /// Use `FirstPass::scan_paragraph_interrupt` in any context that allows |
2017 | /// tables to interrupt the paragraph. |
2018 | fn scan_paragraph_interrupt_no_table( |
2019 | bytes: &[u8], |
2020 | current_container: bool, |
2021 | has_footnote: bool, |
2022 | tree: &Tree<Item>, |
2023 | ) -> bool { |
2024 | scan_eol(bytes).is_some() |
2025 | || scan_hrule(bytes).is_ok() |
2026 | || scan_atx_heading(bytes).is_some() |
2027 | || scan_code_fence(bytes).is_some() |
2028 | || scan_blockquote_start(bytes).is_some() |
2029 | || scan_listitem(bytes).map_or(false, |(ix, delim, index, _)| { |
2030 | ! current_container || |
2031 | tree.is_in_table() || |
2032 | // we don't allow interruption by either empty lists or |
2033 | // numbered lists starting at an index other than 1 |
2034 | (delim == b'*' || delim == b'-' || delim == b'+' || index == 1) |
2035 | && (scan_blank_line(&bytes[ix..]).is_none()) |
2036 | }) |
2037 | || bytes.starts_with(b"<" ) |
2038 | && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..])) |
2039 | || (has_footnote |
2040 | && bytes.starts_with(b"[^" ) |
2041 | && scan_link_label_rest( |
2042 | std::str::from_utf8(&bytes[2..]).unwrap(), |
2043 | &|_| None, |
2044 | tree.is_in_table(), |
2045 | ) |
2046 | .map_or(false, |(len, _)| bytes.get(2 + len) == Some(&b':' ))) |
2047 | } |
2048 | |
2049 | /// Assumes `text_bytes` is preceded by `<`. |
2050 | fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> { |
2051 | static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre" , b"style" , b"script" , b"textarea" ]; |
2052 | static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--" , b"?" , b"![CDATA[" ]; |
2053 | |
2054 | for (beg_tag, end_tag) in BEGIN_TAGS |
2055 | .iter() |
2056 | .zip(["</pre>" , "</style>" , "</script>" , "</textarea>" ].iter()) |
2057 | { |
2058 | let tag_len = beg_tag.len(); |
2059 | |
2060 | if text_bytes.len() < tag_len { |
2061 | // begin tags are increasing in size |
2062 | break; |
2063 | } |
2064 | |
2065 | if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) { |
2066 | continue; |
2067 | } |
2068 | |
2069 | // Must either be the end of the line... |
2070 | if text_bytes.len() == tag_len { |
2071 | return Some(end_tag); |
2072 | } |
2073 | |
2074 | // ...or be followed by whitespace, newline, or '>'. |
2075 | let s = text_bytes[tag_len]; |
2076 | if is_ascii_whitespace(s) || s == b'>' { |
2077 | return Some(end_tag); |
2078 | } |
2079 | } |
2080 | |
2081 | for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->" , "?>" , "]]>" ].iter()) { |
2082 | if text_bytes.starts_with(beg_tag) { |
2083 | return Some(end_tag); |
2084 | } |
2085 | } |
2086 | |
2087 | if text_bytes.len() > 1 && text_bytes[0] == b'!' && text_bytes[1].is_ascii_alphabetic() { |
2088 | Some(">" ) |
2089 | } else { |
2090 | None |
2091 | } |
2092 | } |
2093 | |
2094 | // https://english.stackexchange.com/a/285573 |
2095 | fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) { |
2096 | let mut list_item = tree[list_ix].child; |
2097 | while let Some(listitem_ix) = list_item { |
2098 | // first child is special, controls how we repoint list_item.child |
2099 | let list_item_firstborn = tree[listitem_ix].child; |
2100 | |
2101 | // Check that list item has children - this is not necessarily the case! |
2102 | if let Some(firstborn_ix) = list_item_firstborn { |
2103 | if let ItemBody::Paragraph = tree[firstborn_ix].item.body { |
2104 | tree[listitem_ix].child = tree[firstborn_ix].child; |
2105 | } |
2106 | |
2107 | let mut list_item_child = Some(firstborn_ix); |
2108 | let mut node_to_repoint = None; |
2109 | while let Some(child_ix) = list_item_child { |
2110 | // surgerize paragraphs |
2111 | let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body { |
2112 | if let Some(child_firstborn) = tree[child_ix].child { |
2113 | if let Some(repoint_ix) = node_to_repoint { |
2114 | tree[repoint_ix].next = Some(child_firstborn); |
2115 | } |
2116 | let mut child_lastborn = child_firstborn; |
2117 | while let Some(lastborn_next_ix) = tree[child_lastborn].next { |
2118 | child_lastborn = lastborn_next_ix; |
2119 | } |
2120 | child_lastborn |
2121 | } else { |
2122 | child_ix |
2123 | } |
2124 | } else { |
2125 | child_ix |
2126 | }; |
2127 | |
2128 | node_to_repoint = Some(repoint_ix); |
2129 | tree[repoint_ix].next = tree[child_ix].next; |
2130 | list_item_child = tree[child_ix].next; |
2131 | } |
2132 | } |
2133 | |
2134 | list_item = tree[listitem_ix].next; |
2135 | } |
2136 | } |
2137 | |
2138 | /// Determines whether the delimiter run starting at given index is |
2139 | /// left-flanking, as defined by the commonmark spec (and isn't intraword |
2140 | /// for _ delims). |
2141 | /// suffix is &s[ix..], which is passed in as an optimization, since taking |
2142 | /// a string subslice is O(n). |
2143 | fn delim_run_can_open( |
2144 | s: &str, |
2145 | suffix: &str, |
2146 | run_len: usize, |
2147 | ix: usize, |
2148 | mode: TableParseMode, |
2149 | ) -> bool { |
2150 | let next_char = if let Some(c) = suffix.chars().nth(run_len) { |
2151 | c |
2152 | } else { |
2153 | return false; |
2154 | }; |
2155 | if next_char.is_whitespace() { |
2156 | return false; |
2157 | } |
2158 | if ix == 0 { |
2159 | return true; |
2160 | } |
2161 | if mode == TableParseMode::Active { |
2162 | if s[..ix].ends_with('|' ) && !s[..ix].ends_with(r"\|" ) { |
2163 | return true; |
2164 | } |
2165 | if next_char == '|' { |
2166 | return false; |
2167 | } |
2168 | } |
2169 | let delim = suffix.chars().next().unwrap(); |
2170 | // `*` and `~~` can be intraword, `_` and `~` cannot |
2171 | if delim == '*' && !is_punctuation(next_char) { |
2172 | return true; |
2173 | } |
2174 | if delim == '~' && run_len > 1 { |
2175 | return true; |
2176 | } |
2177 | let prev_char = s[..ix].chars().last().unwrap(); |
2178 | if delim == '~' && prev_char == '~' && !is_punctuation(next_char) { |
2179 | return true; |
2180 | } |
2181 | |
2182 | prev_char.is_whitespace() |
2183 | || is_punctuation(prev_char) && (delim != ' \'' || ![']' , ')' ].contains(&prev_char)) |
2184 | } |
2185 | |
2186 | /// Determines whether the delimiter run starting at given index is |
2187 | /// right-flanking, as defined by the commonmark spec (and isn't intraword |
2188 | /// for _ delims) |
2189 | fn delim_run_can_close( |
2190 | s: &str, |
2191 | suffix: &str, |
2192 | run_len: usize, |
2193 | ix: usize, |
2194 | mode: TableParseMode, |
2195 | ) -> bool { |
2196 | if ix == 0 { |
2197 | return false; |
2198 | } |
2199 | let prev_char = s[..ix].chars().last().unwrap(); |
2200 | if prev_char.is_whitespace() { |
2201 | return false; |
2202 | } |
2203 | let next_char = if let Some(c) = suffix.chars().nth(run_len) { |
2204 | c |
2205 | } else { |
2206 | return true; |
2207 | }; |
2208 | if mode == TableParseMode::Active { |
2209 | if s[..ix].ends_with('|' ) && !s[..ix].ends_with(r"\|" ) { |
2210 | return false; |
2211 | } |
2212 | if next_char == '|' { |
2213 | return true; |
2214 | } |
2215 | } |
2216 | let delim = suffix.chars().next().unwrap(); |
2217 | // `*` and `~~` can be intraword, `_` and `~` cannot |
2218 | if (delim == '*' || (delim == '~' && run_len > 1)) && !is_punctuation(prev_char) { |
2219 | return true; |
2220 | } |
2221 | if delim == '~' && prev_char == '~' { |
2222 | return true; |
2223 | } |
2224 | |
2225 | next_char.is_whitespace() || is_punctuation(next_char) |
2226 | } |
2227 | |
2228 | fn create_lut(options: &Options) -> LookupTable { |
2229 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
2230 | { |
2231 | LookupTable { |
2232 | simd: simd::compute_lookup(options), |
2233 | scalar: special_bytes(options), |
2234 | } |
2235 | } |
2236 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
2237 | { |
2238 | special_bytes(options) |
2239 | } |
2240 | } |
2241 | |
2242 | fn special_bytes(options: &Options) -> [bool; 256] { |
2243 | let mut bytes = [false; 256]; |
2244 | let standard_bytes = [ |
2245 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
2246 | ]; |
2247 | |
2248 | for &byte in &standard_bytes { |
2249 | bytes[byte as usize] = true; |
2250 | } |
2251 | if options.contains(Options::ENABLE_TABLES) { |
2252 | bytes[b'|' as usize] = true; |
2253 | } |
2254 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
2255 | bytes[b'~' as usize] = true; |
2256 | } |
2257 | if options.contains(Options::ENABLE_MATH) { |
2258 | bytes[b'$' as usize] = true; |
2259 | bytes[b'{' as usize] = true; |
2260 | bytes[b'}' as usize] = true; |
2261 | } |
2262 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
2263 | for &byte in &[b'.' , b'-' , b'"' , b' \'' ] { |
2264 | bytes[byte as usize] = true; |
2265 | } |
2266 | } |
2267 | |
2268 | bytes |
2269 | } |
2270 | |
2271 | enum LoopInstruction<T> { |
2272 | /// Continue looking for more special bytes, but skip next few bytes. |
2273 | ContinueAndSkip(usize), |
2274 | /// Break looping immediately, returning with the given index and value. |
2275 | BreakAtWith(usize, T), |
2276 | } |
2277 | |
2278 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
2279 | struct LookupTable { |
2280 | simd: [u8; 16], |
2281 | scalar: [bool; 256], |
2282 | } |
2283 | |
2284 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
2285 | type LookupTable = [bool; 256]; |
2286 | |
2287 | /// This function walks the byte slices from the given index and |
2288 | /// calls the callback function on all bytes (and their indices) that are in the following set: |
2289 | /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n` |
2290 | /// It is guaranteed not call the callback on other bytes. |
2291 | /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback |
2292 | /// will not be called with an index that is less than `ix + n + 1`. |
2293 | /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be |
2294 | /// called and the function returns immediately with the return value `(end_ix, opt_val)`. |
2295 | /// If `BreakAtWith(..)` is never returned, this function will return the first |
2296 | /// index that is outside the byteslice bound and a `None` value. |
2297 | fn iterate_special_bytes<F, T>( |
2298 | lut: &LookupTable, |
2299 | bytes: &[u8], |
2300 | ix: usize, |
2301 | callback: F, |
2302 | ) -> (usize, Option<T>) |
2303 | where |
2304 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
2305 | { |
2306 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
2307 | { |
2308 | simd::iterate_special_bytes(lut, bytes, ix, callback) |
2309 | } |
2310 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
2311 | { |
2312 | scalar_iterate_special_bytes(lut, bytes, ix, callback) |
2313 | } |
2314 | } |
2315 | |
2316 | fn scalar_iterate_special_bytes<F, T>( |
2317 | lut: &[bool; 256], |
2318 | bytes: &[u8], |
2319 | mut ix: usize, |
2320 | mut callback: F, |
2321 | ) -> (usize, Option<T>) |
2322 | where |
2323 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
2324 | { |
2325 | while ix < bytes.len() { |
2326 | let b: u8 = bytes[ix]; |
2327 | if lut[b as usize] { |
2328 | match callback(ix, b) { |
2329 | LoopInstruction::ContinueAndSkip(skip: usize) => { |
2330 | ix += skip; |
2331 | } |
2332 | LoopInstruction::BreakAtWith(ix: usize, val: Option) => { |
2333 | return (ix, val); |
2334 | } |
2335 | } |
2336 | } |
2337 | ix += 1; |
2338 | } |
2339 | |
2340 | (ix, None) |
2341 | } |
2342 | |
2343 | /// Split the usual heading content range and the content inside the trailing attribute block. |
2344 | /// |
2345 | /// Returns `(leading_content_len, Option<trailing_attr_block_range>)`. |
2346 | /// |
2347 | /// Note that `trailing_attr_block_range` will be empty range when the block |
2348 | /// is `{}`, since the range is content inside the wrapping `{` and `}`. |
2349 | /// |
2350 | /// The closing `}` of an attribute block can have trailing whitespaces. |
2351 | /// They are automatically trimmed when the attribute block is being searched. |
2352 | /// |
2353 | /// However, this method does not trim the trailing whitespaces of heading content. |
2354 | /// It is callers' responsibility to trim them if necessary. |
2355 | fn extract_attribute_block_content_from_header_text( |
2356 | heading: &[u8], |
2357 | ) -> (usize, Option<Range<usize>>) { |
2358 | let heading_len = heading.len(); |
2359 | let mut ix = heading_len; |
2360 | ix -= scan_rev_while(heading, |b| { |
2361 | b == b' \n' || b == b' \r' || b == b' ' || b == b' \t' |
2362 | }); |
2363 | if ix == 0 { |
2364 | return (heading_len, None); |
2365 | } |
2366 | |
2367 | let attr_block_close = ix - 1; |
2368 | if heading.get(attr_block_close) != Some(&b'}' ) { |
2369 | // The last character is not `}`. No attribute blocks found. |
2370 | return (heading_len, None); |
2371 | } |
2372 | // move cursor before the closing right brace (`}`) |
2373 | ix -= 1; |
2374 | |
2375 | ix -= scan_rev_while(&heading[..ix], |b| { |
2376 | // Characters to be excluded: |
2377 | // * `{` and `}`: special characters to open and close an attribute block. |
2378 | // * `\\`: a special character to escape many characters and disable some syntaxes. |
2379 | // + Handling of this escape character differs among markdown processors. |
2380 | // + Escaped characters will be separate text node from neighbors, so |
2381 | // it is not easy to handle unescaped string and trim the trailing block. |
2382 | // * `<` and `>`: special characters to start and end HTML tag. |
2383 | // + No known processors converts `{#<i>foo</i>}` into |
2384 | // `id="<i>foo</>"` as of this writing, so hopefully |
2385 | // this restriction won't cause compatibility issues. |
2386 | // * `\n` and `\r`: a newline character. |
2387 | // + Setext heading can have multiple lines. However it is hard to support |
2388 | // attribute blocks that have newline inside, since the parsing proceeds line by |
2389 | // line and lines will be separate nodes even they are logically a single text. |
2390 | !matches!(b, b'{' | b'}' | b'<' | b'>' | b' \\' | b' \n' | b' \r' ) |
2391 | }); |
2392 | if ix == 0 { |
2393 | // `{` is not found. No attribute blocks available. |
2394 | return (heading_len, None); |
2395 | } |
2396 | let attr_block_open = ix - 1; |
2397 | if heading[attr_block_open] != b'{' { |
2398 | // `{` is not found. No attribute blocks available. |
2399 | return (heading_len, None); |
2400 | } |
2401 | |
2402 | (attr_block_open, Some(ix..attr_block_close)) |
2403 | } |
2404 | |
2405 | /// Parses an attribute block content, such as `.class1 #id .class2`. |
2406 | /// |
2407 | /// Returns `(id, classes)`. |
2408 | /// |
2409 | /// It is callers' responsibility to find opening and closing characters of the attribute |
2410 | /// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you. |
2411 | /// |
2412 | /// Note that this parsing requires explicit whitespace separators between |
2413 | /// attributes. This is intentional design with the reasons below: |
2414 | /// |
2415 | /// * to keep conversion simple and easy to understand for any possible input, |
2416 | /// * to avoid adding less obvious conversion rule that can reduce compatibility |
2417 | /// with other implementations more, and |
2418 | /// * to follow the major design of implementations with the support for the |
2419 | /// attribute blocks extension (as of this writing). |
2420 | /// |
2421 | /// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`]. |
2422 | /// |
2423 | /// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES` |
2424 | fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes<'_>> { |
2425 | let mut id = None; |
2426 | let mut classes = Vec::new(); |
2427 | let mut attrs = Vec::new(); |
2428 | |
2429 | for attr in inside_attr_block.split_ascii_whitespace() { |
2430 | // iterator returned by `str::split_ascii_whitespace` never emits empty |
2431 | // strings, so taking first byte won't panic. |
2432 | if attr.len() > 1 { |
2433 | let first_byte = attr.as_bytes()[0]; |
2434 | if first_byte == b'#' { |
2435 | id = Some(attr[1..].into()); |
2436 | } else if first_byte == b'.' { |
2437 | classes.push(attr[1..].into()); |
2438 | } else { |
2439 | let split = attr.split_once('=' ); |
2440 | if let Some((key, value)) = split { |
2441 | attrs.push((key.into(), Some(value.into()))); |
2442 | } else { |
2443 | attrs.push((attr.into(), None)); |
2444 | } |
2445 | } |
2446 | } |
2447 | } |
2448 | |
2449 | Some(HeadingAttributes { id, classes, attrs }) |
2450 | } |
2451 | |
2452 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
2453 | mod simd { |
2454 | //! SIMD byte scanning logic. |
2455 | //! |
2456 | //! This module provides functions that allow walking through byteslices, calling |
2457 | //! provided callback functions on special bytes and their indices using SIMD. |
2458 | //! The byteset is defined in `compute_lookup`. |
2459 | //! |
2460 | //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of |
2461 | //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask |
2462 | //! from this and call the callback on every index corresponding to a 1 in this mask |
2463 | //! before moving on to the next chunk. This allows us to move quickly when there |
2464 | //! are no or few matches. |
2465 | //! |
2466 | //! The table lookup is inspired by this [great overview]. However, since all of the |
2467 | //! bytes we're interested in are ASCII, we don't quite need the full generality of |
2468 | //! the universal algorithm and are hence able to skip a few instructions. |
2469 | //! |
2470 | //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html |
2471 | |
2472 | use super::{LookupTable, LoopInstruction}; |
2473 | use crate::Options; |
2474 | use core::arch::x86_64::*; |
2475 | |
2476 | const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>(); |
2477 | |
2478 | /// Generates a lookup table containing the bitmaps for our |
2479 | /// special marker bytes. This is effectively a 128 element 2d bitvector, |
2480 | /// that can be indexed by a four bit row index (the lower nibble) |
2481 | /// and a three bit column index (upper nibble). |
2482 | pub(super) fn compute_lookup(options: &Options) -> [u8; 16] { |
2483 | let mut lookup = [0u8; 16]; |
2484 | let standard_bytes = [ |
2485 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
2486 | ]; |
2487 | |
2488 | for &byte in &standard_bytes { |
2489 | add_lookup_byte(&mut lookup, byte); |
2490 | } |
2491 | if options.contains(Options::ENABLE_TABLES) { |
2492 | add_lookup_byte(&mut lookup, b'|' ); |
2493 | } |
2494 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
2495 | add_lookup_byte(&mut lookup, b'~' ); |
2496 | } |
2497 | if options.contains(Options::ENABLE_MATH) { |
2498 | add_lookup_byte(&mut lookup, b'$' ); |
2499 | add_lookup_byte(&mut lookup, b'{' ); |
2500 | add_lookup_byte(&mut lookup, b'}' ); |
2501 | } |
2502 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
2503 | for &byte in &[b'.' , b'-' , b'"' , b' \'' ] { |
2504 | add_lookup_byte(&mut lookup, byte); |
2505 | } |
2506 | } |
2507 | |
2508 | lookup |
2509 | } |
2510 | |
2511 | fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) { |
2512 | lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4); |
2513 | } |
2514 | |
2515 | /// Computes a bit mask for the given byteslice starting from the given index, |
2516 | /// where the 16 least significant bits indicate (by value of 1) whether or not |
2517 | /// there is a special character at that byte position. The least significant bit |
2518 | /// corresponds to `bytes[ix]` and the most significant bit corresponds to |
2519 | /// `bytes[ix + 15]`. |
2520 | /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`. |
2521 | #[target_feature (enable = "ssse3" )] |
2522 | #[inline ] |
2523 | unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 { |
2524 | debug_assert!(bytes.len() >= ix + VECTOR_SIZE); |
2525 | |
2526 | let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i); |
2527 | // Small lookup table to compute single bit bitshifts |
2528 | // for 16 bytes at once. |
2529 | let bitmask_lookup = |
2530 | _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1); |
2531 | |
2532 | // Load input from memory. |
2533 | let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i; |
2534 | let input = _mm_loadu_si128(raw_ptr); |
2535 | // Compute the bitmap using the bottom nibble as an index |
2536 | // into the lookup table. Note that non-ascii bytes will have |
2537 | // their most significant bit set and will map to lookup[0]. |
2538 | let bitset = _mm_shuffle_epi8(bitmap, input); |
2539 | // Compute the high nibbles of the input using a 16-bit rightshift of four |
2540 | // and a mask to prevent most-significant bit issues. |
2541 | let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f)); |
2542 | // Create a bitmask for the bitmap by perform a left shift of the value |
2543 | // of the higher nibble. Bytes with their most significant set are mapped |
2544 | // to -1 (all ones). |
2545 | let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles); |
2546 | // Test the bit of the bitmap by AND'ing the bitmap and the mask together. |
2547 | let tmp = _mm_and_si128(bitset, bitmask); |
2548 | // Check whether the result was not null. NEQ is not a SIMD intrinsic, |
2549 | // but comparing to the bitmask is logically equivalent. This also prevents us |
2550 | // from matching any non-ASCII bytes since none of the bitmaps were all ones |
2551 | // (-1). |
2552 | let result = _mm_cmpeq_epi8(tmp, bitmask); |
2553 | |
2554 | // Return the resulting bitmask. |
2555 | _mm_movemask_epi8(result) |
2556 | } |
2557 | |
2558 | /// Calls callback on byte indices and their value. |
2559 | /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the |
2560 | /// number of bytes in callback return value otherwise. |
2561 | /// Returns the final index and a possible break value. |
2562 | pub(super) fn iterate_special_bytes<F, T>( |
2563 | lut: &LookupTable, |
2564 | bytes: &[u8], |
2565 | ix: usize, |
2566 | callback: F, |
2567 | ) -> (usize, Option<T>) |
2568 | where |
2569 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
2570 | { |
2571 | if is_x86_feature_detected!("ssse3" ) && bytes.len() >= VECTOR_SIZE { |
2572 | unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) } |
2573 | } else { |
2574 | super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback) |
2575 | } |
2576 | } |
2577 | |
2578 | /// Calls the callback function for every 1 in the given bitmask with |
2579 | /// the index `offset + ix`, where `ix` is the position of the 1 in the mask. |
2580 | /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with |
2581 | /// final index `end_ix` and optional value `opt_val`. |
2582 | unsafe fn process_mask<F, T>( |
2583 | mut mask: i32, |
2584 | bytes: &[u8], |
2585 | mut offset: usize, |
2586 | callback: &mut F, |
2587 | ) -> Result<usize, (usize, Option<T>)> |
2588 | where |
2589 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
2590 | { |
2591 | while mask != 0 { |
2592 | let mask_ix = mask.trailing_zeros() as usize; |
2593 | offset += mask_ix; |
2594 | match callback(offset, *bytes.get_unchecked(offset)) { |
2595 | LoopInstruction::ContinueAndSkip(skip) => { |
2596 | offset += skip + 1; |
2597 | mask = mask.wrapping_shr((skip + 1 + mask_ix) as u32); |
2598 | } |
2599 | LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)), |
2600 | } |
2601 | } |
2602 | Ok(offset) |
2603 | } |
2604 | |
2605 | #[target_feature (enable = "ssse3" )] |
2606 | /// Important: only call this function when `bytes.len() >= 16`. Doing |
2607 | /// so otherwise may exhibit undefined behaviour. |
2608 | unsafe fn simd_iterate_special_bytes<F, T>( |
2609 | lut: &[u8; 16], |
2610 | bytes: &[u8], |
2611 | mut ix: usize, |
2612 | mut callback: F, |
2613 | ) -> (usize, Option<T>) |
2614 | where |
2615 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
2616 | { |
2617 | debug_assert!(bytes.len() >= VECTOR_SIZE); |
2618 | let upperbound = bytes.len() - VECTOR_SIZE; |
2619 | |
2620 | while ix < upperbound { |
2621 | let mask = compute_mask(lut, bytes, ix); |
2622 | let block_start = ix; |
2623 | ix = match process_mask(mask, bytes, ix, &mut callback) { |
2624 | Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start), |
2625 | Err((end_ix, val)) => return (end_ix, val), |
2626 | }; |
2627 | } |
2628 | |
2629 | if bytes.len() > ix { |
2630 | // shift off the bytes at start we have already scanned |
2631 | let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound; |
2632 | if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) { |
2633 | return (end_ix, val); |
2634 | } |
2635 | } |
2636 | |
2637 | (bytes.len(), None) |
2638 | } |
2639 | |
2640 | #[cfg (test)] |
2641 | mod simd_test { |
2642 | use super::super::create_lut; |
2643 | use super::{iterate_special_bytes, LoopInstruction}; |
2644 | use crate::Options; |
2645 | |
2646 | fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) { |
2647 | let mut opts = Options::empty(); |
2648 | opts.insert(Options::ENABLE_MATH); |
2649 | opts.insert(Options::ENABLE_TABLES); |
2650 | opts.insert(Options::ENABLE_FOOTNOTES); |
2651 | opts.insert(Options::ENABLE_STRIKETHROUGH); |
2652 | opts.insert(Options::ENABLE_TASKLISTS); |
2653 | |
2654 | let lut = create_lut(&opts); |
2655 | let mut indices = vec![]; |
2656 | |
2657 | iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| { |
2658 | indices.push(ix); |
2659 | LoopInstruction::ContinueAndSkip(skip) |
2660 | }); |
2661 | |
2662 | assert_eq!(&indices[..], expected); |
2663 | } |
2664 | |
2665 | #[test ] |
2666 | fn simple_no_match() { |
2667 | check_expected_indices("abcdef0123456789" .as_bytes(), &[], 0); |
2668 | } |
2669 | |
2670 | #[test ] |
2671 | fn simple_match() { |
2672 | check_expected_indices("*bcd&f0123456789" .as_bytes(), &[0, 4], 0); |
2673 | } |
2674 | |
2675 | #[test ] |
2676 | fn single_open_fish() { |
2677 | check_expected_indices("<" .as_bytes(), &[0], 0); |
2678 | } |
2679 | |
2680 | #[test ] |
2681 | fn long_match() { |
2682 | check_expected_indices("0123456789abcde~*bcd&f0" .as_bytes(), &[15, 16, 20], 0); |
2683 | } |
2684 | |
2685 | #[test ] |
2686 | fn border_skip() { |
2687 | check_expected_indices("0123456789abcde~~~~d&f0" .as_bytes(), &[15, 20], 3); |
2688 | } |
2689 | |
2690 | #[test ] |
2691 | fn exhaustive_search() { |
2692 | let chars = [ |
2693 | b' \n' , b' \r' , b'*' , b'_' , b'~' , b'|' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
2694 | b'$' , b'{' , b'}' , |
2695 | ]; |
2696 | |
2697 | for &c in &chars { |
2698 | for i in 0u8..=255 { |
2699 | if !chars.contains(&i) { |
2700 | // full match |
2701 | let mut buf = [i; 18]; |
2702 | buf[3] = c; |
2703 | buf[6] = c; |
2704 | |
2705 | check_expected_indices(&buf[..], &[3, 6], 0); |
2706 | } |
2707 | } |
2708 | } |
2709 | } |
2710 | } |
2711 | } |
2712 | |