1//! The first pass resolves all block structure, generating an AST. Within a block, items
2//! are in a linear chain with potential inline markup identified.
3
4use std::cmp::max;
5use std::ops::Range;
6
7use crate::parse::{scan_containers, Allocations, HeadingAttributes, Item, ItemBody, LinkDef};
8use crate::scanners::*;
9use crate::strings::CowStr;
10use crate::tree::{Tree, TreeIndex};
11use crate::Options;
12use crate::{
13 linklabel::{scan_link_label_rest, LinkLabel},
14 HeadingLevel,
15};
16
17use unicase::UniCase;
18
19/// Runs the first pass, which resolves the block structure of the document,
20/// and returns the resulting tree.
21pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations) {
22 // This is a very naive heuristic for the number of nodes
23 // we'll need.
24 let start_capacity: usize = max(v1:128, v2:text.len() / 32);
25 let lookup_table: &[bool; 256] = &create_lut(&options);
26 let first_pass: FirstPass<'_, '_> = FirstPass {
27 text,
28 tree: Tree::with_capacity(cap:start_capacity),
29 begin_list_item: false,
30 last_line_blank: false,
31 allocs: Allocations::new(),
32 options,
33 lookup_table,
34 };
35 first_pass.run()
36}
37
38/// State for the first parsing pass.
39struct FirstPass<'a, 'b> {
40 text: &'a str,
41 tree: Tree<Item>,
42 begin_list_item: bool,
43 last_line_blank: bool,
44 allocs: Allocations<'a>,
45 options: Options,
46 lookup_table: &'b LookupTable,
47}
48
49impl<'a, 'b> FirstPass<'a, 'b> {
50 fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
51 let mut ix = 0;
52 while ix < self.text.len() {
53 ix = self.parse_block(ix);
54 }
55 for _ in 0..self.tree.spine_len() {
56 self.pop(ix);
57 }
58 (self.tree, self.allocs)
59 }
60
61 /// Returns offset after block.
62 fn parse_block(&mut self, mut start_ix: usize) -> usize {
63 let bytes = self.text.as_bytes();
64 let mut line_start = LineStart::new(&bytes[start_ix..]);
65
66 let i = scan_containers(&self.tree, &mut line_start);
67 for _ in i..self.tree.spine_len() {
68 self.pop(start_ix);
69 }
70
71 if self.options.contains(Options::ENABLE_FOOTNOTES) {
72 // finish footnote if it's still open and was preceded by blank line
73 if let Some(node_ix) = self.tree.peek_up() {
74 if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
75 if self.last_line_blank {
76 self.pop(start_ix);
77 }
78 }
79 }
80
81 // Footnote definitions of the form
82 // [^bar]:
83 // * anything really
84 let container_start = start_ix + line_start.bytes_scanned();
85 if let Some(bytecount) = self.parse_footnote(container_start) {
86 start_ix = container_start + bytecount;
87 start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
88 line_start = LineStart::new(&bytes[start_ix..]);
89 }
90 }
91
92 // Process new containers
93 loop {
94 let container_start = start_ix + line_start.bytes_scanned();
95 if let Some((ch, index, indent)) = line_start.scan_list_marker() {
96 let after_marker_index = start_ix + line_start.bytes_scanned();
97 self.continue_list(container_start, ch, index);
98 self.tree.append(Item {
99 start: container_start,
100 end: after_marker_index, // will get updated later if item not empty
101 body: ItemBody::ListItem(indent),
102 });
103 self.tree.push();
104 if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
105 self.begin_list_item = true;
106 return after_marker_index + n;
107 }
108 if self.options.contains(Options::ENABLE_TASKLISTS) {
109 if let Some(is_checked) = line_start.scan_task_list_marker() {
110 self.tree.append(Item {
111 start: after_marker_index,
112 end: start_ix + line_start.bytes_scanned(),
113 body: ItemBody::TaskListMarker(is_checked),
114 });
115 }
116 }
117 } else if line_start.scan_blockquote_marker() {
118 self.finish_list(start_ix);
119 self.tree.append(Item {
120 start: container_start,
121 end: 0, // will get set later
122 body: ItemBody::BlockQuote,
123 });
124 self.tree.push();
125 } else {
126 break;
127 }
128 }
129
130 let ix = start_ix + line_start.bytes_scanned();
131
132 if let Some(n) = scan_blank_line(&bytes[ix..]) {
133 if let Some(node_ix) = self.tree.peek_up() {
134 match self.tree[node_ix].item.body {
135 ItemBody::BlockQuote => (),
136 _ => {
137 if self.begin_list_item {
138 // A list item can begin with at most one blank line.
139 self.pop(start_ix);
140 }
141 self.last_line_blank = true;
142 }
143 }
144 }
145 return ix + n;
146 }
147
148 self.begin_list_item = false;
149 self.finish_list(start_ix);
150
151 // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
152 let remaining_space = line_start.remaining_space();
153
154 let indent = line_start.scan_space_upto(4);
155 if indent == 4 {
156 let ix = start_ix + line_start.bytes_scanned();
157 let remaining_space = line_start.remaining_space();
158 return self.parse_indented_code_block(ix, remaining_space);
159 }
160
161 let ix = start_ix + line_start.bytes_scanned();
162
163 // HTML Blocks
164 if bytes[ix] == b'<' {
165 // Types 1-5 are all detected by one function and all end with the same
166 // pattern
167 if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
168 return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
169 }
170
171 // Detect type 6
172 if starts_html_block_type_6(&bytes[(ix + 1)..]) {
173 return self.parse_html_block_type_6_or_7(ix, remaining_space);
174 }
175
176 // Detect type 7
177 if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
178 return self.parse_html_block_type_6_or_7(ix, remaining_space);
179 }
180 }
181
182 if let Ok(n) = scan_hrule(&bytes[ix..]) {
183 return self.parse_hrule(n, ix);
184 }
185
186 if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
187 return self.parse_atx_heading(ix, atx_size);
188 }
189
190 // parse refdef
191 if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
192 self.allocs.refdefs.0.entry(label).or_insert(link_def);
193 let ix = ix + bytecount;
194 // try to read trailing whitespace or it will register as a completely blank line
195 // TODO: shouldn't we do this for all block level items?
196 return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
197 }
198
199 if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
200 return self.parse_fenced_code_block(ix, indent, fence_ch, n);
201 }
202 self.parse_paragraph(ix)
203 }
204
205 /// Returns the offset of the first line after the table.
206 /// Assumptions: current focus is a table element and the table header
207 /// matches the separator line (same number of columns).
208 fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
209 // parse header. this shouldn't fail because we made sure the table header is ok
210 let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
211 self.tree[thead_ix].item.body = ItemBody::TableHead;
212
213 // parse body
214 let mut ix = body_start;
215 while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
216 ix = next_ix;
217 }
218
219 self.pop(ix);
220 ix
221 }
222
223 /// Call this when containers are taken care of.
224 /// Returns bytes scanned, row_ix
225 fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
226 let bytes = self.text.as_bytes();
227 let mut cells = 0;
228 let mut final_cell_ix = None;
229
230 let row_ix = self.tree.append(Item {
231 start: ix,
232 end: 0, // set at end of this function
233 body: ItemBody::TableRow,
234 });
235 self.tree.push();
236
237 loop {
238 ix += scan_ch(&bytes[ix..], b'|');
239 let start_ix = ix;
240 ix += scan_whitespace_no_nl(&bytes[ix..]);
241
242 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
243 ix += eol_bytes;
244 break;
245 }
246
247 let cell_ix = self.tree.append(Item {
248 start: start_ix,
249 end: ix,
250 body: ItemBody::TableCell,
251 });
252 self.tree.push();
253 let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active);
254
255 if let Some(cur_ix) = self.tree.cur() {
256 let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
257 self.tree[cur_ix].item.end -= trailing_whitespace;
258 }
259
260 self.tree[cell_ix].item.end = next_ix;
261 self.tree.pop();
262
263 ix = next_ix;
264 cells += 1;
265
266 if cells == row_cells {
267 final_cell_ix = Some(cell_ix);
268 }
269 }
270
271 // fill empty cells if needed
272 // note: this is where GFM and commonmark-extra diverge. we follow
273 // GFM here
274 for _ in cells..row_cells {
275 self.tree.append(Item {
276 start: ix,
277 end: ix,
278 body: ItemBody::TableCell,
279 });
280 }
281
282 // drop excess cells
283 if let Some(cell_ix) = final_cell_ix {
284 self.tree[cell_ix].next = None;
285 }
286
287 self.pop(ix);
288
289 (ix, row_ix)
290 }
291
292 /// Returns first offset after the row and the tree index of the row.
293 fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
294 let bytes = self.text.as_bytes();
295 let mut line_start = LineStart::new(&bytes[ix..]);
296 let current_container =
297 scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
298 if !current_container {
299 return None;
300 }
301 line_start.scan_all_space();
302 ix += line_start.bytes_scanned();
303 if scan_paragraph_interrupt(&bytes[ix..], current_container) {
304 return None;
305 }
306
307 let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
308 Some((ix, row_ix))
309 }
310
311 /// Returns offset of line start after paragraph.
312 fn parse_paragraph(&mut self, start_ix: usize) -> usize {
313 let node_ix = self.tree.append(Item {
314 start: start_ix,
315 end: 0, // will get set later
316 body: ItemBody::Paragraph,
317 });
318 self.tree.push();
319 let bytes = self.text.as_bytes();
320
321 let mut ix = start_ix;
322 loop {
323 if self.options.contains(Options::ENABLE_FOOTNOTES) && self.get_footnote(ix).is_some() {
324 self.tree[node_ix].item.end = if ix > start_ix { ix - 1 } else { start_ix };
325 self.tree.pop();
326 if let Some(node_ix) = self.tree.peek_up() {
327 if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
328 self.pop(ix);
329 }
330 }
331 return ix;
332 }
333
334 let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
335 TableParseMode::Scan
336 } else {
337 TableParseMode::Disabled
338 };
339 let (next_ix, brk) = self.parse_line(ix, None, scan_mode);
340
341 // break out when we find a table
342 if let Some(Item {
343 body: ItemBody::Table(alignment_ix),
344 ..
345 }) = brk
346 {
347 let table_cols = self.allocs[alignment_ix].len();
348 self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
349 // this clears out any stuff we may have appended - but there may
350 // be a cleaner way
351 self.tree[node_ix].child = None;
352 self.tree.pop();
353 self.tree.push();
354 return self.parse_table(table_cols, ix, next_ix);
355 }
356
357 ix = next_ix;
358 let mut line_start = LineStart::new(&bytes[ix..]);
359 let current_container =
360 scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
361 if !line_start.scan_space(4) {
362 let ix_new = ix + line_start.bytes_scanned();
363 if current_container {
364 let trailing_backslash_pos = match brk {
365 Some(Item {
366 start,
367 body: ItemBody::HardBreak,
368 ..
369 }) if bytes[start] == b'\\' => Some(start),
370 _ => None,
371 };
372 if let Some(ix_setext) =
373 self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some())
374 {
375 if let Some(pos) = trailing_backslash_pos {
376 self.tree.append_text(pos, pos + 1);
377 }
378 ix = ix_setext;
379 break;
380 }
381 }
382 // first check for non-empty lists, then for other interrupts
383 let suffix = &bytes[ix_new..];
384 if scan_paragraph_interrupt(suffix, current_container) {
385 break;
386 }
387 }
388 line_start.scan_all_space();
389 if line_start.is_at_eol() {
390 break;
391 }
392 ix = next_ix + line_start.bytes_scanned();
393 if let Some(item) = brk {
394 self.tree.append(item);
395 }
396 }
397
398 self.pop(ix);
399 ix
400 }
401
402 /// Returns end ix of setext_heading on success.
403 fn parse_setext_heading(
404 &mut self,
405 ix: usize,
406 node_ix: TreeIndex,
407 has_trailing_content: bool,
408 ) -> Option<usize> {
409 let bytes = self.text.as_bytes();
410 let (n, level) = scan_setext_heading(&bytes[ix..])?;
411 let mut attrs = None;
412
413 if let Some(cur_ix) = self.tree.cur() {
414 let parent_ix = self.tree.peek_up().unwrap();
415 let header_start = self.tree[parent_ix].item.start;
416 // Note that `self.tree[parent_ix].item.end` might be zero at this point.
417 // Use the end position of the current node (i.e. the last known child
418 // of the parent) instead.
419 let header_end = self.tree[cur_ix].item.end;
420
421 // extract the trailing attribute block
422 let (content_end, attrs_) =
423 self.extract_and_parse_heading_attribute_block(header_start, header_end);
424 attrs = attrs_;
425
426 // strip trailing whitespace
427 let new_end = if has_trailing_content {
428 content_end
429 } else {
430 let trailing_ws =
431 scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl);
432 content_end - trailing_ws
433 };
434
435 if attrs.is_some() {
436 // remove trailing block attributes
437 self.tree.truncate_siblings(self.text.as_bytes(), new_end);
438 }
439
440 if let Some(cur_ix) = self.tree.cur() {
441 self.tree[cur_ix].item.end = new_end;
442 }
443 }
444
445 self.tree[node_ix].item.body = ItemBody::Heading(
446 level,
447 attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
448 );
449
450 Some(ix + n)
451 }
452
453 /// Parse a line of input, appending text and items to tree.
454 ///
455 /// Returns: index after line and an item representing the break.
456 fn parse_line(
457 &mut self,
458 start: usize,
459 end: Option<usize>,
460 mode: TableParseMode,
461 ) -> (usize, Option<Item>) {
462 let bytes = self.text.as_bytes();
463 let bytes = match end {
464 Some(end) => &bytes[..end],
465 None => bytes,
466 };
467 let bytes_len = bytes.len();
468 let mut pipes = 0;
469 let mut last_pipe_ix = start;
470 let mut begin_text = start;
471
472 let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| {
473 match byte {
474 b'\n' | b'\r' => {
475 if let TableParseMode::Active = mode {
476 return LoopInstruction::BreakAtWith(ix, None);
477 }
478
479 let mut i = ix;
480 let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
481 if mode == TableParseMode::Scan && pipes > 0 {
482 // check if we may be parsing a table
483 let next_line_ix = ix + eol_bytes;
484 let mut line_start = LineStart::new(&bytes[next_line_ix..]);
485 if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() {
486 let table_head_ix = next_line_ix + line_start.bytes_scanned();
487 let (table_head_bytes, alignment) =
488 scan_table_head(&bytes[table_head_ix..]);
489
490 if table_head_bytes > 0 {
491 // computing header count from number of pipes
492 let header_count =
493 count_header_cols(bytes, pipes, start, last_pipe_ix);
494
495 // make sure they match the number of columns we find in separator line
496 if alignment.len() == header_count {
497 let alignment_ix = self.allocs.allocate_alignment(alignment);
498 let end_ix = table_head_ix + table_head_bytes;
499 return LoopInstruction::BreakAtWith(
500 end_ix,
501 Some(Item {
502 start: i,
503 end: end_ix, // must update later
504 body: ItemBody::Table(alignment_ix),
505 }),
506 );
507 }
508 }
509 }
510 }
511
512 let end_ix = ix + eol_bytes;
513 let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
514 if trailing_backslashes % 2 == 1 && end_ix < bytes_len {
515 i -= 1;
516 self.tree.append_text(begin_text, i);
517 return LoopInstruction::BreakAtWith(
518 end_ix,
519 Some(Item {
520 start: i,
521 end: end_ix,
522 body: ItemBody::HardBreak,
523 }),
524 );
525 }
526 let trailing_whitespace =
527 scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
528 if trailing_whitespace >= 2 {
529 i -= trailing_whitespace;
530 self.tree.append_text(begin_text, i);
531 return LoopInstruction::BreakAtWith(
532 end_ix,
533 Some(Item {
534 start: i,
535 end: end_ix,
536 body: ItemBody::HardBreak,
537 }),
538 );
539 }
540
541 self.tree.append_text(begin_text, ix);
542 LoopInstruction::BreakAtWith(
543 end_ix,
544 Some(Item {
545 start: i,
546 end: end_ix,
547 body: ItemBody::SoftBreak,
548 }),
549 )
550 }
551 b'\\' => {
552 if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) {
553 self.tree.append_text(begin_text, ix);
554 if bytes[ix + 1] == b'`' {
555 let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
556 self.tree.append(Item {
557 start: ix + 1,
558 end: ix + count + 1,
559 body: ItemBody::MaybeCode(count, true),
560 });
561 begin_text = ix + 1 + count;
562 LoopInstruction::ContinueAndSkip(count)
563 } else {
564 begin_text = ix + 1;
565 LoopInstruction::ContinueAndSkip(1)
566 }
567 } else {
568 LoopInstruction::ContinueAndSkip(0)
569 }
570 }
571 c @ b'*' | c @ b'_' | c @ b'~' => {
572 let string_suffix = &self.text[ix..];
573 let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
574 let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
575 let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
576 let is_valid_seq = c != b'~' || count <= 2;
577
578 if (can_open || can_close) && is_valid_seq {
579 self.tree.append_text(begin_text, ix);
580 for i in 0..count {
581 self.tree.append(Item {
582 start: ix + i,
583 end: ix + i + 1,
584 body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
585 });
586 }
587 begin_text = ix + count;
588 }
589 LoopInstruction::ContinueAndSkip(count - 1)
590 }
591 b'`' => {
592 self.tree.append_text(begin_text, ix);
593 let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
594 self.tree.append(Item {
595 start: ix,
596 end: ix + count,
597 body: ItemBody::MaybeCode(count, false),
598 });
599 begin_text = ix + count;
600 LoopInstruction::ContinueAndSkip(count - 1)
601 }
602 b'<' => {
603 // Note: could detect some non-HTML cases and early escape here, but not
604 // clear that's a win.
605 self.tree.append_text(begin_text, ix);
606 self.tree.append(Item {
607 start: ix,
608 end: ix + 1,
609 body: ItemBody::MaybeHtml,
610 });
611 begin_text = ix + 1;
612 LoopInstruction::ContinueAndSkip(0)
613 }
614 b'!' => {
615 if ix + 1 < bytes_len && bytes[ix + 1] == b'[' {
616 self.tree.append_text(begin_text, ix);
617 self.tree.append(Item {
618 start: ix,
619 end: ix + 2,
620 body: ItemBody::MaybeImage,
621 });
622 begin_text = ix + 2;
623 LoopInstruction::ContinueAndSkip(1)
624 } else {
625 LoopInstruction::ContinueAndSkip(0)
626 }
627 }
628 b'[' => {
629 self.tree.append_text(begin_text, ix);
630 self.tree.append(Item {
631 start: ix,
632 end: ix + 1,
633 body: ItemBody::MaybeLinkOpen,
634 });
635 begin_text = ix + 1;
636 LoopInstruction::ContinueAndSkip(0)
637 }
638 b']' => {
639 self.tree.append_text(begin_text, ix);
640 self.tree.append(Item {
641 start: ix,
642 end: ix + 1,
643 body: ItemBody::MaybeLinkClose(true),
644 });
645 begin_text = ix + 1;
646 LoopInstruction::ContinueAndSkip(0)
647 }
648 b'&' => match scan_entity(&bytes[ix..]) {
649 (n, Some(value)) => {
650 self.tree.append_text(begin_text, ix);
651 self.tree.append(Item {
652 start: ix,
653 end: ix + n,
654 body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
655 });
656 begin_text = ix + n;
657 LoopInstruction::ContinueAndSkip(n - 1)
658 }
659 _ => LoopInstruction::ContinueAndSkip(0),
660 },
661 b'|' => {
662 if let TableParseMode::Active = mode {
663 LoopInstruction::BreakAtWith(ix, None)
664 } else {
665 last_pipe_ix = ix;
666 pipes += 1;
667 LoopInstruction::ContinueAndSkip(0)
668 }
669 }
670 b'.' => {
671 if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' {
672 self.tree.append_text(begin_text, ix);
673 self.tree.append(Item {
674 start: ix,
675 end: ix + 3,
676 body: ItemBody::SynthesizeChar('…'),
677 });
678 begin_text = ix + 3;
679 LoopInstruction::ContinueAndSkip(2)
680 } else {
681 LoopInstruction::ContinueAndSkip(0)
682 }
683 }
684 b'-' => {
685 let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-');
686 if count == 1 {
687 LoopInstruction::ContinueAndSkip(0)
688 } else {
689 let itembody = if count == 2 {
690 ItemBody::SynthesizeChar('–')
691 } else if count == 3 {
692 ItemBody::SynthesizeChar('—')
693 } else {
694 let (ems, ens) = match count % 6 {
695 0 | 3 => (count / 3, 0),
696 2 | 4 => (0, count / 2),
697 1 => (count / 3 - 1, 2),
698 _ => (count / 3, 1),
699 };
700 // – and — are 3 bytes each in utf8
701 let mut buf = String::with_capacity(3 * (ems + ens));
702 for _ in 0..ems {
703 buf.push('—');
704 }
705 for _ in 0..ens {
706 buf.push('–');
707 }
708 ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
709 };
710
711 self.tree.append_text(begin_text, ix);
712 self.tree.append(Item {
713 start: ix,
714 end: ix + count,
715 body: itembody,
716 });
717 begin_text = ix + count;
718 LoopInstruction::ContinueAndSkip(count - 1)
719 }
720 }
721 c @ b'\'' | c @ b'"' => {
722 let string_suffix = &self.text[ix..];
723 let can_open = delim_run_can_open(self.text, string_suffix, 1, ix);
724 let can_close = delim_run_can_close(self.text, string_suffix, 1, ix);
725
726 self.tree.append_text(begin_text, ix);
727 self.tree.append(Item {
728 start: ix,
729 end: ix + 1,
730 body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
731 });
732 begin_text = ix + 1;
733
734 LoopInstruction::ContinueAndSkip(0)
735 }
736 _ => LoopInstruction::ContinueAndSkip(0),
737 }
738 });
739
740 if brk.is_none() {
741 // need to close text at eof
742 self.tree.append_text(begin_text, final_ix);
743 }
744 (final_ix, brk)
745 }
746
747 /// When start_ix is at the beginning of an HTML block of type 1 to 5,
748 /// this will find the end of the block, adding the block itself to the
749 /// tree and also keeping track of the lines of HTML within the block.
750 ///
751 /// The html_end_tag is the tag that must be found on a line to end the block.
752 fn parse_html_block_type_1_to_5(
753 &mut self,
754 start_ix: usize,
755 html_end_tag: &str,
756 mut remaining_space: usize,
757 ) -> usize {
758 let bytes = self.text.as_bytes();
759 let mut ix = start_ix;
760 loop {
761 let line_start_ix = ix;
762 ix += scan_nextline(&bytes[ix..]);
763 self.append_html_line(remaining_space, line_start_ix, ix);
764
765 let mut line_start = LineStart::new(&bytes[ix..]);
766 let n_containers = scan_containers(&self.tree, &mut line_start);
767 if n_containers < self.tree.spine_len() {
768 break;
769 }
770
771 if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
772 break;
773 }
774
775 let next_line_ix = ix + line_start.bytes_scanned();
776 if next_line_ix == self.text.len() {
777 break;
778 }
779 ix = next_line_ix;
780 remaining_space = line_start.remaining_space();
781 }
782 ix
783 }
784
785 /// When start_ix is at the beginning of an HTML block of type 6 or 7,
786 /// this will consume lines until there is a blank line and keep track of
787 /// the HTML within the block.
788 fn parse_html_block_type_6_or_7(
789 &mut self,
790 start_ix: usize,
791 mut remaining_space: usize,
792 ) -> usize {
793 let bytes = self.text.as_bytes();
794 let mut ix = start_ix;
795 loop {
796 let line_start_ix = ix;
797 ix += scan_nextline(&bytes[ix..]);
798 self.append_html_line(remaining_space, line_start_ix, ix);
799
800 let mut line_start = LineStart::new(&bytes[ix..]);
801 let n_containers = scan_containers(&self.tree, &mut line_start);
802 if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
803 break;
804 }
805
806 let next_line_ix = ix + line_start.bytes_scanned();
807 if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
808 {
809 break;
810 }
811 ix = next_line_ix;
812 remaining_space = line_start.remaining_space();
813 }
814 ix
815 }
816
817 fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
818 self.tree.append(Item {
819 start: start_ix,
820 end: 0, // will get set later
821 body: ItemBody::IndentCodeBlock,
822 });
823 self.tree.push();
824 let bytes = self.text.as_bytes();
825 let mut last_nonblank_child = None;
826 let mut last_nonblank_ix = 0;
827 let mut end_ix = 0;
828 let mut last_line_blank = false;
829
830 let mut ix = start_ix;
831 loop {
832 let line_start_ix = ix;
833 ix += scan_nextline(&bytes[ix..]);
834 self.append_code_text(remaining_space, line_start_ix, ix);
835 // TODO(spec clarification): should we synthesize newline at EOF?
836
837 if !last_line_blank {
838 last_nonblank_child = self.tree.cur();
839 last_nonblank_ix = ix;
840 end_ix = ix;
841 }
842
843 let mut line_start = LineStart::new(&bytes[ix..]);
844 let n_containers = scan_containers(&self.tree, &mut line_start);
845 if n_containers < self.tree.spine_len()
846 || !(line_start.scan_space(4) || line_start.is_at_eol())
847 {
848 break;
849 }
850 let next_line_ix = ix + line_start.bytes_scanned();
851 if next_line_ix == self.text.len() {
852 break;
853 }
854 ix = next_line_ix;
855 remaining_space = line_start.remaining_space();
856 last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
857 }
858
859 // Trim trailing blank lines.
860 if let Some(child) = last_nonblank_child {
861 self.tree[child].next = None;
862 self.tree[child].item.end = last_nonblank_ix;
863 }
864 self.pop(end_ix);
865 ix
866 }
867
868 fn parse_fenced_code_block(
869 &mut self,
870 start_ix: usize,
871 indent: usize,
872 fence_ch: u8,
873 n_fence_char: usize,
874 ) -> usize {
875 let bytes = self.text.as_bytes();
876 let mut info_start = start_ix + n_fence_char;
877 info_start += scan_whitespace_no_nl(&bytes[info_start..]);
878 // TODO: info strings are typically very short. wouldn't it be faster
879 // to just do a forward scan here?
880 let mut ix = info_start + scan_nextline(&bytes[info_start..]);
881 let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
882 let info_string = unescape(&self.text[info_start..info_end]);
883 self.tree.append(Item {
884 start: start_ix,
885 end: 0, // will get set later
886 body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
887 });
888 self.tree.push();
889 loop {
890 let mut line_start = LineStart::new(&bytes[ix..]);
891 let n_containers = scan_containers(&self.tree, &mut line_start);
892 if n_containers < self.tree.spine_len() {
893 break;
894 }
895 line_start.scan_space(indent);
896 let mut close_line_start = line_start.clone();
897 if !close_line_start.scan_space(4) {
898 let close_ix = ix + close_line_start.bytes_scanned();
899 if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
900 {
901 ix = close_ix + n;
902 break;
903 }
904 }
905 let remaining_space = line_start.remaining_space();
906 ix += line_start.bytes_scanned();
907 let next_ix = ix + scan_nextline(&bytes[ix..]);
908 self.append_code_text(remaining_space, ix, next_ix);
909 ix = next_ix;
910 }
911
912 self.pop(ix);
913
914 // try to read trailing whitespace or it will register as a completely blank line
915 ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
916 }
917
918 fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
919 if remaining_space > 0 {
920 let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
921 self.tree.append(Item {
922 start,
923 end: start,
924 body: ItemBody::SynthesizeText(cow_ix),
925 });
926 }
927 if self.text.as_bytes()[end - 2] == b'\r' {
928 // Normalize CRLF to LF
929 self.tree.append_text(start, end - 2);
930 self.tree.append_text(end - 1, end);
931 } else {
932 self.tree.append_text(start, end);
933 }
934 }
935
936 /// Appends a line of HTML to the tree.
937 fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
938 if remaining_space > 0 {
939 let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
940 self.tree.append(Item {
941 start,
942 end: start,
943 // TODO: maybe this should synthesize to html rather than text?
944 body: ItemBody::SynthesizeText(cow_ix),
945 });
946 }
947 if self.text.as_bytes()[end - 2] == b'\r' {
948 // Normalize CRLF to LF
949 self.tree.append(Item {
950 start,
951 end: end - 2,
952 body: ItemBody::Html,
953 });
954 self.tree.append(Item {
955 start: end - 1,
956 end,
957 body: ItemBody::Html,
958 });
959 } else {
960 self.tree.append(Item {
961 start,
962 end,
963 body: ItemBody::Html,
964 });
965 }
966 }
967
968 /// Pop a container, setting its end.
969 fn pop(&mut self, ix: usize) {
970 let cur_ix = self.tree.pop().unwrap();
971 self.tree[cur_ix].item.end = ix;
972 if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body {
973 surgerize_tight_list(&mut self.tree, cur_ix);
974 }
975 }
976
977 /// Close a list if it's open. Also set loose if last line was blank
978 fn finish_list(&mut self, ix: usize) {
979 if let Some(node_ix) = self.tree.peek_up() {
980 if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
981 self.pop(ix);
982 }
983 }
984 if self.last_line_blank {
985 if let Some(node_ix) = self.tree.peek_grandparent() {
986 if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
987 *is_tight = false;
988 }
989 }
990 self.last_line_blank = false;
991 }
992 }
993
994 /// Continue an existing list or start a new one if there's not an open
995 /// list that matches.
996 fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
997 if let Some(node_ix) = self.tree.peek_up() {
998 if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
999 if existing_ch == ch {
1000 if self.last_line_blank {
1001 *is_tight = false;
1002 self.last_line_blank = false;
1003 }
1004 return;
1005 }
1006 }
1007 // TODO: this is not the best choice for end; maybe get end from last list item.
1008 self.finish_list(start);
1009 }
1010 self.tree.append(Item {
1011 start,
1012 end: 0, // will get set later
1013 body: ItemBody::List(true, ch, index),
1014 });
1015 self.tree.push();
1016 self.last_line_blank = false;
1017 }
1018
1019 /// Parse a thematic break.
1020 ///
1021 /// Returns index of start of next line.
1022 fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
1023 self.tree.append(Item {
1024 start: ix,
1025 end: ix + hrule_size,
1026 body: ItemBody::Rule,
1027 });
1028 ix + hrule_size
1029 }
1030
1031 /// Parse an ATX heading.
1032 ///
1033 /// Returns index of start of next line.
1034 fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize {
1035 let mut ix = start;
1036 let heading_ix = self.tree.append(Item {
1037 start,
1038 end: 0, // set later
1039 body: ItemBody::default(), // set later
1040 });
1041 ix += atx_level as usize;
1042 // next char is space or eol (guaranteed by scan_atx_heading)
1043 let bytes = self.text.as_bytes();
1044 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
1045 self.tree[heading_ix].item.end = ix + eol_bytes;
1046 self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None);
1047 return ix + eol_bytes;
1048 }
1049 // skip leading spaces
1050 let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
1051 ix += skip_spaces;
1052
1053 // now handle the header text
1054 let header_start = ix;
1055 let header_node_idx = self.tree.push(); // so that we can set the endpoint later
1056
1057 // trim the trailing attribute block before parsing the entire line, if necessary
1058 let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES)
1059 {
1060 // the start of the next line is the end of the header since the
1061 // header cannot have line breaks
1062 let header_end = header_start + scan_nextline(&bytes[header_start..]);
1063 let (content_end, attrs) =
1064 self.extract_and_parse_heading_attribute_block(header_start, header_end);
1065 self.parse_line(ix, Some(content_end), TableParseMode::Disabled);
1066 (header_end, content_end, attrs)
1067 } else {
1068 ix = self.parse_line(ix, None, TableParseMode::Disabled).0;
1069 (ix, ix, None)
1070 };
1071 self.tree[header_node_idx].item.end = end;
1072
1073 // remove trailing matter from header text
1074 if let Some(cur_ix) = self.tree.cur() {
1075 // remove closing of the ATX heading
1076 let header_text = &bytes[header_start..content_end];
1077 let mut limit = header_text
1078 .iter()
1079 .rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
1080 .map_or(0, |i| i + 1);
1081 let closer = header_text[..limit]
1082 .iter()
1083 .rposition(|&b| b != b'#')
1084 .map_or(0, |i| i + 1);
1085 if closer == 0 {
1086 limit = closer;
1087 } else {
1088 let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
1089 if spaces > 0 {
1090 limit = closer - spaces;
1091 }
1092 }
1093 self.tree[cur_ix].item.end = limit + header_start;
1094 }
1095
1096 self.tree.pop();
1097 self.tree[heading_ix].item.body = ItemBody::Heading(
1098 atx_level,
1099 attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
1100 );
1101 end
1102 }
1103
1104 fn get_footnote(&mut self, start: usize) -> Option<(usize, CowStr<'a>)> {
1105 let bytes = &self.text.as_bytes()[start..];
1106 if !bytes.starts_with(b"[^") {
1107 return None;
1108 }
1109 let (mut i, label) = self.parse_refdef_label(start + 2)?;
1110 i += 2;
1111 if scan_ch(&bytes[i..], b':') == 0 {
1112 return None;
1113 }
1114 i += 1;
1115 Some((i, label))
1116 }
1117
1118 /// Returns the number of bytes scanned on success.
1119 fn parse_footnote(&mut self, start: usize) -> Option<usize> {
1120 let (i, label) = self.get_footnote(start)?;
1121 self.finish_list(start);
1122 self.tree.append(Item {
1123 start,
1124 end: 0, // will get set later
1125 // TODO: check whether the label here is strictly necessary
1126 body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
1127 });
1128 self.tree.push();
1129 Some(i)
1130 }
1131
1132 /// Tries to parse a reference label, which can be interrupted by new blocks.
1133 /// On success, returns the number of bytes of the label and the label itself.
1134 fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
1135 scan_link_label_rest(&self.text[start..], &|bytes| {
1136 let mut line_start = LineStart::new(bytes);
1137 let current_container =
1138 scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
1139 let bytes_scanned = line_start.bytes_scanned();
1140 let suffix = &bytes[bytes_scanned..];
1141 if scan_paragraph_interrupt(suffix, current_container) {
1142 None
1143 } else {
1144 Some(bytes_scanned)
1145 }
1146 })
1147 }
1148
1149 /// Returns number of bytes scanned, label and definition on success.
1150 fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
1151 let bytes = &self.text.as_bytes()[start..];
1152 if scan_ch(bytes, b'[') == 0 {
1153 return None;
1154 }
1155 let (mut i, label) = self.parse_refdef_label(start + 1)?;
1156 i += 1;
1157 if scan_ch(&bytes[i..], b':') == 0 {
1158 return None;
1159 }
1160 i += 1;
1161 let (bytecount, link_def) = self.scan_refdef(start, start + i)?;
1162 Some((bytecount + i, UniCase::new(label), link_def))
1163 }
1164
1165 /// Returns number of bytes and number of newlines
1166 fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
1167 let mut newlines = 0;
1168 loop {
1169 let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
1170 i += whitespaces;
1171 if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
1172 i += eol_bytes;
1173 newlines += 1;
1174 if newlines > 1 {
1175 return None;
1176 }
1177 } else {
1178 break;
1179 }
1180 let mut line_start = LineStart::new(&bytes[i..]);
1181 if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
1182 return None;
1183 }
1184 i += line_start.bytes_scanned();
1185 }
1186 Some((i, newlines))
1187 }
1188
1189 /// Returns # of bytes and definition.
1190 /// Assumes the label of the reference including colon has already been scanned.
1191 fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> {
1192 let bytes = self.text.as_bytes();
1193
1194 // whitespace between label and url (including up to one newline)
1195 let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
1196
1197 // scan link dest
1198 let (dest_length, dest) = scan_link_dest(self.text, i, 1)?;
1199 if dest_length == 0 {
1200 return None;
1201 }
1202 let dest = unescape(dest);
1203 i += dest_length;
1204
1205 // no title
1206 let mut backup = (
1207 i - start,
1208 LinkDef {
1209 dest,
1210 title: None,
1211 span: span_start..i,
1212 },
1213 );
1214
1215 // scan whitespace between dest and label
1216 let (mut i, newlines) =
1217 if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
1218 if i == self.text.len() {
1219 newlines += 1;
1220 }
1221 if new_i == i && newlines == 0 {
1222 return None;
1223 }
1224 if newlines > 1 {
1225 return Some(backup);
1226 };
1227 (new_i, newlines)
1228 } else {
1229 return Some(backup);
1230 };
1231
1232 // scan title
1233 // if this fails but newline == 1, return also a refdef without title
1234 if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
1235 i += title_length;
1236 backup.1.span = span_start..i;
1237 backup.1.title = Some(unescape(title));
1238 } else if newlines > 0 {
1239 return Some(backup);
1240 } else {
1241 return None;
1242 };
1243
1244 // scan EOL
1245 if let Some(bytes) = scan_blank_line(&bytes[i..]) {
1246 backup.0 = i + bytes - start;
1247 Some(backup)
1248 } else if newlines > 0 {
1249 Some(backup)
1250 } else {
1251 None
1252 }
1253 }
1254
1255 /// Extracts and parses a heading attribute block if exists.
1256 ///
1257 /// Returns `(end_offset_of_heading_content, (id, classes))`.
1258 ///
1259 /// If `header_end` is less than or equal to `header_start`, the given
1260 /// input is considered as empty.
1261 fn extract_and_parse_heading_attribute_block(
1262 &mut self,
1263 header_start: usize,
1264 header_end: usize,
1265 ) -> (usize, Option<HeadingAttributes<'a>>) {
1266 if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) {
1267 return (header_end, None);
1268 }
1269
1270 // extract the trailing attribute block
1271 let header_bytes = &self.text.as_bytes()[header_start..header_end];
1272 let (content_len, attr_block_range_rel) =
1273 extract_attribute_block_content_from_header_text(header_bytes);
1274 let content_end = header_start + content_len;
1275 let attrs = attr_block_range_rel.and_then(|r| {
1276 parse_inside_attribute_block(
1277 &self.text[(header_start + r.start)..(header_start + r.end)],
1278 )
1279 });
1280 (content_end, attrs)
1281 }
1282}
1283
1284/// Scanning modes for `Parser`'s `parse_line` method.
1285#[derive(PartialEq, Eq, Copy, Clone)]
1286enum TableParseMode {
1287 /// Inside a paragraph, scanning for table headers.
1288 Scan,
1289 /// Inside a table.
1290 Active,
1291 /// Inside a paragraph, not scanning for table headers.
1292 Disabled,
1293}
1294
1295/// Computes the number of header columns in a table line by computing the number of dividing pipes
1296/// that aren't followed or preceded by whitespace.
1297fn count_header_cols(
1298 bytes: &[u8],
1299 mut pipes: usize,
1300 mut start: usize,
1301 last_pipe_ix: usize,
1302) -> usize {
1303 // was first pipe preceded by whitespace? if so, subtract one
1304 start += scan_whitespace_no_nl(&bytes[start..]);
1305 if bytes[start] == b'|' {
1306 pipes -= 1;
1307 }
1308
1309 // was last pipe followed by whitespace? if so, sub one
1310 if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
1311 pipes
1312 } else {
1313 pipes + 1
1314 }
1315}
1316
1317/// Checks whether we should break a paragraph on the given input.
1318fn scan_paragraph_interrupt(bytes: &[u8], current_container: bool) -> bool {
1319 scan_eol(bytes).is_some()
1320 || scan_hrule(bytes).is_ok()
1321 || scan_atx_heading(data:bytes).is_some()
1322 || scan_code_fence(data:bytes).is_some()
1323 || scan_blockquote_start(data:bytes).is_some()
1324 || scan_listitem(bytes).map_or(default:false, |(ix: usize, delim: u8, index: usize, _)| {
1325 ! current_container ||
1326 // we don't allow interruption by either empty lists or
1327 // numbered lists starting at an index other than 1
1328 (delim == b'*' || delim == b'-' || delim == b'+' || index == 1)
1329 && !scan_empty_list(&bytes[ix..])
1330 })
1331 || bytes.starts_with(needle:b"<")
1332 && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..]))
1333}
1334
1335/// Assumes `text_bytes` is preceded by `<`.
1336fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
1337 static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre", b"style", b"script", b"textarea"];
1338 static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
1339
1340 for (beg_tag, end_tag) in BEGIN_TAGS
1341 .iter()
1342 .zip(["</pre>", "</style>", "</script>", "</textarea>"].iter())
1343 {
1344 let tag_len = beg_tag.len();
1345
1346 if text_bytes.len() < tag_len {
1347 // begin tags are increasing in size
1348 break;
1349 }
1350
1351 if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
1352 continue;
1353 }
1354
1355 // Must either be the end of the line...
1356 if text_bytes.len() == tag_len {
1357 return Some(end_tag);
1358 }
1359
1360 // ...or be followed by whitespace, newline, or '>'.
1361 let s = text_bytes[tag_len];
1362 if is_ascii_whitespace(s) || s == b'>' {
1363 return Some(end_tag);
1364 }
1365 }
1366
1367 for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
1368 if text_bytes.starts_with(beg_tag) {
1369 return Some(end_tag);
1370 }
1371 }
1372
1373 if text_bytes.len() > 1
1374 && text_bytes[0] == b'!'
1375 && text_bytes[1] >= b'A'
1376 && text_bytes[1] <= b'Z'
1377 {
1378 Some(">")
1379 } else {
1380 None
1381 }
1382}
1383
1384// https://english.stackexchange.com/a/285573
1385fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
1386 let mut list_item = tree[list_ix].child;
1387 while let Some(listitem_ix) = list_item {
1388 // first child is special, controls how we repoint list_item.child
1389 let list_item_firstborn = tree[listitem_ix].child;
1390
1391 // Check that list item has children - this is not necessarily the case!
1392 if let Some(firstborn_ix) = list_item_firstborn {
1393 if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
1394 tree[listitem_ix].child = tree[firstborn_ix].child;
1395 }
1396
1397 let mut list_item_child = Some(firstborn_ix);
1398 let mut node_to_repoint = None;
1399 while let Some(child_ix) = list_item_child {
1400 // surgerize paragraphs
1401 let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
1402 if let Some(child_firstborn) = tree[child_ix].child {
1403 if let Some(repoint_ix) = node_to_repoint {
1404 tree[repoint_ix].next = Some(child_firstborn);
1405 }
1406 let mut child_lastborn = child_firstborn;
1407 while let Some(lastborn_next_ix) = tree[child_lastborn].next {
1408 child_lastborn = lastborn_next_ix;
1409 }
1410 child_lastborn
1411 } else {
1412 child_ix
1413 }
1414 } else {
1415 child_ix
1416 };
1417
1418 node_to_repoint = Some(repoint_ix);
1419 tree[repoint_ix].next = tree[child_ix].next;
1420 list_item_child = tree[child_ix].next;
1421 }
1422 }
1423
1424 list_item = tree[listitem_ix].next;
1425 }
1426}
1427
1428/// Determines whether the delimiter run starting at given index is
1429/// left-flanking, as defined by the commonmark spec (and isn't intraword
1430/// for _ delims).
1431/// suffix is &s[ix..], which is passed in as an optimization, since taking
1432/// a string subslice is O(n).
1433fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1434 let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) {
1435 c
1436 } else {
1437 return false;
1438 };
1439 if next_char.is_whitespace() {
1440 return false;
1441 }
1442 if ix == 0 {
1443 return true;
1444 }
1445 let delim: char = suffix.chars().next().unwrap();
1446 if delim == '*' && !is_punctuation(next_char) {
1447 return true;
1448 }
1449
1450 let prev_char: char = s[..ix].chars().last().unwrap();
1451
1452 prev_char.is_whitespace()
1453 || is_punctuation(prev_char) && (delim != '\'' || ![']', ')'].contains(&prev_char))
1454}
1455
1456/// Determines whether the delimiter run starting at given index is
1457/// left-flanking, as defined by the commonmark spec (and isn't intraword
1458/// for _ delims)
1459fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1460 if ix == 0 {
1461 return false;
1462 }
1463 let prev_char: char = s[..ix].chars().last().unwrap();
1464 if prev_char.is_whitespace() {
1465 return false;
1466 }
1467 let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) {
1468 c
1469 } else {
1470 return true;
1471 };
1472 let delim: char = suffix.chars().next().unwrap();
1473 if delim == '*' && !is_punctuation(prev_char) {
1474 return true;
1475 }
1476
1477 next_char.is_whitespace() || is_punctuation(next_char)
1478}
1479
1480fn create_lut(options: &Options) -> LookupTable {
1481 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
1482 {
1483 LookupTable {
1484 simd: simd::compute_lookup(options),
1485 scalar: special_bytes(options),
1486 }
1487 }
1488 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
1489 {
1490 special_bytes(options)
1491 }
1492}
1493
1494fn special_bytes(options: &Options) -> [bool; 256] {
1495 let mut bytes: [bool; 256] = [false; 256];
1496 let standard_bytes: [u8; 11] = [
1497 b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
1498 ];
1499
1500 for &byte: u8 in &standard_bytes {
1501 bytes[byte as usize] = true;
1502 }
1503 if options.contains(Options::ENABLE_TABLES) {
1504 bytes[b'|' as usize] = true;
1505 }
1506 if options.contains(Options::ENABLE_STRIKETHROUGH) {
1507 bytes[b'~' as usize] = true;
1508 }
1509 if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
1510 for &byte: u8 in &[b'.', b'-', b'"', b'\''] {
1511 bytes[byte as usize] = true;
1512 }
1513 }
1514
1515 bytes
1516}
1517
1518enum LoopInstruction<T> {
1519 /// Continue looking for more special bytes, but skip next few bytes.
1520 ContinueAndSkip(usize),
1521 /// Break looping immediately, returning with the given index and value.
1522 BreakAtWith(usize, T),
1523}
1524
1525#[cfg(all(target_arch = "x86_64", feature = "simd"))]
1526struct LookupTable {
1527 simd: [u8; 16],
1528 scalar: [bool; 256],
1529}
1530
1531#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
1532type LookupTable = [bool; 256];
1533
1534/// This function walks the byte slices from the given index and
1535/// calls the callback function on all bytes (and their indices) that are in the following set:
1536/// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
1537/// It is guaranteed not call the callback on other bytes.
1538/// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
1539/// will not be called with an index that is less than `ix + n + 1`.
1540/// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
1541/// called and the function returns immediately with the return value `(end_ix, opt_val)`.
1542/// If `BreakAtWith(..)` is never returned, this function will return the first
1543/// index that is outside the byteslice bound and a `None` value.
1544fn iterate_special_bytes<F, T>(
1545 lut: &LookupTable,
1546 bytes: &[u8],
1547 ix: usize,
1548 callback: F,
1549) -> (usize, Option<T>)
1550where
1551 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1552{
1553 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
1554 {
1555 simd::iterate_special_bytes(lut, bytes, ix, callback)
1556 }
1557 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
1558 {
1559 scalar_iterate_special_bytes(lut, bytes, ix, callback)
1560 }
1561}
1562
1563fn scalar_iterate_special_bytes<F, T>(
1564 lut: &[bool; 256],
1565 bytes: &[u8],
1566 mut ix: usize,
1567 mut callback: F,
1568) -> (usize, Option<T>)
1569where
1570 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1571{
1572 while ix < bytes.len() {
1573 let b: u8 = bytes[ix];
1574 if lut[b as usize] {
1575 match callback(ix, b) {
1576 LoopInstruction::ContinueAndSkip(skip: usize) => {
1577 ix += skip;
1578 }
1579 LoopInstruction::BreakAtWith(ix: usize, val: Option) => {
1580 return (ix, val);
1581 }
1582 }
1583 }
1584 ix += 1;
1585 }
1586
1587 (ix, None)
1588}
1589
1590/// Split the usual heading content range and the content inside the trailing attribute block.
1591///
1592/// Returns `(leading_content_len, Option<trailing_attr_block_range>)`.
1593///
1594/// Note that `trailing_attr_block_range` will be empty range when the block
1595/// is `{}`, since the range is content inside the wrapping `{` and `}`.
1596///
1597/// The closing `}` of an attribute block can have trailing whitespaces.
1598/// They are automatically trimmed when the attribute block is being searched.
1599///
1600/// However, this method does not trim the trailing whitespaces of heading content.
1601/// It is callers' responsibility to trim them if necessary.
1602fn extract_attribute_block_content_from_header_text(
1603 heading: &[u8],
1604) -> (usize, Option<Range<usize>>) {
1605 let heading_len = heading.len();
1606 let mut ix = heading_len;
1607 ix -= scan_rev_while(heading, |b| {
1608 b == b'\n' || b == b'\r' || b == b' ' || b == b'\t'
1609 });
1610 if ix == 0 {
1611 return (heading_len, None);
1612 }
1613
1614 let attr_block_close = ix - 1;
1615 if heading.get(attr_block_close) != Some(&b'}') {
1616 // The last character is not `}`. No attribute blocks found.
1617 return (heading_len, None);
1618 }
1619 // move cursor before the closing right brace (`}`)
1620 ix -= 1;
1621
1622 ix -= scan_rev_while(&heading[..ix], |b| {
1623 // Characters to be excluded:
1624 // * `{` and `}`: special characters to open and close an attribute block.
1625 // * `\\`: a special character to escape many characters and disable some syntaxes.
1626 // + Handling of this escape character differs among markdown processors.
1627 // + Escaped characters will be separate text node from neighbors, so
1628 // it is not easy to handle unescaped string and trim the trailing block.
1629 // * `<` and `>`: special characters to start and end HTML tag.
1630 // + No known processors converts `{#<i>foo</i>}` into
1631 // `id="&lt;i&gt;foo&lt;/&gt;"` as of this writing, so hopefully
1632 // this restriction won't cause compatibility issues.
1633 // * `\n` and `\r`: a newline character.
1634 // + Setext heading can have multiple lines. However it is hard to support
1635 // attribute blocks that have newline inside, since the parsing proceeds line by
1636 // line and lines will be separate nodes even they are logically a single text.
1637 !matches!(b, b'{' | b'}' | b'<' | b'>' | b'\\' | b'\n' | b'\r')
1638 });
1639 if ix == 0 {
1640 // `{` is not found. No attribute blocks available.
1641 return (heading_len, None);
1642 }
1643 let attr_block_open = ix - 1;
1644 if heading[attr_block_open] != b'{' {
1645 // `{` is not found. No attribute blocks available.
1646 return (heading_len, None);
1647 }
1648
1649 (attr_block_open, Some(ix..attr_block_close))
1650}
1651
1652/// Parses an attribute block content, such as `.class1 #id .class2`.
1653///
1654/// Returns `(id, classes)`.
1655///
1656/// It is callers' responsibility to find opening and closing characters of the attribute
1657/// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you.
1658///
1659/// Note that this parsing requires explicit whitespace separators between
1660/// attributes. This is intentional design with the reasons below:
1661///
1662/// * to keep conversion simple and easy to understand for any possible input,
1663/// * to avoid adding less obvious conversion rule that can reduce compatibility
1664/// with other implementations more, and
1665/// * to follow the major design of implementations with the support for the
1666/// attribute blocks extension (as of this writing).
1667///
1668/// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`].
1669///
1670/// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES`
1671fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes> {
1672 let mut id: Option<&str> = None;
1673 let mut classes: Vec<&str> = Vec::new();
1674
1675 for attr: &str in inside_attr_block.split_ascii_whitespace() {
1676 // iterator returned by `str::split_ascii_whitespace` never emits empty
1677 // strings, so taking first byte won't panic.
1678 if attr.len() > 1 {
1679 let first_byte: u8 = attr.as_bytes()[0];
1680 if first_byte == b'#' {
1681 id = Some(&attr[1..]);
1682 } else if first_byte == b'.' {
1683 classes.push(&attr[1..]);
1684 }
1685 }
1686 }
1687
1688 Some(HeadingAttributes { id, classes })
1689}
1690
1691#[cfg(all(target_arch = "x86_64", feature = "simd"))]
1692mod simd {
1693 //! SIMD byte scanning logic.
1694 //!
1695 //! This module provides functions that allow walking through byteslices, calling
1696 //! provided callback functions on special bytes and their indices using SIMD.
1697 //! The byteset is defined in `compute_lookup`.
1698 //!
1699 //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of
1700 //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask
1701 //! from this and call the callback on every index corresponding to a 1 in this mask
1702 //! before moving on to the next chunk. This allows us to move quickly when there
1703 //! are no or few matches.
1704 //!
1705 //! The table lookup is inspired by this [great overview]. However, since all of the
1706 //! bytes we're interested in are ASCII, we don't quite need the full generality of
1707 //! the universal algorithm and are hence able to skip a few instructions.
1708 //!
1709 //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html
1710
1711 use super::{LookupTable, LoopInstruction};
1712 use crate::Options;
1713 use core::arch::x86_64::*;
1714
1715 const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
1716
1717 /// Generates a lookup table containing the bitmaps for our
1718 /// special marker bytes. This is effectively a 128 element 2d bitvector,
1719 /// that can be indexed by a four bit row index (the lower nibble)
1720 /// and a three bit column index (upper nibble).
1721 pub(super) fn compute_lookup(options: &Options) -> [u8; 16] {
1722 let mut lookup = [0u8; 16];
1723 let standard_bytes = [
1724 b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
1725 ];
1726
1727 for &byte in &standard_bytes {
1728 add_lookup_byte(&mut lookup, byte);
1729 }
1730 if options.contains(Options::ENABLE_TABLES) {
1731 add_lookup_byte(&mut lookup, b'|');
1732 }
1733 if options.contains(Options::ENABLE_STRIKETHROUGH) {
1734 add_lookup_byte(&mut lookup, b'~');
1735 }
1736 if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
1737 for &byte in &[b'.', b'-', b'"', b'\''] {
1738 add_lookup_byte(&mut lookup, byte);
1739 }
1740 }
1741
1742 lookup
1743 }
1744
1745 fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) {
1746 lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4);
1747 }
1748
1749 /// Computes a bit mask for the given byteslice starting from the given index,
1750 /// where the 16 least significant bits indicate (by value of 1) whether or not
1751 /// there is a special character at that byte position. The least significant bit
1752 /// corresponds to `bytes[ix]` and the most significant bit corresponds to
1753 /// `bytes[ix + 15]`.
1754 /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`.
1755 #[target_feature(enable = "ssse3")]
1756 #[inline]
1757 unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 {
1758 debug_assert!(bytes.len() >= ix + VECTOR_SIZE);
1759
1760 let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i);
1761 // Small lookup table to compute single bit bitshifts
1762 // for 16 bytes at once.
1763 let bitmask_lookup =
1764 _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1);
1765
1766 // Load input from memory.
1767 let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i;
1768 let input = _mm_loadu_si128(raw_ptr);
1769 // Compute the bitmap using the bottom nibble as an index
1770 // into the lookup table. Note that non-ascii bytes will have
1771 // their most significant bit set and will map to lookup[0].
1772 let bitset = _mm_shuffle_epi8(bitmap, input);
1773 // Compute the high nibbles of the input using a 16-bit rightshift of four
1774 // and a mask to prevent most-significant bit issues.
1775 let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f));
1776 // Create a bitmask for the bitmap by perform a left shift of the value
1777 // of the higher nibble. Bytes with their most significant set are mapped
1778 // to -1 (all ones).
1779 let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles);
1780 // Test the bit of the bitmap by AND'ing the bitmap and the mask together.
1781 let tmp = _mm_and_si128(bitset, bitmask);
1782 // Check whether the result was not null. NEQ is not a SIMD intrinsic,
1783 // but comparing to the bitmask is logically equivalent. This also prevents us
1784 // from matching any non-ASCII bytes since none of the bitmaps were all ones
1785 // (-1).
1786 let result = _mm_cmpeq_epi8(tmp, bitmask);
1787
1788 // Return the resulting bitmask.
1789 _mm_movemask_epi8(result)
1790 }
1791
1792 /// Calls callback on byte indices and their value.
1793 /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the
1794 /// number of bytes in callback return value otherwise.
1795 /// Returns the final index and a possible break value.
1796 pub(super) fn iterate_special_bytes<F, T>(
1797 lut: &LookupTable,
1798 bytes: &[u8],
1799 ix: usize,
1800 callback: F,
1801 ) -> (usize, Option<T>)
1802 where
1803 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1804 {
1805 if is_x86_feature_detected!("ssse3") && bytes.len() >= VECTOR_SIZE {
1806 unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) }
1807 } else {
1808 super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback)
1809 }
1810 }
1811
1812 /// Calls the callback function for every 1 in the given bitmask with
1813 /// the index `offset + ix`, where `ix` is the position of the 1 in the mask.
1814 /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with
1815 /// final index `end_ix` and optional value `opt_val`.
1816 unsafe fn process_mask<F, T>(
1817 mut mask: i32,
1818 bytes: &[u8],
1819 mut offset: usize,
1820 callback: &mut F,
1821 ) -> Result<usize, (usize, Option<T>)>
1822 where
1823 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1824 {
1825 while mask != 0 {
1826 let mask_ix = mask.trailing_zeros() as usize;
1827 offset += mask_ix;
1828 match callback(offset, *bytes.get_unchecked(offset)) {
1829 LoopInstruction::ContinueAndSkip(skip) => {
1830 offset += skip + 1;
1831 mask >>= skip + 1 + mask_ix;
1832 }
1833 LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)),
1834 }
1835 }
1836 Ok(offset)
1837 }
1838
1839 #[target_feature(enable = "ssse3")]
1840 /// Important: only call this function when `bytes.len() >= 16`. Doing
1841 /// so otherwise may exhibit undefined behaviour.
1842 unsafe fn simd_iterate_special_bytes<F, T>(
1843 lut: &[u8; 16],
1844 bytes: &[u8],
1845 mut ix: usize,
1846 mut callback: F,
1847 ) -> (usize, Option<T>)
1848 where
1849 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1850 {
1851 debug_assert!(bytes.len() >= VECTOR_SIZE);
1852 let upperbound = bytes.len() - VECTOR_SIZE;
1853
1854 while ix < upperbound {
1855 let mask = compute_mask(lut, bytes, ix);
1856 let block_start = ix;
1857 ix = match process_mask(mask, bytes, ix, &mut callback) {
1858 Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start),
1859 Err((end_ix, val)) => return (end_ix, val),
1860 };
1861 }
1862
1863 if bytes.len() > ix {
1864 // shift off the bytes at start we have already scanned
1865 let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound;
1866 if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) {
1867 return (end_ix, val);
1868 }
1869 }
1870
1871 (bytes.len(), None)
1872 }
1873
1874 #[cfg(test)]
1875 mod simd_test {
1876 use super::super::create_lut;
1877 use super::{iterate_special_bytes, LoopInstruction};
1878 use crate::Options;
1879
1880 fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) {
1881 let mut opts = Options::empty();
1882 opts.insert(Options::ENABLE_TABLES);
1883 opts.insert(Options::ENABLE_FOOTNOTES);
1884 opts.insert(Options::ENABLE_STRIKETHROUGH);
1885 opts.insert(Options::ENABLE_TASKLISTS);
1886
1887 let lut = create_lut(&opts);
1888 let mut indices = vec![];
1889
1890 iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| {
1891 indices.push(ix);
1892 LoopInstruction::ContinueAndSkip(skip)
1893 });
1894
1895 assert_eq!(&indices[..], expected);
1896 }
1897
1898 #[test]
1899 fn simple_no_match() {
1900 check_expected_indices("abcdef0123456789".as_bytes(), &[], 0);
1901 }
1902
1903 #[test]
1904 fn simple_match() {
1905 check_expected_indices("*bcd&f0123456789".as_bytes(), &[0, 4], 0);
1906 }
1907
1908 #[test]
1909 fn single_open_fish() {
1910 check_expected_indices("<".as_bytes(), &[0], 0);
1911 }
1912
1913 #[test]
1914 fn long_match() {
1915 check_expected_indices("0123456789abcde~*bcd&f0".as_bytes(), &[15, 16, 20], 0);
1916 }
1917
1918 #[test]
1919 fn border_skip() {
1920 check_expected_indices("0123456789abcde~~~~d&f0".as_bytes(), &[15, 20], 3);
1921 }
1922
1923 #[test]
1924 fn exhaustive_search() {
1925 let chars = [
1926 b'\n', b'\r', b'*', b'_', b'~', b'|', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
1927 ];
1928
1929 for &c in &chars {
1930 for i in 0u8..=255 {
1931 if !chars.contains(&i) {
1932 // full match
1933 let mut buf = [i; 18];
1934 buf[3] = c;
1935 buf[6] = c;
1936
1937 check_expected_indices(&buf[..], &[3, 6], 0);
1938 }
1939 }
1940 }
1941 }
1942 }
1943}
1944