1// Copyright 2017 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Tree-based two pass parser.
22
23use std::cmp::{max, min};
24use std::collections::{HashMap, VecDeque};
25use std::iter::FusedIterator;
26use std::num::NonZeroUsize;
27use std::ops::{Index, Range};
28
29use unicase::UniCase;
30
31use crate::firstpass::run_first_pass;
32use crate::linklabel::{scan_link_label_rest, LinkLabel, ReferenceLabel};
33use crate::scanners::*;
34use crate::strings::CowStr;
35use crate::tree::{Tree, TreeIndex};
36use crate::{Alignment, CodeBlockKind, Event, HeadingLevel, LinkType, Options, Tag};
37
38// Allowing arbitrary depth nested parentheses inside link destinations
39// can create denial of service vulnerabilities if we're not careful.
40// The simplest countermeasure is to limit their depth, which is
41// explicitly allowed by the spec as long as the limit is at least 3:
42// https://spec.commonmark.org/0.29/#link-destination
43const LINK_MAX_NESTED_PARENS: usize = 5;
44
45#[derive(Debug, Default, Clone, Copy)]
46pub(crate) struct Item {
47 pub start: usize,
48 pub end: usize,
49 pub body: ItemBody,
50}
51
52#[derive(Debug, PartialEq, Clone, Copy)]
53pub(crate) enum ItemBody {
54 Paragraph,
55 Text,
56 SoftBreak,
57 HardBreak,
58
59 // These are possible inline items, need to be resolved in second pass.
60
61 // repeats, can_open, can_close
62 MaybeEmphasis(usize, bool, bool),
63 // quote byte, can_open, can_close
64 MaybeSmartQuote(u8, bool, bool),
65 MaybeCode(usize, bool), // number of backticks, preceded by backslash
66 MaybeHtml,
67 MaybeLinkOpen,
68 // bool indicates whether or not the preceding section could be a reference
69 MaybeLinkClose(bool),
70 MaybeImage,
71
72 // These are inline items after resolution.
73 Emphasis,
74 Strong,
75 Strikethrough,
76 Code(CowIndex),
77 Link(LinkIndex),
78 Image(LinkIndex),
79 FootnoteReference(CowIndex),
80 TaskListMarker(bool), // true for checked
81
82 Rule,
83 Heading(HeadingLevel, Option<HeadingIndex>), // heading level
84 FencedCodeBlock(CowIndex),
85 IndentCodeBlock,
86 Html,
87 OwnedHtml(CowIndex),
88 BlockQuote,
89 List(bool, u8, u64), // is_tight, list character, list start index
90 ListItem(usize), // indent level
91 SynthesizeText(CowIndex),
92 SynthesizeChar(char),
93 FootnoteDefinition(CowIndex),
94
95 // Tables
96 Table(AlignmentIndex),
97 TableHead,
98 TableRow,
99 TableCell,
100
101 // Dummy node at the top of the tree - should not be used otherwise!
102 Root,
103}
104
105impl<'a> ItemBody {
106 fn is_inline(&self) -> bool {
107 matches!(
108 *self,
109 ItemBody::MaybeEmphasis(..)
110 | ItemBody::MaybeSmartQuote(..)
111 | ItemBody::MaybeHtml
112 | ItemBody::MaybeCode(..)
113 | ItemBody::MaybeLinkOpen
114 | ItemBody::MaybeLinkClose(..)
115 | ItemBody::MaybeImage
116 )
117 }
118}
119
120impl<'a> Default for ItemBody {
121 fn default() -> Self {
122 ItemBody::Root
123 }
124}
125
126#[derive(Debug)]
127pub struct BrokenLink<'a> {
128 pub span: std::ops::Range<usize>,
129 pub link_type: LinkType,
130 pub reference: CowStr<'a>,
131}
132
133/// Markdown event iterator.
134pub struct Parser<'input, 'callback> {
135 text: &'input str,
136 options: Options,
137 tree: Tree<Item>,
138 allocs: Allocations<'input>,
139 broken_link_callback: BrokenLinkCallback<'input, 'callback>,
140 html_scan_guard: HtmlScanGuard,
141
142 // used by inline passes. store them here for reuse
143 inline_stack: InlineStack,
144 link_stack: LinkStack,
145}
146
147impl<'input, 'callback> std::fmt::Debug for Parser<'input, 'callback> {
148 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
149 // Only print the fileds that have public types.
150 f&mut DebugStruct<'_, '_>.debug_struct("Parser")
151 .field("text", &self.text)
152 .field("options", &self.options)
153 .field(
154 name:"broken_link_callback",
155 &self.broken_link_callback.as_ref().map(|_| ..),
156 )
157 .finish()
158 }
159}
160
161impl<'input, 'callback> Parser<'input, 'callback> {
162 /// Creates a new event iterator for a markdown string without any options enabled.
163 pub fn new(text: &'input str) -> Self {
164 Parser::new_ext(text, Options::empty())
165 }
166
167 /// Creates a new event iterator for a markdown string with given options.
168 pub fn new_ext(text: &'input str, options: Options) -> Self {
169 Parser::new_with_broken_link_callback(text, options, None)
170 }
171
172 /// In case the parser encounters any potential links that have a broken
173 /// reference (e.g `[foo]` when there is no `[foo]: ` entry at the bottom)
174 /// the provided callback will be called with the reference name,
175 /// and the returned pair will be used as the link name and title if it is not
176 /// `None`.
177 pub fn new_with_broken_link_callback(
178 text: &'input str,
179 options: Options,
180 broken_link_callback: BrokenLinkCallback<'input, 'callback>,
181 ) -> Self {
182 let (mut tree, allocs) = run_first_pass(text, options);
183 tree.reset();
184 let inline_stack = Default::default();
185 let link_stack = Default::default();
186 let html_scan_guard = Default::default();
187 Parser {
188 text,
189 options,
190 tree,
191 allocs,
192 broken_link_callback,
193 inline_stack,
194 link_stack,
195 html_scan_guard,
196 }
197 }
198
199 /// Returns a reference to the internal `RefDefs` object, which provides access
200 /// to the internal map of reference definitions.
201 pub fn reference_definitions(&self) -> &RefDefs {
202 &self.allocs.refdefs
203 }
204
205 /// Handle inline markup.
206 ///
207 /// When the parser encounters any item indicating potential inline markup, all
208 /// inline markup passes are run on the remainder of the chain.
209 ///
210 /// Note: there's some potential for optimization here, but that's future work.
211 fn handle_inline(&mut self) {
212 self.handle_inline_pass1();
213 self.handle_emphasis();
214 }
215
216 /// Handle inline HTML, code spans, and links.
217 ///
218 /// This function handles both inline HTML and code spans, because they have
219 /// the same precedence. It also handles links, even though they have lower
220 /// precedence, because the URL of links must not be processed.
221 fn handle_inline_pass1(&mut self) {
222 let mut code_delims = CodeDelims::new();
223 let mut cur = self.tree.cur();
224 let mut prev = None;
225
226 let block_end = self.tree[self.tree.peek_up().unwrap()].item.end;
227 let block_text = &self.text[..block_end];
228
229 while let Some(mut cur_ix) = cur {
230 match self.tree[cur_ix].item.body {
231 ItemBody::MaybeHtml => {
232 let next = self.tree[cur_ix].next;
233 let autolink = if let Some(next_ix) = next {
234 scan_autolink(block_text, self.tree[next_ix].item.start)
235 } else {
236 None
237 };
238
239 if let Some((ix, uri, link_type)) = autolink {
240 let node = scan_nodes_to_ix(&self.tree, next, ix);
241 let text_node = self.tree.create_node(Item {
242 start: self.tree[cur_ix].item.start + 1,
243 end: ix - 1,
244 body: ItemBody::Text,
245 });
246 let link_ix = self.allocs.allocate_link(link_type, uri, "".into());
247 self.tree[cur_ix].item.body = ItemBody::Link(link_ix);
248 self.tree[cur_ix].item.end = ix;
249 self.tree[cur_ix].next = node;
250 self.tree[cur_ix].child = Some(text_node);
251 prev = cur;
252 cur = node;
253 if let Some(node_ix) = cur {
254 self.tree[node_ix].item.start = max(self.tree[node_ix].item.start, ix);
255 }
256 continue;
257 } else {
258 let inline_html = next.and_then(|next_ix| {
259 self.scan_inline_html(
260 block_text.as_bytes(),
261 self.tree[next_ix].item.start,
262 )
263 });
264 if let Some((span, ix)) = inline_html {
265 let node = scan_nodes_to_ix(&self.tree, next, ix);
266 self.tree[cur_ix].item.body = if !span.is_empty() {
267 let converted_string =
268 String::from_utf8(span).expect("invalid utf8");
269 ItemBody::OwnedHtml(
270 self.allocs.allocate_cow(converted_string.into()),
271 )
272 } else {
273 ItemBody::Html
274 };
275 self.tree[cur_ix].item.end = ix;
276 self.tree[cur_ix].next = node;
277 prev = cur;
278 cur = node;
279 if let Some(node_ix) = cur {
280 self.tree[node_ix].item.start =
281 max(self.tree[node_ix].item.start, ix);
282 }
283 continue;
284 }
285 }
286 self.tree[cur_ix].item.body = ItemBody::Text;
287 }
288 ItemBody::MaybeCode(mut search_count, preceded_by_backslash) => {
289 if preceded_by_backslash {
290 search_count -= 1;
291 if search_count == 0 {
292 self.tree[cur_ix].item.body = ItemBody::Text;
293 prev = cur;
294 cur = self.tree[cur_ix].next;
295 continue;
296 }
297 }
298
299 if code_delims.is_populated() {
300 // we have previously scanned all codeblock delimiters,
301 // so we can reuse that work
302 if let Some(scan_ix) = code_delims.find(cur_ix, search_count) {
303 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
304 } else {
305 self.tree[cur_ix].item.body = ItemBody::Text;
306 }
307 } else {
308 // we haven't previously scanned all codeblock delimiters,
309 // so walk the AST
310 let mut scan = if search_count > 0 {
311 self.tree[cur_ix].next
312 } else {
313 None
314 };
315 while let Some(scan_ix) = scan {
316 if let ItemBody::MaybeCode(delim_count, _) =
317 self.tree[scan_ix].item.body
318 {
319 if search_count == delim_count {
320 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
321 code_delims.clear();
322 break;
323 } else {
324 code_delims.insert(delim_count, scan_ix);
325 }
326 }
327 scan = self.tree[scan_ix].next;
328 }
329 if scan == None {
330 self.tree[cur_ix].item.body = ItemBody::Text;
331 }
332 }
333 }
334 ItemBody::MaybeLinkOpen => {
335 self.tree[cur_ix].item.body = ItemBody::Text;
336 self.link_stack.push(LinkStackEl {
337 node: cur_ix,
338 ty: LinkStackTy::Link,
339 });
340 }
341 ItemBody::MaybeImage => {
342 self.tree[cur_ix].item.body = ItemBody::Text;
343 self.link_stack.push(LinkStackEl {
344 node: cur_ix,
345 ty: LinkStackTy::Image,
346 });
347 }
348 ItemBody::MaybeLinkClose(could_be_ref) => {
349 self.tree[cur_ix].item.body = ItemBody::Text;
350 if let Some(tos) = self.link_stack.pop() {
351 if tos.ty == LinkStackTy::Disabled {
352 continue;
353 }
354 let next = self.tree[cur_ix].next;
355 if let Some((next_ix, url, title)) =
356 self.scan_inline_link(block_text, self.tree[cur_ix].item.end, next)
357 {
358 let next_node = scan_nodes_to_ix(&self.tree, next, next_ix);
359 if let Some(prev_ix) = prev {
360 self.tree[prev_ix].next = None;
361 }
362 cur = Some(tos.node);
363 cur_ix = tos.node;
364 let link_ix = self.allocs.allocate_link(LinkType::Inline, url, title);
365 self.tree[cur_ix].item.body = if tos.ty == LinkStackTy::Image {
366 ItemBody::Image(link_ix)
367 } else {
368 ItemBody::Link(link_ix)
369 };
370 self.tree[cur_ix].child = self.tree[cur_ix].next;
371 self.tree[cur_ix].next = next_node;
372 self.tree[cur_ix].item.end = next_ix;
373 if let Some(next_node_ix) = next_node {
374 self.tree[next_node_ix].item.start =
375 max(self.tree[next_node_ix].item.start, next_ix);
376 }
377
378 if tos.ty == LinkStackTy::Link {
379 self.link_stack.disable_all_links();
380 }
381 } else {
382 // ok, so its not an inline link. maybe it is a reference
383 // to a defined link?
384 let scan_result = scan_reference(
385 &self.tree,
386 block_text,
387 next,
388 self.options.contains(Options::ENABLE_FOOTNOTES),
389 );
390 let (node_after_link, link_type) = match scan_result {
391 // [label][reference]
392 RefScan::LinkLabel(_, end_ix) => {
393 // Toggle reference viability of the last closing bracket,
394 // so that we can skip it on future iterations in case
395 // it fails in this one. In particular, we won't call
396 // the broken link callback twice on one reference.
397 let reference_close_node = if let Some(node) =
398 scan_nodes_to_ix(&self.tree, next, end_ix - 1)
399 {
400 node
401 } else {
402 continue;
403 };
404 self.tree[reference_close_node].item.body =
405 ItemBody::MaybeLinkClose(false);
406 let next_node = self.tree[reference_close_node].next;
407
408 (next_node, LinkType::Reference)
409 }
410 // [reference][]
411 RefScan::Collapsed(next_node) => {
412 // This reference has already been tried, and it's not
413 // valid. Skip it.
414 if !could_be_ref {
415 continue;
416 }
417 (next_node, LinkType::Collapsed)
418 }
419 // [shortcut]
420 //
421 // [shortcut]: /blah
422 RefScan::Failed => {
423 if !could_be_ref {
424 continue;
425 }
426 (next, LinkType::Shortcut)
427 }
428 };
429
430 // FIXME: references and labels are mixed in the naming of variables
431 // below. Disambiguate!
432
433 // (label, source_ix end)
434 let label: Option<(ReferenceLabel<'input>, usize)> = match scan_result {
435 RefScan::LinkLabel(l, end_ix) => {
436 Some((ReferenceLabel::Link(l), end_ix))
437 }
438 RefScan::Collapsed(..) | RefScan::Failed => {
439 // No label? maybe it is a shortcut reference
440 let label_start = self.tree[tos.node].item.end - 1;
441 let label_end = self.tree[cur_ix].item.end;
442 scan_link_label(
443 &self.tree,
444 &self.text[label_start..label_end],
445 self.options.contains(Options::ENABLE_FOOTNOTES),
446 )
447 .map(|(ix, label)| (label, label_start + ix))
448 .filter(|(_, end)| *end == label_end)
449 }
450 };
451
452 // see if it's a footnote reference
453 if let Some((ReferenceLabel::Footnote(l), end)) = label {
454 self.tree[tos.node].next = node_after_link;
455 self.tree[tos.node].child = None;
456 self.tree[tos.node].item.body =
457 ItemBody::FootnoteReference(self.allocs.allocate_cow(l));
458 self.tree[tos.node].item.end = end;
459 prev = Some(tos.node);
460 cur = node_after_link;
461 self.link_stack.clear();
462 continue;
463 } else if let Some((ReferenceLabel::Link(link_label), end)) = label {
464 let type_url_title = self
465 .allocs
466 .refdefs
467 .get(link_label.as_ref())
468 .map(|matching_def| {
469 // found a matching definition!
470 let title = matching_def
471 .title
472 .as_ref()
473 .cloned()
474 .unwrap_or_else(|| "".into());
475 let url = matching_def.dest.clone();
476 (link_type, url, title)
477 })
478 .or_else(|| {
479 match self.broken_link_callback.as_mut() {
480 Some(callback) => {
481 // Construct a BrokenLink struct, which will be passed to the callback
482 let broken_link = BrokenLink {
483 span: (self.tree[tos.node].item.start)..end,
484 link_type,
485 reference: link_label,
486 };
487
488 callback(broken_link).map(|(url, title)| {
489 (link_type.to_unknown(), url, title)
490 })
491 }
492 None => None,
493 }
494 });
495
496 if let Some((def_link_type, url, title)) = type_url_title {
497 let link_ix =
498 self.allocs.allocate_link(def_link_type, url, title);
499 self.tree[tos.node].item.body = if tos.ty == LinkStackTy::Image
500 {
501 ItemBody::Image(link_ix)
502 } else {
503 ItemBody::Link(link_ix)
504 };
505 let label_node = self.tree[tos.node].next;
506
507 // lets do some tree surgery to add the link to the tree
508 // 1st: skip the label node and close node
509 self.tree[tos.node].next = node_after_link;
510
511 // then, if it exists, add the label node as a child to the link node
512 if label_node != cur {
513 self.tree[tos.node].child = label_node;
514
515 // finally: disconnect list of children
516 if let Some(prev_ix) = prev {
517 self.tree[prev_ix].next = None;
518 }
519 }
520
521 self.tree[tos.node].item.end = end;
522
523 // set up cur so next node will be node_after_link
524 cur = Some(tos.node);
525 cur_ix = tos.node;
526
527 if tos.ty == LinkStackTy::Link {
528 self.link_stack.disable_all_links();
529 }
530 }
531 }
532 }
533 }
534 }
535 _ => (),
536 }
537 prev = cur;
538 cur = self.tree[cur_ix].next;
539 }
540 self.link_stack.clear();
541 }
542
543 fn handle_emphasis(&mut self) {
544 let mut prev = None;
545 let mut prev_ix: TreeIndex;
546 let mut cur = self.tree.cur();
547
548 let mut single_quote_open: Option<TreeIndex> = None;
549 let mut double_quote_open: bool = false;
550
551 while let Some(mut cur_ix) = cur {
552 match self.tree[cur_ix].item.body {
553 ItemBody::MaybeEmphasis(mut count, can_open, can_close) => {
554 let c = self.text.as_bytes()[self.tree[cur_ix].item.start];
555 let both = can_open && can_close;
556 if can_close {
557 while let Some(el) =
558 self.inline_stack.find_match(&mut self.tree, c, count, both)
559 {
560 // have a match!
561 if let Some(prev_ix) = prev {
562 self.tree[prev_ix].next = None;
563 }
564 let match_count = min(count, el.count);
565 // start, end are tree node indices
566 let mut end = cur_ix - 1;
567 let mut start = el.start + el.count;
568
569 // work from the inside out
570 while start > el.start + el.count - match_count {
571 let inc = if start > el.start + el.count - match_count + 1 {
572 2
573 } else {
574 1
575 };
576 let ty = if c == b'~' {
577 ItemBody::Strikethrough
578 } else if inc == 2 {
579 ItemBody::Strong
580 } else {
581 ItemBody::Emphasis
582 };
583
584 let root = start - inc;
585 end = end + inc;
586 self.tree[root].item.body = ty;
587 self.tree[root].item.end = self.tree[end].item.end;
588 self.tree[root].child = Some(start);
589 self.tree[root].next = None;
590 start = root;
591 }
592
593 // set next for top most emph level
594 prev_ix = el.start + el.count - match_count;
595 prev = Some(prev_ix);
596 cur = self.tree[cur_ix + match_count - 1].next;
597 self.tree[prev_ix].next = cur;
598
599 if el.count > match_count {
600 self.inline_stack.push(InlineEl {
601 start: el.start,
602 count: el.count - match_count,
603 c: el.c,
604 both,
605 })
606 }
607 count -= match_count;
608 if count > 0 {
609 cur_ix = cur.unwrap();
610 } else {
611 break;
612 }
613 }
614 }
615 if count > 0 {
616 if can_open {
617 self.inline_stack.push(InlineEl {
618 start: cur_ix,
619 count,
620 c,
621 both,
622 });
623 } else {
624 for i in 0..count {
625 self.tree[cur_ix + i].item.body = ItemBody::Text;
626 }
627 }
628 prev_ix = cur_ix + count - 1;
629 prev = Some(prev_ix);
630 cur = self.tree[prev_ix].next;
631 }
632 }
633 ItemBody::MaybeSmartQuote(c, can_open, can_close) => {
634 self.tree[cur_ix].item.body = match c {
635 b'\'' => {
636 if let (Some(open_ix), true) = (single_quote_open, can_close) {
637 self.tree[open_ix].item.body = ItemBody::SynthesizeChar('‘');
638 single_quote_open = None;
639 } else if can_open {
640 single_quote_open = Some(cur_ix);
641 }
642 ItemBody::SynthesizeChar('’')
643 }
644 _ /* double quote */ => {
645 if can_close && double_quote_open {
646 double_quote_open = false;
647 ItemBody::SynthesizeChar('”')
648 } else {
649 if can_open && !double_quote_open {
650 double_quote_open = true;
651 }
652 ItemBody::SynthesizeChar('“')
653 }
654 }
655 };
656 prev = cur;
657 cur = self.tree[cur_ix].next;
658 }
659 _ => {
660 prev = cur;
661 cur = self.tree[cur_ix].next;
662 }
663 }
664 }
665 self.inline_stack.pop_all(&mut self.tree);
666 }
667
668 /// Returns next byte index, url and title.
669 fn scan_inline_link(
670 &self,
671 underlying: &'input str,
672 mut ix: usize,
673 node: Option<TreeIndex>,
674 ) -> Option<(usize, CowStr<'input>, CowStr<'input>)> {
675 if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
676 return None;
677 }
678 ix += 1;
679 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
680
681 let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
682 let dest = unescape(dest);
683 ix += dest_length;
684
685 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
686
687 let title = if let Some((bytes_scanned, t)) = self.scan_link_title(underlying, ix, node) {
688 ix += bytes_scanned;
689 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
690 t
691 } else {
692 "".into()
693 };
694 if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
695 return None;
696 }
697 ix += 1;
698
699 Some((ix, dest, title))
700 }
701
702 // returns (bytes scanned, title cow)
703 fn scan_link_title(
704 &self,
705 text: &'input str,
706 start_ix: usize,
707 node: Option<TreeIndex>,
708 ) -> Option<(usize, CowStr<'input>)> {
709 let bytes = text.as_bytes();
710 let open = match bytes.get(start_ix) {
711 Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
712 _ => return None,
713 };
714 let close = if open == b'(' { b')' } else { open };
715
716 let mut title = String::new();
717 let mut mark = start_ix + 1;
718 let mut i = start_ix + 1;
719
720 while i < bytes.len() {
721 let c = bytes[i];
722
723 if c == close {
724 let cow = if mark == 1 {
725 (i - start_ix + 1, text[mark..i].into())
726 } else {
727 title.push_str(&text[mark..i]);
728 (i - start_ix + 1, title.into())
729 };
730
731 return Some(cow);
732 }
733 if c == open {
734 return None;
735 }
736
737 if c == b'\n' || c == b'\r' {
738 if let Some(node_ix) = scan_nodes_to_ix(&self.tree, node, i + 1) {
739 if self.tree[node_ix].item.start > i {
740 title.push_str(&text[mark..i]);
741 title.push('\n');
742 i = self.tree[node_ix].item.start;
743 mark = i;
744 continue;
745 }
746 }
747 }
748 if c == b'&' {
749 if let (n, Some(value)) = scan_entity(&bytes[i..]) {
750 title.push_str(&text[mark..i]);
751 title.push_str(&value);
752 i += n;
753 mark = i;
754 continue;
755 }
756 }
757 if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
758 title.push_str(&text[mark..i]);
759 i += 1;
760 mark = i;
761 }
762
763 i += 1;
764 }
765
766 None
767 }
768
769 /// Make a code span.
770 ///
771 /// Both `open` and `close` are matching MaybeCode items.
772 fn make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool) {
773 let first_ix = self.tree[open].next.unwrap();
774 let bytes = self.text.as_bytes();
775 let mut span_start = self.tree[open].item.end;
776 let mut span_end = self.tree[close].item.start;
777 let mut buf: Option<String> = None;
778
779 // detect all-space sequences, since they are kept as-is as of commonmark 0.29
780 if !bytes[span_start..span_end].iter().all(|&b| b == b' ') {
781 let opening = matches!(bytes[span_start], b' ' | b'\r' | b'\n');
782 let closing = matches!(bytes[span_end - 1], b' ' | b'\r' | b'\n');
783 let drop_enclosing_whitespace = opening && closing;
784
785 if drop_enclosing_whitespace {
786 span_start += 1;
787 if span_start < span_end {
788 span_end -= 1;
789 }
790 }
791
792 let mut ix = first_ix;
793
794 while ix != close {
795 let next_ix = self.tree[ix].next.unwrap();
796 if let ItemBody::HardBreak | ItemBody::SoftBreak = self.tree[ix].item.body {
797 if drop_enclosing_whitespace {
798 // check whether break should be ignored
799 if ix == first_ix {
800 ix = next_ix;
801 span_start = min(span_end, self.tree[ix].item.start);
802 continue;
803 } else if next_ix == close && ix > first_ix {
804 break;
805 }
806 }
807
808 let end = bytes[self.tree[ix].item.start..]
809 .iter()
810 .position(|&b| b == b'\r' || b == b'\n')
811 .unwrap()
812 + self.tree[ix].item.start;
813 if let Some(ref mut buf) = buf {
814 buf.push_str(&self.text[self.tree[ix].item.start..end]);
815 buf.push(' ');
816 } else {
817 let mut new_buf = String::with_capacity(span_end - span_start);
818 new_buf.push_str(&self.text[span_start..end]);
819 new_buf.push(' ');
820 buf = Some(new_buf);
821 }
822 } else if let Some(ref mut buf) = buf {
823 let end = if next_ix == close {
824 span_end
825 } else {
826 self.tree[ix].item.end
827 };
828 buf.push_str(&self.text[self.tree[ix].item.start..end]);
829 }
830 ix = next_ix;
831 }
832 }
833
834 let cow = if let Some(buf) = buf {
835 buf.into()
836 } else {
837 self.text[span_start..span_end].into()
838 };
839 if preceding_backslash {
840 self.tree[open].item.body = ItemBody::Text;
841 self.tree[open].item.end = self.tree[open].item.start + 1;
842 self.tree[open].next = Some(close);
843 self.tree[close].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
844 self.tree[close].item.start = self.tree[open].item.start + 1;
845 } else {
846 self.tree[open].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
847 self.tree[open].item.end = self.tree[close].item.end;
848 self.tree[open].next = self.tree[close].next;
849 }
850 }
851
852 /// On success, returns a buffer containing the inline html and byte offset.
853 /// When no bytes were skipped, the buffer will be empty and the html can be
854 /// represented as a subslice of the input string.
855 fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)> {
856 let c = *bytes.get(ix)?;
857 if c == b'!' {
858 Some((
859 vec![],
860 scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)?,
861 ))
862 } else if c == b'?' {
863 Some((
864 vec![],
865 scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)?,
866 ))
867 } else {
868 let (span, i) = scan_html_block_inner(
869 // Subtract 1 to include the < character
870 &bytes[(ix - 1)..],
871 Some(&|bytes| {
872 let mut line_start = LineStart::new(bytes);
873 let _ = scan_containers(&self.tree, &mut line_start);
874 line_start.bytes_scanned()
875 }),
876 )?;
877 Some((span, i + ix - 1))
878 }
879 }
880
881 /// Consumes the event iterator and produces an iterator that produces
882 /// `(Event, Range)` pairs, where the `Range` value maps to the corresponding
883 /// range in the markdown source.
884 pub fn into_offset_iter(self) -> OffsetIter<'input, 'callback> {
885 OffsetIter { inner: self }
886 }
887}
888
889/// Returns number of containers scanned.
890pub(crate) fn scan_containers(tree: &Tree<Item>, line_start: &mut LineStart) -> usize {
891 let mut i: usize = 0;
892 for &node_ix: TreeIndex in tree.walk_spine() {
893 match tree[node_ix].item.body {
894 ItemBody::BlockQuote => {
895 // `scan_blockquote_marker` saves & restores internally
896 if !line_start.scan_blockquote_marker() {
897 break;
898 }
899 }
900 ItemBody::ListItem(indent: usize) => {
901 let save: LineStart<'_> = line_start.clone();
902 if !line_start.scan_space(n_space:indent) && !line_start.is_at_eol() {
903 *line_start = save;
904 break;
905 }
906 }
907 _ => (),
908 }
909 i += 1;
910 }
911 i
912}
913
914impl<'a> Tree<Item> {
915 pub(crate) fn append_text(&mut self, start: usize, end: usize) {
916 if end > start {
917 if let Some(ix: TreeIndex) = self.cur() {
918 if ItemBody::Text == self[ix].item.body && self[ix].item.end == start {
919 self[ix].item.end = end;
920 return;
921 }
922 }
923 self.append(Item {
924 start,
925 end,
926 body: ItemBody::Text,
927 });
928 }
929 }
930}
931
932#[derive(Copy, Clone, Debug)]
933struct InlineEl {
934 start: TreeIndex, // offset of tree node
935 count: usize,
936 c: u8, // b'*' or b'_'
937 both: bool, // can both open and close
938}
939
940#[derive(Debug, Clone, Default)]
941struct InlineStack {
942 stack: Vec<InlineEl>,
943 // Lower bounds for matching indices in the stack. For example
944 // a strikethrough delimiter will never match with any element
945 // in the stack with index smaller than
946 // `lower_bounds[InlineStack::TILDES]`.
947 lower_bounds: [usize; 7],
948}
949
950impl InlineStack {
951 /// These are indices into the lower bounds array.
952 /// Not both refers to the property that the delimiter can not both
953 /// be opener as a closer.
954 const UNDERSCORE_NOT_BOTH: usize = 0;
955 const ASTERISK_NOT_BOTH: usize = 1;
956 const ASTERISK_BASE: usize = 2;
957 const TILDES: usize = 5;
958 const UNDERSCORE_BOTH: usize = 6;
959
960 fn pop_all(&mut self, tree: &mut Tree<Item>) {
961 for el in self.stack.drain(..) {
962 for i in 0..el.count {
963 tree[el.start + i].item.body = ItemBody::Text;
964 }
965 }
966 self.lower_bounds = [0; 7];
967 }
968
969 fn get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize {
970 if c == b'_' {
971 if both {
972 self.lower_bounds[InlineStack::UNDERSCORE_BOTH]
973 } else {
974 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH]
975 }
976 } else if c == b'*' {
977 let mod3_lower = self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3];
978 if both {
979 mod3_lower
980 } else {
981 min(
982 mod3_lower,
983 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH],
984 )
985 }
986 } else {
987 self.lower_bounds[InlineStack::TILDES]
988 }
989 }
990
991 fn set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize) {
992 if c == b'_' {
993 if both {
994 self.lower_bounds[InlineStack::UNDERSCORE_BOTH] = new_bound;
995 } else {
996 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH] = new_bound;
997 }
998 } else if c == b'*' {
999 self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3] = new_bound;
1000 if !both {
1001 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH] = new_bound;
1002 }
1003 } else {
1004 self.lower_bounds[InlineStack::TILDES] = new_bound;
1005 }
1006 }
1007
1008 fn find_match(
1009 &mut self,
1010 tree: &mut Tree<Item>,
1011 c: u8,
1012 count: usize,
1013 both: bool,
1014 ) -> Option<InlineEl> {
1015 let lowerbound = min(self.stack.len(), self.get_lowerbound(c, count, both));
1016 let res = self.stack[lowerbound..]
1017 .iter()
1018 .cloned()
1019 .enumerate()
1020 .rfind(|(_, el)| {
1021 el.c == c && (!both && !el.both || (count + el.count) % 3 != 0 || count % 3 == 0)
1022 });
1023
1024 if let Some((matching_ix, matching_el)) = res {
1025 let matching_ix = matching_ix + lowerbound;
1026 for el in &self.stack[(matching_ix + 1)..] {
1027 for i in 0..el.count {
1028 tree[el.start + i].item.body = ItemBody::Text;
1029 }
1030 }
1031 self.stack.truncate(matching_ix);
1032 Some(matching_el)
1033 } else {
1034 self.set_lowerbound(c, count, both, self.stack.len());
1035 None
1036 }
1037 }
1038
1039 fn push(&mut self, el: InlineEl) {
1040 self.stack.push(el)
1041 }
1042}
1043
1044#[derive(Debug, Clone)]
1045enum RefScan<'a> {
1046 // label, source ix of label end
1047 LinkLabel(CowStr<'a>, usize),
1048 // contains next node index
1049 Collapsed(Option<TreeIndex>),
1050 Failed,
1051}
1052
1053/// Skips forward within a block to a node which spans (ends inclusive) the given
1054/// index into the source.
1055fn scan_nodes_to_ix(
1056 tree: &Tree<Item>,
1057 mut node: Option<TreeIndex>,
1058 ix: usize,
1059) -> Option<TreeIndex> {
1060 while let Some(node_ix: TreeIndex) = node {
1061 if tree[node_ix].item.end <= ix {
1062 node = tree[node_ix].next;
1063 } else {
1064 break;
1065 }
1066 }
1067 node
1068}
1069
1070/// Scans an inline link label, which cannot be interrupted.
1071/// Returns number of bytes (including brackets) and label on success.
1072fn scan_link_label<'text, 'tree>(
1073 tree: &'tree Tree<Item>,
1074 text: &'text str,
1075 allow_footnote_refs: bool,
1076) -> Option<(usize, ReferenceLabel<'text>)> {
1077 let bytes: &&[u8] = &text.as_bytes();
1078 if bytes.len() < 2 || bytes[0] != b'[' {
1079 return None;
1080 }
1081 let linebreak_handler: impl Fn(&[u8]) -> Option<…> = |bytes: &[u8]| {
1082 let mut line_start: LineStart<'_> = LineStart::new(bytes);
1083 let _ = scan_containers(tree, &mut line_start);
1084 Some(line_start.bytes_scanned())
1085 };
1086 let pair: (usize, ReferenceLabel<'_>) = if allow_footnote_refs && b'^' == bytes[1] {
1087 let (byte_index: usize, cow: CowStr<'_>) = scan_link_label_rest(&text[2..], &linebreak_handler)?;
1088 (byte_index + 2, ReferenceLabel::Footnote(cow))
1089 } else {
1090 let (byte_index: usize, cow: CowStr<'_>) = scan_link_label_rest(&text[1..], &linebreak_handler)?;
1091 (byte_index + 1, ReferenceLabel::Link(cow))
1092 };
1093 Some(pair)
1094}
1095
1096fn scan_reference<'a, 'b>(
1097 tree: &'a Tree<Item>,
1098 text: &'b str,
1099 cur: Option<TreeIndex>,
1100 allow_footnote_refs: bool,
1101) -> RefScan<'b> {
1102 let cur_ix: TreeIndex = match cur {
1103 None => return RefScan::Failed,
1104 Some(cur_ix: TreeIndex) => cur_ix,
1105 };
1106 let start: usize = tree[cur_ix].item.start;
1107 let tail: &[u8] = &text.as_bytes()[start..];
1108
1109 if tail.starts_with(needle:b"[]") {
1110 // TODO: this unwrap is sus and should be looked at closer
1111 let closing_node: TreeIndex = tree[cur_ix].next.unwrap();
1112 RefScan::Collapsed(tree[closing_node].next)
1113 } else if let Some((ix: usize, ReferenceLabel::Link(label: CowStr<'_>))) =
1114 scan_link_label(tree, &text[start..], allow_footnote_refs)
1115 {
1116 RefScan::LinkLabel(label, start + ix)
1117 } else {
1118 RefScan::Failed
1119 }
1120}
1121
1122#[derive(Clone, Default)]
1123struct LinkStack {
1124 inner: Vec<LinkStackEl>,
1125 disabled_ix: usize,
1126}
1127
1128impl LinkStack {
1129 fn push(&mut self, el: LinkStackEl) {
1130 self.inner.push(el);
1131 }
1132
1133 fn pop(&mut self) -> Option<LinkStackEl> {
1134 let el = self.inner.pop();
1135 self.disabled_ix = std::cmp::min(self.disabled_ix, self.inner.len());
1136 el
1137 }
1138
1139 fn clear(&mut self) {
1140 self.inner.clear();
1141 self.disabled_ix = 0;
1142 }
1143
1144 fn disable_all_links(&mut self) {
1145 for el in &mut self.inner[self.disabled_ix..] {
1146 if el.ty == LinkStackTy::Link {
1147 el.ty = LinkStackTy::Disabled;
1148 }
1149 }
1150 self.disabled_ix = self.inner.len();
1151 }
1152}
1153
1154#[derive(Clone, Debug)]
1155struct LinkStackEl {
1156 node: TreeIndex,
1157 ty: LinkStackTy,
1158}
1159
1160#[derive(PartialEq, Clone, Debug)]
1161enum LinkStackTy {
1162 Link,
1163 Image,
1164 Disabled,
1165}
1166
1167/// Contains the destination URL, title and source span of a reference definition.
1168#[derive(Clone, Debug)]
1169pub struct LinkDef<'a> {
1170 pub dest: CowStr<'a>,
1171 pub title: Option<CowStr<'a>>,
1172 pub span: Range<usize>,
1173}
1174
1175/// Tracks tree indices of code span delimiters of each length. It should prevent
1176/// quadratic scanning behaviours by providing (amortized) constant time lookups.
1177struct CodeDelims {
1178 inner: HashMap<usize, VecDeque<TreeIndex>>,
1179 seen_first: bool,
1180}
1181
1182impl CodeDelims {
1183 fn new() -> Self {
1184 Self {
1185 inner: Default::default(),
1186 seen_first: false,
1187 }
1188 }
1189
1190 fn insert(&mut self, count: usize, ix: TreeIndex) {
1191 if self.seen_first {
1192 self.inner
1193 .entry(count)
1194 .or_insert_with(Default::default)
1195 .push_back(ix);
1196 } else {
1197 // Skip the first insert, since that delimiter will always
1198 // be an opener and not a closer.
1199 self.seen_first = true;
1200 }
1201 }
1202
1203 fn is_populated(&self) -> bool {
1204 !self.inner.is_empty()
1205 }
1206
1207 fn find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex> {
1208 while let Some(ix) = self.inner.get_mut(&count)?.pop_front() {
1209 if ix > open_ix {
1210 return Some(ix);
1211 }
1212 }
1213 None
1214 }
1215
1216 fn clear(&mut self) {
1217 self.inner.clear();
1218 self.seen_first = false;
1219 }
1220}
1221
1222#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1223pub(crate) struct LinkIndex(usize);
1224
1225#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1226pub(crate) struct CowIndex(usize);
1227
1228#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1229pub(crate) struct AlignmentIndex(usize);
1230
1231#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1232pub(crate) struct HeadingIndex(NonZeroUsize);
1233
1234#[derive(Clone)]
1235pub(crate) struct Allocations<'a> {
1236 pub refdefs: RefDefs<'a>,
1237 links: Vec<(LinkType, CowStr<'a>, CowStr<'a>)>,
1238 cows: Vec<CowStr<'a>>,
1239 alignments: Vec<Vec<Alignment>>,
1240 headings: Vec<HeadingAttributes<'a>>,
1241}
1242
1243/// Used by the heading attributes extension.
1244#[derive(Clone)]
1245pub(crate) struct HeadingAttributes<'a> {
1246 pub id: Option<&'a str>,
1247 pub classes: Vec<&'a str>,
1248}
1249
1250/// Keeps track of the reference definitions defined in the document.
1251#[derive(Clone, Default, Debug)]
1252pub struct RefDefs<'input>(pub(crate) HashMap<LinkLabel<'input>, LinkDef<'input>>);
1253
1254impl<'input, 'b, 's> RefDefs<'input>
1255where
1256 's: 'b,
1257{
1258 /// Performs a lookup on reference label using unicode case folding.
1259 pub fn get(&'s self, key: &'b str) -> Option<&'b LinkDef<'input>> {
1260 self.0.get(&UniCase::new(key.into()))
1261 }
1262
1263 /// Provides an iterator over all the document's reference definitions.
1264 pub fn iter(&'s self) -> impl Iterator<Item = (&'s str, &'s LinkDef<'input>)> {
1265 self.0.iter().map(|(k: &UniCase>, v: &LinkDef<'_>)| (k.as_ref(), v))
1266 }
1267}
1268
1269impl<'a> Allocations<'a> {
1270 pub fn new() -> Self {
1271 Self {
1272 refdefs: RefDefs::default(),
1273 links: Vec::with_capacity(128),
1274 cows: Vec::new(),
1275 alignments: Vec::new(),
1276 headings: Vec::new(),
1277 }
1278 }
1279
1280 pub fn allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex {
1281 let ix = self.cows.len();
1282 self.cows.push(cow);
1283 CowIndex(ix)
1284 }
1285
1286 pub fn allocate_link(&mut self, ty: LinkType, url: CowStr<'a>, title: CowStr<'a>) -> LinkIndex {
1287 let ix = self.links.len();
1288 self.links.push((ty, url, title));
1289 LinkIndex(ix)
1290 }
1291
1292 pub fn allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex {
1293 let ix = self.alignments.len();
1294 self.alignments.push(alignment);
1295 AlignmentIndex(ix)
1296 }
1297
1298 pub fn allocate_heading(&mut self, attrs: HeadingAttributes<'a>) -> HeadingIndex {
1299 let ix = self.headings.len();
1300 self.headings.push(attrs);
1301 // This won't panic. `self.headings.len()` can't be `usize::MAX` since
1302 // such a long Vec cannot fit in memory.
1303 let ix_nonzero = NonZeroUsize::new(ix.wrapping_add(1)).expect("too many headings");
1304 HeadingIndex(ix_nonzero)
1305 }
1306}
1307
1308impl<'a> Index<CowIndex> for Allocations<'a> {
1309 type Output = CowStr<'a>;
1310
1311 fn index(&self, ix: CowIndex) -> &Self::Output {
1312 self.cows.index(ix.0)
1313 }
1314}
1315
1316impl<'a> Index<LinkIndex> for Allocations<'a> {
1317 type Output = (LinkType, CowStr<'a>, CowStr<'a>);
1318
1319 fn index(&self, ix: LinkIndex) -> &Self::Output {
1320 self.links.index(ix.0)
1321 }
1322}
1323
1324impl<'a> Index<AlignmentIndex> for Allocations<'a> {
1325 type Output = Vec<Alignment>;
1326
1327 fn index(&self, ix: AlignmentIndex) -> &Self::Output {
1328 self.alignments.index(ix.0)
1329 }
1330}
1331
1332impl<'a> Index<HeadingIndex> for Allocations<'a> {
1333 type Output = HeadingAttributes<'a>;
1334
1335 fn index(&self, ix: HeadingIndex) -> &Self::Output {
1336 self.headings.index(ix.0.get() - 1)
1337 }
1338}
1339
1340/// A struct containing information on the reachability of certain inline HTML
1341/// elements. In particular, for cdata elements (`<![CDATA[`), processing
1342/// elements (`<?`) and declarations (`<!DECLARATION`). The respectives usizes
1343/// represent the indices before which a scan will always fail and can hence
1344/// be skipped.
1345#[derive(Clone, Default)]
1346pub(crate) struct HtmlScanGuard {
1347 pub cdata: usize,
1348 pub processing: usize,
1349 pub declaration: usize,
1350}
1351
1352pub type BrokenLinkCallback<'input, 'borrow> =
1353 Option<&'borrow mut dyn FnMut(BrokenLink<'input>) -> Option<(CowStr<'input>, CowStr<'input>)>>;
1354
1355/// Markdown event and source range iterator.
1356///
1357/// Generates tuples where the first element is the markdown event and the second
1358/// is a the corresponding range in the source string.
1359///
1360/// Constructed from a `Parser` using its
1361/// [`into_offset_iter`](struct.Parser.html#method.into_offset_iter) method.
1362#[derive(Debug)]
1363pub struct OffsetIter<'a, 'b> {
1364 inner: Parser<'a, 'b>,
1365}
1366
1367impl<'a, 'b> OffsetIter<'a, 'b> {
1368 /// Returns a reference to the internal reference definition tracker.
1369 pub fn reference_definitions(&self) -> &RefDefs {
1370 self.inner.reference_definitions()
1371 }
1372}
1373
1374impl<'a, 'b> Iterator for OffsetIter<'a, 'b> {
1375 type Item = (Event<'a>, Range<usize>);
1376
1377 fn next(&mut self) -> Option<Self::Item> {
1378 match self.inner.tree.cur() {
1379 None => {
1380 let ix = self.inner.tree.pop()?;
1381 let tag = item_to_tag(&self.inner.tree[ix].item, &self.inner.allocs);
1382 self.inner.tree.next_sibling(ix);
1383 let span = self.inner.tree[ix].item.start..self.inner.tree[ix].item.end;
1384 debug_assert!(span.start <= span.end);
1385 Some((Event::End(tag), span))
1386 }
1387 Some(cur_ix) => {
1388 if self.inner.tree[cur_ix].item.body.is_inline() {
1389 self.inner.handle_inline();
1390 }
1391
1392 let node = self.inner.tree[cur_ix];
1393 let item = node.item;
1394 let event = item_to_event(item, self.inner.text, &self.inner.allocs);
1395 if let Event::Start(..) = event {
1396 self.inner.tree.push();
1397 } else {
1398 self.inner.tree.next_sibling(cur_ix);
1399 }
1400 debug_assert!(item.start <= item.end);
1401 Some((event, item.start..item.end))
1402 }
1403 }
1404 }
1405}
1406
1407fn item_to_tag<'a>(item: &Item, allocs: &Allocations<'a>) -> Tag<'a> {
1408 match item.body {
1409 ItemBody::Paragraph => Tag::Paragraph,
1410 ItemBody::Emphasis => Tag::Emphasis,
1411 ItemBody::Strong => Tag::Strong,
1412 ItemBody::Strikethrough => Tag::Strikethrough,
1413 ItemBody::Link(link_ix) => {
1414 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
1415 Tag::Link(*link_type, url.clone(), title.clone())
1416 }
1417 ItemBody::Image(link_ix) => {
1418 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
1419 Tag::Image(*link_type, url.clone(), title.clone())
1420 }
1421 ItemBody::Heading(level, Some(heading_ix)) => {
1422 let HeadingAttributes { id, classes } = allocs.index(heading_ix);
1423 Tag::Heading(level, *id, classes.clone())
1424 }
1425 ItemBody::Heading(level, None) => Tag::Heading(level, None, Vec::new()),
1426 ItemBody::FencedCodeBlock(cow_ix) => {
1427 Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
1428 }
1429 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
1430 ItemBody::BlockQuote => Tag::BlockQuote,
1431 ItemBody::List(_, c, listitem_start) => {
1432 if c == b'.' || c == b')' {
1433 Tag::List(Some(listitem_start))
1434 } else {
1435 Tag::List(None)
1436 }
1437 }
1438 ItemBody::ListItem(_) => Tag::Item,
1439 ItemBody::TableHead => Tag::TableHead,
1440 ItemBody::TableCell => Tag::TableCell,
1441 ItemBody::TableRow => Tag::TableRow,
1442 ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
1443 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
1444 _ => panic!("unexpected item body {:?}", item.body),
1445 }
1446}
1447
1448fn item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Event<'a> {
1449 let tag = match item.body {
1450 ItemBody::Text => return Event::Text(text[item.start..item.end].into()),
1451 ItemBody::Code(cow_ix) => return Event::Code(allocs[cow_ix].clone()),
1452 ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs[cow_ix].clone()),
1453 ItemBody::SynthesizeChar(c) => return Event::Text(c.into()),
1454 ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
1455 ItemBody::OwnedHtml(cow_ix) => return Event::Html(allocs[cow_ix].clone()),
1456 ItemBody::SoftBreak => return Event::SoftBreak,
1457 ItemBody::HardBreak => return Event::HardBreak,
1458 ItemBody::FootnoteReference(cow_ix) => {
1459 return Event::FootnoteReference(allocs[cow_ix].clone())
1460 }
1461 ItemBody::TaskListMarker(checked) => return Event::TaskListMarker(checked),
1462 ItemBody::Rule => return Event::Rule,
1463
1464 ItemBody::Paragraph => Tag::Paragraph,
1465 ItemBody::Emphasis => Tag::Emphasis,
1466 ItemBody::Strong => Tag::Strong,
1467 ItemBody::Strikethrough => Tag::Strikethrough,
1468 ItemBody::Link(link_ix) => {
1469 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
1470 Tag::Link(*link_type, url.clone(), title.clone())
1471 }
1472 ItemBody::Image(link_ix) => {
1473 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
1474 Tag::Image(*link_type, url.clone(), title.clone())
1475 }
1476 ItemBody::Heading(level, Some(heading_ix)) => {
1477 let HeadingAttributes { id, classes } = allocs.index(heading_ix);
1478 Tag::Heading(level, *id, classes.clone())
1479 }
1480 ItemBody::Heading(level, None) => Tag::Heading(level, None, Vec::new()),
1481 ItemBody::FencedCodeBlock(cow_ix) => {
1482 Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
1483 }
1484 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
1485 ItemBody::BlockQuote => Tag::BlockQuote,
1486 ItemBody::List(_, c, listitem_start) => {
1487 if c == b'.' || c == b')' {
1488 Tag::List(Some(listitem_start))
1489 } else {
1490 Tag::List(None)
1491 }
1492 }
1493 ItemBody::ListItem(_) => Tag::Item,
1494 ItemBody::TableHead => Tag::TableHead,
1495 ItemBody::TableCell => Tag::TableCell,
1496 ItemBody::TableRow => Tag::TableRow,
1497 ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
1498 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
1499 _ => panic!("unexpected item body {:?}", item.body),
1500 };
1501
1502 Event::Start(tag)
1503}
1504
1505impl<'a, 'b> Iterator for Parser<'a, 'b> {
1506 type Item = Event<'a>;
1507
1508 fn next(&mut self) -> Option<Event<'a>> {
1509 match self.tree.cur() {
1510 None => {
1511 let ix = self.tree.pop()?;
1512 let tag = item_to_tag(&self.tree[ix].item, &self.allocs);
1513 self.tree.next_sibling(ix);
1514 Some(Event::End(tag))
1515 }
1516 Some(cur_ix) => {
1517 if self.tree[cur_ix].item.body.is_inline() {
1518 self.handle_inline();
1519 }
1520
1521 let node = self.tree[cur_ix];
1522 let item = node.item;
1523 let event = item_to_event(item, self.text, &self.allocs);
1524 if let Event::Start(..) = event {
1525 self.tree.push();
1526 } else {
1527 self.tree.next_sibling(cur_ix);
1528 }
1529 Some(event)
1530 }
1531 }
1532 }
1533}
1534
1535impl FusedIterator for Parser<'_, '_> {}
1536
1537#[cfg(test)]
1538mod test {
1539 use super::*;
1540 use crate::tree::Node;
1541
1542 // TODO: move these tests to tests/html.rs?
1543
1544 fn parser_with_extensions(text: &str) -> Parser<'_, 'static> {
1545 let mut opts = Options::empty();
1546 opts.insert(Options::ENABLE_TABLES);
1547 opts.insert(Options::ENABLE_FOOTNOTES);
1548 opts.insert(Options::ENABLE_STRIKETHROUGH);
1549 opts.insert(Options::ENABLE_TASKLISTS);
1550
1551 Parser::new_ext(text, opts)
1552 }
1553
1554 #[test]
1555 #[cfg(target_pointer_width = "64")]
1556 fn node_size() {
1557 let node_size = std::mem::size_of::<Node<Item>>();
1558 assert_eq!(48, node_size);
1559 }
1560
1561 #[test]
1562 #[cfg(target_pointer_width = "64")]
1563 fn body_size() {
1564 let body_size = std::mem::size_of::<ItemBody>();
1565 assert_eq!(16, body_size);
1566 }
1567
1568 #[test]
1569 fn single_open_fish_bracket() {
1570 // dont crash
1571 assert_eq!(3, Parser::new("<").count());
1572 }
1573
1574 #[test]
1575 fn lone_hashtag() {
1576 // dont crash
1577 assert_eq!(2, Parser::new("#").count());
1578 }
1579
1580 #[test]
1581 fn lots_of_backslashes() {
1582 // dont crash
1583 Parser::new("\\\\\r\r").count();
1584 Parser::new("\\\r\r\\.\\\\\r\r\\.\\").count();
1585 }
1586
1587 #[test]
1588 fn issue_320() {
1589 // dont crash
1590 parser_with_extensions(":\r\t> |\r:\r\t> |\r").count();
1591 }
1592
1593 #[test]
1594 fn issue_319() {
1595 // dont crash
1596 parser_with_extensions("|\r-]([^|\r-]([^").count();
1597 parser_with_extensions("|\r\r=][^|\r\r=][^car").count();
1598 }
1599
1600 #[test]
1601 fn issue_303() {
1602 // dont crash
1603 parser_with_extensions("[^\r\ra]").count();
1604 parser_with_extensions("\r\r]Z[^\x00\r\r]Z[^\x00").count();
1605 }
1606
1607 #[test]
1608 fn issue_313() {
1609 // dont crash
1610 parser_with_extensions("*]0[^\r\r*]0[^").count();
1611 parser_with_extensions("[^\r> `][^\r> `][^\r> `][").count();
1612 }
1613
1614 #[test]
1615 fn issue_311() {
1616 // dont crash
1617 parser_with_extensions("\\\u{0d}-\u{09}\\\u{0d}-\u{09}").count();
1618 }
1619
1620 #[test]
1621 fn issue_283() {
1622 let input = std::str::from_utf8(b"\xf0\x9b\xb2\x9f<td:^\xf0\x9b\xb2\x9f").unwrap();
1623 // dont crash
1624 parser_with_extensions(input).count();
1625 }
1626
1627 #[test]
1628 fn issue_289() {
1629 // dont crash
1630 parser_with_extensions("> - \\\n> - ").count();
1631 parser_with_extensions("- \n\n").count();
1632 }
1633
1634 #[test]
1635 fn issue_306() {
1636 // dont crash
1637 parser_with_extensions("*\r_<__*\r_<__*\r_<__*\r_<__").count();
1638 }
1639
1640 #[test]
1641 fn issue_305() {
1642 // dont crash
1643 parser_with_extensions("_6**6*_*").count();
1644 }
1645
1646 #[test]
1647 fn another_emphasis_panic() {
1648 parser_with_extensions("*__#_#__*").count();
1649 }
1650
1651 #[test]
1652 fn offset_iter() {
1653 let event_offsets: Vec<_> = Parser::new("*hello* world")
1654 .into_offset_iter()
1655 .map(|(_ev, range)| range)
1656 .collect();
1657 let expected_offsets = vec![(0..13), (0..7), (1..6), (0..7), (7..13), (0..13)];
1658 assert_eq!(expected_offsets, event_offsets);
1659 }
1660
1661 #[test]
1662 fn reference_link_offsets() {
1663 let range =
1664 Parser::new("# H1\n[testing][Some reference]\n\n[Some reference]: https://github.com")
1665 .into_offset_iter()
1666 .filter_map(|(ev, range)| match ev {
1667 Event::Start(Tag::Link(LinkType::Reference, ..), ..) => Some(range),
1668 _ => None,
1669 })
1670 .next()
1671 .unwrap();
1672 assert_eq!(5..30, range);
1673 }
1674
1675 #[test]
1676 fn footnote_offsets() {
1677 let range = parser_with_extensions("Testing this[^1] out.\n\n[^1]: Footnote.")
1678 .into_offset_iter()
1679 .filter_map(|(ev, range)| match ev {
1680 Event::FootnoteReference(..) => Some(range),
1681 _ => None,
1682 })
1683 .next()
1684 .unwrap();
1685 assert_eq!(12..16, range);
1686 }
1687
1688 #[test]
1689 fn table_offset() {
1690 let markdown = "a\n\nTesting|This|Outtt\n--|:--:|--:\nSome Data|Other data|asdf";
1691 let event_offset = parser_with_extensions(markdown)
1692 .into_offset_iter()
1693 .map(|(_ev, range)| range)
1694 .nth(3)
1695 .unwrap();
1696 let expected_offset = 3..59;
1697 assert_eq!(expected_offset, event_offset);
1698 }
1699
1700 #[test]
1701 fn table_cell_span() {
1702 let markdown = "a|b|c\n--|--|--\na| |c";
1703 let event_offset = parser_with_extensions(markdown)
1704 .into_offset_iter()
1705 .filter_map(|(ev, span)| match ev {
1706 Event::Start(Tag::TableCell) => Some(span),
1707 _ => None,
1708 })
1709 .nth(4)
1710 .unwrap();
1711 let expected_offset_start = "a|b|c\n--|--|--\na|".len();
1712 assert_eq!(
1713 expected_offset_start..(expected_offset_start + 2),
1714 event_offset
1715 );
1716 }
1717
1718 #[test]
1719 fn offset_iter_issue_378() {
1720 let event_offsets: Vec<_> = Parser::new("a [b](c) d")
1721 .into_offset_iter()
1722 .map(|(_ev, range)| range)
1723 .collect();
1724 let expected_offsets = vec![(0..10), (0..2), (2..8), (3..4), (2..8), (8..10), (0..10)];
1725 assert_eq!(expected_offsets, event_offsets);
1726 }
1727
1728 #[test]
1729 fn offset_iter_issue_404() {
1730 let event_offsets: Vec<_> = Parser::new("###\n")
1731 .into_offset_iter()
1732 .map(|(_ev, range)| range)
1733 .collect();
1734 let expected_offsets = vec![(0..4), (0..4)];
1735 assert_eq!(expected_offsets, event_offsets);
1736 }
1737
1738 // FIXME: add this one regression suite
1739 #[test]
1740 fn link_def_at_eof() {
1741 let test_str = "[My site][world]\n\n[world]: https://vincentprouillet.com";
1742 let expected = "<p><a href=\"https://vincentprouillet.com\">My site</a></p>\n";
1743
1744 let mut buf = String::new();
1745 crate::html::push_html(&mut buf, Parser::new(test_str));
1746 assert_eq!(expected, buf);
1747 }
1748
1749 #[test]
1750 fn no_footnote_refs_without_option() {
1751 let test_str = "a [^a]\n\n[^a]: yolo";
1752 let expected = "<p>a <a href=\"yolo\">^a</a></p>\n";
1753
1754 let mut buf = String::new();
1755 crate::html::push_html(&mut buf, Parser::new(test_str));
1756 assert_eq!(expected, buf);
1757 }
1758
1759 #[test]
1760 fn ref_def_at_eof() {
1761 let test_str = "[test]:\\";
1762 let expected = "";
1763
1764 let mut buf = String::new();
1765 crate::html::push_html(&mut buf, Parser::new(test_str));
1766 assert_eq!(expected, buf);
1767 }
1768
1769 #[test]
1770 fn ref_def_cr_lf() {
1771 let test_str = "[a]: /u\r\n\n[a]";
1772 let expected = "<p><a href=\"/u\">a</a></p>\n";
1773
1774 let mut buf = String::new();
1775 crate::html::push_html(&mut buf, Parser::new(test_str));
1776 assert_eq!(expected, buf);
1777 }
1778
1779 #[test]
1780 fn no_dest_refdef() {
1781 let test_str = "[a]:";
1782 let expected = "<p>[a]:</p>\n";
1783
1784 let mut buf = String::new();
1785 crate::html::push_html(&mut buf, Parser::new(test_str));
1786 assert_eq!(expected, buf);
1787 }
1788
1789 #[test]
1790 fn broken_links_called_only_once() {
1791 for &(markdown, expected) in &[
1792 ("See also [`g()`][crate::g].", 1),
1793 ("See also [`g()`][crate::g][].", 1),
1794 ("[brokenlink1] some other node [brokenlink2]", 2),
1795 ] {
1796 let mut times_called = 0;
1797 let callback = &mut |_broken_link: BrokenLink| {
1798 times_called += 1;
1799 None
1800 };
1801 let parser =
1802 Parser::new_with_broken_link_callback(markdown, Options::empty(), Some(callback));
1803 for _ in parser {}
1804 assert_eq!(times_called, expected);
1805 }
1806 }
1807
1808 #[test]
1809 fn simple_broken_link_callback() {
1810 let test_str = "This is a link w/o def: [hello][world]";
1811 let mut callback = |broken_link: BrokenLink| {
1812 assert_eq!("world", broken_link.reference.as_ref());
1813 assert_eq!(&test_str[broken_link.span], "[hello][world]");
1814 let url = "YOLO".into();
1815 let title = "SWAG".to_owned().into();
1816 Some((url, title))
1817 };
1818 let parser =
1819 Parser::new_with_broken_link_callback(test_str, Options::empty(), Some(&mut callback));
1820 let mut link_tag_count = 0;
1821 for (typ, url, title) in parser.filter_map(|event| match event {
1822 Event::Start(tag) | Event::End(tag) => match tag {
1823 Tag::Link(typ, url, title) => Some((typ, url, title)),
1824 _ => None,
1825 },
1826 _ => None,
1827 }) {
1828 link_tag_count += 1;
1829 assert_eq!(typ, LinkType::ReferenceUnknown);
1830 assert_eq!(url.as_ref(), "YOLO");
1831 assert_eq!(title.as_ref(), "SWAG");
1832 }
1833 assert!(link_tag_count > 0);
1834 }
1835
1836 #[test]
1837 fn code_block_kind_check_fenced() {
1838 let parser = Parser::new("hello\n```test\ntadam\n```");
1839 let mut found = 0;
1840 for (ev, _range) in parser.into_offset_iter() {
1841 match ev {
1842 Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(syntax))) => {
1843 assert_eq!(syntax.as_ref(), "test");
1844 found += 1;
1845 }
1846 _ => {}
1847 }
1848 }
1849 assert_eq!(found, 1);
1850 }
1851
1852 #[test]
1853 fn code_block_kind_check_indented() {
1854 let parser = Parser::new("hello\n\n ```test\n tadam\nhello");
1855 let mut found = 0;
1856 for (ev, _range) in parser.into_offset_iter() {
1857 match ev {
1858 Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
1859 found += 1;
1860 }
1861 _ => {}
1862 }
1863 }
1864 assert_eq!(found, 1);
1865 }
1866
1867 #[test]
1868 fn ref_defs() {
1869 let input = r###"[a B c]: http://example.com
1870[another]: https://google.com
1871
1872text
1873
1874[final ONE]: http://wikipedia.org
1875"###;
1876 let mut parser = Parser::new(input);
1877
1878 assert!(parser.reference_definitions().get("a b c").is_some());
1879 assert!(parser.reference_definitions().get("nope").is_none());
1880
1881 if let Some(_event) = parser.next() {
1882 // testing keys with shorter lifetimes than parser and its input
1883 let s = "final one".to_owned();
1884 let link_def = parser.reference_definitions().get(&s).unwrap();
1885 let span = &input[link_def.span.clone()];
1886 assert_eq!(span, "[final ONE]: http://wikipedia.org");
1887 }
1888 }
1889
1890 #[test]
1891 fn common_lifetime_patterns_allowed<'b>() {
1892 let temporary_str = String::from("xyz");
1893
1894 // NOTE: this is a limitation of Rust, it doesn't allow putting lifetime parameters on the closure itself.
1895 // Hack it by attaching the lifetime to the test function instead.
1896 // TODO: why is the `'b` lifetime required at all? Changing it to `'_` breaks things :(
1897 let mut closure = |link: BrokenLink<'b>| Some(("#".into(), link.reference.into()));
1898
1899 fn function<'a>(link: BrokenLink<'a>) -> Option<(CowStr<'a>, CowStr<'a>)> {
1900 Some(("#".into(), link.reference))
1901 }
1902
1903 for _ in Parser::new_with_broken_link_callback(
1904 "static lifetime",
1905 Options::empty(),
1906 Some(&mut closure),
1907 ) {}
1908 /* This fails to compile. Because the closure can't say `for <'a> fn(BrokenLink<'a>) ->
1909 * CowStr<'a>` and has to use the enclosing `'b` lifetime parameter, `temporary_str` lives
1910 * shorter than `'b`. I think this is unlikely to occur in real life, and if it does, the
1911 * fix is simple: move it out to a function that allows annotating the lifetimes.
1912 */
1913 //for _ in Parser::new_with_broken_link_callback(&temporary_str, Options::empty(), Some(&mut callback)) {
1914 //}
1915
1916 for _ in Parser::new_with_broken_link_callback(
1917 "static lifetime",
1918 Options::empty(),
1919 Some(&mut function),
1920 ) {}
1921 for _ in Parser::new_with_broken_link_callback(
1922 &temporary_str,
1923 Options::empty(),
1924 Some(&mut function),
1925 ) {}
1926 }
1927}
1928