1// Copyright 2017 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Tree-based two pass parser.
22
23use std::cmp::{max, min};
24use std::collections::{HashMap, VecDeque};
25use std::iter::FusedIterator;
26use std::num::NonZeroUsize;
27use std::ops::{Index, Range};
28
29use unicase::UniCase;
30
31use crate::firstpass::run_first_pass;
32use crate::linklabel::{scan_link_label_rest, FootnoteLabel, LinkLabel, ReferenceLabel};
33use crate::scanners::*;
34use crate::strings::CowStr;
35use crate::tree::{Tree, TreeIndex};
36use crate::{
37 Alignment, BlockQuoteKind, CodeBlockKind, Event, HeadingLevel, LinkType, MetadataBlockKind,
38 Options, Tag, TagEnd,
39};
40
41// Allowing arbitrary depth nested parentheses inside link destinations
42// can create denial of service vulnerabilities if we're not careful.
43// The simplest countermeasure is to limit their depth, which is
44// explicitly allowed by the spec as long as the limit is at least 3:
45// https://spec.commonmark.org/0.29/#link-destination
46pub(crate) const LINK_MAX_NESTED_PARENS: usize = 32;
47
48#[derive(Debug, Default, Clone, Copy)]
49pub(crate) struct Item {
50 pub start: usize,
51 pub end: usize,
52 pub body: ItemBody,
53}
54
55#[derive(Debug, PartialEq, Clone, Copy, Default)]
56pub(crate) enum ItemBody {
57 Paragraph,
58 Text {
59 backslash_escaped: bool,
60 },
61 SoftBreak,
62 // true = is backlash
63 HardBreak(bool),
64
65 // These are possible inline items, need to be resolved in second pass.
66
67 // repeats, can_open, can_close
68 MaybeEmphasis(usize, bool, bool),
69 // can_open, can_close, brace context
70 MaybeMath(bool, bool, u8),
71 // quote byte, can_open, can_close
72 MaybeSmartQuote(u8, bool, bool),
73 MaybeCode(usize, bool), // number of backticks, preceded by backslash
74 MaybeHtml,
75 MaybeLinkOpen,
76 // bool indicates whether or not the preceding section could be a reference
77 MaybeLinkClose(bool),
78 MaybeImage,
79
80 // These are inline items after resolution.
81 Emphasis,
82 Strong,
83 Strikethrough,
84 Math(CowIndex, bool), // true for display math
85 Code(CowIndex),
86 Link(LinkIndex),
87 Image(LinkIndex),
88 FootnoteReference(CowIndex),
89 TaskListMarker(bool), // true for checked
90
91 Rule,
92 Heading(HeadingLevel, Option<HeadingIndex>), // heading level
93 FencedCodeBlock(CowIndex),
94 IndentCodeBlock,
95 HtmlBlock,
96 InlineHtml,
97 Html,
98 OwnedHtml(CowIndex),
99 BlockQuote(Option<BlockQuoteKind>),
100 List(bool, u8, u64), // is_tight, list character, list start index
101 ListItem(usize), // indent level
102 SynthesizeText(CowIndex),
103 SynthesizeChar(char),
104 FootnoteDefinition(CowIndex),
105 MetadataBlock(MetadataBlockKind),
106
107 // Tables
108 Table(AlignmentIndex),
109 TableHead,
110 TableRow,
111 TableCell,
112
113 // Dummy node at the top of the tree - should not be used otherwise!
114 #[default]
115 Root,
116}
117
118impl ItemBody {
119 fn is_inline(&self) -> bool {
120 matches!(
121 *self,
122 ItemBody::MaybeEmphasis(..)
123 | ItemBody::MaybeMath(..)
124 | ItemBody::MaybeSmartQuote(..)
125 | ItemBody::MaybeHtml
126 | ItemBody::MaybeCode(..)
127 | ItemBody::MaybeLinkOpen
128 | ItemBody::MaybeLinkClose(..)
129 | ItemBody::MaybeImage
130 )
131 }
132 fn is_block(&self) -> bool {
133 matches!(
134 *self,
135 ItemBody::Paragraph
136 | ItemBody::BlockQuote(..)
137 | ItemBody::List(..)
138 | ItemBody::ListItem(..)
139 | ItemBody::HtmlBlock
140 | ItemBody::Table(..)
141 | ItemBody::TableHead
142 | ItemBody::TableRow
143 | ItemBody::TableCell
144 | ItemBody::Heading(..)
145 | ItemBody::Rule
146 )
147 }
148}
149
150#[derive(Debug)]
151pub struct BrokenLink<'a> {
152 pub span: std::ops::Range<usize>,
153 pub link_type: LinkType,
154 pub reference: CowStr<'a>,
155}
156
157/// Markdown event iterator.
158pub struct Parser<'input, F = DefaultBrokenLinkCallback> {
159 text: &'input str,
160 options: Options,
161 tree: Tree<Item>,
162 allocs: Allocations<'input>,
163 broken_link_callback: Option<F>,
164 html_scan_guard: HtmlScanGuard,
165
166 // https://github.com/pulldown-cmark/pulldown-cmark/issues/844
167 // Consider this example:
168 //
169 // [x]: xxx...
170 // [x]
171 // [x]
172 // [x]
173 //
174 // Which expands to this HTML:
175 //
176 // <a href="xxx...">x</a>
177 // <a href="xxx...">x</a>
178 // <a href="xxx...">x</a>
179 //
180 // This is quadratic growth, because it's filling in the area of a square.
181 // To prevent this, track how much it's expanded and limit it.
182 link_ref_expansion_limit: usize,
183
184 // used by inline passes. store them here for reuse
185 inline_stack: InlineStack,
186 link_stack: LinkStack,
187}
188
189impl<'input, F> std::fmt::Debug for Parser<'input, F> {
190 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
191 // Only print the fields that have public types.
192 f&mut DebugStruct<'_, '_>.debug_struct("Parser")
193 .field("text", &self.text)
194 .field("options", &self.options)
195 .field(
196 name:"broken_link_callback",
197 &self.broken_link_callback.as_ref().map(|_| ..),
198 )
199 .finish()
200 }
201}
202
203impl<'a> BrokenLink<'a> {
204 /// Moves the link into version with a static lifetime.
205 ///
206 /// The `reference` member is cloned to a Boxed or Inline version.
207 pub fn into_static(self) -> BrokenLink<'static> {
208 BrokenLink {
209 span: self.span.clone(),
210 link_type: self.link_type,
211 reference: self.reference.into_string().into(),
212 }
213 }
214}
215
216impl<'input> Parser<'input, DefaultBrokenLinkCallback> {
217 /// Creates a new event iterator for a markdown string without any options enabled.
218 pub fn new(text: &'input str) -> Self {
219 Self::new_ext(text, Options::empty())
220 }
221
222 /// Creates a new event iterator for a markdown string with given options.
223 pub fn new_ext(text: &'input str, options: Options) -> Self {
224 Self::new_with_broken_link_callback(text, options, broken_link_callback:None)
225 }
226}
227
228impl<'input, F: BrokenLinkCallback<'input>> Parser<'input, F> {
229 /// In case the parser encounters any potential links that have a broken
230 /// reference (e.g `[foo]` when there is no `[foo]: ` entry at the bottom)
231 /// the provided callback will be called with the reference name,
232 /// and the returned pair will be used as the link URL and title if it is not
233 /// `None`.
234 pub fn new_with_broken_link_callback(
235 text: &'input str,
236 options: Options,
237 broken_link_callback: Option<F>,
238 ) -> Self {
239 let (mut tree, allocs) = run_first_pass(text, options);
240 tree.reset();
241 let inline_stack = Default::default();
242 let link_stack = Default::default();
243 let html_scan_guard = Default::default();
244 Parser {
245 text,
246 options,
247 tree,
248 allocs,
249 broken_link_callback,
250 inline_stack,
251 link_stack,
252 html_scan_guard,
253 // always allow 100KiB
254 link_ref_expansion_limit: text.len().max(100_000),
255 }
256 }
257
258 /// Returns a reference to the internal `RefDefs` object, which provides access
259 /// to the internal map of reference definitions.
260 pub fn reference_definitions(&self) -> &RefDefs<'_> {
261 &self.allocs.refdefs
262 }
263
264 /// Use a link label to fetch a type, url, and title.
265 ///
266 /// This function enforces the [`link_ref_expansion_limit`].
267 /// If it returns Some, it also consumes some of the fuel.
268 /// If we're out of fuel, it immediately returns None.
269 ///
270 /// The URL and title are found in the [`RefDefs`] map.
271 /// If they're not there, and a callback was provided by the user,
272 /// the [`broken_link_callback`] will be invoked and given the opportunity
273 /// to provide a fallback.
274 ///
275 /// The link type (that's "link" or "image") depends on the usage site, and
276 /// is provided by the caller of this function.
277 /// This function returns a new one because, if it has to invoke a callback
278 /// to find the information, the link type is [mapped to an unknown type].
279 ///
280 /// [mapped to an unknown type]: crate::LinkType::to_unknown
281 /// [`link_ref_expansion_limit`]: Self::link_ref_expansion_limit
282 /// [`broken_link_callback`]: Self::broken_link_callback
283 fn fetch_link_type_url_title(
284 &mut self,
285 link_label: CowStr<'input>,
286 span: Range<usize>,
287 link_type: LinkType,
288 ) -> Option<(LinkType, CowStr<'input>, CowStr<'input>)> {
289 if self.link_ref_expansion_limit == 0 {
290 return None;
291 }
292
293 let (link_type, url, title) = self
294 .allocs
295 .refdefs
296 .get(link_label.as_ref())
297 .map(|matching_def| {
298 // found a matching definition!
299 let title = matching_def
300 .title
301 .as_ref()
302 .cloned()
303 .unwrap_or_else(|| "".into());
304 let url = matching_def.dest.clone();
305 (link_type, url, title)
306 })
307 .or_else(|| {
308 match self.broken_link_callback.as_mut() {
309 Some(callback) => {
310 // Construct a BrokenLink struct, which will be passed to the callback
311 let broken_link = BrokenLink {
312 span,
313 link_type,
314 reference: link_label,
315 };
316
317 callback
318 .handle_broken_link(broken_link)
319 .map(|(url, title)| (link_type.to_unknown(), url, title))
320 }
321 None => None,
322 }
323 })?;
324
325 // Limit expansion from link references.
326 // This isn't a problem for footnotes, because multiple references to the same one
327 // reuse the same node, but links/images get their HREF/SRC copied.
328 self.link_ref_expansion_limit = self
329 .link_ref_expansion_limit
330 .saturating_sub(url.len() + title.len());
331
332 Some((link_type, url, title))
333 }
334
335 /// Handle inline markup.
336 ///
337 /// When the parser encounters any item indicating potential inline markup, all
338 /// inline markup passes are run on the remainder of the chain.
339 ///
340 /// Note: there's some potential for optimization here, but that's future work.
341 fn handle_inline(&mut self) {
342 self.handle_inline_pass1();
343 self.handle_emphasis_and_hard_break();
344 }
345
346 /// Handle inline HTML, code spans, and links.
347 ///
348 /// This function handles both inline HTML and code spans, because they have
349 /// the same precedence. It also handles links, even though they have lower
350 /// precedence, because the URL of links must not be processed.
351 fn handle_inline_pass1(&mut self) {
352 let mut code_delims = CodeDelims::new();
353 let mut math_delims = MathDelims::new();
354 let mut cur = self.tree.cur();
355 let mut prev = None;
356
357 let block_end = self.tree[self.tree.peek_up().unwrap()].item.end;
358 let block_text = &self.text[..block_end];
359
360 while let Some(mut cur_ix) = cur {
361 match self.tree[cur_ix].item.body {
362 ItemBody::MaybeHtml => {
363 let next = self.tree[cur_ix].next;
364 let autolink = if let Some(next_ix) = next {
365 scan_autolink(block_text, self.tree[next_ix].item.start)
366 } else {
367 None
368 };
369
370 if let Some((ix, uri, link_type)) = autolink {
371 let node = scan_nodes_to_ix(&self.tree, next, ix);
372 let text_node = self.tree.create_node(Item {
373 start: self.tree[cur_ix].item.start + 1,
374 end: ix - 1,
375 body: ItemBody::Text {
376 backslash_escaped: false,
377 },
378 });
379 let link_ix =
380 self.allocs
381 .allocate_link(link_type, uri, "".into(), "".into());
382 self.tree[cur_ix].item.body = ItemBody::Link(link_ix);
383 self.tree[cur_ix].item.end = ix;
384 self.tree[cur_ix].next = node;
385 self.tree[cur_ix].child = Some(text_node);
386 prev = cur;
387 cur = node;
388 if let Some(node_ix) = cur {
389 self.tree[node_ix].item.start = max(self.tree[node_ix].item.start, ix);
390 }
391 continue;
392 } else {
393 let inline_html = next.and_then(|next_ix| {
394 self.scan_inline_html(
395 block_text.as_bytes(),
396 self.tree[next_ix].item.start,
397 )
398 });
399 if let Some((span, ix)) = inline_html {
400 let node = scan_nodes_to_ix(&self.tree, next, ix);
401 self.tree[cur_ix].item.body = if !span.is_empty() {
402 let converted_string =
403 String::from_utf8(span).expect("invalid utf8");
404 ItemBody::OwnedHtml(
405 self.allocs.allocate_cow(converted_string.into()),
406 )
407 } else {
408 ItemBody::InlineHtml
409 };
410 self.tree[cur_ix].item.end = ix;
411 self.tree[cur_ix].next = node;
412 prev = cur;
413 cur = node;
414 if let Some(node_ix) = cur {
415 self.tree[node_ix].item.start =
416 max(self.tree[node_ix].item.start, ix);
417 }
418 continue;
419 }
420 }
421 self.tree[cur_ix].item.body = ItemBody::Text {
422 backslash_escaped: false,
423 };
424 }
425 ItemBody::MaybeMath(can_open, _can_close, brace_context) => {
426 if !can_open {
427 self.tree[cur_ix].item.body = ItemBody::Text {
428 backslash_escaped: false,
429 };
430 prev = cur;
431 cur = self.tree[cur_ix].next;
432 continue;
433 }
434 let is_display = self.tree[cur_ix].next.map_or(false, |next_ix| {
435 matches!(
436 self.tree[next_ix].item.body,
437 ItemBody::MaybeMath(_can_open, _can_close, _brace_context)
438 )
439 });
440 let result = if math_delims.is_populated() {
441 // we have previously scanned all math environment delimiters,
442 // so we can reuse that work
443 math_delims.find(&self.tree, cur_ix, is_display, brace_context)
444 } else {
445 // we haven't previously scanned all math delimiters,
446 // so walk the AST
447 let mut scan = self.tree[cur_ix].next;
448 if is_display {
449 // a display delimiter, `$$`, is actually two delimiters
450 // skip the second one
451 scan = self.tree[scan.unwrap()].next;
452 }
453 let mut invalid = false;
454 while let Some(scan_ix) = scan {
455 if let ItemBody::MaybeMath(_can_open, can_close, delim_brace_context) =
456 self.tree[scan_ix].item.body
457 {
458 let delim_is_display =
459 self.tree[scan_ix].next.map_or(false, |next_ix| {
460 matches!(
461 self.tree[next_ix].item.body,
462 ItemBody::MaybeMath(
463 _can_open,
464 _can_close,
465 _brace_context
466 )
467 )
468 });
469 if !invalid && delim_brace_context == brace_context {
470 if (!is_display && can_close)
471 || (is_display && delim_is_display)
472 {
473 // This will skip ahead past everything we
474 // just inserted. Needed for correctness to
475 // ensure that a new scan is done after this item.
476 math_delims.clear();
477 break;
478 } else {
479 // Math cannot contain $, so the current item
480 // is invalid. Keep scanning to fill math_delims.
481 invalid = true;
482 }
483 }
484 math_delims.insert(
485 delim_is_display,
486 delim_brace_context,
487 scan_ix,
488 can_close,
489 );
490 }
491 if self.tree[scan_ix].item.body.is_block() {
492 // If this is a tight list, blocks and inlines might be
493 // siblings. Inlines can't cross the boundary like that.
494 scan = None;
495 break;
496 }
497 scan = self.tree[scan_ix].next;
498 }
499 scan
500 };
501
502 if let Some(scan_ix) = result {
503 self.make_math_span(cur_ix, scan_ix);
504 } else {
505 self.tree[cur_ix].item.body = ItemBody::Text {
506 backslash_escaped: false,
507 };
508 }
509 }
510 ItemBody::MaybeCode(mut search_count, preceded_by_backslash) => {
511 if preceded_by_backslash {
512 search_count -= 1;
513 if search_count == 0 {
514 self.tree[cur_ix].item.body = ItemBody::Text {
515 backslash_escaped: false,
516 };
517 prev = cur;
518 cur = self.tree[cur_ix].next;
519 continue;
520 }
521 }
522
523 if code_delims.is_populated() {
524 // we have previously scanned all codeblock delimiters,
525 // so we can reuse that work
526 if let Some(scan_ix) = code_delims.find(cur_ix, search_count) {
527 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
528 } else {
529 self.tree[cur_ix].item.body = ItemBody::Text {
530 backslash_escaped: false,
531 };
532 }
533 } else {
534 // we haven't previously scanned all codeblock delimiters,
535 // so walk the AST
536 let mut scan = if search_count > 0 {
537 self.tree[cur_ix].next
538 } else {
539 None
540 };
541 while let Some(scan_ix) = scan {
542 if let ItemBody::MaybeCode(delim_count, _) =
543 self.tree[scan_ix].item.body
544 {
545 if search_count == delim_count {
546 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
547 code_delims.clear();
548 break;
549 } else {
550 code_delims.insert(delim_count, scan_ix);
551 }
552 }
553 if self.tree[scan_ix].item.body.is_block() {
554 // If this is a tight list, blocks and inlines might be
555 // siblings. Inlines can't cross the boundary like that.
556 scan = None;
557 break;
558 }
559 scan = self.tree[scan_ix].next;
560 }
561 if scan.is_none() {
562 self.tree[cur_ix].item.body = ItemBody::Text {
563 backslash_escaped: false,
564 };
565 }
566 }
567 }
568 ItemBody::MaybeLinkOpen => {
569 self.tree[cur_ix].item.body = ItemBody::Text {
570 backslash_escaped: false,
571 };
572 self.link_stack.push(LinkStackEl {
573 node: cur_ix,
574 ty: LinkStackTy::Link,
575 });
576 }
577 ItemBody::MaybeImage => {
578 self.tree[cur_ix].item.body = ItemBody::Text {
579 backslash_escaped: false,
580 };
581 self.link_stack.push(LinkStackEl {
582 node: cur_ix,
583 ty: LinkStackTy::Image,
584 });
585 }
586 ItemBody::MaybeLinkClose(could_be_ref) => {
587 self.tree[cur_ix].item.body = ItemBody::Text {
588 backslash_escaped: false,
589 };
590 if let Some(tos) = self.link_stack.pop() {
591 if tos.ty == LinkStackTy::Disabled {
592 continue;
593 }
594 let next = self.tree[cur_ix].next;
595 if let Some((next_ix, url, title)) =
596 self.scan_inline_link(block_text, self.tree[cur_ix].item.end, next)
597 {
598 let next_node = scan_nodes_to_ix(&self.tree, next, next_ix);
599 if let Some(prev_ix) = prev {
600 self.tree[prev_ix].next = None;
601 }
602 cur = Some(tos.node);
603 cur_ix = tos.node;
604 let link_ix =
605 self.allocs
606 .allocate_link(LinkType::Inline, url, title, "".into());
607 self.tree[cur_ix].item.body = if tos.ty == LinkStackTy::Image {
608 ItemBody::Image(link_ix)
609 } else {
610 ItemBody::Link(link_ix)
611 };
612 self.tree[cur_ix].child = self.tree[cur_ix].next;
613 self.tree[cur_ix].next = next_node;
614 self.tree[cur_ix].item.end = next_ix;
615 if let Some(next_node_ix) = next_node {
616 self.tree[next_node_ix].item.start =
617 max(self.tree[next_node_ix].item.start, next_ix);
618 }
619
620 if tos.ty == LinkStackTy::Link {
621 self.link_stack.disable_all_links();
622 }
623 } else {
624 // ok, so its not an inline link. maybe it is a reference
625 // to a defined link?
626 let scan_result = scan_reference(
627 &self.tree,
628 block_text,
629 next,
630 self.options.contains(Options::ENABLE_FOOTNOTES),
631 self.options.has_gfm_footnotes(),
632 );
633 let (node_after_link, link_type) = match scan_result {
634 // [label][reference]
635 RefScan::LinkLabel(_, end_ix) => {
636 // Toggle reference viability of the last closing bracket,
637 // so that we can skip it on future iterations in case
638 // it fails in this one. In particular, we won't call
639 // the broken link callback twice on one reference.
640 let reference_close_node = if let Some(node) =
641 scan_nodes_to_ix(&self.tree, next, end_ix - 1)
642 {
643 node
644 } else {
645 continue;
646 };
647 self.tree[reference_close_node].item.body =
648 ItemBody::MaybeLinkClose(false);
649 let next_node = self.tree[reference_close_node].next;
650
651 (next_node, LinkType::Reference)
652 }
653 // [reference][]
654 RefScan::Collapsed(next_node) => {
655 // This reference has already been tried, and it's not
656 // valid. Skip it.
657 if !could_be_ref {
658 continue;
659 }
660 (next_node, LinkType::Collapsed)
661 }
662 // [shortcut]
663 //
664 // [shortcut]: /blah
665 RefScan::Failed | RefScan::UnexpectedFootnote => {
666 if !could_be_ref {
667 continue;
668 }
669 (next, LinkType::Shortcut)
670 }
671 };
672
673 // FIXME: references and labels are mixed in the naming of variables
674 // below. Disambiguate!
675
676 // (label, source_ix end)
677 let label: Option<(ReferenceLabel<'input>, usize)> = match scan_result {
678 RefScan::LinkLabel(l, end_ix) => {
679 Some((ReferenceLabel::Link(l), end_ix))
680 }
681 RefScan::Collapsed(..)
682 | RefScan::Failed
683 | RefScan::UnexpectedFootnote => {
684 // No label? maybe it is a shortcut reference
685 let label_start = self.tree[tos.node].item.end - 1;
686 let label_end = self.tree[cur_ix].item.end;
687 scan_link_label(
688 &self.tree,
689 &self.text[label_start..label_end],
690 self.options.contains(Options::ENABLE_FOOTNOTES),
691 self.options.has_gfm_footnotes(),
692 )
693 .map(|(ix, label)| (label, label_start + ix))
694 .filter(|(_, end)| *end == label_end)
695 }
696 };
697
698 let id = match &label {
699 Some(
700 (ReferenceLabel::Link(l), _) | (ReferenceLabel::Footnote(l), _),
701 ) => l.clone(),
702 None => "".into(),
703 };
704
705 // see if it's a footnote reference
706 if let Some((ReferenceLabel::Footnote(l), end)) = label {
707 let footref = self.allocs.allocate_cow(l);
708 if let Some(def) = self
709 .allocs
710 .footdefs
711 .get_mut(self.allocs.cows[footref.0].to_owned())
712 {
713 def.use_count += 1;
714 }
715 if !self.options.has_gfm_footnotes()
716 || self.allocs.footdefs.contains(&self.allocs.cows[footref.0])
717 {
718 // If this came from a MaybeImage, then the `!` prefix
719 // isn't part of the footnote reference.
720 let footnote_ix = if tos.ty == LinkStackTy::Image {
721 self.tree[tos.node].next = Some(cur_ix);
722 self.tree[tos.node].child = None;
723 self.tree[tos.node].item.body =
724 ItemBody::SynthesizeChar('!');
725 self.tree[cur_ix].item.start =
726 self.tree[tos.node].item.start + 1;
727 self.tree[tos.node].item.end =
728 self.tree[tos.node].item.start + 1;
729 cur_ix
730 } else {
731 tos.node
732 };
733 // use `next` instead of `node_after_link` because
734 // node_after_link is calculated for a [collapsed][] link,
735 // which footnotes don't support.
736 self.tree[footnote_ix].next = next;
737 self.tree[footnote_ix].child = None;
738 self.tree[footnote_ix].item.body =
739 ItemBody::FootnoteReference(footref);
740 self.tree[footnote_ix].item.end = end;
741 prev = Some(footnote_ix);
742 cur = next;
743 self.link_stack.clear();
744 continue;
745 }
746 } else if let Some((ReferenceLabel::Link(link_label), end)) = label {
747 if let Some((def_link_type, url, title)) = self
748 .fetch_link_type_url_title(
749 link_label,
750 (self.tree[tos.node].item.start)..end,
751 link_type,
752 )
753 {
754 let link_ix =
755 self.allocs.allocate_link(def_link_type, url, title, id);
756 self.tree[tos.node].item.body = if tos.ty == LinkStackTy::Image
757 {
758 ItemBody::Image(link_ix)
759 } else {
760 ItemBody::Link(link_ix)
761 };
762 let label_node = self.tree[tos.node].next;
763
764 // lets do some tree surgery to add the link to the tree
765 // 1st: skip the label node and close node
766 self.tree[tos.node].next = node_after_link;
767
768 // then, if it exists, add the label node as a child to the link node
769 if label_node != cur {
770 self.tree[tos.node].child = label_node;
771
772 // finally: disconnect list of children
773 if let Some(prev_ix) = prev {
774 self.tree[prev_ix].next = None;
775 }
776 }
777
778 self.tree[tos.node].item.end = end;
779
780 // set up cur so next node will be node_after_link
781 cur = Some(tos.node);
782 cur_ix = tos.node;
783
784 if tos.ty == LinkStackTy::Link {
785 self.link_stack.disable_all_links();
786 }
787 }
788 }
789 }
790 }
791 }
792 _ => {
793 // If this is a tight list, blocks and inlines might be
794 // siblings. Inlines can't cross the boundary like that.
795 if let Some(cur_ix) = cur {
796 if self.tree[cur_ix].item.body.is_block() {
797 self.link_stack.clear();
798 }
799 }
800 }
801 }
802 prev = cur;
803 cur = self.tree[cur_ix].next;
804 }
805 self.link_stack.clear();
806 }
807
808 fn handle_emphasis_and_hard_break(&mut self) {
809 let mut prev = None;
810 let mut prev_ix: TreeIndex;
811 let mut cur = self.tree.cur();
812
813 let mut single_quote_open: Option<TreeIndex> = None;
814 let mut double_quote_open: bool = false;
815
816 while let Some(mut cur_ix) = cur {
817 match self.tree[cur_ix].item.body {
818 ItemBody::MaybeEmphasis(mut count, can_open, can_close) => {
819 let run_length = count;
820 let c = self.text.as_bytes()[self.tree[cur_ix].item.start];
821 let both = can_open && can_close;
822 if can_close {
823 while let Some(el) =
824 self.inline_stack
825 .find_match(&mut self.tree, c, run_length, both)
826 {
827 // have a match!
828 if let Some(prev_ix) = prev {
829 self.tree[prev_ix].next = None;
830 }
831 let match_count = min(count, el.count);
832 // start, end are tree node indices
833 let mut end = cur_ix - 1;
834 let mut start = el.start + el.count;
835
836 // work from the inside out
837 while start > el.start + el.count - match_count {
838 let inc = if start > el.start + el.count - match_count + 1 {
839 2
840 } else {
841 1
842 };
843 let ty = if c == b'~' {
844 ItemBody::Strikethrough
845 } else if inc == 2 {
846 ItemBody::Strong
847 } else {
848 ItemBody::Emphasis
849 };
850
851 let root = start - inc;
852 end = end + inc;
853 self.tree[root].item.body = ty;
854 self.tree[root].item.end = self.tree[end].item.end;
855 self.tree[root].child = Some(start);
856 self.tree[root].next = None;
857 start = root;
858 }
859
860 // set next for top most emph level
861 prev_ix = el.start + el.count - match_count;
862 prev = Some(prev_ix);
863 cur = self.tree[cur_ix + match_count - 1].next;
864 self.tree[prev_ix].next = cur;
865
866 if el.count > match_count {
867 self.inline_stack.push(InlineEl {
868 start: el.start,
869 count: el.count - match_count,
870 run_length: el.run_length,
871 c: el.c,
872 both: el.both,
873 })
874 }
875 count -= match_count;
876 if count > 0 {
877 cur_ix = cur.unwrap();
878 } else {
879 break;
880 }
881 }
882 }
883 if count > 0 {
884 if can_open {
885 self.inline_stack.push(InlineEl {
886 start: cur_ix,
887 run_length,
888 count,
889 c,
890 both,
891 });
892 } else {
893 for i in 0..count {
894 self.tree[cur_ix + i].item.body = ItemBody::Text {
895 backslash_escaped: false,
896 };
897 }
898 }
899 prev_ix = cur_ix + count - 1;
900 prev = Some(prev_ix);
901 cur = self.tree[prev_ix].next;
902 }
903 }
904 ItemBody::MaybeSmartQuote(c, can_open, can_close) => {
905 self.tree[cur_ix].item.body = match c {
906 b'\'' => {
907 if let (Some(open_ix), true) = (single_quote_open, can_close) {
908 self.tree[open_ix].item.body = ItemBody::SynthesizeChar('‘');
909 single_quote_open = None;
910 } else if can_open {
911 single_quote_open = Some(cur_ix);
912 }
913 ItemBody::SynthesizeChar('’')
914 }
915 _ /* double quote */ => {
916 if can_close && double_quote_open {
917 double_quote_open = false;
918 ItemBody::SynthesizeChar('”')
919 } else {
920 if can_open && !double_quote_open {
921 double_quote_open = true;
922 }
923 ItemBody::SynthesizeChar('“')
924 }
925 }
926 };
927 prev = cur;
928 cur = self.tree[cur_ix].next;
929 }
930 ItemBody::HardBreak(true) => {
931 if self.tree[cur_ix].next.is_none() {
932 self.tree[cur_ix].item.body = ItemBody::SynthesizeChar('\\');
933 }
934 prev = cur;
935 cur = self.tree[cur_ix].next;
936 }
937 _ => {
938 prev = cur;
939 // If this is a tight list, blocks and inlines might be
940 // siblings. Inlines can't cross the boundary like that.
941 if let Some(cur_ix) = cur {
942 if self.tree[cur_ix].item.body.is_block() {
943 self.inline_stack.pop_all(&mut self.tree);
944 }
945 }
946 cur = self.tree[cur_ix].next;
947 }
948 }
949 }
950 self.inline_stack.pop_all(&mut self.tree);
951 }
952
953 /// Returns next byte index, url and title.
954 fn scan_inline_link(
955 &self,
956 underlying: &'input str,
957 mut ix: usize,
958 node: Option<TreeIndex>,
959 ) -> Option<(usize, CowStr<'input>, CowStr<'input>)> {
960 if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
961 return None;
962 }
963 ix += 1;
964
965 let scan_separator = |ix: &mut usize| {
966 *ix += scan_while(&underlying.as_bytes()[*ix..], is_ascii_whitespace_no_nl);
967 if let Some(bl) = scan_eol(&underlying.as_bytes()[*ix..]) {
968 *ix += bl;
969 let mut line_start = LineStart::new(&underlying.as_bytes()[*ix..]);
970 let _ = scan_containers(
971 &self.tree,
972 &mut line_start,
973 self.options.has_gfm_footnotes(),
974 );
975 *ix += line_start.bytes_scanned();
976 }
977 *ix += scan_while(&underlying.as_bytes()[*ix..], is_ascii_whitespace_no_nl);
978 };
979
980 scan_separator(&mut ix);
981
982 let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
983 let dest = unescape(dest, self.tree.is_in_table());
984 ix += dest_length;
985
986 scan_separator(&mut ix);
987
988 let title = if let Some((bytes_scanned, t)) = self.scan_link_title(underlying, ix, node) {
989 ix += bytes_scanned;
990 scan_separator(&mut ix);
991 t
992 } else {
993 "".into()
994 };
995 if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
996 return None;
997 }
998 ix += 1;
999
1000 Some((ix, dest, title))
1001 }
1002
1003 // returns (bytes scanned, title cow)
1004 fn scan_link_title(
1005 &self,
1006 text: &'input str,
1007 start_ix: usize,
1008 node: Option<TreeIndex>,
1009 ) -> Option<(usize, CowStr<'input>)> {
1010 let bytes = text.as_bytes();
1011 let open = match bytes.get(start_ix) {
1012 Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
1013 _ => return None,
1014 };
1015 let close = if open == b'(' { b')' } else { open };
1016
1017 let mut title = String::new();
1018 let mut mark = start_ix + 1;
1019 let mut i = start_ix + 1;
1020
1021 while i < bytes.len() {
1022 let c = bytes[i];
1023
1024 if c == close {
1025 let cow = if mark == 1 {
1026 (i - start_ix + 1, text[mark..i].into())
1027 } else {
1028 title.push_str(&text[mark..i]);
1029 (i - start_ix + 1, title.into())
1030 };
1031
1032 return Some(cow);
1033 }
1034 if c == open {
1035 return None;
1036 }
1037
1038 if c == b'\n' || c == b'\r' {
1039 if let Some(node_ix) = scan_nodes_to_ix(&self.tree, node, i + 1) {
1040 if self.tree[node_ix].item.start > i {
1041 title.push_str(&text[mark..i]);
1042 title.push('\n');
1043 i = self.tree[node_ix].item.start;
1044 mark = i;
1045 continue;
1046 }
1047 }
1048 }
1049 if c == b'&' {
1050 if let (n, Some(value)) = scan_entity(&bytes[i..]) {
1051 title.push_str(&text[mark..i]);
1052 title.push_str(&value);
1053 i += n;
1054 mark = i;
1055 continue;
1056 }
1057 }
1058 if self.tree.is_in_table()
1059 && c == b'\\'
1060 && i + 2 < bytes.len()
1061 && bytes[i + 1] == b'\\'
1062 && bytes[i + 2] == b'|'
1063 {
1064 // this runs if there are an even number of pipes in a table
1065 // if it's odd, then it gets parsed as normal
1066 title.push_str(&text[mark..i]);
1067 i += 2;
1068 mark = i;
1069 }
1070 if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
1071 title.push_str(&text[mark..i]);
1072 i += 1;
1073 mark = i;
1074 }
1075
1076 i += 1;
1077 }
1078
1079 None
1080 }
1081
1082 fn make_math_span(&mut self, open: TreeIndex, mut close: TreeIndex) {
1083 let start_is_display = self.tree[open].next.filter(|&next_ix| {
1084 next_ix != close
1085 && matches!(
1086 self.tree[next_ix].item.body,
1087 ItemBody::MaybeMath(_can_open, _can_close, _brace_context)
1088 )
1089 });
1090 let end_is_display = self.tree[close].next.filter(|&next_ix| {
1091 matches!(
1092 self.tree[next_ix].item.body,
1093 ItemBody::MaybeMath(_can_open, _can_close, _brace_context)
1094 )
1095 });
1096 let is_display = start_is_display.is_some() && end_is_display.is_some();
1097 if is_display {
1098 // This unwrap() can't panic, because if the next variable were None, end_is_display would be None
1099 close = self.tree[close].next.unwrap();
1100 self.tree[open].next = Some(close);
1101 self.tree[open].item.end += 1;
1102 self.tree[close].item.start -= 1;
1103 } else {
1104 if self.tree[open].item.end == self.tree[close].item.start {
1105 // inline math spans cannot be empty
1106 self.tree[open].item.body = ItemBody::Text {
1107 backslash_escaped: false,
1108 };
1109 return;
1110 }
1111 self.tree[open].next = Some(close);
1112 }
1113 let span_start = self.tree[open].item.end;
1114 let span_end = self.tree[close].item.start;
1115
1116 let bytes = self.text.as_bytes();
1117 let mut buf: Option<String> = None;
1118
1119 let mut start_ix = span_start;
1120 let mut ix = span_start;
1121 while ix < span_end {
1122 let c = bytes[ix];
1123 if c == b'\r' || c == b'\n' {
1124 ix += 1;
1125 let buf = buf.get_or_insert_with(|| String::with_capacity(ix - span_start));
1126 buf.push_str(&self.text[start_ix..ix]);
1127 let mut line_start = LineStart::new(&bytes[ix..]);
1128 let _ = scan_containers(
1129 &self.tree,
1130 &mut line_start,
1131 self.options.has_gfm_footnotes(),
1132 );
1133 ix += line_start.bytes_scanned();
1134 start_ix = ix;
1135 } else if c == b'\\' && bytes.get(ix + 1) == Some(&b'|') && self.tree.is_in_table() {
1136 let buf = buf.get_or_insert_with(|| String::with_capacity(ix + 1 - span_start));
1137 buf.push_str(&self.text[start_ix..ix]);
1138 buf.push('|');
1139 ix += 2;
1140 start_ix = ix;
1141 } else {
1142 ix += 1;
1143 }
1144 }
1145
1146 let cow = if let Some(mut buf) = buf {
1147 buf.push_str(&self.text[start_ix..span_end]);
1148 buf.into()
1149 } else {
1150 self.text[span_start..span_end].into()
1151 };
1152
1153 self.tree[open].item.body = ItemBody::Math(self.allocs.allocate_cow(cow), is_display);
1154 self.tree[open].item.end = self.tree[close].item.end;
1155 self.tree[open].next = self.tree[close].next;
1156 }
1157
1158 /// Make a code span.
1159 ///
1160 /// Both `open` and `close` are matching MaybeCode items.
1161 fn make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool) {
1162 let bytes = self.text.as_bytes();
1163 let span_start = self.tree[open].item.end;
1164 let span_end = self.tree[close].item.start;
1165 let mut buf: Option<String> = None;
1166
1167 let mut start_ix = span_start;
1168 let mut ix = span_start;
1169 while ix < span_end {
1170 let c = bytes[ix];
1171 if c == b'\r' || c == b'\n' {
1172 let buf = buf.get_or_insert_with(|| String::with_capacity(ix + 1 - span_start));
1173 buf.push_str(&self.text[start_ix..ix]);
1174 buf.push(' ');
1175 ix += 1;
1176 let mut line_start = LineStart::new(&bytes[ix..]);
1177 let _ = scan_containers(
1178 &self.tree,
1179 &mut line_start,
1180 self.options.has_gfm_footnotes(),
1181 );
1182 ix += line_start.bytes_scanned();
1183 start_ix = ix;
1184 } else if c == b'\\' && bytes.get(ix + 1) == Some(&b'|') && self.tree.is_in_table() {
1185 let buf = buf.get_or_insert_with(|| String::with_capacity(ix + 1 - span_start));
1186 buf.push_str(&self.text[start_ix..ix]);
1187 buf.push('|');
1188 ix += 2;
1189 start_ix = ix;
1190 } else {
1191 ix += 1;
1192 }
1193 }
1194
1195 let (opening, closing, all_spaces) = {
1196 let s = if let Some(buf) = &mut buf {
1197 buf.push_str(&self.text[start_ix..span_end]);
1198 &buf[..]
1199 } else {
1200 &self.text[span_start..span_end]
1201 };
1202 (
1203 s.as_bytes().first() == Some(&b' '),
1204 s.as_bytes().last() == Some(&b' '),
1205 s.bytes().all(|b| b == b' '),
1206 )
1207 };
1208
1209 let cow: CowStr<'input> = if !all_spaces && opening && closing {
1210 if let Some(mut buf) = buf {
1211 buf.remove(0);
1212 buf.pop();
1213 buf.into()
1214 } else {
1215 let lo = span_start + 1;
1216 let hi = (span_end - 1).max(lo);
1217 self.text[lo..hi].into()
1218 }
1219 } else if let Some(buf) = buf {
1220 buf.into()
1221 } else {
1222 self.text[span_start..span_end].into()
1223 };
1224
1225 if preceding_backslash {
1226 self.tree[open].item.body = ItemBody::Text {
1227 backslash_escaped: true,
1228 };
1229 self.tree[open].item.end = self.tree[open].item.start + 1;
1230 self.tree[open].next = Some(close);
1231 self.tree[close].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
1232 self.tree[close].item.start = self.tree[open].item.start + 1;
1233 } else {
1234 self.tree[open].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
1235 self.tree[open].item.end = self.tree[close].item.end;
1236 self.tree[open].next = self.tree[close].next;
1237 }
1238 }
1239
1240 /// On success, returns a buffer containing the inline html and byte offset.
1241 /// When no bytes were skipped, the buffer will be empty and the html can be
1242 /// represented as a subslice of the input string.
1243 fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)> {
1244 let c = *bytes.get(ix)?;
1245 if c == b'!' {
1246 Some((
1247 vec![],
1248 scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)?,
1249 ))
1250 } else if c == b'?' {
1251 Some((
1252 vec![],
1253 scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)?,
1254 ))
1255 } else {
1256 let (span, i) = scan_html_block_inner(
1257 // Subtract 1 to include the < character
1258 &bytes[(ix - 1)..],
1259 Some(&|bytes| {
1260 let mut line_start = LineStart::new(bytes);
1261 let _ = scan_containers(
1262 &self.tree,
1263 &mut line_start,
1264 self.options.has_gfm_footnotes(),
1265 );
1266 line_start.bytes_scanned()
1267 }),
1268 )?;
1269 Some((span, i + ix - 1))
1270 }
1271 }
1272
1273 /// Consumes the event iterator and produces an iterator that produces
1274 /// `(Event, Range)` pairs, where the `Range` value maps to the corresponding
1275 /// range in the markdown source.
1276 pub fn into_offset_iter(self) -> OffsetIter<'input, F> {
1277 OffsetIter { inner: self }
1278 }
1279}
1280
1281/// Returns number of containers scanned.
1282pub(crate) fn scan_containers(
1283 tree: &Tree<Item>,
1284 line_start: &mut LineStart<'_>,
1285 gfm_footnotes: bool,
1286) -> usize {
1287 let mut i = 0;
1288 for &node_ix in tree.walk_spine() {
1289 match tree[node_ix].item.body {
1290 ItemBody::BlockQuote(..) => {
1291 // `scan_blockquote_marker` saves & restores internally
1292 if !line_start.scan_blockquote_marker() {
1293 break;
1294 }
1295 }
1296 ItemBody::ListItem(indent) => {
1297 let save = line_start.clone();
1298 if !line_start.scan_space(indent) && !line_start.is_at_eol() {
1299 *line_start = save;
1300 break;
1301 }
1302 }
1303 ItemBody::FootnoteDefinition(..) if gfm_footnotes => {
1304 let save = line_start.clone();
1305 if !line_start.scan_space(4) && !line_start.is_at_eol() {
1306 *line_start = save;
1307 break;
1308 }
1309 }
1310 _ => (),
1311 }
1312 i += 1;
1313 }
1314 i
1315}
1316
1317impl Tree<Item> {
1318 pub(crate) fn append_text(&mut self, start: usize, end: usize, backslash_escaped: bool) {
1319 if end > start {
1320 if let Some(ix) = self.cur() {
1321 if matches!(self[ix].item.body, ItemBody::Text { .. }) && self[ix].item.end == start
1322 {
1323 self[ix].item.end = end;
1324 return;
1325 }
1326 }
1327 self.append(Item {
1328 start,
1329 end,
1330 body: ItemBody::Text { backslash_escaped },
1331 });
1332 }
1333 }
1334 /// Returns true if the current node is inside a table.
1335 ///
1336 /// If `cur` is an ItemBody::Table, it would return false,
1337 /// but since the `TableRow` and `TableHead` and `TableCell`
1338 /// are children of the table, anything doing inline parsing
1339 /// doesn't need to care about that.
1340 pub(crate) fn is_in_table(&self) -> bool {
1341 fn might_be_in_table(item: &Item) -> bool {
1342 item.body.is_inline()
1343 || matches!(item.body, |ItemBody::TableHead| ItemBody::TableRow
1344 | ItemBody::TableCell)
1345 }
1346 for &ix in self.walk_spine().rev() {
1347 if matches!(self[ix].item.body, ItemBody::Table(_)) {
1348 return true;
1349 }
1350 if !might_be_in_table(&self[ix].item) {
1351 return false;
1352 }
1353 }
1354 false
1355 }
1356}
1357
1358#[derive(Copy, Clone, Debug)]
1359struct InlineEl {
1360 /// offset of tree node
1361 start: TreeIndex,
1362 /// number of delimiters available for matching
1363 count: usize,
1364 /// length of the run that these delimiters came from
1365 run_length: usize,
1366 /// b'*', b'_', or b'~'
1367 c: u8,
1368 /// can both open and close
1369 both: bool,
1370}
1371
1372#[derive(Debug, Clone, Default)]
1373struct InlineStack {
1374 stack: Vec<InlineEl>,
1375 // Lower bounds for matching indices in the stack. For example
1376 // a strikethrough delimiter will never match with any element
1377 // in the stack with index smaller than
1378 // `lower_bounds[InlineStack::TILDES]`.
1379 lower_bounds: [usize; 9],
1380}
1381
1382impl InlineStack {
1383 /// These are indices into the lower bounds array.
1384 /// Not both refers to the property that the delimiter can not both
1385 /// be opener as a closer.
1386 const UNDERSCORE_NOT_BOTH: usize = 0;
1387 const ASTERISK_NOT_BOTH: usize = 1;
1388 const ASTERISK_BASE: usize = 2;
1389 const TILDES: usize = 5;
1390 const UNDERSCORE_BASE: usize = 6;
1391
1392 fn pop_all(&mut self, tree: &mut Tree<Item>) {
1393 for el in self.stack.drain(..) {
1394 for i in 0..el.count {
1395 tree[el.start + i].item.body = ItemBody::Text {
1396 backslash_escaped: false,
1397 };
1398 }
1399 }
1400 self.lower_bounds = [0; 9];
1401 }
1402
1403 fn get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize {
1404 if c == b'_' {
1405 let mod3_lower = self.lower_bounds[InlineStack::UNDERSCORE_BASE + count % 3];
1406 if both {
1407 mod3_lower
1408 } else {
1409 min(
1410 mod3_lower,
1411 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH],
1412 )
1413 }
1414 } else if c == b'*' {
1415 let mod3_lower = self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3];
1416 if both {
1417 mod3_lower
1418 } else {
1419 min(
1420 mod3_lower,
1421 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH],
1422 )
1423 }
1424 } else {
1425 self.lower_bounds[InlineStack::TILDES]
1426 }
1427 }
1428
1429 fn set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize) {
1430 if c == b'_' {
1431 if both {
1432 self.lower_bounds[InlineStack::UNDERSCORE_BASE + count % 3] = new_bound;
1433 } else {
1434 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH] = new_bound;
1435 }
1436 } else if c == b'*' {
1437 self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3] = new_bound;
1438 if !both {
1439 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH] = new_bound;
1440 }
1441 } else {
1442 self.lower_bounds[InlineStack::TILDES] = new_bound;
1443 }
1444 }
1445
1446 fn truncate(&mut self, new_bound: usize) {
1447 self.stack.truncate(new_bound);
1448 for lower_bound in &mut self.lower_bounds {
1449 if *lower_bound > new_bound {
1450 *lower_bound = new_bound;
1451 }
1452 }
1453 }
1454
1455 fn find_match(
1456 &mut self,
1457 tree: &mut Tree<Item>,
1458 c: u8,
1459 run_length: usize,
1460 both: bool,
1461 ) -> Option<InlineEl> {
1462 let lowerbound = min(self.stack.len(), self.get_lowerbound(c, run_length, both));
1463 let res = self.stack[lowerbound..]
1464 .iter()
1465 .cloned()
1466 .enumerate()
1467 .rfind(|(_, el)| {
1468 if c == b'~' && run_length != el.run_length {
1469 return false;
1470 }
1471 el.c == c
1472 && (!both && !el.both
1473 || (run_length + el.run_length) % 3 != 0
1474 || run_length % 3 == 0)
1475 });
1476
1477 if let Some((matching_ix, matching_el)) = res {
1478 let matching_ix = matching_ix + lowerbound;
1479 for el in &self.stack[(matching_ix + 1)..] {
1480 for i in 0..el.count {
1481 tree[el.start + i].item.body = ItemBody::Text {
1482 backslash_escaped: false,
1483 };
1484 }
1485 }
1486 self.truncate(matching_ix);
1487 Some(matching_el)
1488 } else {
1489 self.set_lowerbound(c, run_length, both, self.stack.len());
1490 None
1491 }
1492 }
1493
1494 fn trim_lower_bound(&mut self, ix: usize) {
1495 self.lower_bounds[ix] = self.lower_bounds[ix].min(self.stack.len());
1496 }
1497
1498 fn push(&mut self, el: InlineEl) {
1499 if el.c == b'~' {
1500 self.trim_lower_bound(InlineStack::TILDES);
1501 }
1502 self.stack.push(el)
1503 }
1504}
1505
1506#[derive(Debug, Clone)]
1507enum RefScan<'a> {
1508 // label, source ix of label end
1509 LinkLabel(CowStr<'a>, usize),
1510 // contains next node index
1511 Collapsed(Option<TreeIndex>),
1512 UnexpectedFootnote,
1513 Failed,
1514}
1515
1516/// Skips forward within a block to a node which spans (ends inclusive) the given
1517/// index into the source.
1518fn scan_nodes_to_ix(
1519 tree: &Tree<Item>,
1520 mut node: Option<TreeIndex>,
1521 ix: usize,
1522) -> Option<TreeIndex> {
1523 while let Some(node_ix: TreeIndex) = node {
1524 if tree[node_ix].item.end <= ix {
1525 node = tree[node_ix].next;
1526 } else {
1527 break;
1528 }
1529 }
1530 node
1531}
1532
1533/// Scans an inline link label, which cannot be interrupted.
1534/// Returns number of bytes (including brackets) and label on success.
1535fn scan_link_label<'text>(
1536 tree: &Tree<Item>,
1537 text: &'text str,
1538 allow_footnote_refs: bool,
1539 gfm_footnotes: bool,
1540) -> Option<(usize, ReferenceLabel<'text>)> {
1541 let bytes = text.as_bytes();
1542 if bytes.len() < 2 || bytes[0] != b'[' {
1543 return None;
1544 }
1545 let linebreak_handler = |bytes: &[u8]| {
1546 let mut line_start = LineStart::new(bytes);
1547 let _ = scan_containers(tree, &mut line_start, gfm_footnotes);
1548 Some(line_start.bytes_scanned())
1549 };
1550 if allow_footnote_refs && b'^' == bytes[1] && bytes.get(2) != Some(&b']') {
1551 let linebreak_handler: &dyn Fn(&[u8]) -> Option<usize> = if gfm_footnotes {
1552 &|_| None
1553 } else {
1554 &linebreak_handler
1555 };
1556 if let Some((byte_index, cow)) =
1557 scan_link_label_rest(&text[2..], linebreak_handler, tree.is_in_table())
1558 {
1559 return Some((byte_index + 2, ReferenceLabel::Footnote(cow)));
1560 }
1561 }
1562 let (byte_index, cow) =
1563 scan_link_label_rest(&text[1..], &linebreak_handler, tree.is_in_table())?;
1564 Some((byte_index + 1, ReferenceLabel::Link(cow)))
1565}
1566
1567fn scan_reference<'b>(
1568 tree: &Tree<Item>,
1569 text: &'b str,
1570 cur: Option<TreeIndex>,
1571 allow_footnote_refs: bool,
1572 gfm_footnotes: bool,
1573) -> RefScan<'b> {
1574 let cur_ix: TreeIndex = match cur {
1575 None => return RefScan::Failed,
1576 Some(cur_ix: TreeIndex) => cur_ix,
1577 };
1578 let start: usize = tree[cur_ix].item.start;
1579 let tail: &[u8] = &text.as_bytes()[start..];
1580
1581 if tail.starts_with(needle:b"[]") {
1582 // TODO: this unwrap is sus and should be looked at closer
1583 let closing_node: TreeIndex = tree[cur_ix].next.unwrap();
1584 RefScan::Collapsed(tree[closing_node].next)
1585 } else {
1586 let label: Option<(usize, ReferenceLabel<'_>)> = scan_link_label(tree, &text[start..], allow_footnote_refs, gfm_footnotes);
1587 match label {
1588 Some((ix: usize, ReferenceLabel::Link(label: CowStr<'_>))) => RefScan::LinkLabel(label, start + ix),
1589 Some((_ix: usize, ReferenceLabel::Footnote(_label: CowStr<'_>))) => RefScan::UnexpectedFootnote,
1590 None => RefScan::Failed,
1591 }
1592 }
1593}
1594
1595#[derive(Clone, Default)]
1596struct LinkStack {
1597 inner: Vec<LinkStackEl>,
1598 disabled_ix: usize,
1599}
1600
1601impl LinkStack {
1602 fn push(&mut self, el: LinkStackEl) {
1603 self.inner.push(el);
1604 }
1605
1606 fn pop(&mut self) -> Option<LinkStackEl> {
1607 let el = self.inner.pop();
1608 self.disabled_ix = std::cmp::min(self.disabled_ix, self.inner.len());
1609 el
1610 }
1611
1612 fn clear(&mut self) {
1613 self.inner.clear();
1614 self.disabled_ix = 0;
1615 }
1616
1617 fn disable_all_links(&mut self) {
1618 for el in &mut self.inner[self.disabled_ix..] {
1619 if el.ty == LinkStackTy::Link {
1620 el.ty = LinkStackTy::Disabled;
1621 }
1622 }
1623 self.disabled_ix = self.inner.len();
1624 }
1625}
1626
1627#[derive(Clone, Debug)]
1628struct LinkStackEl {
1629 node: TreeIndex,
1630 ty: LinkStackTy,
1631}
1632
1633#[derive(PartialEq, Clone, Debug)]
1634enum LinkStackTy {
1635 Link,
1636 Image,
1637 Disabled,
1638}
1639
1640/// Contains the destination URL, title and source span of a reference definition.
1641#[derive(Clone, Debug)]
1642pub struct LinkDef<'a> {
1643 pub dest: CowStr<'a>,
1644 pub title: Option<CowStr<'a>>,
1645 pub span: Range<usize>,
1646}
1647
1648/// Contains the destination URL, title and source span of a reference definition.
1649#[derive(Clone, Debug)]
1650pub struct FootnoteDef {
1651 pub use_count: usize,
1652}
1653
1654/// Tracks tree indices of code span delimiters of each length. It should prevent
1655/// quadratic scanning behaviours by providing (amortized) constant time lookups.
1656struct CodeDelims {
1657 inner: HashMap<usize, VecDeque<TreeIndex>>,
1658 seen_first: bool,
1659}
1660
1661impl CodeDelims {
1662 fn new() -> Self {
1663 Self {
1664 inner: Default::default(),
1665 seen_first: false,
1666 }
1667 }
1668
1669 fn insert(&mut self, count: usize, ix: TreeIndex) {
1670 if self.seen_first {
1671 self.inner.entry(count).or_default().push_back(ix);
1672 } else {
1673 // Skip the first insert, since that delimiter will always
1674 // be an opener and not a closer.
1675 self.seen_first = true;
1676 }
1677 }
1678
1679 fn is_populated(&self) -> bool {
1680 !self.inner.is_empty()
1681 }
1682
1683 fn find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex> {
1684 while let Some(ix) = self.inner.get_mut(&count)?.pop_front() {
1685 if ix > open_ix {
1686 return Some(ix);
1687 }
1688 }
1689 None
1690 }
1691
1692 fn clear(&mut self) {
1693 self.inner.clear();
1694 self.seen_first = false;
1695 }
1696}
1697
1698/// Tracks brace contexts and delimiter length for math delimiters.
1699/// Provides amortized constant-time lookups.
1700struct MathDelims {
1701 inner: HashMap<u8, VecDeque<(TreeIndex, bool, bool)>>,
1702}
1703
1704impl MathDelims {
1705 fn new() -> Self {
1706 Self {
1707 inner: Default::default(),
1708 }
1709 }
1710
1711 fn insert(
1712 &mut self,
1713 delim_is_display: bool,
1714 brace_context: u8,
1715 ix: TreeIndex,
1716 can_close: bool,
1717 ) {
1718 self.inner
1719 .entry(brace_context)
1720 .or_default()
1721 .push_back((ix, can_close, delim_is_display));
1722 }
1723
1724 fn is_populated(&self) -> bool {
1725 !self.inner.is_empty()
1726 }
1727
1728 fn find(
1729 &mut self,
1730 tree: &Tree<Item>,
1731 open_ix: TreeIndex,
1732 is_display: bool,
1733 brace_context: u8,
1734 ) -> Option<TreeIndex> {
1735 while let Some((ix, can_close, delim_is_display)) =
1736 self.inner.get_mut(&brace_context)?.pop_front()
1737 {
1738 if ix <= open_ix || (is_display && tree[open_ix].next == Some(ix)) {
1739 continue;
1740 }
1741 let can_close = can_close && tree[open_ix].item.end != tree[ix].item.start;
1742 if (!is_display && can_close) || (is_display && delim_is_display) {
1743 return Some(ix);
1744 }
1745 // if we can't use it, leave it in the queue as a tombstone for the next
1746 // thing that tries to match it
1747 self.inner
1748 .get_mut(&brace_context)?
1749 .push_front((ix, can_close, delim_is_display));
1750 break;
1751 }
1752 None
1753 }
1754
1755 fn clear(&mut self) {
1756 self.inner.clear();
1757 }
1758}
1759
1760#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1761pub(crate) struct LinkIndex(usize);
1762
1763#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1764pub(crate) struct CowIndex(usize);
1765
1766#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1767pub(crate) struct AlignmentIndex(usize);
1768
1769#[derive(Copy, Clone, PartialEq, Eq, Debug)]
1770pub(crate) struct HeadingIndex(NonZeroUsize);
1771
1772#[derive(Clone)]
1773pub(crate) struct Allocations<'a> {
1774 pub refdefs: RefDefs<'a>,
1775 pub footdefs: FootnoteDefs<'a>,
1776 links: Vec<(LinkType, CowStr<'a>, CowStr<'a>, CowStr<'a>)>,
1777 cows: Vec<CowStr<'a>>,
1778 alignments: Vec<Vec<Alignment>>,
1779 headings: Vec<HeadingAttributes<'a>>,
1780}
1781
1782/// Used by the heading attributes extension.
1783#[derive(Clone)]
1784pub(crate) struct HeadingAttributes<'a> {
1785 pub id: Option<CowStr<'a>>,
1786 pub classes: Vec<CowStr<'a>>,
1787 pub attrs: Vec<(CowStr<'a>, Option<CowStr<'a>>)>,
1788}
1789
1790/// Keeps track of the reference definitions defined in the document.
1791#[derive(Clone, Default, Debug)]
1792pub struct RefDefs<'input>(pub(crate) HashMap<LinkLabel<'input>, LinkDef<'input>>);
1793
1794/// Keeps track of the footnote definitions defined in the document.
1795#[derive(Clone, Default, Debug)]
1796pub struct FootnoteDefs<'input>(pub(crate) HashMap<FootnoteLabel<'input>, FootnoteDef>);
1797
1798impl<'input, 'b, 's> RefDefs<'input>
1799where
1800 's: 'b,
1801{
1802 /// Performs a lookup on reference label using unicode case folding.
1803 pub fn get(&'s self, key: &'b str) -> Option<&'b LinkDef<'input>> {
1804 self.0.get(&UniCase::new(key.into()))
1805 }
1806
1807 /// Provides an iterator over all the document's reference definitions.
1808 pub fn iter(&'s self) -> impl Iterator<Item = (&'s str, &'s LinkDef<'input>)> {
1809 self.0.iter().map(|(k: &UniCase>, v: &LinkDef<'_>)| (k.as_ref(), v))
1810 }
1811}
1812
1813impl<'input, 'b, 's> FootnoteDefs<'input>
1814where
1815 's: 'b,
1816{
1817 /// Performs a lookup on reference label using unicode case folding.
1818 pub fn contains(&'s self, key: &'b str) -> bool {
1819 self.0.contains_key(&UniCase::new(key.into()))
1820 }
1821 /// Performs a lookup on reference label using unicode case folding.
1822 pub fn get_mut(&'s mut self, key: CowStr<'input>) -> Option<&'s mut FootnoteDef> {
1823 self.0.get_mut(&UniCase::new(key))
1824 }
1825}
1826
1827impl<'a> Allocations<'a> {
1828 pub fn new() -> Self {
1829 Self {
1830 refdefs: RefDefs::default(),
1831 footdefs: FootnoteDefs::default(),
1832 links: Vec::with_capacity(128),
1833 cows: Vec::new(),
1834 alignments: Vec::new(),
1835 headings: Vec::new(),
1836 }
1837 }
1838
1839 pub fn allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex {
1840 let ix = self.cows.len();
1841 self.cows.push(cow);
1842 CowIndex(ix)
1843 }
1844
1845 pub fn allocate_link(
1846 &mut self,
1847 ty: LinkType,
1848 url: CowStr<'a>,
1849 title: CowStr<'a>,
1850 id: CowStr<'a>,
1851 ) -> LinkIndex {
1852 let ix = self.links.len();
1853 self.links.push((ty, url, title, id));
1854 LinkIndex(ix)
1855 }
1856
1857 pub fn allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex {
1858 let ix = self.alignments.len();
1859 self.alignments.push(alignment);
1860 AlignmentIndex(ix)
1861 }
1862
1863 pub fn allocate_heading(&mut self, attrs: HeadingAttributes<'a>) -> HeadingIndex {
1864 let ix = self.headings.len();
1865 self.headings.push(attrs);
1866 // This won't panic. `self.headings.len()` can't be `usize::MAX` since
1867 // such a long Vec cannot fit in memory.
1868 let ix_nonzero = NonZeroUsize::new(ix.wrapping_add(1)).expect("too many headings");
1869 HeadingIndex(ix_nonzero)
1870 }
1871
1872 pub fn take_cow(&mut self, ix: CowIndex) -> CowStr<'a> {
1873 std::mem::replace(&mut self.cows[ix.0], "".into())
1874 }
1875
1876 pub fn take_link(&mut self, ix: LinkIndex) -> (LinkType, CowStr<'a>, CowStr<'a>, CowStr<'a>) {
1877 let default_link = (LinkType::ShortcutUnknown, "".into(), "".into(), "".into());
1878 std::mem::replace(&mut self.links[ix.0], default_link)
1879 }
1880
1881 pub fn take_alignment(&mut self, ix: AlignmentIndex) -> Vec<Alignment> {
1882 std::mem::take(&mut self.alignments[ix.0])
1883 }
1884}
1885
1886impl<'a> Index<CowIndex> for Allocations<'a> {
1887 type Output = CowStr<'a>;
1888
1889 fn index(&self, ix: CowIndex) -> &Self::Output {
1890 self.cows.index(ix.0)
1891 }
1892}
1893
1894impl<'a> Index<LinkIndex> for Allocations<'a> {
1895 type Output = (LinkType, CowStr<'a>, CowStr<'a>, CowStr<'a>);
1896
1897 fn index(&self, ix: LinkIndex) -> &Self::Output {
1898 self.links.index(ix.0)
1899 }
1900}
1901
1902impl<'a> Index<AlignmentIndex> for Allocations<'a> {
1903 type Output = Vec<Alignment>;
1904
1905 fn index(&self, ix: AlignmentIndex) -> &Self::Output {
1906 self.alignments.index(ix.0)
1907 }
1908}
1909
1910impl<'a> Index<HeadingIndex> for Allocations<'a> {
1911 type Output = HeadingAttributes<'a>;
1912
1913 fn index(&self, ix: HeadingIndex) -> &Self::Output {
1914 self.headings.index(ix.0.get() - 1)
1915 }
1916}
1917
1918/// A struct containing information on the reachability of certain inline HTML
1919/// elements. In particular, for cdata elements (`<![CDATA[`), processing
1920/// elements (`<?`) and declarations (`<!DECLARATION`). The respectives usizes
1921/// represent the indices before which a scan will always fail and can hence
1922/// be skipped.
1923#[derive(Clone, Default)]
1924pub(crate) struct HtmlScanGuard {
1925 pub cdata: usize,
1926 pub processing: usize,
1927 pub declaration: usize,
1928 pub comment: usize,
1929}
1930
1931/// Trait for broken link callbacks.
1932///
1933/// See [Parser::new_with_broken_link_callback].
1934/// Automatically implemented for closures with the appropriate signature.
1935pub trait BrokenLinkCallback<'input> {
1936 fn handle_broken_link(
1937 &mut self,
1938 link: BrokenLink<'input>,
1939 ) -> Option<(CowStr<'input>, CowStr<'input>)>;
1940}
1941
1942impl<'input, T> BrokenLinkCallback<'input> for T
1943where
1944 T: FnMut(BrokenLink<'input>) -> Option<(CowStr<'input>, CowStr<'input>)>,
1945{
1946 fn handle_broken_link(
1947 &mut self,
1948 link: BrokenLink<'input>,
1949 ) -> Option<(CowStr<'input>, CowStr<'input>)> {
1950 self(link)
1951 }
1952}
1953
1954impl<'input> BrokenLinkCallback<'input> for Box<dyn BrokenLinkCallback<'input>> {
1955 fn handle_broken_link(
1956 &mut self,
1957 link: BrokenLink<'input>,
1958 ) -> Option<(CowStr<'input>, CowStr<'input>)> {
1959 (**self).handle_broken_link(link)
1960 }
1961}
1962
1963/// Broken link callback that does nothing.
1964#[derive(Debug)]
1965pub struct DefaultBrokenLinkCallback;
1966
1967impl<'input> BrokenLinkCallback<'input> for DefaultBrokenLinkCallback {
1968 fn handle_broken_link(
1969 &mut self,
1970 _link: BrokenLink<'input>,
1971 ) -> Option<(CowStr<'input>, CowStr<'input>)> {
1972 None
1973 }
1974}
1975
1976/// Markdown event and source range iterator.
1977///
1978/// Generates tuples where the first element is the markdown event and the second
1979/// is a the corresponding range in the source string.
1980///
1981/// Constructed from a `Parser` using its
1982/// [`into_offset_iter`](struct.Parser.html#method.into_offset_iter) method.
1983#[derive(Debug)]
1984pub struct OffsetIter<'a, F = DefaultBrokenLinkCallback> {
1985 inner: Parser<'a, F>,
1986}
1987
1988impl<'a, F: BrokenLinkCallback<'a>> OffsetIter<'a, F> {
1989 /// Returns a reference to the internal reference definition tracker.
1990 pub fn reference_definitions(&self) -> &RefDefs<'_> {
1991 self.inner.reference_definitions()
1992 }
1993}
1994
1995impl<'a, F: BrokenLinkCallback<'a>> Iterator for OffsetIter<'a, F> {
1996 type Item = (Event<'a>, Range<usize>);
1997
1998 fn next(&mut self) -> Option<Self::Item> {
1999 match self.inner.tree.cur() {
2000 None => {
2001 let ix = self.inner.tree.pop()?;
2002 let tag_end = body_to_tag_end(&self.inner.tree[ix].item.body);
2003 self.inner.tree.next_sibling(ix);
2004 let span = self.inner.tree[ix].item.start..self.inner.tree[ix].item.end;
2005 debug_assert!(span.start <= span.end);
2006 Some((Event::End(tag_end), span))
2007 }
2008 Some(cur_ix) => {
2009 if self.inner.tree[cur_ix].item.body.is_inline() {
2010 self.inner.handle_inline();
2011 }
2012
2013 let node = self.inner.tree[cur_ix];
2014 let item = node.item;
2015 let event = item_to_event(item, self.inner.text, &mut self.inner.allocs);
2016 if let Event::Start(..) = event {
2017 self.inner.tree.push();
2018 } else {
2019 self.inner.tree.next_sibling(cur_ix);
2020 }
2021 debug_assert!(item.start <= item.end);
2022 Some((event, item.start..item.end))
2023 }
2024 }
2025 }
2026}
2027
2028fn body_to_tag_end(body: &ItemBody) -> TagEnd {
2029 match *body {
2030 ItemBody::Paragraph => TagEnd::Paragraph,
2031 ItemBody::Emphasis => TagEnd::Emphasis,
2032 ItemBody::Strong => TagEnd::Strong,
2033 ItemBody::Strikethrough => TagEnd::Strikethrough,
2034 ItemBody::Link(..) => TagEnd::Link,
2035 ItemBody::Image(..) => TagEnd::Image,
2036 ItemBody::Heading(level, _) => TagEnd::Heading(level),
2037 ItemBody::IndentCodeBlock | ItemBody::FencedCodeBlock(..) => TagEnd::CodeBlock,
2038 ItemBody::BlockQuote(..) => TagEnd::BlockQuote,
2039 ItemBody::HtmlBlock => TagEnd::HtmlBlock,
2040 ItemBody::List(_, c, _) => {
2041 let is_ordered = c == b'.' || c == b')';
2042 TagEnd::List(is_ordered)
2043 }
2044 ItemBody::ListItem(_) => TagEnd::Item,
2045 ItemBody::TableHead => TagEnd::TableHead,
2046 ItemBody::TableCell => TagEnd::TableCell,
2047 ItemBody::TableRow => TagEnd::TableRow,
2048 ItemBody::Table(..) => TagEnd::Table,
2049 ItemBody::FootnoteDefinition(..) => TagEnd::FootnoteDefinition,
2050 ItemBody::MetadataBlock(kind) => TagEnd::MetadataBlock(kind),
2051 _ => panic!("unexpected item body {:?}", body),
2052 }
2053}
2054
2055fn item_to_event<'a>(item: Item, text: &'a str, allocs: &mut Allocations<'a>) -> Event<'a> {
2056 let tag = match item.body {
2057 ItemBody::Text { .. } => return Event::Text(text[item.start..item.end].into()),
2058 ItemBody::Code(cow_ix) => return Event::Code(allocs.take_cow(cow_ix)),
2059 ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs.take_cow(cow_ix)),
2060 ItemBody::SynthesizeChar(c) => return Event::Text(c.into()),
2061 ItemBody::HtmlBlock => Tag::HtmlBlock,
2062 ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
2063 ItemBody::InlineHtml => return Event::InlineHtml(text[item.start..item.end].into()),
2064 ItemBody::OwnedHtml(cow_ix) => return Event::Html(allocs.take_cow(cow_ix)),
2065 ItemBody::SoftBreak => return Event::SoftBreak,
2066 ItemBody::HardBreak(_) => return Event::HardBreak,
2067 ItemBody::FootnoteReference(cow_ix) => {
2068 return Event::FootnoteReference(allocs.take_cow(cow_ix))
2069 }
2070 ItemBody::TaskListMarker(checked) => return Event::TaskListMarker(checked),
2071 ItemBody::Rule => return Event::Rule,
2072 ItemBody::Paragraph => Tag::Paragraph,
2073 ItemBody::Emphasis => Tag::Emphasis,
2074 ItemBody::Strong => Tag::Strong,
2075 ItemBody::Strikethrough => Tag::Strikethrough,
2076 ItemBody::Link(link_ix) => {
2077 let (link_type, dest_url, title, id) = allocs.take_link(link_ix);
2078 Tag::Link {
2079 link_type,
2080 dest_url,
2081 title,
2082 id,
2083 }
2084 }
2085 ItemBody::Image(link_ix) => {
2086 let (link_type, dest_url, title, id) = allocs.take_link(link_ix);
2087 Tag::Image {
2088 link_type,
2089 dest_url,
2090 title,
2091 id,
2092 }
2093 }
2094 ItemBody::Heading(level, Some(heading_ix)) => {
2095 let HeadingAttributes { id, classes, attrs } = allocs.index(heading_ix);
2096 Tag::Heading {
2097 level,
2098 id: id.clone(),
2099 classes: classes.clone(),
2100 attrs: attrs.clone(),
2101 }
2102 }
2103 ItemBody::Heading(level, None) => Tag::Heading {
2104 level,
2105 id: None,
2106 classes: Vec::new(),
2107 attrs: Vec::new(),
2108 },
2109 ItemBody::FencedCodeBlock(cow_ix) => {
2110 Tag::CodeBlock(CodeBlockKind::Fenced(allocs.take_cow(cow_ix)))
2111 }
2112 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2113 ItemBody::BlockQuote(kind) => Tag::BlockQuote(kind),
2114 ItemBody::List(_, c, listitem_start) => {
2115 if c == b'.' || c == b')' {
2116 Tag::List(Some(listitem_start))
2117 } else {
2118 Tag::List(None)
2119 }
2120 }
2121 ItemBody::ListItem(_) => Tag::Item,
2122 ItemBody::TableHead => Tag::TableHead,
2123 ItemBody::TableCell => Tag::TableCell,
2124 ItemBody::TableRow => Tag::TableRow,
2125 ItemBody::Table(alignment_ix) => Tag::Table(allocs.take_alignment(alignment_ix)),
2126 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs.take_cow(cow_ix)),
2127 ItemBody::MetadataBlock(kind) => Tag::MetadataBlock(kind),
2128 ItemBody::Math(cow_ix, is_display) => {
2129 return if is_display {
2130 Event::DisplayMath(allocs.take_cow(cow_ix))
2131 } else {
2132 Event::InlineMath(allocs.take_cow(cow_ix))
2133 }
2134 }
2135 _ => panic!("unexpected item body {:?}", item.body),
2136 };
2137
2138 Event::Start(tag)
2139}
2140
2141impl<'a, F: BrokenLinkCallback<'a>> Iterator for Parser<'a, F> {
2142 type Item = Event<'a>;
2143
2144 fn next(&mut self) -> Option<Event<'a>> {
2145 match self.tree.cur() {
2146 None => {
2147 let ix = self.tree.pop()?;
2148 let tag_end = body_to_tag_end(&self.tree[ix].item.body);
2149 self.tree.next_sibling(ix);
2150 Some(Event::End(tag_end))
2151 }
2152 Some(cur_ix) => {
2153 if self.tree[cur_ix].item.body.is_inline() {
2154 self.handle_inline();
2155 }
2156
2157 let node = self.tree[cur_ix];
2158 let item = node.item;
2159 let event = item_to_event(item, self.text, &mut self.allocs);
2160 if let Event::Start(ref _tag) = event {
2161 self.tree.push();
2162 } else {
2163 self.tree.next_sibling(cur_ix);
2164 }
2165 Some(event)
2166 }
2167 }
2168 }
2169}
2170
2171impl<'a, F: BrokenLinkCallback<'a>> FusedIterator for Parser<'a, F> {}
2172
2173#[cfg(test)]
2174mod test {
2175 use super::*;
2176 use crate::tree::Node;
2177
2178 // TODO: move these tests to tests/html.rs?
2179
2180 fn parser_with_extensions(text: &str) -> Parser<'_> {
2181 let mut opts = Options::empty();
2182 opts.insert(Options::ENABLE_TABLES);
2183 opts.insert(Options::ENABLE_FOOTNOTES);
2184 opts.insert(Options::ENABLE_STRIKETHROUGH);
2185 opts.insert(Options::ENABLE_TASKLISTS);
2186
2187 Parser::new_ext(text, opts)
2188 }
2189
2190 #[test]
2191 #[cfg(target_pointer_width = "64")]
2192 fn node_size() {
2193 let node_size = std::mem::size_of::<Node<Item>>();
2194 assert_eq!(48, node_size);
2195 }
2196
2197 #[test]
2198 #[cfg(target_pointer_width = "64")]
2199 fn body_size() {
2200 let body_size = std::mem::size_of::<ItemBody>();
2201 assert_eq!(16, body_size);
2202 }
2203
2204 #[test]
2205 fn single_open_fish_bracket() {
2206 // dont crash
2207 assert_eq!(3, Parser::new("<").count());
2208 }
2209
2210 #[test]
2211 fn lone_hashtag() {
2212 // dont crash
2213 assert_eq!(2, Parser::new("#").count());
2214 }
2215
2216 #[test]
2217 fn lots_of_backslashes() {
2218 // dont crash
2219 Parser::new("\\\\\r\r").count();
2220 Parser::new("\\\r\r\\.\\\\\r\r\\.\\").count();
2221 }
2222
2223 #[test]
2224 fn issue_320() {
2225 // dont crash
2226 parser_with_extensions(":\r\t> |\r:\r\t> |\r").count();
2227 }
2228
2229 #[test]
2230 fn issue_319() {
2231 // dont crash
2232 parser_with_extensions("|\r-]([^|\r-]([^").count();
2233 parser_with_extensions("|\r\r=][^|\r\r=][^car").count();
2234 }
2235
2236 #[test]
2237 fn issue_303() {
2238 // dont crash
2239 parser_with_extensions("[^\r\ra]").count();
2240 parser_with_extensions("\r\r]Z[^\x00\r\r]Z[^\x00").count();
2241 }
2242
2243 #[test]
2244 fn issue_313() {
2245 // dont crash
2246 parser_with_extensions("*]0[^\r\r*]0[^").count();
2247 parser_with_extensions("[^\r> `][^\r> `][^\r> `][").count();
2248 }
2249
2250 #[test]
2251 fn issue_311() {
2252 // dont crash
2253 parser_with_extensions("\\\u{0d}-\u{09}\\\u{0d}-\u{09}").count();
2254 }
2255
2256 #[test]
2257 fn issue_283() {
2258 let input = std::str::from_utf8(b"\xf0\x9b\xb2\x9f<td:^\xf0\x9b\xb2\x9f").unwrap();
2259 // dont crash
2260 parser_with_extensions(input).count();
2261 }
2262
2263 #[test]
2264 fn issue_289() {
2265 // dont crash
2266 parser_with_extensions("> - \\\n> - ").count();
2267 parser_with_extensions("- \n\n").count();
2268 }
2269
2270 #[test]
2271 fn issue_306() {
2272 // dont crash
2273 parser_with_extensions("*\r_<__*\r_<__*\r_<__*\r_<__").count();
2274 }
2275
2276 #[test]
2277 fn issue_305() {
2278 // dont crash
2279 parser_with_extensions("_6**6*_*").count();
2280 }
2281
2282 #[test]
2283 fn another_emphasis_panic() {
2284 parser_with_extensions("*__#_#__*").count();
2285 }
2286
2287 #[test]
2288 fn offset_iter() {
2289 let event_offsets: Vec<_> = Parser::new("*hello* world")
2290 .into_offset_iter()
2291 .map(|(_ev, range)| range)
2292 .collect();
2293 let expected_offsets = vec![(0..13), (0..7), (1..6), (0..7), (7..13), (0..13)];
2294 assert_eq!(expected_offsets, event_offsets);
2295 }
2296
2297 #[test]
2298 fn reference_link_offsets() {
2299 let range =
2300 Parser::new("# H1\n[testing][Some reference]\n\n[Some reference]: https://github.com")
2301 .into_offset_iter()
2302 .filter_map(|(ev, range)| match ev {
2303 Event::Start(
2304 Tag::Link {
2305 link_type: LinkType::Reference,
2306 ..
2307 },
2308 ..,
2309 ) => Some(range),
2310 _ => None,
2311 })
2312 .next()
2313 .unwrap();
2314 assert_eq!(5..30, range);
2315 }
2316
2317 #[test]
2318 fn footnote_offsets() {
2319 let range = parser_with_extensions("Testing this[^1] out.\n\n[^1]: Footnote.")
2320 .into_offset_iter()
2321 .filter_map(|(ev, range)| match ev {
2322 Event::FootnoteReference(..) => Some(range),
2323 _ => None,
2324 })
2325 .next()
2326 .unwrap();
2327 assert_eq!(12..16, range);
2328 }
2329
2330 #[test]
2331 fn footnote_offsets_exclamation() {
2332 let mut immediately_before_footnote = None;
2333 let range = parser_with_extensions("Testing this![^1] out.\n\n[^1]: Footnote.")
2334 .into_offset_iter()
2335 .filter_map(|(ev, range)| match ev {
2336 Event::FootnoteReference(..) => Some(range),
2337 _ => {
2338 immediately_before_footnote = Some((ev, range));
2339 None
2340 }
2341 })
2342 .next()
2343 .unwrap();
2344 assert_eq!(13..17, range);
2345 if let (Event::Text(exclamation), range_exclamation) =
2346 immediately_before_footnote.as_ref().unwrap()
2347 {
2348 assert_eq!("!", &exclamation[..]);
2349 assert_eq!(&(12..13), range_exclamation);
2350 } else {
2351 panic!("what came first, then? {immediately_before_footnote:?}");
2352 }
2353 }
2354
2355 #[test]
2356 fn table_offset() {
2357 let markdown = "a\n\nTesting|This|Outtt\n--|:--:|--:\nSome Data|Other data|asdf";
2358 let event_offset = parser_with_extensions(markdown)
2359 .into_offset_iter()
2360 .map(|(_ev, range)| range)
2361 .nth(3)
2362 .unwrap();
2363 let expected_offset = 3..59;
2364 assert_eq!(expected_offset, event_offset);
2365 }
2366
2367 #[test]
2368 fn table_cell_span() {
2369 let markdown = "a|b|c\n--|--|--\na| |c";
2370 let event_offset = parser_with_extensions(markdown)
2371 .into_offset_iter()
2372 .filter_map(|(ev, span)| match ev {
2373 Event::Start(Tag::TableCell) => Some(span),
2374 _ => None,
2375 })
2376 .nth(4)
2377 .unwrap();
2378 let expected_offset_start = "a|b|c\n--|--|--\na|".len();
2379 assert_eq!(
2380 expected_offset_start..(expected_offset_start + 2),
2381 event_offset
2382 );
2383 }
2384
2385 #[test]
2386 fn offset_iter_issue_378() {
2387 let event_offsets: Vec<_> = Parser::new("a [b](c) d")
2388 .into_offset_iter()
2389 .map(|(_ev, range)| range)
2390 .collect();
2391 let expected_offsets = vec![(0..10), (0..2), (2..8), (3..4), (2..8), (8..10), (0..10)];
2392 assert_eq!(expected_offsets, event_offsets);
2393 }
2394
2395 #[test]
2396 fn offset_iter_issue_404() {
2397 let event_offsets: Vec<_> = Parser::new("###\n")
2398 .into_offset_iter()
2399 .map(|(_ev, range)| range)
2400 .collect();
2401 let expected_offsets = vec![(0..4), (0..4)];
2402 assert_eq!(expected_offsets, event_offsets);
2403 }
2404
2405 // FIXME: add this one regression suite
2406 #[cfg(feature = "html")]
2407 #[test]
2408 fn link_def_at_eof() {
2409 let test_str = "[My site][world]\n\n[world]: https://vincentprouillet.com";
2410 let expected = "<p><a href=\"https://vincentprouillet.com\">My site</a></p>\n";
2411
2412 let mut buf = String::new();
2413 crate::html::push_html(&mut buf, Parser::new(test_str));
2414 assert_eq!(expected, buf);
2415 }
2416
2417 #[cfg(feature = "html")]
2418 #[test]
2419 fn no_footnote_refs_without_option() {
2420 let test_str = "a [^a]\n\n[^a]: yolo";
2421 let expected = "<p>a <a href=\"yolo\">^a</a></p>\n";
2422
2423 let mut buf = String::new();
2424 crate::html::push_html(&mut buf, Parser::new(test_str));
2425 assert_eq!(expected, buf);
2426 }
2427
2428 #[cfg(feature = "html")]
2429 #[test]
2430 fn ref_def_at_eof() {
2431 let test_str = "[test]:\\";
2432 let expected = "";
2433
2434 let mut buf = String::new();
2435 crate::html::push_html(&mut buf, Parser::new(test_str));
2436 assert_eq!(expected, buf);
2437 }
2438
2439 #[cfg(feature = "html")]
2440 #[test]
2441 fn ref_def_cr_lf() {
2442 let test_str = "[a]: /u\r\n\n[a]";
2443 let expected = "<p><a href=\"/u\">a</a></p>\n";
2444
2445 let mut buf = String::new();
2446 crate::html::push_html(&mut buf, Parser::new(test_str));
2447 assert_eq!(expected, buf);
2448 }
2449
2450 #[cfg(feature = "html")]
2451 #[test]
2452 fn no_dest_refdef() {
2453 let test_str = "[a]:";
2454 let expected = "<p>[a]:</p>\n";
2455
2456 let mut buf = String::new();
2457 crate::html::push_html(&mut buf, Parser::new(test_str));
2458 assert_eq!(expected, buf);
2459 }
2460
2461 #[test]
2462 fn broken_links_called_only_once() {
2463 for &(markdown, expected) in &[
2464 ("See also [`g()`][crate::g].", 1),
2465 ("See also [`g()`][crate::g][].", 1),
2466 ("[brokenlink1] some other node [brokenlink2]", 2),
2467 ] {
2468 let mut times_called = 0;
2469 let callback = &mut |_broken_link: BrokenLink| {
2470 times_called += 1;
2471 None
2472 };
2473 let parser =
2474 Parser::new_with_broken_link_callback(markdown, Options::empty(), Some(callback));
2475 for _ in parser {}
2476 assert_eq!(times_called, expected);
2477 }
2478 }
2479
2480 #[test]
2481 fn simple_broken_link_callback() {
2482 let test_str = "This is a link w/o def: [hello][world]";
2483 let mut callback = |broken_link: BrokenLink| {
2484 assert_eq!("world", broken_link.reference.as_ref());
2485 assert_eq!(&test_str[broken_link.span], "[hello][world]");
2486 let url = "YOLO".into();
2487 let title = "SWAG".to_owned().into();
2488 Some((url, title))
2489 };
2490 let parser =
2491 Parser::new_with_broken_link_callback(test_str, Options::empty(), Some(&mut callback));
2492 let mut link_tag_count = 0;
2493 for (typ, url, title, id) in parser.filter_map(|event| match event {
2494 Event::Start(tag) => match tag {
2495 Tag::Link {
2496 link_type,
2497 dest_url,
2498 title,
2499 id,
2500 } => Some((link_type, dest_url, title, id)),
2501 _ => None,
2502 },
2503 _ => None,
2504 }) {
2505 link_tag_count += 1;
2506 assert_eq!(typ, LinkType::ReferenceUnknown);
2507 assert_eq!(url.as_ref(), "YOLO");
2508 assert_eq!(title.as_ref(), "SWAG");
2509 assert_eq!(id.as_ref(), "world");
2510 }
2511 assert!(link_tag_count > 0);
2512 }
2513
2514 #[test]
2515 fn code_block_kind_check_fenced() {
2516 let parser = Parser::new("hello\n```test\ntadam\n```");
2517 let mut found = 0;
2518 for (ev, _range) in parser.into_offset_iter() {
2519 match ev {
2520 Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(syntax))) => {
2521 assert_eq!(syntax.as_ref(), "test");
2522 found += 1;
2523 }
2524 _ => {}
2525 }
2526 }
2527 assert_eq!(found, 1);
2528 }
2529
2530 #[test]
2531 fn code_block_kind_check_indented() {
2532 let parser = Parser::new("hello\n\n ```test\n tadam\nhello");
2533 let mut found = 0;
2534 for (ev, _range) in parser.into_offset_iter() {
2535 match ev {
2536 Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
2537 found += 1;
2538 }
2539 _ => {}
2540 }
2541 }
2542 assert_eq!(found, 1);
2543 }
2544
2545 #[test]
2546 fn ref_defs() {
2547 let input = r###"[a B c]: http://example.com
2548[another]: https://google.com
2549
2550text
2551
2552[final ONE]: http://wikipedia.org
2553"###;
2554 let mut parser = Parser::new(input);
2555
2556 assert!(parser.reference_definitions().get("a b c").is_some());
2557 assert!(parser.reference_definitions().get("nope").is_none());
2558
2559 if let Some(_event) = parser.next() {
2560 // testing keys with shorter lifetimes than parser and its input
2561 let s = "final one".to_owned();
2562 let link_def = parser.reference_definitions().get(&s).unwrap();
2563 let span = &input[link_def.span.clone()];
2564 assert_eq!(span, "[final ONE]: http://wikipedia.org");
2565 }
2566 }
2567
2568 #[test]
2569 fn common_lifetime_patterns_allowed<'b>() {
2570 let temporary_str = String::from("xyz");
2571
2572 // NOTE: this is a limitation of Rust, it doesn't allow putting lifetime parameters on the closure itself.
2573 // Hack it by attaching the lifetime to the test function instead.
2574 // TODO: why is the `'b` lifetime required at all? Changing it to `'_` breaks things :(
2575 let mut closure = |link: BrokenLink<'b>| Some(("#".into(), link.reference));
2576
2577 fn function(link: BrokenLink<'_>) -> Option<(CowStr<'_>, CowStr<'_>)> {
2578 Some(("#".into(), link.reference))
2579 }
2580
2581 for _ in Parser::new_with_broken_link_callback(
2582 "static lifetime",
2583 Options::empty(),
2584 Some(&mut closure),
2585 ) {}
2586 /* This fails to compile. Because the closure can't say `for <'a> fn(BrokenLink<'a>) ->
2587 * CowStr<'a>` and has to use the enclosing `'b` lifetime parameter, `temporary_str` lives
2588 * shorter than `'b`. I think this is unlikely to occur in real life, and if it does, the
2589 * fix is simple: move it out to a function that allows annotating the lifetimes.
2590 */
2591 //for _ in Parser::new_with_broken_link_callback(&temporary_str, Options::empty(), Some(&mut callback)) {
2592 //}
2593
2594 for _ in Parser::new_with_broken_link_callback(
2595 "static lifetime",
2596 Options::empty(),
2597 Some(&mut function),
2598 ) {}
2599 for _ in Parser::new_with_broken_link_callback(
2600 &temporary_str,
2601 Options::empty(),
2602 Some(&mut function),
2603 ) {}
2604 }
2605}
2606