1// Copyright 2015 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Pull parser for [CommonMark](https://commonmark.org). This crate provides a [Parser](struct.Parser.html) struct
22//! which is an iterator over [Event](enum.Event.html)s. This iterator can be used
23//! directly, or to output HTML using the [HTML module](html/index.html).
24//!
25//! By default, only CommonMark features are enabled. To use extensions like tables,
26//! footnotes or task lists, enable them by setting the corresponding flags in the
27//! [Options](struct.Options.html) struct.
28//!
29//! # Example
30//! ```rust
31//! use pulldown_cmark::{Parser, Options};
32//!
33//! let markdown_input = "Hello world, this is a ~~complicated~~ *very simple* example.";
34//!
35//! // Set up options and parser. Strikethroughs are not part of the CommonMark standard
36//! // and we therefore must enable it explicitly.
37//! let mut options = Options::empty();
38//! options.insert(Options::ENABLE_STRIKETHROUGH);
39//! let parser = Parser::new_ext(markdown_input, options);
40//!
41//! # #[cfg(feature = "html")] {
42//! // Write to String buffer.
43//! let mut html_output = String::new();
44//! pulldown_cmark::html::push_html(&mut html_output, parser);
45//!
46//! // Check that the output is what we expected.
47//! let expected_html = "<p>Hello world, this is a <del>complicated</del> <em>very simple</em> example.</p>\n";
48//! assert_eq!(expected_html, &html_output);
49//! # }
50//! ```
51//!
52//! Note that consecutive text events can happen due to the manner in which the
53//! parser evaluates the source. A utility `TextMergeStream` exists to improve
54//! the comfort of iterating the events:
55//!
56//! ```rust
57//! use pulldown_cmark::{Event, Parser, TextMergeStream};
58//!
59//! let markdown_input = "Hello world, this is a ~~complicated~~ *very simple* example.";
60//!
61//! let iterator = TextMergeStream::new(Parser::new(markdown_input));
62//!
63//! for event in iterator {
64//! match event {
65//! Event::Text(text) => println!("{}", text),
66//! _ => {}
67//! }
68//! }
69//! ```
70//!
71
72// When compiled for the rustc compiler itself we want to make sure that this is
73// an unstable crate.
74#![cfg_attr(rustbuild, feature(staged_api, rustc_private))]
75#![cfg_attr(rustbuild, unstable(feature = "rustc_private", issue = "27812"))]
76// Forbid unsafe code unless the SIMD feature is enabled.
77#![cfg_attr(not(feature = "simd"), forbid(unsafe_code))]
78#![warn(missing_debug_implementations)]
79
80#[cfg(feature = "serde")]
81use serde::{Deserialize, Serialize};
82
83#[cfg(feature = "html")]
84pub mod html;
85
86pub mod utils;
87
88mod entities;
89mod firstpass;
90mod linklabel;
91mod parse;
92mod puncttable;
93mod scanners;
94mod strings;
95mod tree;
96
97use std::fmt::Display;
98
99pub use crate::parse::{
100 BrokenLink, BrokenLinkCallback, DefaultBrokenLinkCallback, OffsetIter, Parser, RefDefs,
101};
102pub use crate::strings::{CowStr, InlineStr};
103pub use crate::utils::*;
104
105/// Codeblock kind.
106#[derive(Clone, Debug, PartialEq)]
107#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
108pub enum CodeBlockKind<'a> {
109 Indented,
110 /// The value contained in the tag describes the language of the code, which may be empty.
111 #[cfg_attr(feature = "serde", serde(borrow))]
112 Fenced(CowStr<'a>),
113}
114
115impl<'a> CodeBlockKind<'a> {
116 pub fn is_indented(&self) -> bool {
117 matches!(*self, CodeBlockKind::Indented)
118 }
119
120 pub fn is_fenced(&self) -> bool {
121 matches!(*self, CodeBlockKind::Fenced(_))
122 }
123}
124
125/// BlockQuote kind (Note, Tip, Important, Warning, Caution).
126#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
127#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
128pub enum BlockQuoteKind {
129 Note,
130 Tip,
131 Important,
132 Warning,
133 Caution,
134}
135
136/// Metadata block kind.
137#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
138#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
139pub enum MetadataBlockKind {
140 YamlStyle,
141 PlusesStyle,
142}
143
144/// Tags for elements that can contain other elements.
145#[derive(Clone, Debug, PartialEq)]
146#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
147pub enum Tag<'a> {
148 /// A paragraph of text and other inline elements.
149 Paragraph,
150
151 /// A heading, with optional identifier, classes and custom attributes.
152 /// The identifier is prefixed with `#` and the last one in the attributes
153 /// list is chosen, classes are prefixed with `.` and custom attributes
154 /// have no prefix and can optionally have a value (`myattr` or `myattr=myvalue`).
155 Heading {
156 level: HeadingLevel,
157 id: Option<CowStr<'a>>,
158 classes: Vec<CowStr<'a>>,
159 /// The first item of the tuple is the attr and second one the value.
160 attrs: Vec<(CowStr<'a>, Option<CowStr<'a>>)>,
161 },
162
163 BlockQuote(Option<BlockQuoteKind>),
164 /// A code block.
165 CodeBlock(CodeBlockKind<'a>),
166
167 /// A HTML block.
168 HtmlBlock,
169
170 /// A list. If the list is ordered the field indicates the number of the first item.
171 /// Contains only list items.
172 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
173 /// A list item.
174 Item,
175 /// A footnote definition. The value contained is the footnote's label by which it can
176 /// be referred to.
177 #[cfg_attr(feature = "serde", serde(borrow))]
178 FootnoteDefinition(CowStr<'a>),
179
180 /// A table. Contains a vector describing the text-alignment for each of its columns.
181 Table(Vec<Alignment>),
182 /// A table header. Contains only `TableCell`s. Note that the table body starts immediately
183 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
184 TableHead,
185 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
186 TableRow,
187 TableCell,
188
189 // span-level tags
190 Emphasis,
191 Strong,
192 Strikethrough,
193
194 /// A link.
195 Link {
196 link_type: LinkType,
197 dest_url: CowStr<'a>,
198 title: CowStr<'a>,
199 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
200 id: CowStr<'a>,
201 },
202
203 /// An image. The first field is the link type, the second the destination URL and the third is a title,
204 /// the fourth is the link identifier.
205 Image {
206 link_type: LinkType,
207 dest_url: CowStr<'a>,
208 title: CowStr<'a>,
209 /// Identifier of reference links, e.g. `world` in the link `[hello][world]`.
210 id: CowStr<'a>,
211 },
212
213 /// A metadata block.
214 MetadataBlock(MetadataBlockKind),
215}
216
217impl<'a> Tag<'a> {
218 pub fn to_end(&self) -> TagEnd {
219 match self {
220 Tag::Paragraph => TagEnd::Paragraph,
221 Tag::Heading { level: &HeadingLevel, .. } => TagEnd::Heading(*level),
222 Tag::BlockQuote(_) => TagEnd::BlockQuote,
223 Tag::CodeBlock(_) => TagEnd::CodeBlock,
224 Tag::HtmlBlock => TagEnd::HtmlBlock,
225 Tag::List(number: &Option) => TagEnd::List(number.is_some()),
226 Tag::Item => TagEnd::Item,
227 Tag::FootnoteDefinition(_) => TagEnd::FootnoteDefinition,
228 Tag::Table(_) => TagEnd::Table,
229 Tag::TableHead => TagEnd::TableHead,
230 Tag::TableRow => TagEnd::TableRow,
231 Tag::TableCell => TagEnd::TableCell,
232 Tag::Emphasis => TagEnd::Emphasis,
233 Tag::Strong => TagEnd::Strong,
234 Tag::Strikethrough => TagEnd::Strikethrough,
235 Tag::Link { .. } => TagEnd::Link,
236 Tag::Image { .. } => TagEnd::Image,
237 Tag::MetadataBlock(kind: &MetadataBlockKind) => TagEnd::MetadataBlock(*kind),
238 }
239 }
240}
241
242/// The end of a `Tag`.
243#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
244#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
245pub enum TagEnd {
246 Paragraph,
247 Heading(HeadingLevel),
248
249 BlockQuote,
250 CodeBlock,
251
252 HtmlBlock,
253
254 /// A list, `true` for ordered lists.
255 List(bool),
256 Item,
257 FootnoteDefinition,
258
259 Table,
260 TableHead,
261 TableRow,
262 TableCell,
263
264 Emphasis,
265 Strong,
266 Strikethrough,
267
268 Link,
269 Image,
270
271 MetadataBlock(MetadataBlockKind),
272}
273
274/// Make sure `TagEnd` is no more than two bytes in size.
275/// This is why it's used instead of just using `Tag`.
276#[cfg(target_pointer_width = "64")]
277const _STATIC_ASSERT_TAG_END_SIZE: [(); 2] = [(); std::mem::size_of::<TagEnd>()];
278
279impl<'a> From<Tag<'a>> for TagEnd {
280 fn from(value: Tag) -> Self {
281 value.to_end()
282 }
283}
284
285#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
286#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
287pub enum HeadingLevel {
288 H1 = 1,
289 H2,
290 H3,
291 H4,
292 H5,
293 H6,
294}
295
296impl Display for HeadingLevel {
297 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
298 match self {
299 Self::H1 => write!(f, "h1"),
300 Self::H2 => write!(f, "h2"),
301 Self::H3 => write!(f, "h3"),
302 Self::H4 => write!(f, "h4"),
303 Self::H5 => write!(f, "h5"),
304 Self::H6 => write!(f, "h6"),
305 }
306 }
307}
308
309/// Returned when trying to convert a `usize` into a `Heading` but it fails
310/// because the usize isn't a valid heading level
311#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
312pub struct InvalidHeadingLevel(usize);
313
314impl TryFrom<usize> for HeadingLevel {
315 type Error = InvalidHeadingLevel;
316
317 fn try_from(value: usize) -> Result<Self, Self::Error> {
318 match value {
319 1 => Ok(Self::H1),
320 2 => Ok(Self::H2),
321 3 => Ok(Self::H3),
322 4 => Ok(Self::H4),
323 5 => Ok(Self::H5),
324 6 => Ok(Self::H6),
325 _ => Err(InvalidHeadingLevel(value)),
326 }
327 }
328}
329
330/// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information.
331#[derive(Clone, Debug, PartialEq, Copy)]
332#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
333pub enum LinkType {
334 /// Inline link like `[foo](bar)`
335 Inline,
336 /// Reference link like `[foo][bar]`
337 Reference,
338 /// Reference without destination in the document, but resolved by the broken_link_callback
339 ReferenceUnknown,
340 /// Collapsed link like `[foo][]`
341 Collapsed,
342 /// Collapsed link without destination in the document, but resolved by the broken_link_callback
343 CollapsedUnknown,
344 /// Shortcut link like `[foo]`
345 Shortcut,
346 /// Shortcut without destination in the document, but resolved by the broken_link_callback
347 ShortcutUnknown,
348 /// Autolink like `<http://foo.bar/baz>`
349 Autolink,
350 /// Email address in autolink like `<john@example.org>`
351 Email,
352}
353
354impl LinkType {
355 /// Map the link type to an equivalent _Unknown link type.
356 fn to_unknown(self) -> Self {
357 match self {
358 LinkType::Reference => LinkType::ReferenceUnknown,
359 LinkType::Collapsed => LinkType::CollapsedUnknown,
360 LinkType::Shortcut => LinkType::ShortcutUnknown,
361 _ => unreachable!(),
362 }
363 }
364}
365
366/// Markdown events that are generated in a preorder traversal of the document
367/// tree, with additional `End` events whenever all of an inner node's children
368/// have been visited.
369#[derive(Clone, Debug, PartialEq)]
370#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
371pub enum Event<'a> {
372 /// Start of a tagged element. Events that are yielded after this event
373 /// and before its corresponding `End` event are inside this element.
374 /// Start and end events are guaranteed to be balanced.
375 #[cfg_attr(feature = "serde", serde(borrow))]
376 Start(Tag<'a>),
377 /// End of a tagged element.
378 End(TagEnd),
379 /// A text node.
380 #[cfg_attr(feature = "serde", serde(borrow))]
381 Text(CowStr<'a>),
382 /// An inline code node.
383 #[cfg_attr(feature = "serde", serde(borrow))]
384 Code(CowStr<'a>),
385 /// An inline math environment node.
386 #[cfg_attr(feature = "serde", serde(borrow))]
387 InlineMath(CowStr<'a>),
388 /// A display math environment node.
389 #[cfg_attr(feature = "serde", serde(borrow))]
390 DisplayMath(CowStr<'a>),
391 /// An HTML node.
392 #[cfg_attr(feature = "serde", serde(borrow))]
393 Html(CowStr<'a>),
394 /// An inline HTML node.
395 #[cfg_attr(feature = "serde", serde(borrow))]
396 InlineHtml(CowStr<'a>),
397 /// A reference to a footnote with given label, which may or may not be defined
398 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
399 /// occur in any order.
400 #[cfg_attr(feature = "serde", serde(borrow))]
401 FootnoteReference(CowStr<'a>),
402 /// A soft line break.
403 SoftBreak,
404 /// A hard line break.
405 HardBreak,
406 /// A horizontal ruler.
407 Rule,
408 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
409 TaskListMarker(bool),
410}
411
412/// Table column text alignment.
413#[derive(Copy, Clone, Debug, PartialEq)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415
416pub enum Alignment {
417 /// Default text alignment.
418 None,
419 Left,
420 Center,
421 Right,
422}
423
424bitflags::bitflags! {
425 /// Option struct containing flags for enabling extra features
426 /// that are not part of the CommonMark spec.
427 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
428 pub struct Options: u32 {
429 const ENABLE_TABLES = 1 << 1;
430 /// GitHub-compatible footnote syntax.
431 ///
432 /// Footnotes are referenced with the syntax `[^IDENT]`,
433 /// and defined with an identifier followed by a colon at top level.
434 ///
435 /// ---
436 ///
437 /// ```markdown
438 /// Footnote referenced [^1].
439 ///
440 /// [^1]: footnote defined
441 /// ```
442 ///
443 /// Footnote referenced [^1].
444 ///
445 /// [^1]: footnote defined
446 const ENABLE_FOOTNOTES = 1 << 2;
447 const ENABLE_STRIKETHROUGH = 1 << 3;
448 const ENABLE_TASKLISTS = 1 << 4;
449 const ENABLE_SMART_PUNCTUATION = 1 << 5;
450 /// Extension to allow headings to have ID and classes.
451 ///
452 /// `# text { #id .class1 .class2 myattr other_attr=myvalue }`
453 /// is interpreted as a level 1 heading
454 /// with the content `text`, ID `id`, classes `class1` and `class2` and
455 /// custom attributes `myattr` (without value) and
456 /// `other_attr` with value `myvalue`.
457 /// Note that ID, classes, and custom attributes should be space-separated.
458 const ENABLE_HEADING_ATTRIBUTES = 1 << 6;
459 /// Metadata blocks in YAML style, i.e.:
460 /// - starting with a `---` line
461 /// - ending with a `---` or `...` line
462 const ENABLE_YAML_STYLE_METADATA_BLOCKS = 1 << 7;
463 /// Metadata blocks delimited by:
464 /// - `+++` line at start
465 /// - `+++` line at end
466 const ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS = 1 << 8;
467 /// Older footnote syntax. This flag implies `ENABLE_FOOTNOTES`, changing it to use an
468 /// older syntax instead of the new, default, GitHub-compatible syntax.
469 ///
470 /// New syntax is different from the old syntax regarding
471 /// indentation, nesting, and footnote references with no definition:
472 ///
473 /// ```markdown
474 /// [^1]: In new syntax, this is two footnote definitions.
475 /// [^2]: In old syntax, this is a single footnote definition with two lines.
476 ///
477 /// [^3]:
478 ///
479 /// In new syntax, this is a footnote with two paragraphs.
480 ///
481 /// In old syntax, this is a footnote followed by a code block.
482 ///
483 /// In new syntax, this undefined footnote definition renders as
484 /// literal text [^4]. In old syntax, it creates a dangling link.
485 /// ```
486 const ENABLE_OLD_FOOTNOTES = (1 << 9) | (1 << 2);
487 /// With this feature enabled, two events `Event::InlineMath` and `Event::DisplayMath`
488 /// are emitted that conventionally contain TeX formulas.
489 const ENABLE_MATH = 1 << 10;
490 /// Misc GitHub Flavored Markdown features not supported in CommonMark.
491 /// The following features are currently behind this tag:
492 /// - Blockquote tags ([!NOTE], [!TIP], [!IMPORTANT], [!WARNING], [!CAUTION]).
493 const ENABLE_GFM = 1 << 11;
494 }
495}
496
497impl Options {
498 pub(crate) fn has_gfm_footnotes(&self) -> bool {
499 self.contains(Options::ENABLE_FOOTNOTES) && !self.contains(Options::ENABLE_OLD_FOOTNOTES)
500 }
501}
502