1 | // Copyright 2015 Google Inc. All rights reserved. |
2 | // |
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | // of this software and associated documentation files (the "Software"), to deal |
5 | // in the Software without restriction, including without limitation the rights |
6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
7 | // copies of the Software, and to permit persons to whom the Software is |
8 | // furnished to do so, subject to the following conditions: |
9 | // |
10 | // The above copyright notice and this permission notice shall be included in |
11 | // all copies or substantial portions of the Software. |
12 | // |
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
19 | // THE SOFTWARE. |
20 | |
21 | //! Pull parser for [CommonMark](https://commonmark.org). This crate provides a [Parser](struct.Parser.html) struct |
22 | //! which is an iterator over [Event](enum.Event.html)s. This iterator can be used |
23 | //! directly, or to output HTML using the [HTML module](html/index.html). |
24 | //! |
25 | //! By default, only CommonMark features are enabled. To use extensions like tables, |
26 | //! footnotes or task lists, enable them by setting the corresponding flags in the |
27 | //! [Options](struct.Options.html) struct. |
28 | //! |
29 | //! # Example |
30 | //! ```rust |
31 | //! use pulldown_cmark::{Parser, Options}; |
32 | //! |
33 | //! let markdown_input = "Hello world, this is a ~~complicated~~ *very simple* example." ; |
34 | //! |
35 | //! // Set up options and parser. Strikethroughs are not part of the CommonMark standard |
36 | //! // and we therefore must enable it explicitly. |
37 | //! let mut options = Options::empty(); |
38 | //! options.insert(Options::ENABLE_STRIKETHROUGH); |
39 | //! let parser = Parser::new_ext(markdown_input, options); |
40 | //! |
41 | //! # #[cfg (feature = "html" )] { |
42 | //! // Write to String buffer. |
43 | //! let mut html_output = String::new(); |
44 | //! pulldown_cmark::html::push_html(&mut html_output, parser); |
45 | //! |
46 | //! // Check that the output is what we expected. |
47 | //! let expected_html = "<p>Hello world, this is a <del>complicated</del> <em>very simple</em> example.</p> \n" ; |
48 | //! assert_eq!(expected_html, &html_output); |
49 | //! # } |
50 | //! ``` |
51 | //! |
52 | //! Note that consecutive text events can happen due to the manner in which the |
53 | //! parser evaluates the source. A utility `TextMergeStream` exists to improve |
54 | //! the comfort of iterating the events: |
55 | //! |
56 | //! ```rust |
57 | //! use pulldown_cmark::{Event, Parser, TextMergeStream}; |
58 | //! |
59 | //! let markdown_input = "Hello world, this is a ~~complicated~~ *very simple* example." ; |
60 | //! |
61 | //! let iterator = TextMergeStream::new(Parser::new(markdown_input)); |
62 | //! |
63 | //! for event in iterator { |
64 | //! match event { |
65 | //! Event::Text(text) => println!("{}" , text), |
66 | //! _ => {} |
67 | //! } |
68 | //! } |
69 | //! ``` |
70 | //! |
71 | |
72 | // When compiled for the rustc compiler itself we want to make sure that this is |
73 | // an unstable crate. |
74 | #![cfg_attr (rustbuild, feature(staged_api, rustc_private))] |
75 | #![cfg_attr (rustbuild, unstable(feature = "rustc_private" , issue = "27812" ))] |
76 | // Forbid unsafe code unless the SIMD feature is enabled. |
77 | #![cfg_attr (not(feature = "simd" ), forbid(unsafe_code))] |
78 | #![warn (missing_debug_implementations)] |
79 | |
80 | #[cfg (feature = "serde" )] |
81 | use serde::{Deserialize, Serialize}; |
82 | |
83 | #[cfg (feature = "html" )] |
84 | pub mod html; |
85 | |
86 | pub mod utils; |
87 | |
88 | mod entities; |
89 | mod firstpass; |
90 | mod linklabel; |
91 | mod parse; |
92 | mod puncttable; |
93 | mod scanners; |
94 | mod strings; |
95 | mod tree; |
96 | |
97 | use std::fmt::Display; |
98 | |
99 | pub use crate::parse::{ |
100 | BrokenLink, BrokenLinkCallback, DefaultBrokenLinkCallback, OffsetIter, Parser, RefDefs, |
101 | }; |
102 | pub use crate::strings::{CowStr, InlineStr}; |
103 | pub use crate::utils::*; |
104 | |
105 | /// Codeblock kind. |
106 | #[derive (Clone, Debug, PartialEq)] |
107 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
108 | pub enum CodeBlockKind<'a> { |
109 | Indented, |
110 | /// The value contained in the tag describes the language of the code, which may be empty. |
111 | #[cfg_attr (feature = "serde" , serde(borrow))] |
112 | Fenced(CowStr<'a>), |
113 | } |
114 | |
115 | impl<'a> CodeBlockKind<'a> { |
116 | pub fn is_indented(&self) -> bool { |
117 | matches!(*self, CodeBlockKind::Indented) |
118 | } |
119 | |
120 | pub fn is_fenced(&self) -> bool { |
121 | matches!(*self, CodeBlockKind::Fenced(_)) |
122 | } |
123 | } |
124 | |
125 | /// BlockQuote kind (Note, Tip, Important, Warning, Caution). |
126 | #[derive (Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] |
127 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
128 | pub enum BlockQuoteKind { |
129 | Note, |
130 | Tip, |
131 | Important, |
132 | Warning, |
133 | Caution, |
134 | } |
135 | |
136 | /// Metadata block kind. |
137 | #[derive (Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] |
138 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
139 | pub enum MetadataBlockKind { |
140 | YamlStyle, |
141 | PlusesStyle, |
142 | } |
143 | |
144 | /// Tags for elements that can contain other elements. |
145 | #[derive (Clone, Debug, PartialEq)] |
146 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
147 | pub enum Tag<'a> { |
148 | /// A paragraph of text and other inline elements. |
149 | Paragraph, |
150 | |
151 | /// A heading, with optional identifier, classes and custom attributes. |
152 | /// The identifier is prefixed with `#` and the last one in the attributes |
153 | /// list is chosen, classes are prefixed with `.` and custom attributes |
154 | /// have no prefix and can optionally have a value (`myattr` or `myattr=myvalue`). |
155 | Heading { |
156 | level: HeadingLevel, |
157 | id: Option<CowStr<'a>>, |
158 | classes: Vec<CowStr<'a>>, |
159 | /// The first item of the tuple is the attr and second one the value. |
160 | attrs: Vec<(CowStr<'a>, Option<CowStr<'a>>)>, |
161 | }, |
162 | |
163 | BlockQuote(Option<BlockQuoteKind>), |
164 | /// A code block. |
165 | CodeBlock(CodeBlockKind<'a>), |
166 | |
167 | /// A HTML block. |
168 | HtmlBlock, |
169 | |
170 | /// A list. If the list is ordered the field indicates the number of the first item. |
171 | /// Contains only list items. |
172 | List(Option<u64>), // TODO: add delim and tight for ast (not needed for html) |
173 | /// A list item. |
174 | Item, |
175 | /// A footnote definition. The value contained is the footnote's label by which it can |
176 | /// be referred to. |
177 | #[cfg_attr (feature = "serde" , serde(borrow))] |
178 | FootnoteDefinition(CowStr<'a>), |
179 | |
180 | /// A table. Contains a vector describing the text-alignment for each of its columns. |
181 | Table(Vec<Alignment>), |
182 | /// A table header. Contains only `TableCell`s. Note that the table body starts immediately |
183 | /// after the closure of the `TableHead` tag. There is no `TableBody` tag. |
184 | TableHead, |
185 | /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s. |
186 | TableRow, |
187 | TableCell, |
188 | |
189 | // span-level tags |
190 | Emphasis, |
191 | Strong, |
192 | Strikethrough, |
193 | |
194 | /// A link. |
195 | Link { |
196 | link_type: LinkType, |
197 | dest_url: CowStr<'a>, |
198 | title: CowStr<'a>, |
199 | /// Identifier of reference links, e.g. `world` in the link `[hello][world]`. |
200 | id: CowStr<'a>, |
201 | }, |
202 | |
203 | /// An image. The first field is the link type, the second the destination URL and the third is a title, |
204 | /// the fourth is the link identifier. |
205 | Image { |
206 | link_type: LinkType, |
207 | dest_url: CowStr<'a>, |
208 | title: CowStr<'a>, |
209 | /// Identifier of reference links, e.g. `world` in the link `[hello][world]`. |
210 | id: CowStr<'a>, |
211 | }, |
212 | |
213 | /// A metadata block. |
214 | MetadataBlock(MetadataBlockKind), |
215 | } |
216 | |
217 | impl<'a> Tag<'a> { |
218 | pub fn to_end(&self) -> TagEnd { |
219 | match self { |
220 | Tag::Paragraph => TagEnd::Paragraph, |
221 | Tag::Heading { level: &HeadingLevel, .. } => TagEnd::Heading(*level), |
222 | Tag::BlockQuote(_) => TagEnd::BlockQuote, |
223 | Tag::CodeBlock(_) => TagEnd::CodeBlock, |
224 | Tag::HtmlBlock => TagEnd::HtmlBlock, |
225 | Tag::List(number: &Option) => TagEnd::List(number.is_some()), |
226 | Tag::Item => TagEnd::Item, |
227 | Tag::FootnoteDefinition(_) => TagEnd::FootnoteDefinition, |
228 | Tag::Table(_) => TagEnd::Table, |
229 | Tag::TableHead => TagEnd::TableHead, |
230 | Tag::TableRow => TagEnd::TableRow, |
231 | Tag::TableCell => TagEnd::TableCell, |
232 | Tag::Emphasis => TagEnd::Emphasis, |
233 | Tag::Strong => TagEnd::Strong, |
234 | Tag::Strikethrough => TagEnd::Strikethrough, |
235 | Tag::Link { .. } => TagEnd::Link, |
236 | Tag::Image { .. } => TagEnd::Image, |
237 | Tag::MetadataBlock(kind: &MetadataBlockKind) => TagEnd::MetadataBlock(*kind), |
238 | } |
239 | } |
240 | } |
241 | |
242 | /// The end of a `Tag`. |
243 | #[derive (Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)] |
244 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
245 | pub enum TagEnd { |
246 | Paragraph, |
247 | Heading(HeadingLevel), |
248 | |
249 | BlockQuote, |
250 | CodeBlock, |
251 | |
252 | HtmlBlock, |
253 | |
254 | /// A list, `true` for ordered lists. |
255 | List(bool), |
256 | Item, |
257 | FootnoteDefinition, |
258 | |
259 | Table, |
260 | TableHead, |
261 | TableRow, |
262 | TableCell, |
263 | |
264 | Emphasis, |
265 | Strong, |
266 | Strikethrough, |
267 | |
268 | Link, |
269 | Image, |
270 | |
271 | MetadataBlock(MetadataBlockKind), |
272 | } |
273 | |
274 | /// Make sure `TagEnd` is no more than two bytes in size. |
275 | /// This is why it's used instead of just using `Tag`. |
276 | #[cfg (target_pointer_width = "64" )] |
277 | const _STATIC_ASSERT_TAG_END_SIZE: [(); 2] = [(); std::mem::size_of::<TagEnd>()]; |
278 | |
279 | impl<'a> From<Tag<'a>> for TagEnd { |
280 | fn from(value: Tag) -> Self { |
281 | value.to_end() |
282 | } |
283 | } |
284 | |
285 | #[derive (Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)] |
286 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
287 | pub enum HeadingLevel { |
288 | H1 = 1, |
289 | H2, |
290 | H3, |
291 | H4, |
292 | H5, |
293 | H6, |
294 | } |
295 | |
296 | impl Display for HeadingLevel { |
297 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
298 | match self { |
299 | Self::H1 => write!(f, "h1" ), |
300 | Self::H2 => write!(f, "h2" ), |
301 | Self::H3 => write!(f, "h3" ), |
302 | Self::H4 => write!(f, "h4" ), |
303 | Self::H5 => write!(f, "h5" ), |
304 | Self::H6 => write!(f, "h6" ), |
305 | } |
306 | } |
307 | } |
308 | |
309 | /// Returned when trying to convert a `usize` into a `Heading` but it fails |
310 | /// because the usize isn't a valid heading level |
311 | #[derive (Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)] |
312 | pub struct InvalidHeadingLevel(usize); |
313 | |
314 | impl TryFrom<usize> for HeadingLevel { |
315 | type Error = InvalidHeadingLevel; |
316 | |
317 | fn try_from(value: usize) -> Result<Self, Self::Error> { |
318 | match value { |
319 | 1 => Ok(Self::H1), |
320 | 2 => Ok(Self::H2), |
321 | 3 => Ok(Self::H3), |
322 | 4 => Ok(Self::H4), |
323 | 5 => Ok(Self::H5), |
324 | 6 => Ok(Self::H6), |
325 | _ => Err(InvalidHeadingLevel(value)), |
326 | } |
327 | } |
328 | } |
329 | |
330 | /// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information. |
331 | #[derive (Clone, Debug, PartialEq, Copy)] |
332 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
333 | pub enum LinkType { |
334 | /// Inline link like `[foo](bar)` |
335 | Inline, |
336 | /// Reference link like `[foo][bar]` |
337 | Reference, |
338 | /// Reference without destination in the document, but resolved by the broken_link_callback |
339 | ReferenceUnknown, |
340 | /// Collapsed link like `[foo][]` |
341 | Collapsed, |
342 | /// Collapsed link without destination in the document, but resolved by the broken_link_callback |
343 | CollapsedUnknown, |
344 | /// Shortcut link like `[foo]` |
345 | Shortcut, |
346 | /// Shortcut without destination in the document, but resolved by the broken_link_callback |
347 | ShortcutUnknown, |
348 | /// Autolink like `<http://foo.bar/baz>` |
349 | Autolink, |
350 | /// Email address in autolink like `<john@example.org>` |
351 | Email, |
352 | } |
353 | |
354 | impl LinkType { |
355 | /// Map the link type to an equivalent _Unknown link type. |
356 | fn to_unknown(self) -> Self { |
357 | match self { |
358 | LinkType::Reference => LinkType::ReferenceUnknown, |
359 | LinkType::Collapsed => LinkType::CollapsedUnknown, |
360 | LinkType::Shortcut => LinkType::ShortcutUnknown, |
361 | _ => unreachable!(), |
362 | } |
363 | } |
364 | } |
365 | |
366 | /// Markdown events that are generated in a preorder traversal of the document |
367 | /// tree, with additional `End` events whenever all of an inner node's children |
368 | /// have been visited. |
369 | #[derive (Clone, Debug, PartialEq)] |
370 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
371 | pub enum Event<'a> { |
372 | /// Start of a tagged element. Events that are yielded after this event |
373 | /// and before its corresponding `End` event are inside this element. |
374 | /// Start and end events are guaranteed to be balanced. |
375 | #[cfg_attr (feature = "serde" , serde(borrow))] |
376 | Start(Tag<'a>), |
377 | /// End of a tagged element. |
378 | End(TagEnd), |
379 | /// A text node. |
380 | #[cfg_attr (feature = "serde" , serde(borrow))] |
381 | Text(CowStr<'a>), |
382 | /// An inline code node. |
383 | #[cfg_attr (feature = "serde" , serde(borrow))] |
384 | Code(CowStr<'a>), |
385 | /// An inline math environment node. |
386 | #[cfg_attr (feature = "serde" , serde(borrow))] |
387 | InlineMath(CowStr<'a>), |
388 | /// A display math environment node. |
389 | #[cfg_attr (feature = "serde" , serde(borrow))] |
390 | DisplayMath(CowStr<'a>), |
391 | /// An HTML node. |
392 | #[cfg_attr (feature = "serde" , serde(borrow))] |
393 | Html(CowStr<'a>), |
394 | /// An inline HTML node. |
395 | #[cfg_attr (feature = "serde" , serde(borrow))] |
396 | InlineHtml(CowStr<'a>), |
397 | /// A reference to a footnote with given label, which may or may not be defined |
398 | /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may |
399 | /// occur in any order. |
400 | #[cfg_attr (feature = "serde" , serde(borrow))] |
401 | FootnoteReference(CowStr<'a>), |
402 | /// A soft line break. |
403 | SoftBreak, |
404 | /// A hard line break. |
405 | HardBreak, |
406 | /// A horizontal ruler. |
407 | Rule, |
408 | /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked. |
409 | TaskListMarker(bool), |
410 | } |
411 | |
412 | /// Table column text alignment. |
413 | #[derive (Copy, Clone, Debug, PartialEq)] |
414 | #[cfg_attr (feature = "serde" , derive(Serialize, Deserialize))] |
415 | |
416 | pub enum Alignment { |
417 | /// Default text alignment. |
418 | None, |
419 | Left, |
420 | Center, |
421 | Right, |
422 | } |
423 | |
424 | bitflags::bitflags! { |
425 | /// Option struct containing flags for enabling extra features |
426 | /// that are not part of the CommonMark spec. |
427 | #[derive (Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] |
428 | pub struct Options: u32 { |
429 | const ENABLE_TABLES = 1 << 1; |
430 | /// GitHub-compatible footnote syntax. |
431 | /// |
432 | /// Footnotes are referenced with the syntax `[^IDENT]`, |
433 | /// and defined with an identifier followed by a colon at top level. |
434 | /// |
435 | /// --- |
436 | /// |
437 | /// ```markdown |
438 | /// Footnote referenced [^1]. |
439 | /// |
440 | /// [^1]: footnote defined |
441 | /// ``` |
442 | /// |
443 | /// Footnote referenced [^1]. |
444 | /// |
445 | /// [^1]: footnote defined |
446 | const ENABLE_FOOTNOTES = 1 << 2; |
447 | const ENABLE_STRIKETHROUGH = 1 << 3; |
448 | const ENABLE_TASKLISTS = 1 << 4; |
449 | const ENABLE_SMART_PUNCTUATION = 1 << 5; |
450 | /// Extension to allow headings to have ID and classes. |
451 | /// |
452 | /// `# text { #id .class1 .class2 myattr other_attr=myvalue }` |
453 | /// is interpreted as a level 1 heading |
454 | /// with the content `text`, ID `id`, classes `class1` and `class2` and |
455 | /// custom attributes `myattr` (without value) and |
456 | /// `other_attr` with value `myvalue`. |
457 | /// Note that ID, classes, and custom attributes should be space-separated. |
458 | const ENABLE_HEADING_ATTRIBUTES = 1 << 6; |
459 | /// Metadata blocks in YAML style, i.e.: |
460 | /// - starting with a `---` line |
461 | /// - ending with a `---` or `...` line |
462 | const ENABLE_YAML_STYLE_METADATA_BLOCKS = 1 << 7; |
463 | /// Metadata blocks delimited by: |
464 | /// - `+++` line at start |
465 | /// - `+++` line at end |
466 | const ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS = 1 << 8; |
467 | /// Older footnote syntax. This flag implies `ENABLE_FOOTNOTES`, changing it to use an |
468 | /// older syntax instead of the new, default, GitHub-compatible syntax. |
469 | /// |
470 | /// New syntax is different from the old syntax regarding |
471 | /// indentation, nesting, and footnote references with no definition: |
472 | /// |
473 | /// ```markdown |
474 | /// [^1]: In new syntax, this is two footnote definitions. |
475 | /// [^2]: In old syntax, this is a single footnote definition with two lines. |
476 | /// |
477 | /// [^3]: |
478 | /// |
479 | /// In new syntax, this is a footnote with two paragraphs. |
480 | /// |
481 | /// In old syntax, this is a footnote followed by a code block. |
482 | /// |
483 | /// In new syntax, this undefined footnote definition renders as |
484 | /// literal text [^4]. In old syntax, it creates a dangling link. |
485 | /// ``` |
486 | const ENABLE_OLD_FOOTNOTES = (1 << 9) | (1 << 2); |
487 | /// With this feature enabled, two events `Event::InlineMath` and `Event::DisplayMath` |
488 | /// are emitted that conventionally contain TeX formulas. |
489 | const ENABLE_MATH = 1 << 10; |
490 | /// Misc GitHub Flavored Markdown features not supported in CommonMark. |
491 | /// The following features are currently behind this tag: |
492 | /// - Blockquote tags ([!NOTE], [!TIP], [!IMPORTANT], [!WARNING], [!CAUTION]). |
493 | const ENABLE_GFM = 1 << 11; |
494 | } |
495 | } |
496 | |
497 | impl Options { |
498 | pub(crate) fn has_gfm_footnotes(&self) -> bool { |
499 | self.contains(Options::ENABLE_FOOTNOTES) && !self.contains(Options::ENABLE_OLD_FOOTNOTES) |
500 | } |
501 | } |
502 | |