| 1 | //! The first pass resolves all block structure, generating an AST. Within a block, items |
| 2 | //! are in a linear chain with potential inline markup identified. |
| 3 | |
| 4 | use std::cmp::max; |
| 5 | use std::ops::Range; |
| 6 | |
| 7 | use crate::parse::{ |
| 8 | scan_containers, Allocations, FootnoteDef, HeadingAttributes, Item, ItemBody, LinkDef, |
| 9 | LINK_MAX_NESTED_PARENS, |
| 10 | }; |
| 11 | use crate::strings::CowStr; |
| 12 | use crate::tree::{Tree, TreeIndex}; |
| 13 | use crate::Options; |
| 14 | use crate::{ |
| 15 | linklabel::{scan_link_label_rest, LinkLabel}, |
| 16 | HeadingLevel, |
| 17 | }; |
| 18 | use crate::{scanners::*, MetadataBlockKind}; |
| 19 | |
| 20 | use unicase::UniCase; |
| 21 | |
| 22 | /// Runs the first pass, which resolves the block structure of the document, |
| 23 | /// and returns the resulting tree. |
| 24 | pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations<'_>) { |
| 25 | // This is a very naive heuristic for the number of nodes |
| 26 | // we'll need. |
| 27 | let start_capacity: usize = max(v1:128, v2:text.len() / 32); |
| 28 | let lookup_table: &[bool; 256] = &create_lut(&options); |
| 29 | let first_pass: FirstPass<'_, '_> = FirstPass { |
| 30 | text, |
| 31 | tree: Tree::with_capacity(cap:start_capacity), |
| 32 | begin_list_item: None, |
| 33 | last_line_blank: false, |
| 34 | allocs: Allocations::new(), |
| 35 | options, |
| 36 | lookup_table, |
| 37 | next_paragraph_task: None, |
| 38 | brace_context_next: 0, |
| 39 | brace_context_stack: Vec::new(), |
| 40 | }; |
| 41 | first_pass.run() |
| 42 | } |
| 43 | |
| 44 | // Each level of brace nesting adds another entry to a hash table. |
| 45 | // To limit the amount of memory consumed, do not create a new brace |
| 46 | // context beyond some amount deep. |
| 47 | // |
| 48 | // There are actually two limits at play here: this one, |
| 49 | // and the one where the maximum amount of distinct contexts passes |
| 50 | // the 255 item limit imposed by using `u8`. When over 255 distinct |
| 51 | // contexts are created, it wraps around, while this one instead makes it |
| 52 | // saturate, which is a better behavior. |
| 53 | const MATH_BRACE_CONTEXT_MAX_NESTING: usize = 25; |
| 54 | |
| 55 | /// State for the first parsing pass. |
| 56 | struct FirstPass<'a, 'b> { |
| 57 | text: &'a str, |
| 58 | tree: Tree<Item>, |
| 59 | begin_list_item: Option<usize>, |
| 60 | last_line_blank: bool, |
| 61 | allocs: Allocations<'a>, |
| 62 | options: Options, |
| 63 | lookup_table: &'b LookupTable, |
| 64 | /// This is `Some(item)` when the next paragraph |
| 65 | /// starts with a task list marker. |
| 66 | next_paragraph_task: Option<Item>, |
| 67 | /// Math environment brace nesting. |
| 68 | brace_context_stack: Vec<u8>, |
| 69 | brace_context_next: usize, |
| 70 | } |
| 71 | |
| 72 | impl<'a, 'b> FirstPass<'a, 'b> { |
| 73 | fn run(mut self) -> (Tree<Item>, Allocations<'a>) { |
| 74 | let mut ix = 0; |
| 75 | while ix < self.text.len() { |
| 76 | ix = self.parse_block(ix); |
| 77 | } |
| 78 | for _ in 0..self.tree.spine_len() { |
| 79 | self.pop(ix); |
| 80 | } |
| 81 | (self.tree, self.allocs) |
| 82 | } |
| 83 | |
| 84 | /// Returns offset after block. |
| 85 | fn parse_block(&mut self, mut start_ix: usize) -> usize { |
| 86 | let bytes = self.text.as_bytes(); |
| 87 | let mut line_start = LineStart::new(&bytes[start_ix..]); |
| 88 | |
| 89 | // math spans and their braces are tracked only within a single block |
| 90 | self.brace_context_stack.clear(); |
| 91 | self.brace_context_next = 0; |
| 92 | |
| 93 | let i = scan_containers( |
| 94 | &self.tree, |
| 95 | &mut line_start, |
| 96 | self.options.has_gfm_footnotes(), |
| 97 | ); |
| 98 | for _ in i..self.tree.spine_len() { |
| 99 | self.pop(start_ix); |
| 100 | } |
| 101 | |
| 102 | if self.options.contains(Options::ENABLE_OLD_FOOTNOTES) { |
| 103 | // finish footnote if it's still open and was preceded by blank line |
| 104 | if let Some(node_ix) = self.tree.peek_up() { |
| 105 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
| 106 | if self.last_line_blank { |
| 107 | self.pop(start_ix); |
| 108 | } |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | // Footnote definitions of the form |
| 113 | // [^bar]: |
| 114 | // * anything really |
| 115 | let container_start = start_ix + line_start.bytes_scanned(); |
| 116 | if let Some(bytecount) = self.parse_footnote(container_start) { |
| 117 | start_ix = container_start + bytecount; |
| 118 | start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0); |
| 119 | line_start = LineStart::new(&bytes[start_ix..]); |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | // Process new containers |
| 124 | loop { |
| 125 | if self.options.has_gfm_footnotes() |
| 126 | || self.options.contains(Options::ENABLE_OLD_FOOTNOTES) |
| 127 | { |
| 128 | // Footnote definitions of the form |
| 129 | // [^bar]: |
| 130 | // * anything really |
| 131 | let save = line_start.clone(); |
| 132 | let indent = line_start.scan_space_upto(4); |
| 133 | if indent < 4 { |
| 134 | let container_start = start_ix + line_start.bytes_scanned(); |
| 135 | if let Some(bytecount) = self.parse_footnote(container_start) { |
| 136 | start_ix = container_start + bytecount; |
| 137 | line_start = LineStart::new(&bytes[start_ix..]); |
| 138 | continue; |
| 139 | } else { |
| 140 | line_start = save; |
| 141 | } |
| 142 | } else { |
| 143 | line_start = save; |
| 144 | } |
| 145 | } |
| 146 | let container_start = start_ix + line_start.bytes_scanned(); |
| 147 | if let Some((ch, index, indent)) = line_start.scan_list_marker() { |
| 148 | let after_marker_index = start_ix + line_start.bytes_scanned(); |
| 149 | self.continue_list(container_start, ch, index); |
| 150 | self.tree.append(Item { |
| 151 | start: container_start, |
| 152 | end: after_marker_index, // will get updated later if item not empty |
| 153 | body: ItemBody::ListItem(indent), |
| 154 | }); |
| 155 | self.tree.push(); |
| 156 | if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) { |
| 157 | self.begin_list_item = Some(after_marker_index + n); |
| 158 | return after_marker_index + n; |
| 159 | } |
| 160 | if self.options.contains(Options::ENABLE_TASKLISTS) { |
| 161 | let task_list_marker = |
| 162 | line_start.scan_task_list_marker().map(|is_checked| Item { |
| 163 | start: after_marker_index, |
| 164 | end: start_ix + line_start.bytes_scanned(), |
| 165 | body: ItemBody::TaskListMarker(is_checked), |
| 166 | }); |
| 167 | if let Some(task_list_marker) = task_list_marker { |
| 168 | if let Some(n) = scan_blank_line(&bytes[task_list_marker.end..]) { |
| 169 | self.tree.append(task_list_marker); |
| 170 | self.begin_list_item = Some(task_list_marker.end + n); |
| 171 | return task_list_marker.end + n; |
| 172 | } else { |
| 173 | self.next_paragraph_task = Some(task_list_marker); |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | } else if line_start.scan_blockquote_marker() { |
| 178 | let kind = if self.options.contains(Options::ENABLE_GFM) { |
| 179 | line_start.scan_blockquote_tag() |
| 180 | } else { |
| 181 | None |
| 182 | }; |
| 183 | self.finish_list(start_ix); |
| 184 | self.tree.append(Item { |
| 185 | start: container_start, |
| 186 | end: 0, // will get set later |
| 187 | body: ItemBody::BlockQuote(kind), |
| 188 | }); |
| 189 | self.tree.push(); |
| 190 | if kind.is_some() { |
| 191 | // blockquote tag leaves us at the end of the line |
| 192 | // we need to scan through all the container syntax for the next line |
| 193 | // and break out if we can't re-scan all of them |
| 194 | let ix = start_ix + line_start.bytes_scanned(); |
| 195 | let mut lazy_line_start = LineStart::new(&bytes[ix..]); |
| 196 | let current_container = scan_containers( |
| 197 | &self.tree, |
| 198 | &mut lazy_line_start, |
| 199 | self.options.has_gfm_footnotes(), |
| 200 | ) == self.tree.spine_len(); |
| 201 | if !lazy_line_start.scan_space(4) |
| 202 | && self.scan_paragraph_interrupt( |
| 203 | &bytes[ix + lazy_line_start.bytes_scanned()..], |
| 204 | current_container, |
| 205 | ) |
| 206 | { |
| 207 | return ix; |
| 208 | } else { |
| 209 | // blockquote tags act as if they were nested in a paragraph |
| 210 | // so you can lazily continue the imaginary paragraph off of them |
| 211 | line_start = lazy_line_start; |
| 212 | line_start.scan_all_space(); |
| 213 | start_ix = ix; |
| 214 | break; |
| 215 | } |
| 216 | } |
| 217 | } else { |
| 218 | break; |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | let ix = start_ix + line_start.bytes_scanned(); |
| 223 | |
| 224 | if let Some(n) = scan_blank_line(&bytes[ix..]) { |
| 225 | if let Some(node_ix) = self.tree.peek_up() { |
| 226 | match &mut self.tree[node_ix].item.body { |
| 227 | ItemBody::BlockQuote(..) => (), |
| 228 | ItemBody::ListItem(indent) if self.begin_list_item.is_some() => { |
| 229 | self.last_line_blank = true; |
| 230 | // This is a blank list item. |
| 231 | // While the list itself can be continued no matter how many blank lines |
| 232 | // there are between this one and the next one, it cannot nest anything |
| 233 | // else, so its indentation should not be subtracted from the line start. |
| 234 | *indent = 0; |
| 235 | } |
| 236 | _ => { |
| 237 | self.last_line_blank = true; |
| 238 | } |
| 239 | } |
| 240 | } |
| 241 | return ix + n; |
| 242 | } |
| 243 | |
| 244 | self.finish_list(start_ix); |
| 245 | |
| 246 | // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks |
| 247 | let remaining_space = line_start.remaining_space(); |
| 248 | |
| 249 | let indent = line_start.scan_space_upto(4); |
| 250 | if indent == 4 { |
| 251 | let ix = start_ix + line_start.bytes_scanned(); |
| 252 | let remaining_space = line_start.remaining_space(); |
| 253 | return self.parse_indented_code_block(ix, remaining_space); |
| 254 | } |
| 255 | |
| 256 | let ix = start_ix + line_start.bytes_scanned(); |
| 257 | |
| 258 | // metadata blocks cannot be indented |
| 259 | if indent == 0 { |
| 260 | if let Some((_n, metadata_block_ch)) = scan_metadata_block( |
| 261 | &bytes[ix..], |
| 262 | self.options |
| 263 | .contains(Options::ENABLE_YAML_STYLE_METADATA_BLOCKS), |
| 264 | self.options |
| 265 | .contains(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS), |
| 266 | ) { |
| 267 | return self.parse_metadata_block(ix, metadata_block_ch); |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | // HTML Blocks |
| 272 | if bytes[ix] == b'<' { |
| 273 | // Types 1-5 are all detected by one function and all end with the same |
| 274 | // pattern |
| 275 | if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) { |
| 276 | return self.parse_html_block_type_1_to_5( |
| 277 | ix, |
| 278 | html_end_tag, |
| 279 | remaining_space, |
| 280 | indent, |
| 281 | ); |
| 282 | } |
| 283 | |
| 284 | // Detect type 6 |
| 285 | if starts_html_block_type_6(&bytes[(ix + 1)..]) { |
| 286 | return self.parse_html_block_type_6_or_7(ix, remaining_space, indent); |
| 287 | } |
| 288 | |
| 289 | // Detect type 7 |
| 290 | if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) { |
| 291 | return self.parse_html_block_type_6_or_7(ix, remaining_space, indent); |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | if let Ok(n) = scan_hrule(&bytes[ix..]) { |
| 296 | return self.parse_hrule(n, ix); |
| 297 | } |
| 298 | |
| 299 | if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) { |
| 300 | return self.parse_atx_heading(ix, atx_size); |
| 301 | } |
| 302 | |
| 303 | if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) { |
| 304 | return self.parse_fenced_code_block(ix, indent, fence_ch, n); |
| 305 | } |
| 306 | |
| 307 | // parse refdef |
| 308 | while let Some((bytecount, label, link_def)) = |
| 309 | self.parse_refdef_total(start_ix + line_start.bytes_scanned()) |
| 310 | { |
| 311 | self.allocs.refdefs.0.entry(label).or_insert(link_def); |
| 312 | let container_start = start_ix + line_start.bytes_scanned(); |
| 313 | let mut ix = container_start + bytecount; |
| 314 | // Refdefs act as if they were contained within a paragraph, for purposes of lazy |
| 315 | // continuations. For example: |
| 316 | // |
| 317 | // ``` |
| 318 | // > [foo]: http://example.com |
| 319 | // bar |
| 320 | // ``` |
| 321 | // |
| 322 | // is equivalent to |
| 323 | // |
| 324 | // ``` |
| 325 | // > bar |
| 326 | // |
| 327 | // [foo]: http://example.com |
| 328 | // ``` |
| 329 | if let Some(nl) = scan_blank_line(&bytes[ix..]) { |
| 330 | ix += nl; |
| 331 | let mut lazy_line_start = LineStart::new(&bytes[ix..]); |
| 332 | let current_container = scan_containers( |
| 333 | &self.tree, |
| 334 | &mut lazy_line_start, |
| 335 | self.options.has_gfm_footnotes(), |
| 336 | ) == self.tree.spine_len(); |
| 337 | if !lazy_line_start.scan_space(4) |
| 338 | && self.scan_paragraph_interrupt( |
| 339 | &bytes[ix + lazy_line_start.bytes_scanned()..], |
| 340 | current_container, |
| 341 | ) |
| 342 | { |
| 343 | return ix; |
| 344 | } else { |
| 345 | line_start = lazy_line_start; |
| 346 | line_start.scan_all_space(); |
| 347 | start_ix = ix; |
| 348 | } |
| 349 | } else { |
| 350 | return ix; |
| 351 | } |
| 352 | } |
| 353 | |
| 354 | let ix = start_ix + line_start.bytes_scanned(); |
| 355 | |
| 356 | self.parse_paragraph(ix) |
| 357 | } |
| 358 | |
| 359 | /// Returns the offset of the first line after the table. |
| 360 | /// Assumptions: current focus is a table element and the table header |
| 361 | /// matches the separator line (same number of columns). |
| 362 | fn parse_table( |
| 363 | &mut self, |
| 364 | table_cols: usize, |
| 365 | head_start: usize, |
| 366 | body_start: usize, |
| 367 | ) -> Option<usize> { |
| 368 | // filled empty cells are limited to protect against quadratic growth |
| 369 | // https://github.com/raphlinus/pulldown-cmark/issues/832 |
| 370 | let mut missing_empty_cells = 0; |
| 371 | // parse header. this shouldn't fail because we made sure the table header is ok |
| 372 | let (_sep_start, thead_ix) = |
| 373 | self.parse_table_row_inner(head_start, table_cols, &mut missing_empty_cells)?; |
| 374 | self.tree[thead_ix].item.body = ItemBody::TableHead; |
| 375 | |
| 376 | // parse body |
| 377 | let mut ix = body_start; |
| 378 | while let Some((next_ix, _row_ix)) = |
| 379 | self.parse_table_row(ix, table_cols, &mut missing_empty_cells) |
| 380 | { |
| 381 | ix = next_ix; |
| 382 | } |
| 383 | |
| 384 | self.pop(ix); |
| 385 | Some(ix) |
| 386 | } |
| 387 | |
| 388 | /// Call this when containers are taken care of. |
| 389 | /// Returns bytes scanned, row_ix |
| 390 | fn parse_table_row_inner( |
| 391 | &mut self, |
| 392 | mut ix: usize, |
| 393 | row_cells: usize, |
| 394 | missing_empty_cells: &mut usize, |
| 395 | ) -> Option<(usize, TreeIndex)> { |
| 396 | // Limit to prevent a malicious input from causing a denial of service. |
| 397 | const MAX_AUTOCOMPLETED_CELLS: usize = 1 << 18; // = 0x40000 |
| 398 | |
| 399 | let bytes = self.text.as_bytes(); |
| 400 | let mut cells = 0; |
| 401 | let mut final_cell_ix = None; |
| 402 | |
| 403 | let old_cur = self.tree.cur(); |
| 404 | let row_ix = self.tree.append(Item { |
| 405 | start: ix, |
| 406 | end: 0, // set at end of this function |
| 407 | body: ItemBody::TableRow, |
| 408 | }); |
| 409 | self.tree.push(); |
| 410 | |
| 411 | loop { |
| 412 | ix += scan_ch(&bytes[ix..], b'|' ); |
| 413 | let start_ix = ix; |
| 414 | ix += scan_whitespace_no_nl(&bytes[ix..]); |
| 415 | |
| 416 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
| 417 | ix += eol_bytes; |
| 418 | break; |
| 419 | } |
| 420 | |
| 421 | let cell_ix = self.tree.append(Item { |
| 422 | start: start_ix, |
| 423 | end: ix, |
| 424 | body: ItemBody::TableCell, |
| 425 | }); |
| 426 | self.tree.push(); |
| 427 | let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active); |
| 428 | |
| 429 | self.tree[cell_ix].item.end = next_ix; |
| 430 | self.tree.pop(); |
| 431 | |
| 432 | ix = next_ix; |
| 433 | cells += 1; |
| 434 | |
| 435 | if cells == row_cells { |
| 436 | final_cell_ix = Some(cell_ix); |
| 437 | } |
| 438 | } |
| 439 | |
| 440 | if let (Some(cur), 0) = (old_cur, cells) { |
| 441 | self.pop(ix); |
| 442 | self.tree[cur].next = None; |
| 443 | return None; |
| 444 | } |
| 445 | |
| 446 | // fill empty cells if needed |
| 447 | // note: this is where GFM and commonmark-extra diverge. we follow |
| 448 | // GFM here |
| 449 | for _ in cells..row_cells { |
| 450 | if *missing_empty_cells >= MAX_AUTOCOMPLETED_CELLS { |
| 451 | return None; |
| 452 | } |
| 453 | *missing_empty_cells += 1; |
| 454 | self.tree.append(Item { |
| 455 | start: ix, |
| 456 | end: ix, |
| 457 | body: ItemBody::TableCell, |
| 458 | }); |
| 459 | } |
| 460 | |
| 461 | // drop excess cells |
| 462 | if let Some(cell_ix) = final_cell_ix { |
| 463 | self.tree[cell_ix].next = None; |
| 464 | } |
| 465 | |
| 466 | self.pop(ix); |
| 467 | |
| 468 | Some((ix, row_ix)) |
| 469 | } |
| 470 | |
| 471 | /// Returns first offset after the row and the tree index of the row. |
| 472 | fn parse_table_row( |
| 473 | &mut self, |
| 474 | mut ix: usize, |
| 475 | row_cells: usize, |
| 476 | missing_empty_cells: &mut usize, |
| 477 | ) -> Option<(usize, TreeIndex)> { |
| 478 | let bytes = self.text.as_bytes(); |
| 479 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 480 | let current_container = scan_containers( |
| 481 | &self.tree, |
| 482 | &mut line_start, |
| 483 | self.options.has_gfm_footnotes(), |
| 484 | ) == self.tree.spine_len(); |
| 485 | if !current_container { |
| 486 | return None; |
| 487 | } |
| 488 | line_start.scan_all_space(); |
| 489 | ix += line_start.bytes_scanned(); |
| 490 | if scan_paragraph_interrupt_no_table( |
| 491 | &bytes[ix..], |
| 492 | current_container, |
| 493 | self.options.contains(Options::ENABLE_FOOTNOTES), |
| 494 | &self.tree, |
| 495 | ) { |
| 496 | return None; |
| 497 | } |
| 498 | |
| 499 | let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells, missing_empty_cells)?; |
| 500 | Some((ix, row_ix)) |
| 501 | } |
| 502 | |
| 503 | /// Returns offset of line start after paragraph. |
| 504 | fn parse_paragraph(&mut self, start_ix: usize) -> usize { |
| 505 | let node_ix = self.tree.append(Item { |
| 506 | start: start_ix, |
| 507 | end: 0, // will get set later |
| 508 | body: ItemBody::Paragraph, |
| 509 | }); |
| 510 | self.tree.push(); |
| 511 | |
| 512 | if let Some(item) = self.next_paragraph_task { |
| 513 | self.tree.append(item); |
| 514 | self.next_paragraph_task = None; |
| 515 | } |
| 516 | |
| 517 | let bytes = self.text.as_bytes(); |
| 518 | let mut ix = start_ix; |
| 519 | loop { |
| 520 | let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix { |
| 521 | TableParseMode::Scan |
| 522 | } else { |
| 523 | TableParseMode::Disabled |
| 524 | }; |
| 525 | let (next_ix, brk) = self.parse_line(ix, None, scan_mode); |
| 526 | |
| 527 | // break out when we find a table |
| 528 | if let Some(Item { |
| 529 | body: ItemBody::Table(alignment_ix), |
| 530 | .. |
| 531 | }) = brk |
| 532 | { |
| 533 | let table_cols = self.allocs[alignment_ix].len(); |
| 534 | self.tree[node_ix].item.body = ItemBody::Table(alignment_ix); |
| 535 | // this clears out any stuff we may have appended - but there may |
| 536 | // be a cleaner way |
| 537 | self.tree[node_ix].child = None; |
| 538 | self.tree.pop(); |
| 539 | self.tree.push(); |
| 540 | if let Some(ix) = self.parse_table(table_cols, ix, next_ix) { |
| 541 | return ix; |
| 542 | } |
| 543 | } |
| 544 | |
| 545 | ix = next_ix; |
| 546 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 547 | let current_container = scan_containers( |
| 548 | &self.tree, |
| 549 | &mut line_start, |
| 550 | self.options.has_gfm_footnotes(), |
| 551 | ) == self.tree.spine_len(); |
| 552 | let trailing_backslash_pos = match brk { |
| 553 | Some(Item { |
| 554 | start, |
| 555 | body: ItemBody::HardBreak(true), |
| 556 | .. |
| 557 | }) if bytes[start] == b' \\' => Some(start), |
| 558 | _ => None, |
| 559 | }; |
| 560 | if !line_start.scan_space(4) { |
| 561 | let ix_new = ix + line_start.bytes_scanned(); |
| 562 | if current_container { |
| 563 | if let Some(ix_setext) = |
| 564 | self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some()) |
| 565 | { |
| 566 | if let Some(pos) = trailing_backslash_pos { |
| 567 | self.tree.append_text(pos, pos + 1, false); |
| 568 | } |
| 569 | ix = ix_setext; |
| 570 | break; |
| 571 | } |
| 572 | } |
| 573 | // first check for non-empty lists, then for other interrupts |
| 574 | let suffix = &bytes[ix_new..]; |
| 575 | if self.scan_paragraph_interrupt(suffix, current_container) { |
| 576 | if let Some(pos) = trailing_backslash_pos { |
| 577 | self.tree.append_text(pos, pos + 1, false); |
| 578 | } |
| 579 | break; |
| 580 | } |
| 581 | } |
| 582 | line_start.scan_all_space(); |
| 583 | if line_start.is_at_eol() { |
| 584 | if let Some(pos) = trailing_backslash_pos { |
| 585 | self.tree.append_text(pos, pos + 1, false); |
| 586 | } |
| 587 | break; |
| 588 | } |
| 589 | ix = next_ix + line_start.bytes_scanned(); |
| 590 | if let Some(item) = brk { |
| 591 | self.tree.append(item); |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | self.pop(ix); |
| 596 | ix |
| 597 | } |
| 598 | |
| 599 | /// Returns end ix of setext_heading on success. |
| 600 | fn parse_setext_heading( |
| 601 | &mut self, |
| 602 | ix: usize, |
| 603 | node_ix: TreeIndex, |
| 604 | has_trailing_content: bool, |
| 605 | ) -> Option<usize> { |
| 606 | let bytes = self.text.as_bytes(); |
| 607 | let (n, level) = scan_setext_heading(&bytes[ix..])?; |
| 608 | let mut attrs = None; |
| 609 | |
| 610 | if let Some(cur_ix) = self.tree.cur() { |
| 611 | let parent_ix = self.tree.peek_up().unwrap(); |
| 612 | let header_start = self.tree[parent_ix].item.start; |
| 613 | // Note that `self.tree[parent_ix].item.end` might be zero at this point. |
| 614 | // Use the end position of the current node (i.e. the last known child |
| 615 | // of the parent) instead. |
| 616 | let header_end = self.tree[cur_ix].item.end; |
| 617 | |
| 618 | // extract the trailing attribute block |
| 619 | let (content_end, attrs_) = |
| 620 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
| 621 | attrs = attrs_; |
| 622 | |
| 623 | // strip trailing whitespace |
| 624 | let new_end = if has_trailing_content { |
| 625 | content_end |
| 626 | } else { |
| 627 | let trailing_ws = |
| 628 | scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl); |
| 629 | content_end - trailing_ws |
| 630 | }; |
| 631 | |
| 632 | if attrs.is_some() { |
| 633 | // remove trailing block attributes |
| 634 | self.tree.truncate_siblings(new_end); |
| 635 | } |
| 636 | |
| 637 | if let Some(cur_ix) = self.tree.cur() { |
| 638 | self.tree[cur_ix].item.end = new_end; |
| 639 | } |
| 640 | } |
| 641 | |
| 642 | self.tree[node_ix].item.body = ItemBody::Heading( |
| 643 | level, |
| 644 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
| 645 | ); |
| 646 | |
| 647 | Some(ix + n) |
| 648 | } |
| 649 | |
| 650 | /// Parse a line of input, appending text and items to tree. |
| 651 | /// |
| 652 | /// Returns: index after line and an item representing the break. |
| 653 | fn parse_line( |
| 654 | &mut self, |
| 655 | start: usize, |
| 656 | end: Option<usize>, |
| 657 | mode: TableParseMode, |
| 658 | ) -> (usize, Option<Item>) { |
| 659 | let bytes = self.text.as_bytes(); |
| 660 | let bytes = match end { |
| 661 | Some(end) => &bytes[..end], |
| 662 | None => bytes, |
| 663 | }; |
| 664 | let bytes_len = bytes.len(); |
| 665 | let mut pipes = 0; |
| 666 | let mut last_pipe_ix = start; |
| 667 | let mut begin_text = start; |
| 668 | let mut backslash_escaped = false; |
| 669 | |
| 670 | let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| { |
| 671 | match byte { |
| 672 | b' \n' | b' \r' => { |
| 673 | if let TableParseMode::Active = mode { |
| 674 | return LoopInstruction::BreakAtWith(ix, None); |
| 675 | } |
| 676 | |
| 677 | let mut i = ix; |
| 678 | let eol_bytes = scan_eol(&bytes[ix..]).unwrap(); |
| 679 | |
| 680 | let end_ix = ix + eol_bytes; |
| 681 | let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b' \\' ); |
| 682 | if trailing_backslashes % 2 == 1 && end_ix < bytes_len { |
| 683 | i -= 1; |
| 684 | self.tree.append_text(begin_text, i, backslash_escaped); |
| 685 | backslash_escaped = false; |
| 686 | return LoopInstruction::BreakAtWith( |
| 687 | end_ix, |
| 688 | Some(Item { |
| 689 | start: i, |
| 690 | end: end_ix, |
| 691 | body: ItemBody::HardBreak(true), |
| 692 | }), |
| 693 | ); |
| 694 | } |
| 695 | |
| 696 | if mode == TableParseMode::Scan && pipes > 0 { |
| 697 | // check if we may be parsing a table |
| 698 | let next_line_ix = ix + eol_bytes; |
| 699 | let mut line_start = LineStart::new(&bytes[next_line_ix..]); |
| 700 | if scan_containers( |
| 701 | &self.tree, |
| 702 | &mut line_start, |
| 703 | self.options.has_gfm_footnotes(), |
| 704 | ) == self.tree.spine_len() |
| 705 | { |
| 706 | let table_head_ix = next_line_ix + line_start.bytes_scanned(); |
| 707 | let (table_head_bytes, alignment) = |
| 708 | scan_table_head(&bytes[table_head_ix..]); |
| 709 | |
| 710 | if table_head_bytes > 0 { |
| 711 | // computing header count from number of pipes |
| 712 | let header_count = |
| 713 | count_header_cols(bytes, pipes, start, last_pipe_ix); |
| 714 | |
| 715 | // make sure they match the number of columns we find in separator line |
| 716 | if alignment.len() == header_count { |
| 717 | let alignment_ix = self.allocs.allocate_alignment(alignment); |
| 718 | let end_ix = table_head_ix + table_head_bytes; |
| 719 | return LoopInstruction::BreakAtWith( |
| 720 | end_ix, |
| 721 | Some(Item { |
| 722 | start: i, |
| 723 | end: end_ix, // must update later |
| 724 | body: ItemBody::Table(alignment_ix), |
| 725 | }), |
| 726 | ); |
| 727 | } |
| 728 | } |
| 729 | } |
| 730 | } |
| 731 | |
| 732 | let trailing_whitespace = |
| 733 | scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl); |
| 734 | if trailing_whitespace >= 2 { |
| 735 | i -= trailing_whitespace; |
| 736 | self.tree.append_text(begin_text, i, backslash_escaped); |
| 737 | backslash_escaped = false; |
| 738 | return LoopInstruction::BreakAtWith( |
| 739 | end_ix, |
| 740 | Some(Item { |
| 741 | start: i, |
| 742 | end: end_ix, |
| 743 | body: ItemBody::HardBreak(false), |
| 744 | }), |
| 745 | ); |
| 746 | } |
| 747 | |
| 748 | self.tree |
| 749 | .append_text(begin_text, ix - trailing_whitespace, backslash_escaped); |
| 750 | backslash_escaped = false; |
| 751 | |
| 752 | LoopInstruction::BreakAtWith( |
| 753 | end_ix, |
| 754 | Some(Item { |
| 755 | start: i, |
| 756 | end: end_ix, |
| 757 | body: ItemBody::SoftBreak, |
| 758 | }), |
| 759 | ) |
| 760 | } |
| 761 | b' \\' => { |
| 762 | if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) { |
| 763 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 764 | if bytes[ix + 1] == b'`' { |
| 765 | let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`' ); |
| 766 | self.tree.append(Item { |
| 767 | start: ix + 1, |
| 768 | end: ix + count + 1, |
| 769 | body: ItemBody::MaybeCode(count, true), |
| 770 | }); |
| 771 | begin_text = ix + 1 + count; |
| 772 | backslash_escaped = false; |
| 773 | LoopInstruction::ContinueAndSkip(count) |
| 774 | } else if bytes[ix + 1] == b'|' && TableParseMode::Active == mode { |
| 775 | // Yeah, it's super weird that backslash escaped pipes in tables aren't "real" |
| 776 | // backslash escapes. |
| 777 | // |
| 778 | // This tree structure is intended for the benefit of inline analysis, and it |
| 779 | // is supposed to operate as-if backslash escaped pipes were stripped out in a |
| 780 | // separate pass. |
| 781 | begin_text = ix + 1; |
| 782 | backslash_escaped = false; |
| 783 | LoopInstruction::ContinueAndSkip(1) |
| 784 | } else if ix + 2 < bytes_len |
| 785 | && bytes[ix + 1] == b' \\' |
| 786 | && bytes[ix + 2] == b'|' |
| 787 | && TableParseMode::Active == mode |
| 788 | { |
| 789 | // To parse `\\|`, discard the backslashes and parse the `|` that follows it. |
| 790 | begin_text = ix + 2; |
| 791 | backslash_escaped = true; |
| 792 | LoopInstruction::ContinueAndSkip(2) |
| 793 | } else { |
| 794 | begin_text = ix + 1; |
| 795 | backslash_escaped = true; |
| 796 | LoopInstruction::ContinueAndSkip(1) |
| 797 | } |
| 798 | } else { |
| 799 | LoopInstruction::ContinueAndSkip(0) |
| 800 | } |
| 801 | } |
| 802 | c @ b'*' | c @ b'_' | c @ b'~' => { |
| 803 | let string_suffix = &self.text[ix..]; |
| 804 | let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c); |
| 805 | let can_open = delim_run_can_open( |
| 806 | &self.text[start..], |
| 807 | string_suffix, |
| 808 | count, |
| 809 | ix - start, |
| 810 | mode, |
| 811 | ); |
| 812 | let can_close = delim_run_can_close( |
| 813 | &self.text[start..], |
| 814 | string_suffix, |
| 815 | count, |
| 816 | ix - start, |
| 817 | mode, |
| 818 | ); |
| 819 | let is_valid_seq = c != b'~' || count <= 2; |
| 820 | |
| 821 | if (can_open || can_close) && is_valid_seq { |
| 822 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 823 | backslash_escaped = false; |
| 824 | for i in 0..count { |
| 825 | self.tree.append(Item { |
| 826 | start: ix + i, |
| 827 | end: ix + i + 1, |
| 828 | body: ItemBody::MaybeEmphasis(count - i, can_open, can_close), |
| 829 | }); |
| 830 | } |
| 831 | begin_text = ix + count; |
| 832 | } |
| 833 | LoopInstruction::ContinueAndSkip(count - 1) |
| 834 | } |
| 835 | b'$' => { |
| 836 | let string_suffix = &self.text[ix..]; |
| 837 | let can_open = !string_suffix[1..] |
| 838 | .as_bytes() |
| 839 | .first() |
| 840 | .copied() |
| 841 | .map_or(true, is_ascii_whitespace); |
| 842 | let can_close = ix > start |
| 843 | && !self.text[..ix] |
| 844 | .as_bytes() |
| 845 | .last() |
| 846 | .copied() |
| 847 | .map_or(true, is_ascii_whitespace); |
| 848 | |
| 849 | // 0xFFFF_FFFF... represents the root brace context. Using None would require |
| 850 | // storing Option<u8>, which is bigger than u8. |
| 851 | // |
| 852 | // These shouldn't conflict unless you have 255 levels of nesting, which is |
| 853 | // past the intended limit anyway. |
| 854 | // |
| 855 | // Unbalanced braces will cause the root to be changed, which is why it gets |
| 856 | // stored here. |
| 857 | let brace_context = |
| 858 | if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING { |
| 859 | self.brace_context_next as u8 |
| 860 | } else { |
| 861 | self.brace_context_stack.last().copied().unwrap_or_else(|| { |
| 862 | self.brace_context_stack.push(!0); |
| 863 | !0 |
| 864 | }) |
| 865 | }; |
| 866 | |
| 867 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 868 | self.tree.append(Item { |
| 869 | start: ix, |
| 870 | end: ix + 1, |
| 871 | body: ItemBody::MaybeMath(can_open, can_close, brace_context), |
| 872 | }); |
| 873 | begin_text = ix + 1; |
| 874 | LoopInstruction::ContinueAndSkip(0) |
| 875 | } |
| 876 | b'{' => { |
| 877 | if self.brace_context_stack.len() == MATH_BRACE_CONTEXT_MAX_NESTING { |
| 878 | self.brace_context_stack.push(self.brace_context_next as u8); |
| 879 | self.brace_context_next = MATH_BRACE_CONTEXT_MAX_NESTING; |
| 880 | } else if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING { |
| 881 | // When we reach the limit of nesting, switch from actually matching |
| 882 | // braces to just counting them. |
| 883 | self.brace_context_next += 1; |
| 884 | } else if !self.brace_context_stack.is_empty() { |
| 885 | // Store nothing if no math environment has been reached yet. |
| 886 | self.brace_context_stack.push(self.brace_context_next as u8); |
| 887 | self.brace_context_next += 1; |
| 888 | } |
| 889 | LoopInstruction::ContinueAndSkip(0) |
| 890 | } |
| 891 | b'}' => { |
| 892 | if let &mut [ref mut top_level_context] = &mut self.brace_context_stack[..] { |
| 893 | // Unbalanced Braces |
| 894 | // |
| 895 | // The initial, root top-level brace context is -1, but this is changed whenever an unbalanced |
| 896 | // close brace is encountered: |
| 897 | // |
| 898 | // This is not a math environment: $}$ |
| 899 | // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^|^ |
| 900 | // -1 |-2 |
| 901 | // |
| 902 | // To ensure this can't get parsed as math, each side of the unbalanced |
| 903 | // brace is an irreversibly separate brace context. As long as the math |
| 904 | // environment itself contains balanced braces, they should share a top level context. |
| 905 | // |
| 906 | // Math environment contains 2+2: $}$2+2$ |
| 907 | // ^^^ this is a math environment |
| 908 | *top_level_context = top_level_context.wrapping_sub(1); |
| 909 | } else if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING { |
| 910 | // When we exceed 25 levels of nesting, switch from accurately balancing braces |
| 911 | // to just counting them. When we dip back below the limit, switch back. |
| 912 | if self.brace_context_next <= MATH_BRACE_CONTEXT_MAX_NESTING { |
| 913 | self.brace_context_stack.pop(); |
| 914 | } else { |
| 915 | self.brace_context_next -= 1; |
| 916 | } |
| 917 | } else { |
| 918 | self.brace_context_stack.pop(); |
| 919 | } |
| 920 | LoopInstruction::ContinueAndSkip(0) |
| 921 | } |
| 922 | b'`' => { |
| 923 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 924 | backslash_escaped = false; |
| 925 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`' ); |
| 926 | self.tree.append(Item { |
| 927 | start: ix, |
| 928 | end: ix + count, |
| 929 | body: ItemBody::MaybeCode(count, false), |
| 930 | }); |
| 931 | begin_text = ix + count; |
| 932 | LoopInstruction::ContinueAndSkip(count - 1) |
| 933 | } |
| 934 | b'<' if bytes.get(ix + 1) != Some(&b' \\' ) => { |
| 935 | // Note: could detect some non-HTML cases and early escape here, but not |
| 936 | // clear that's a win. |
| 937 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 938 | backslash_escaped = false; |
| 939 | self.tree.append(Item { |
| 940 | start: ix, |
| 941 | end: ix + 1, |
| 942 | body: ItemBody::MaybeHtml, |
| 943 | }); |
| 944 | begin_text = ix + 1; |
| 945 | LoopInstruction::ContinueAndSkip(0) |
| 946 | } |
| 947 | b'!' => { |
| 948 | if ix + 1 < bytes_len && bytes[ix + 1] == b'[' { |
| 949 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 950 | backslash_escaped = false; |
| 951 | self.tree.append(Item { |
| 952 | start: ix, |
| 953 | end: ix + 2, |
| 954 | body: ItemBody::MaybeImage, |
| 955 | }); |
| 956 | begin_text = ix + 2; |
| 957 | LoopInstruction::ContinueAndSkip(1) |
| 958 | } else { |
| 959 | LoopInstruction::ContinueAndSkip(0) |
| 960 | } |
| 961 | } |
| 962 | b'[' => { |
| 963 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 964 | backslash_escaped = false; |
| 965 | self.tree.append(Item { |
| 966 | start: ix, |
| 967 | end: ix + 1, |
| 968 | body: ItemBody::MaybeLinkOpen, |
| 969 | }); |
| 970 | begin_text = ix + 1; |
| 971 | LoopInstruction::ContinueAndSkip(0) |
| 972 | } |
| 973 | b']' => { |
| 974 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 975 | backslash_escaped = false; |
| 976 | self.tree.append(Item { |
| 977 | start: ix, |
| 978 | end: ix + 1, |
| 979 | body: ItemBody::MaybeLinkClose(true), |
| 980 | }); |
| 981 | begin_text = ix + 1; |
| 982 | LoopInstruction::ContinueAndSkip(0) |
| 983 | } |
| 984 | b'&' => match scan_entity(&bytes[ix..]) { |
| 985 | (n, Some(value)) => { |
| 986 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 987 | backslash_escaped = false; |
| 988 | self.tree.append(Item { |
| 989 | start: ix, |
| 990 | end: ix + n, |
| 991 | body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)), |
| 992 | }); |
| 993 | begin_text = ix + n; |
| 994 | LoopInstruction::ContinueAndSkip(n - 1) |
| 995 | } |
| 996 | _ => LoopInstruction::ContinueAndSkip(0), |
| 997 | }, |
| 998 | b'|' => { |
| 999 | if ix != 0 && bytes[ix - 1] == b' \\' { |
| 1000 | LoopInstruction::ContinueAndSkip(0) |
| 1001 | } else if let TableParseMode::Active = mode { |
| 1002 | LoopInstruction::BreakAtWith(ix, None) |
| 1003 | } else { |
| 1004 | last_pipe_ix = ix; |
| 1005 | pipes += 1; |
| 1006 | LoopInstruction::ContinueAndSkip(0) |
| 1007 | } |
| 1008 | } |
| 1009 | b'.' => { |
| 1010 | if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' { |
| 1011 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 1012 | backslash_escaped = false; |
| 1013 | self.tree.append(Item { |
| 1014 | start: ix, |
| 1015 | end: ix + 3, |
| 1016 | body: ItemBody::SynthesizeChar('…' ), |
| 1017 | }); |
| 1018 | begin_text = ix + 3; |
| 1019 | LoopInstruction::ContinueAndSkip(2) |
| 1020 | } else { |
| 1021 | LoopInstruction::ContinueAndSkip(0) |
| 1022 | } |
| 1023 | } |
| 1024 | b'-' => { |
| 1025 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-' ); |
| 1026 | if count == 1 { |
| 1027 | LoopInstruction::ContinueAndSkip(0) |
| 1028 | } else { |
| 1029 | let itembody = if count == 2 { |
| 1030 | ItemBody::SynthesizeChar('–' ) |
| 1031 | } else if count == 3 { |
| 1032 | ItemBody::SynthesizeChar('—' ) |
| 1033 | } else { |
| 1034 | let (ems, ens) = match count % 6 { |
| 1035 | 0 | 3 => (count / 3, 0), |
| 1036 | 2 | 4 => (0, count / 2), |
| 1037 | 1 => (count / 3 - 1, 2), |
| 1038 | _ => (count / 3, 1), |
| 1039 | }; |
| 1040 | // – and — are 3 bytes each in utf8 |
| 1041 | let mut buf = String::with_capacity(3 * (ems + ens)); |
| 1042 | for _ in 0..ems { |
| 1043 | buf.push('—' ); |
| 1044 | } |
| 1045 | for _ in 0..ens { |
| 1046 | buf.push('–' ); |
| 1047 | } |
| 1048 | ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into())) |
| 1049 | }; |
| 1050 | |
| 1051 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 1052 | backslash_escaped = false; |
| 1053 | self.tree.append(Item { |
| 1054 | start: ix, |
| 1055 | end: ix + count, |
| 1056 | body: itembody, |
| 1057 | }); |
| 1058 | begin_text = ix + count; |
| 1059 | LoopInstruction::ContinueAndSkip(count - 1) |
| 1060 | } |
| 1061 | } |
| 1062 | c @ b' \'' | c @ b'"' => { |
| 1063 | let string_suffix = &self.text[ix..]; |
| 1064 | let can_open = |
| 1065 | delim_run_can_open(&self.text[start..], string_suffix, 1, ix - start, mode); |
| 1066 | let can_close = delim_run_can_close( |
| 1067 | &self.text[start..], |
| 1068 | string_suffix, |
| 1069 | 1, |
| 1070 | ix - start, |
| 1071 | mode, |
| 1072 | ); |
| 1073 | |
| 1074 | self.tree.append_text(begin_text, ix, backslash_escaped); |
| 1075 | backslash_escaped = false; |
| 1076 | self.tree.append(Item { |
| 1077 | start: ix, |
| 1078 | end: ix + 1, |
| 1079 | body: ItemBody::MaybeSmartQuote(c, can_open, can_close), |
| 1080 | }); |
| 1081 | begin_text = ix + 1; |
| 1082 | |
| 1083 | LoopInstruction::ContinueAndSkip(0) |
| 1084 | } |
| 1085 | _ => LoopInstruction::ContinueAndSkip(0), |
| 1086 | } |
| 1087 | }); |
| 1088 | |
| 1089 | if brk.is_none() { |
| 1090 | let trailing_whitespace = |
| 1091 | scan_rev_while(&bytes[begin_text..final_ix], is_ascii_whitespace_no_nl); |
| 1092 | // need to close text at eof |
| 1093 | self.tree.append_text( |
| 1094 | begin_text, |
| 1095 | final_ix - trailing_whitespace, |
| 1096 | backslash_escaped, |
| 1097 | ); |
| 1098 | } |
| 1099 | (final_ix, brk) |
| 1100 | } |
| 1101 | |
| 1102 | /// When start_ix is at the beginning of an HTML block of type 1 to 5, |
| 1103 | /// this will find the end of the block, adding the block itself to the |
| 1104 | /// tree and also keeping track of the lines of HTML within the block. |
| 1105 | /// |
| 1106 | /// The html_end_tag is the tag that must be found on a line to end the block. |
| 1107 | fn parse_html_block_type_1_to_5( |
| 1108 | &mut self, |
| 1109 | start_ix: usize, |
| 1110 | html_end_tag: &str, |
| 1111 | mut remaining_space: usize, |
| 1112 | mut indent: usize, |
| 1113 | ) -> usize { |
| 1114 | self.tree.append(Item { |
| 1115 | start: start_ix, |
| 1116 | end: 0, // set later |
| 1117 | body: ItemBody::HtmlBlock, |
| 1118 | }); |
| 1119 | self.tree.push(); |
| 1120 | |
| 1121 | let bytes = self.text.as_bytes(); |
| 1122 | let mut ix = start_ix; |
| 1123 | let end_ix; |
| 1124 | loop { |
| 1125 | let line_start_ix = ix; |
| 1126 | ix += scan_nextline(&bytes[ix..]); |
| 1127 | self.append_html_line(remaining_space.max(indent), line_start_ix, ix); |
| 1128 | |
| 1129 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 1130 | let n_containers = scan_containers( |
| 1131 | &self.tree, |
| 1132 | &mut line_start, |
| 1133 | self.options.has_gfm_footnotes(), |
| 1134 | ); |
| 1135 | if n_containers < self.tree.spine_len() { |
| 1136 | end_ix = ix; |
| 1137 | break; |
| 1138 | } |
| 1139 | |
| 1140 | if self.text[line_start_ix..ix].contains(html_end_tag) { |
| 1141 | end_ix = ix; |
| 1142 | break; |
| 1143 | } |
| 1144 | |
| 1145 | let next_line_ix = ix + line_start.bytes_scanned(); |
| 1146 | if next_line_ix == self.text.len() { |
| 1147 | end_ix = next_line_ix; |
| 1148 | break; |
| 1149 | } |
| 1150 | ix = next_line_ix; |
| 1151 | remaining_space = line_start.remaining_space(); |
| 1152 | indent = 0; |
| 1153 | } |
| 1154 | self.pop(end_ix); |
| 1155 | ix |
| 1156 | } |
| 1157 | |
| 1158 | /// When start_ix is at the beginning of an HTML block of type 6 or 7, |
| 1159 | /// this will consume lines until there is a blank line and keep track of |
| 1160 | /// the HTML within the block. |
| 1161 | fn parse_html_block_type_6_or_7( |
| 1162 | &mut self, |
| 1163 | start_ix: usize, |
| 1164 | mut remaining_space: usize, |
| 1165 | mut indent: usize, |
| 1166 | ) -> usize { |
| 1167 | self.tree.append(Item { |
| 1168 | start: start_ix, |
| 1169 | end: 0, // set later |
| 1170 | body: ItemBody::HtmlBlock, |
| 1171 | }); |
| 1172 | self.tree.push(); |
| 1173 | |
| 1174 | let bytes = self.text.as_bytes(); |
| 1175 | let mut ix = start_ix; |
| 1176 | let end_ix; |
| 1177 | loop { |
| 1178 | let line_start_ix = ix; |
| 1179 | ix += scan_nextline(&bytes[ix..]); |
| 1180 | self.append_html_line(remaining_space.max(indent), line_start_ix, ix); |
| 1181 | |
| 1182 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 1183 | let n_containers = scan_containers( |
| 1184 | &self.tree, |
| 1185 | &mut line_start, |
| 1186 | self.options.has_gfm_footnotes(), |
| 1187 | ); |
| 1188 | if n_containers < self.tree.spine_len() || line_start.is_at_eol() { |
| 1189 | end_ix = ix; |
| 1190 | break; |
| 1191 | } |
| 1192 | |
| 1193 | let next_line_ix = ix + line_start.bytes_scanned(); |
| 1194 | if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some() |
| 1195 | { |
| 1196 | end_ix = next_line_ix; |
| 1197 | break; |
| 1198 | } |
| 1199 | ix = next_line_ix; |
| 1200 | remaining_space = line_start.remaining_space(); |
| 1201 | indent = 0; |
| 1202 | } |
| 1203 | self.pop(end_ix); |
| 1204 | ix |
| 1205 | } |
| 1206 | |
| 1207 | fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize { |
| 1208 | self.tree.append(Item { |
| 1209 | start: start_ix, |
| 1210 | end: 0, // will get set later |
| 1211 | body: ItemBody::IndentCodeBlock, |
| 1212 | }); |
| 1213 | self.tree.push(); |
| 1214 | let bytes = self.text.as_bytes(); |
| 1215 | let mut last_nonblank_child = None; |
| 1216 | let mut last_nonblank_ix = 0; |
| 1217 | let mut end_ix = 0; |
| 1218 | self.last_line_blank = false; |
| 1219 | |
| 1220 | let mut ix = start_ix; |
| 1221 | loop { |
| 1222 | let line_start_ix = ix; |
| 1223 | ix += scan_nextline(&bytes[ix..]); |
| 1224 | self.append_code_text(remaining_space, line_start_ix, ix); |
| 1225 | // TODO(spec clarification): should we synthesize newline at EOF? |
| 1226 | |
| 1227 | if !self.last_line_blank { |
| 1228 | last_nonblank_child = self.tree.cur(); |
| 1229 | last_nonblank_ix = ix; |
| 1230 | end_ix = ix; |
| 1231 | } |
| 1232 | |
| 1233 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 1234 | let n_containers = scan_containers( |
| 1235 | &self.tree, |
| 1236 | &mut line_start, |
| 1237 | self.options.has_gfm_footnotes(), |
| 1238 | ); |
| 1239 | if n_containers < self.tree.spine_len() |
| 1240 | || !(line_start.scan_space(4) || line_start.is_at_eol()) |
| 1241 | { |
| 1242 | break; |
| 1243 | } |
| 1244 | let next_line_ix = ix + line_start.bytes_scanned(); |
| 1245 | if next_line_ix == self.text.len() { |
| 1246 | break; |
| 1247 | } |
| 1248 | ix = next_line_ix; |
| 1249 | remaining_space = line_start.remaining_space(); |
| 1250 | self.last_line_blank = scan_blank_line(&bytes[ix..]).is_some(); |
| 1251 | } |
| 1252 | |
| 1253 | // Trim trailing blank lines. |
| 1254 | if let Some(child) = last_nonblank_child { |
| 1255 | self.tree[child].next = None; |
| 1256 | self.tree[child].item.end = last_nonblank_ix; |
| 1257 | } |
| 1258 | self.pop(end_ix); |
| 1259 | ix |
| 1260 | } |
| 1261 | |
| 1262 | fn parse_fenced_code_block( |
| 1263 | &mut self, |
| 1264 | start_ix: usize, |
| 1265 | indent: usize, |
| 1266 | fence_ch: u8, |
| 1267 | n_fence_char: usize, |
| 1268 | ) -> usize { |
| 1269 | let bytes = self.text.as_bytes(); |
| 1270 | let mut info_start = start_ix + n_fence_char; |
| 1271 | info_start += scan_whitespace_no_nl(&bytes[info_start..]); |
| 1272 | // TODO: info strings are typically very short. wouldn't it be faster |
| 1273 | // to just do a forward scan here? |
| 1274 | let mut ix = info_start + scan_nextline(&bytes[info_start..]); |
| 1275 | let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace); |
| 1276 | let info_string = unescape(&self.text[info_start..info_end], self.tree.is_in_table()); |
| 1277 | self.tree.append(Item { |
| 1278 | start: start_ix, |
| 1279 | end: 0, // will get set later |
| 1280 | body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)), |
| 1281 | }); |
| 1282 | self.tree.push(); |
| 1283 | loop { |
| 1284 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 1285 | let n_containers = scan_containers( |
| 1286 | &self.tree, |
| 1287 | &mut line_start, |
| 1288 | self.options.has_gfm_footnotes(), |
| 1289 | ); |
| 1290 | if n_containers < self.tree.spine_len() { |
| 1291 | // this line will get parsed again as not being part of the code |
| 1292 | // if it's blank, it should be parsed as a blank line |
| 1293 | self.pop(ix); |
| 1294 | return ix; |
| 1295 | } |
| 1296 | line_start.scan_space(indent); |
| 1297 | let mut close_line_start = line_start.clone(); |
| 1298 | if !close_line_start.scan_space(4 - indent) { |
| 1299 | let close_ix = ix + close_line_start.bytes_scanned(); |
| 1300 | if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char) |
| 1301 | { |
| 1302 | ix = close_ix + n; |
| 1303 | self.pop(ix); |
| 1304 | // try to read trailing whitespace or it will register as a completely blank line |
| 1305 | return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0); |
| 1306 | } |
| 1307 | } |
| 1308 | let remaining_space = line_start.remaining_space(); |
| 1309 | ix += line_start.bytes_scanned(); |
| 1310 | let next_ix = ix + scan_nextline(&bytes[ix..]); |
| 1311 | self.append_code_text(remaining_space, ix, next_ix); |
| 1312 | ix = next_ix; |
| 1313 | } |
| 1314 | } |
| 1315 | |
| 1316 | fn parse_metadata_block(&mut self, start_ix: usize, metadata_block_ch: u8) -> usize { |
| 1317 | let bytes = self.text.as_bytes(); |
| 1318 | let metadata_block_kind = match metadata_block_ch { |
| 1319 | b'-' => MetadataBlockKind::YamlStyle, |
| 1320 | b'+' => MetadataBlockKind::PlusesStyle, |
| 1321 | _ => panic!("Erroneous metadata block character when parsing metadata block" ), |
| 1322 | }; |
| 1323 | // 3 delimiter characters |
| 1324 | let mut ix = start_ix + 3 + scan_nextline(&bytes[start_ix + 3..]); |
| 1325 | self.tree.append(Item { |
| 1326 | start: start_ix, |
| 1327 | end: 0, // will get set later |
| 1328 | body: ItemBody::MetadataBlock(metadata_block_kind), |
| 1329 | }); |
| 1330 | self.tree.push(); |
| 1331 | loop { |
| 1332 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 1333 | let n_containers = scan_containers( |
| 1334 | &self.tree, |
| 1335 | &mut line_start, |
| 1336 | self.options.has_gfm_footnotes(), |
| 1337 | ); |
| 1338 | if n_containers < self.tree.spine_len() { |
| 1339 | break; |
| 1340 | } |
| 1341 | if let (_, 0) = calc_indent(&bytes[ix..], 4) { |
| 1342 | if let Some(n) = scan_closing_metadata_block(&bytes[ix..], metadata_block_ch) { |
| 1343 | ix += n; |
| 1344 | break; |
| 1345 | } |
| 1346 | } |
| 1347 | let remaining_space = line_start.remaining_space(); |
| 1348 | ix += line_start.bytes_scanned(); |
| 1349 | let next_ix = ix + scan_nextline(&bytes[ix..]); |
| 1350 | self.append_code_text(remaining_space, ix, next_ix); |
| 1351 | ix = next_ix; |
| 1352 | } |
| 1353 | |
| 1354 | self.pop(ix); |
| 1355 | |
| 1356 | // try to read trailing whitespace or it will register as a completely blank line |
| 1357 | ix + scan_blank_line(&bytes[ix..]).unwrap_or(0) |
| 1358 | } |
| 1359 | |
| 1360 | fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) { |
| 1361 | if remaining_space > 0 { |
| 1362 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
| 1363 | self.tree.append(Item { |
| 1364 | start, |
| 1365 | end: start, |
| 1366 | body: ItemBody::SynthesizeText(cow_ix), |
| 1367 | }); |
| 1368 | } |
| 1369 | if self.text.as_bytes()[end - 2] == b' \r' { |
| 1370 | // Normalize CRLF to LF |
| 1371 | self.tree.append_text(start, end - 2, false); |
| 1372 | self.tree.append_text(end - 1, end, false); |
| 1373 | } else { |
| 1374 | self.tree.append_text(start, end, false); |
| 1375 | } |
| 1376 | } |
| 1377 | |
| 1378 | /// Appends a line of HTML to the tree. |
| 1379 | fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) { |
| 1380 | if remaining_space > 0 { |
| 1381 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
| 1382 | self.tree.append(Item { |
| 1383 | start, |
| 1384 | end: start, |
| 1385 | body: ItemBody::SynthesizeText(cow_ix), |
| 1386 | }); |
| 1387 | } |
| 1388 | if self.text.as_bytes()[end - 2] == b' \r' { |
| 1389 | // Normalize CRLF to LF |
| 1390 | self.tree.append(Item { |
| 1391 | start, |
| 1392 | end: end - 2, |
| 1393 | body: ItemBody::Html, |
| 1394 | }); |
| 1395 | self.tree.append(Item { |
| 1396 | start: end - 1, |
| 1397 | end, |
| 1398 | body: ItemBody::Html, |
| 1399 | }); |
| 1400 | } else { |
| 1401 | self.tree.append(Item { |
| 1402 | start, |
| 1403 | end, |
| 1404 | body: ItemBody::Html, |
| 1405 | }); |
| 1406 | } |
| 1407 | } |
| 1408 | |
| 1409 | /// Pop a container, setting its end. |
| 1410 | fn pop(&mut self, ix: usize) { |
| 1411 | let cur_ix = self.tree.pop().unwrap(); |
| 1412 | self.tree[cur_ix].item.end = ix; |
| 1413 | if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body { |
| 1414 | surgerize_tight_list(&mut self.tree, cur_ix); |
| 1415 | self.begin_list_item = None; |
| 1416 | } |
| 1417 | } |
| 1418 | |
| 1419 | /// Close a list if it's open. Also set loose if last line was blank |
| 1420 | /// and end current list if it's a lone, empty item |
| 1421 | fn finish_list(&mut self, ix: usize) { |
| 1422 | self.finish_empty_list_item(); |
| 1423 | if let Some(node_ix) = self.tree.peek_up() { |
| 1424 | if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body { |
| 1425 | self.pop(ix); |
| 1426 | } |
| 1427 | } |
| 1428 | if self.last_line_blank { |
| 1429 | if let Some(node_ix) = self.tree.peek_grandparent() { |
| 1430 | if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body { |
| 1431 | *is_tight = false; |
| 1432 | } |
| 1433 | } |
| 1434 | self.last_line_blank = false; |
| 1435 | } |
| 1436 | } |
| 1437 | |
| 1438 | fn finish_empty_list_item(&mut self) { |
| 1439 | if let Some(begin_list_item) = self.begin_list_item { |
| 1440 | if self.last_line_blank { |
| 1441 | // A list item can begin with at most one blank line. |
| 1442 | if let Some(node_ix) = self.tree.peek_up() { |
| 1443 | if let ItemBody::ListItem(_) = self.tree[node_ix].item.body { |
| 1444 | self.pop(begin_list_item); |
| 1445 | } |
| 1446 | } |
| 1447 | } |
| 1448 | } |
| 1449 | self.begin_list_item = None; |
| 1450 | } |
| 1451 | |
| 1452 | /// Continue an existing list or start a new one if there's not an open |
| 1453 | /// list that matches. |
| 1454 | fn continue_list(&mut self, start: usize, ch: u8, index: u64) { |
| 1455 | self.finish_empty_list_item(); |
| 1456 | if let Some(node_ix) = self.tree.peek_up() { |
| 1457 | if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body { |
| 1458 | if existing_ch == ch { |
| 1459 | if self.last_line_blank { |
| 1460 | *is_tight = false; |
| 1461 | self.last_line_blank = false; |
| 1462 | } |
| 1463 | return; |
| 1464 | } |
| 1465 | } |
| 1466 | // TODO: this is not the best choice for end; maybe get end from last list item. |
| 1467 | self.finish_list(start); |
| 1468 | } |
| 1469 | self.tree.append(Item { |
| 1470 | start, |
| 1471 | end: 0, // will get set later |
| 1472 | body: ItemBody::List(true, ch, index), |
| 1473 | }); |
| 1474 | self.tree.push(); |
| 1475 | self.last_line_blank = false; |
| 1476 | } |
| 1477 | |
| 1478 | /// Parse a thematic break. |
| 1479 | /// |
| 1480 | /// Returns index of start of next line. |
| 1481 | fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize { |
| 1482 | self.tree.append(Item { |
| 1483 | start: ix, |
| 1484 | end: ix + hrule_size, |
| 1485 | body: ItemBody::Rule, |
| 1486 | }); |
| 1487 | ix + hrule_size |
| 1488 | } |
| 1489 | |
| 1490 | /// Parse an ATX heading. |
| 1491 | /// |
| 1492 | /// Returns index of start of next line. |
| 1493 | fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize { |
| 1494 | let mut ix = start; |
| 1495 | let heading_ix = self.tree.append(Item { |
| 1496 | start, |
| 1497 | end: 0, // set later |
| 1498 | body: ItemBody::default(), // set later |
| 1499 | }); |
| 1500 | ix += atx_level as usize; |
| 1501 | // next char is space or eol (guaranteed by scan_atx_heading) |
| 1502 | let bytes = self.text.as_bytes(); |
| 1503 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
| 1504 | self.tree[heading_ix].item.end = ix + eol_bytes; |
| 1505 | self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None); |
| 1506 | return ix + eol_bytes; |
| 1507 | } |
| 1508 | // skip leading spaces |
| 1509 | let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]); |
| 1510 | ix += skip_spaces; |
| 1511 | |
| 1512 | // now handle the header text |
| 1513 | let header_start = ix; |
| 1514 | let header_node_idx = self.tree.push(); // so that we can set the endpoint later |
| 1515 | |
| 1516 | // trim the trailing attribute block before parsing the entire line, if necessary |
| 1517 | let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) |
| 1518 | { |
| 1519 | // the start of the next line is the end of the header since the |
| 1520 | // header cannot have line breaks |
| 1521 | let header_end = header_start + scan_nextline(&bytes[header_start..]); |
| 1522 | let (content_end, attrs) = |
| 1523 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
| 1524 | self.parse_line(ix, Some(content_end), TableParseMode::Disabled); |
| 1525 | (header_end, content_end, attrs) |
| 1526 | } else { |
| 1527 | let (line_ix, line_brk) = self.parse_line(ix, None, TableParseMode::Disabled); |
| 1528 | ix = line_ix; |
| 1529 | // Backslash at end is actually hard line break |
| 1530 | if let Some(Item { |
| 1531 | start, |
| 1532 | end, |
| 1533 | body: ItemBody::HardBreak(true), |
| 1534 | }) = line_brk |
| 1535 | { |
| 1536 | self.tree.append_text(start, end, false); |
| 1537 | } |
| 1538 | (ix, ix, None) |
| 1539 | }; |
| 1540 | self.tree[header_node_idx].item.end = end; |
| 1541 | |
| 1542 | // remove trailing matter from header text |
| 1543 | let mut empty_text_node = false; |
| 1544 | if let Some(cur_ix) = self.tree.cur() { |
| 1545 | // remove closing of the ATX heading |
| 1546 | let header_text = &bytes[header_start..content_end]; |
| 1547 | let mut limit = header_text |
| 1548 | .iter() |
| 1549 | .rposition(|&b| !(b == b' \n' || b == b' \r' || b == b' ' )) |
| 1550 | .map_or(0, |i| i + 1); |
| 1551 | let closer = header_text[..limit] |
| 1552 | .iter() |
| 1553 | .rposition(|&b| b != b'#' ) |
| 1554 | .map_or(0, |i| i + 1); |
| 1555 | if closer == 0 { |
| 1556 | limit = closer; |
| 1557 | } else { |
| 1558 | let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ' ); |
| 1559 | if spaces > 0 { |
| 1560 | limit = closer - spaces; |
| 1561 | } |
| 1562 | } |
| 1563 | // if text is only spaces, then remove them |
| 1564 | self.tree[cur_ix].item.end = limit + header_start; |
| 1565 | |
| 1566 | // limit = 0 when text is empty after removing spaces |
| 1567 | if limit == 0 { |
| 1568 | empty_text_node = true; |
| 1569 | } |
| 1570 | } |
| 1571 | |
| 1572 | if empty_text_node { |
| 1573 | self.tree.remove_node(); |
| 1574 | } else { |
| 1575 | self.tree.pop(); |
| 1576 | } |
| 1577 | self.tree[heading_ix].item.body = ItemBody::Heading( |
| 1578 | atx_level, |
| 1579 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
| 1580 | ); |
| 1581 | |
| 1582 | end |
| 1583 | } |
| 1584 | |
| 1585 | /// Returns the number of bytes scanned on success. |
| 1586 | fn parse_footnote(&mut self, start: usize) -> Option<usize> { |
| 1587 | let bytes = &self.text.as_bytes()[start..]; |
| 1588 | if !bytes.starts_with(b"[^" ) { |
| 1589 | return None; |
| 1590 | } |
| 1591 | let (mut i, label) = if self.options.has_gfm_footnotes() { |
| 1592 | // GitHub doesn't allow footnote definition labels to contain line breaks. |
| 1593 | // It actually does allow this for link definitions under certain circumstances, |
| 1594 | // but for this it's simpler to avoid it. |
| 1595 | scan_link_label_rest(&self.text[start + 2..], &|_| None, self.tree.is_in_table())? |
| 1596 | } else { |
| 1597 | self.parse_refdef_label(start + 2)? |
| 1598 | }; |
| 1599 | if self.options.has_gfm_footnotes() && label.bytes().any(|b| b == b' \r' || b == b' \n' ) { |
| 1600 | // GitHub doesn't allow footnote definition labels to contain line breaks, |
| 1601 | // even if they're escaped. |
| 1602 | return None; |
| 1603 | } |
| 1604 | i += 2; |
| 1605 | if scan_ch(&bytes[i..], b':' ) == 0 { |
| 1606 | return None; |
| 1607 | } |
| 1608 | i += 1; |
| 1609 | self.finish_list(start); |
| 1610 | if let Some(node_ix) = self.tree.peek_up() { |
| 1611 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
| 1612 | // finish previous footnote if it's still open |
| 1613 | self.pop(start); |
| 1614 | } |
| 1615 | } |
| 1616 | if self.options.has_gfm_footnotes() { |
| 1617 | i += scan_whitespace_no_nl(&bytes[i..]); |
| 1618 | } |
| 1619 | self.allocs.footdefs.0.insert( |
| 1620 | UniCase::new(label.clone()), |
| 1621 | FootnoteDef { use_count: 0 }, |
| 1622 | ); |
| 1623 | self.tree.append(Item { |
| 1624 | start, |
| 1625 | end: 0, // will get set later |
| 1626 | // TODO: check whether the label here is strictly necessary |
| 1627 | body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)), |
| 1628 | }); |
| 1629 | self.tree.push(); |
| 1630 | Some(i) |
| 1631 | } |
| 1632 | |
| 1633 | /// Tries to parse a reference label, which can be interrupted by new blocks. |
| 1634 | /// On success, returns the number of bytes of the label and the label itself. |
| 1635 | fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> { |
| 1636 | scan_link_label_rest( |
| 1637 | &self.text[start..], |
| 1638 | &|bytes| { |
| 1639 | let mut line_start = LineStart::new(bytes); |
| 1640 | let current_container = scan_containers( |
| 1641 | &self.tree, |
| 1642 | &mut line_start, |
| 1643 | self.options.has_gfm_footnotes(), |
| 1644 | ) == self.tree.spine_len(); |
| 1645 | if line_start.scan_space(4) { |
| 1646 | return Some(line_start.bytes_scanned()); |
| 1647 | } |
| 1648 | let bytes_scanned = line_start.bytes_scanned(); |
| 1649 | let suffix = &bytes[bytes_scanned..]; |
| 1650 | if self.scan_paragraph_interrupt(suffix, current_container) |
| 1651 | || (current_container && scan_setext_heading(suffix).is_some()) |
| 1652 | { |
| 1653 | None |
| 1654 | } else { |
| 1655 | Some(bytes_scanned) |
| 1656 | } |
| 1657 | }, |
| 1658 | self.tree.is_in_table(), |
| 1659 | ) |
| 1660 | } |
| 1661 | |
| 1662 | /// Returns number of bytes scanned, label and definition on success. |
| 1663 | fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> { |
| 1664 | let bytes = &self.text.as_bytes()[start..]; |
| 1665 | if scan_ch(bytes, b'[' ) == 0 { |
| 1666 | return None; |
| 1667 | } |
| 1668 | let (mut i, label) = self.parse_refdef_label(start + 1)?; |
| 1669 | i += 1; |
| 1670 | if scan_ch(&bytes[i..], b':' ) == 0 { |
| 1671 | return None; |
| 1672 | } |
| 1673 | i += 1; |
| 1674 | let (bytecount, link_def) = self.scan_refdef(start, start + i)?; |
| 1675 | Some((bytecount + i, UniCase::new(label), link_def)) |
| 1676 | } |
| 1677 | |
| 1678 | /// Returns number of bytes and number of newlines |
| 1679 | fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> { |
| 1680 | let mut newlines = 0; |
| 1681 | loop { |
| 1682 | let whitespaces = scan_whitespace_no_nl(&bytes[i..]); |
| 1683 | i += whitespaces; |
| 1684 | if let Some(eol_bytes) = scan_eol(&bytes[i..]) { |
| 1685 | i += eol_bytes; |
| 1686 | newlines += 1; |
| 1687 | if newlines > 1 { |
| 1688 | return None; |
| 1689 | } |
| 1690 | } else { |
| 1691 | break; |
| 1692 | } |
| 1693 | let mut line_start = LineStart::new(&bytes[i..]); |
| 1694 | let current_container = scan_containers( |
| 1695 | &self.tree, |
| 1696 | &mut line_start, |
| 1697 | self.options.has_gfm_footnotes(), |
| 1698 | ) == self.tree.spine_len(); |
| 1699 | if !line_start.scan_space(4) { |
| 1700 | let suffix = &bytes[i + line_start.bytes_scanned()..]; |
| 1701 | if self.scan_paragraph_interrupt(suffix, current_container) |
| 1702 | || scan_setext_heading(suffix).is_some() |
| 1703 | { |
| 1704 | return None; |
| 1705 | } |
| 1706 | } |
| 1707 | i += line_start.bytes_scanned(); |
| 1708 | } |
| 1709 | Some((i, newlines)) |
| 1710 | } |
| 1711 | |
| 1712 | // returns (bytelength, title_str) |
| 1713 | fn scan_refdef_title<'t>(&self, text: &'t str) -> Option<(usize, CowStr<'t>)> { |
| 1714 | let bytes = text.as_bytes(); |
| 1715 | let closing_delim = match bytes.first()? { |
| 1716 | b' \'' => b' \'' , |
| 1717 | b'"' => b'"' , |
| 1718 | b'(' => b')' , |
| 1719 | _ => return None, |
| 1720 | }; |
| 1721 | let mut bytecount = 1; |
| 1722 | let mut linestart = 1; |
| 1723 | |
| 1724 | let mut linebuf = None; |
| 1725 | |
| 1726 | while let Some(&c) = bytes.get(bytecount) { |
| 1727 | match c { |
| 1728 | b'(' if closing_delim == b')' => { |
| 1729 | // https://spec.commonmark.org/0.30/#link-title |
| 1730 | // a sequence of zero or more characters between matching parentheses ((...)), |
| 1731 | // including a ( or ) character only if it is backslash-escaped. |
| 1732 | return None; |
| 1733 | } |
| 1734 | b' \n' | b' \r' => { |
| 1735 | // push text to line buffer |
| 1736 | // this is used to strip the block formatting: |
| 1737 | // |
| 1738 | // > [first]: http://example.com " |
| 1739 | // > second" |
| 1740 | // |
| 1741 | // should get turned into `<a href="example.com" title=" second">first</a>` |
| 1742 | let linebuf = if let Some(linebuf) = &mut linebuf { |
| 1743 | linebuf |
| 1744 | } else { |
| 1745 | linebuf = Some(String::new()); |
| 1746 | linebuf.as_mut().unwrap() |
| 1747 | }; |
| 1748 | linebuf.push_str(&text[linestart..bytecount]); |
| 1749 | linebuf.push(' \n' ); // normalize line breaks |
| 1750 | // skip line break |
| 1751 | bytecount += 1; |
| 1752 | if c == b' \r' && bytes.get(bytecount) == Some(&b' \n' ) { |
| 1753 | bytecount += 1; |
| 1754 | } |
| 1755 | let mut line_start = LineStart::new(&bytes[bytecount..]); |
| 1756 | let current_container = scan_containers( |
| 1757 | &self.tree, |
| 1758 | &mut line_start, |
| 1759 | self.options.has_gfm_footnotes(), |
| 1760 | ) == self.tree.spine_len(); |
| 1761 | if !line_start.scan_space(4) { |
| 1762 | let suffix = &bytes[bytecount + line_start.bytes_scanned()..]; |
| 1763 | if self.scan_paragraph_interrupt(suffix, current_container) |
| 1764 | || scan_setext_heading(suffix).is_some() |
| 1765 | { |
| 1766 | return None; |
| 1767 | } |
| 1768 | } |
| 1769 | line_start.scan_all_space(); |
| 1770 | bytecount += line_start.bytes_scanned(); |
| 1771 | linestart = bytecount; |
| 1772 | if scan_blank_line(&bytes[bytecount..]).is_some() { |
| 1773 | // blank line - not allowed |
| 1774 | return None; |
| 1775 | } |
| 1776 | } |
| 1777 | b' \\' => { |
| 1778 | bytecount += 1; |
| 1779 | if let Some(c) = bytes.get(bytecount) { |
| 1780 | if c != &b' \r' && c != &b' \n' { |
| 1781 | bytecount += 1; |
| 1782 | } |
| 1783 | } |
| 1784 | } |
| 1785 | c if c == closing_delim => { |
| 1786 | let cow = if let Some(mut linebuf) = linebuf { |
| 1787 | linebuf.push_str(&text[linestart..bytecount]); |
| 1788 | CowStr::from(linebuf) |
| 1789 | } else { |
| 1790 | CowStr::from(&text[linestart..bytecount]) |
| 1791 | }; |
| 1792 | return Some((bytecount + 1, cow)); |
| 1793 | } |
| 1794 | _ => { |
| 1795 | bytecount += 1; |
| 1796 | } |
| 1797 | } |
| 1798 | } |
| 1799 | None |
| 1800 | } |
| 1801 | |
| 1802 | /// Returns # of bytes and definition. |
| 1803 | /// Assumes the label of the reference including colon has already been scanned. |
| 1804 | fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> { |
| 1805 | let bytes = self.text.as_bytes(); |
| 1806 | |
| 1807 | // whitespace between label and url (including up to one newline) |
| 1808 | let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?; |
| 1809 | |
| 1810 | // scan link dest |
| 1811 | let (dest_length, dest) = scan_link_dest(self.text, i, LINK_MAX_NESTED_PARENS)?; |
| 1812 | if dest_length == 0 { |
| 1813 | return None; |
| 1814 | } |
| 1815 | let dest = unescape(dest, self.tree.is_in_table()); |
| 1816 | i += dest_length; |
| 1817 | |
| 1818 | // no title |
| 1819 | let mut backup = ( |
| 1820 | i - start, |
| 1821 | LinkDef { |
| 1822 | dest, |
| 1823 | title: None, |
| 1824 | span: span_start..i, |
| 1825 | }, |
| 1826 | ); |
| 1827 | |
| 1828 | // scan whitespace between dest and label |
| 1829 | let (mut i, newlines) = |
| 1830 | if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) { |
| 1831 | if i == self.text.len() { |
| 1832 | newlines += 1; |
| 1833 | } |
| 1834 | if new_i == i && newlines == 0 { |
| 1835 | return None; |
| 1836 | } |
| 1837 | if newlines > 1 { |
| 1838 | return Some(backup); |
| 1839 | }; |
| 1840 | (new_i, newlines) |
| 1841 | } else { |
| 1842 | return Some(backup); |
| 1843 | }; |
| 1844 | |
| 1845 | // scan title |
| 1846 | // if this fails but newline == 1, return also a refdef without title |
| 1847 | if let Some((title_length, title)) = self.scan_refdef_title(&self.text[i..]) { |
| 1848 | i += title_length; |
| 1849 | if scan_blank_line(&bytes[i..]).is_some() { |
| 1850 | backup.0 = i - start; |
| 1851 | backup.1.span = span_start..i; |
| 1852 | backup.1.title = Some(unescape(title, self.tree.is_in_table())); |
| 1853 | return Some(backup); |
| 1854 | } |
| 1855 | } |
| 1856 | if newlines > 0 { |
| 1857 | Some(backup) |
| 1858 | } else { |
| 1859 | None |
| 1860 | } |
| 1861 | } |
| 1862 | |
| 1863 | /// Checks whether we should break a paragraph on the given input. |
| 1864 | fn scan_paragraph_interrupt(&self, bytes: &[u8], current_container: bool) -> bool { |
| 1865 | let has_footnote = self.options.contains(Options::ENABLE_FOOTNOTES); |
| 1866 | if scan_paragraph_interrupt_no_table(bytes, current_container, has_footnote, &self.tree) { |
| 1867 | return true; |
| 1868 | } |
| 1869 | // pulldown-cmark allows heavy tables, that have a `|` on the header row, |
| 1870 | // to interrupt paragraphs. |
| 1871 | // |
| 1872 | // ```markdown |
| 1873 | // This is a table |
| 1874 | // | a | b | c | |
| 1875 | // |---|---|---| |
| 1876 | // | d | e | f | |
| 1877 | // |
| 1878 | // This is not a table |
| 1879 | // a | b | c |
| 1880 | // ---|---|--- |
| 1881 | // d | e | f |
| 1882 | // ``` |
| 1883 | if !self.options.contains(Options::ENABLE_TABLES) || !bytes.starts_with(b"|" ) { |
| 1884 | return false; |
| 1885 | } |
| 1886 | |
| 1887 | // Checking if something's a valid table or not requires looking at two lines. |
| 1888 | // First line, count unescaped pipes. |
| 1889 | let mut pipes = 0; |
| 1890 | let mut next_line_ix = 0; |
| 1891 | let mut bsesc = false; |
| 1892 | let mut last_pipe_ix = 0; |
| 1893 | for (i, &byte) in bytes.iter().enumerate() { |
| 1894 | match byte { |
| 1895 | b' \\' => { |
| 1896 | bsesc = true; |
| 1897 | continue; |
| 1898 | } |
| 1899 | b'|' if !bsesc => { |
| 1900 | pipes += 1; |
| 1901 | last_pipe_ix = i; |
| 1902 | } |
| 1903 | b' \r' | b' \n' => { |
| 1904 | next_line_ix = i + scan_eol(&bytes[i..]).unwrap(); |
| 1905 | break; |
| 1906 | } |
| 1907 | _ => {} |
| 1908 | } |
| 1909 | bsesc = false; |
| 1910 | } |
| 1911 | |
| 1912 | // scan_eol can't return 0, so this can't be zero |
| 1913 | if next_line_ix == 0 { |
| 1914 | return false; |
| 1915 | } |
| 1916 | |
| 1917 | // Scan the table head. The part that looks like: |
| 1918 | // |
| 1919 | // |---|---|---| |
| 1920 | // |
| 1921 | // Also scan any containing items, since it's on its own line, and |
| 1922 | // might be nested inside a block quote or something |
| 1923 | // |
| 1924 | // > Table: First |
| 1925 | // > | first col | second col | |
| 1926 | // > |-----------|------------| |
| 1927 | // ^ |
| 1928 | // | need to skip over the `>` when checking for the table |
| 1929 | let mut line_start = LineStart::new(&bytes[next_line_ix..]); |
| 1930 | if scan_containers( |
| 1931 | &self.tree, |
| 1932 | &mut line_start, |
| 1933 | self.options.has_gfm_footnotes(), |
| 1934 | ) != self.tree.spine_len() |
| 1935 | { |
| 1936 | return false; |
| 1937 | } |
| 1938 | let table_head_ix = next_line_ix + line_start.bytes_scanned(); |
| 1939 | let (table_head_bytes, alignment) = scan_table_head(&bytes[table_head_ix..]); |
| 1940 | |
| 1941 | if table_head_bytes == 0 { |
| 1942 | return false; |
| 1943 | } |
| 1944 | |
| 1945 | // computing header count from number of pipes |
| 1946 | let header_count = count_header_cols(bytes, pipes, 0, last_pipe_ix); |
| 1947 | |
| 1948 | // make sure they match the number of columns we find in separator line |
| 1949 | alignment.len() == header_count |
| 1950 | } |
| 1951 | |
| 1952 | /// Extracts and parses a heading attribute block if exists. |
| 1953 | /// |
| 1954 | /// Returns `(end_offset_of_heading_content, (id, classes))`. |
| 1955 | /// |
| 1956 | /// If `header_end` is less than or equal to `header_start`, the given |
| 1957 | /// input is considered as empty. |
| 1958 | fn extract_and_parse_heading_attribute_block( |
| 1959 | &mut self, |
| 1960 | header_start: usize, |
| 1961 | header_end: usize, |
| 1962 | ) -> (usize, Option<HeadingAttributes<'a>>) { |
| 1963 | if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) { |
| 1964 | return (header_end, None); |
| 1965 | } |
| 1966 | |
| 1967 | // extract the trailing attribute block |
| 1968 | let header_bytes = &self.text.as_bytes()[header_start..header_end]; |
| 1969 | let (content_len, attr_block_range_rel) = |
| 1970 | extract_attribute_block_content_from_header_text(header_bytes); |
| 1971 | let content_end = header_start + content_len; |
| 1972 | let attrs = attr_block_range_rel.and_then(|r| { |
| 1973 | parse_inside_attribute_block( |
| 1974 | &self.text[(header_start + r.start)..(header_start + r.end)], |
| 1975 | ) |
| 1976 | }); |
| 1977 | (content_end, attrs) |
| 1978 | } |
| 1979 | } |
| 1980 | |
| 1981 | /// Scanning modes for `Parser`'s `parse_line` method. |
| 1982 | #[derive (PartialEq, Eq, Copy, Clone)] |
| 1983 | enum TableParseMode { |
| 1984 | /// Inside a paragraph, scanning for table headers. |
| 1985 | Scan, |
| 1986 | /// Inside a table. |
| 1987 | Active, |
| 1988 | /// Inside a paragraph, not scanning for table headers. |
| 1989 | Disabled, |
| 1990 | } |
| 1991 | |
| 1992 | /// Computes the number of header columns in a table line by computing the number of dividing pipes |
| 1993 | /// that aren't followed or preceded by whitespace. |
| 1994 | fn count_header_cols( |
| 1995 | bytes: &[u8], |
| 1996 | mut pipes: usize, |
| 1997 | mut start: usize, |
| 1998 | last_pipe_ix: usize, |
| 1999 | ) -> usize { |
| 2000 | // was first pipe preceded by whitespace? if so, subtract one |
| 2001 | start += scan_whitespace_no_nl(&bytes[start..]); |
| 2002 | if bytes[start] == b'|' { |
| 2003 | pipes -= 1; |
| 2004 | } |
| 2005 | |
| 2006 | // was last pipe followed by whitespace? if so, sub one |
| 2007 | if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() { |
| 2008 | pipes |
| 2009 | } else { |
| 2010 | pipes + 1 |
| 2011 | } |
| 2012 | } |
| 2013 | |
| 2014 | /// Checks whether we should break a paragraph on the given input. |
| 2015 | /// |
| 2016 | /// Use `FirstPass::scan_paragraph_interrupt` in any context that allows |
| 2017 | /// tables to interrupt the paragraph. |
| 2018 | fn scan_paragraph_interrupt_no_table( |
| 2019 | bytes: &[u8], |
| 2020 | current_container: bool, |
| 2021 | has_footnote: bool, |
| 2022 | tree: &Tree<Item>, |
| 2023 | ) -> bool { |
| 2024 | scan_eol(bytes).is_some() |
| 2025 | || scan_hrule(bytes).is_ok() |
| 2026 | || scan_atx_heading(bytes).is_some() |
| 2027 | || scan_code_fence(bytes).is_some() |
| 2028 | || scan_blockquote_start(bytes).is_some() |
| 2029 | || scan_listitem(bytes).map_or(false, |(ix, delim, index, _)| { |
| 2030 | ! current_container || |
| 2031 | tree.is_in_table() || |
| 2032 | // we don't allow interruption by either empty lists or |
| 2033 | // numbered lists starting at an index other than 1 |
| 2034 | (delim == b'*' || delim == b'-' || delim == b'+' || index == 1) |
| 2035 | && (scan_blank_line(&bytes[ix..]).is_none()) |
| 2036 | }) |
| 2037 | || bytes.starts_with(b"<" ) |
| 2038 | && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..])) |
| 2039 | || (has_footnote |
| 2040 | && bytes.starts_with(b"[^" ) |
| 2041 | && scan_link_label_rest( |
| 2042 | std::str::from_utf8(&bytes[2..]).unwrap(), |
| 2043 | &|_| None, |
| 2044 | tree.is_in_table(), |
| 2045 | ) |
| 2046 | .map_or(false, |(len, _)| bytes.get(2 + len) == Some(&b':' ))) |
| 2047 | } |
| 2048 | |
| 2049 | /// Assumes `text_bytes` is preceded by `<`. |
| 2050 | fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> { |
| 2051 | static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre" , b"style" , b"script" , b"textarea" ]; |
| 2052 | static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--" , b"?" , b"![CDATA[" ]; |
| 2053 | |
| 2054 | for (beg_tag, end_tag) in BEGIN_TAGS |
| 2055 | .iter() |
| 2056 | .zip(["</pre>" , "</style>" , "</script>" , "</textarea>" ].iter()) |
| 2057 | { |
| 2058 | let tag_len = beg_tag.len(); |
| 2059 | |
| 2060 | if text_bytes.len() < tag_len { |
| 2061 | // begin tags are increasing in size |
| 2062 | break; |
| 2063 | } |
| 2064 | |
| 2065 | if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) { |
| 2066 | continue; |
| 2067 | } |
| 2068 | |
| 2069 | // Must either be the end of the line... |
| 2070 | if text_bytes.len() == tag_len { |
| 2071 | return Some(end_tag); |
| 2072 | } |
| 2073 | |
| 2074 | // ...or be followed by whitespace, newline, or '>'. |
| 2075 | let s = text_bytes[tag_len]; |
| 2076 | if is_ascii_whitespace(s) || s == b'>' { |
| 2077 | return Some(end_tag); |
| 2078 | } |
| 2079 | } |
| 2080 | |
| 2081 | for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->" , "?>" , "]]>" ].iter()) { |
| 2082 | if text_bytes.starts_with(beg_tag) { |
| 2083 | return Some(end_tag); |
| 2084 | } |
| 2085 | } |
| 2086 | |
| 2087 | if text_bytes.len() > 1 && text_bytes[0] == b'!' && text_bytes[1].is_ascii_alphabetic() { |
| 2088 | Some(">" ) |
| 2089 | } else { |
| 2090 | None |
| 2091 | } |
| 2092 | } |
| 2093 | |
| 2094 | // https://english.stackexchange.com/a/285573 |
| 2095 | fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) { |
| 2096 | let mut list_item = tree[list_ix].child; |
| 2097 | while let Some(listitem_ix) = list_item { |
| 2098 | // first child is special, controls how we repoint list_item.child |
| 2099 | let list_item_firstborn = tree[listitem_ix].child; |
| 2100 | |
| 2101 | // Check that list item has children - this is not necessarily the case! |
| 2102 | if let Some(firstborn_ix) = list_item_firstborn { |
| 2103 | if let ItemBody::Paragraph = tree[firstborn_ix].item.body { |
| 2104 | tree[listitem_ix].child = tree[firstborn_ix].child; |
| 2105 | } |
| 2106 | |
| 2107 | let mut list_item_child = Some(firstborn_ix); |
| 2108 | let mut node_to_repoint = None; |
| 2109 | while let Some(child_ix) = list_item_child { |
| 2110 | // surgerize paragraphs |
| 2111 | let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body { |
| 2112 | if let Some(child_firstborn) = tree[child_ix].child { |
| 2113 | if let Some(repoint_ix) = node_to_repoint { |
| 2114 | tree[repoint_ix].next = Some(child_firstborn); |
| 2115 | } |
| 2116 | let mut child_lastborn = child_firstborn; |
| 2117 | while let Some(lastborn_next_ix) = tree[child_lastborn].next { |
| 2118 | child_lastborn = lastborn_next_ix; |
| 2119 | } |
| 2120 | child_lastborn |
| 2121 | } else { |
| 2122 | child_ix |
| 2123 | } |
| 2124 | } else { |
| 2125 | child_ix |
| 2126 | }; |
| 2127 | |
| 2128 | node_to_repoint = Some(repoint_ix); |
| 2129 | tree[repoint_ix].next = tree[child_ix].next; |
| 2130 | list_item_child = tree[child_ix].next; |
| 2131 | } |
| 2132 | } |
| 2133 | |
| 2134 | list_item = tree[listitem_ix].next; |
| 2135 | } |
| 2136 | } |
| 2137 | |
| 2138 | /// Determines whether the delimiter run starting at given index is |
| 2139 | /// left-flanking, as defined by the commonmark spec (and isn't intraword |
| 2140 | /// for _ delims). |
| 2141 | /// suffix is &s[ix..], which is passed in as an optimization, since taking |
| 2142 | /// a string subslice is O(n). |
| 2143 | fn delim_run_can_open( |
| 2144 | s: &str, |
| 2145 | suffix: &str, |
| 2146 | run_len: usize, |
| 2147 | ix: usize, |
| 2148 | mode: TableParseMode, |
| 2149 | ) -> bool { |
| 2150 | let next_char = if let Some(c) = suffix.chars().nth(run_len) { |
| 2151 | c |
| 2152 | } else { |
| 2153 | return false; |
| 2154 | }; |
| 2155 | if next_char.is_whitespace() { |
| 2156 | return false; |
| 2157 | } |
| 2158 | if ix == 0 { |
| 2159 | return true; |
| 2160 | } |
| 2161 | if mode == TableParseMode::Active { |
| 2162 | if s[..ix].ends_with('|' ) && !s[..ix].ends_with(r"\|" ) { |
| 2163 | return true; |
| 2164 | } |
| 2165 | if next_char == '|' { |
| 2166 | return false; |
| 2167 | } |
| 2168 | } |
| 2169 | let delim = suffix.chars().next().unwrap(); |
| 2170 | // `*` and `~~` can be intraword, `_` and `~` cannot |
| 2171 | if delim == '*' && !is_punctuation(next_char) { |
| 2172 | return true; |
| 2173 | } |
| 2174 | if delim == '~' && run_len > 1 { |
| 2175 | return true; |
| 2176 | } |
| 2177 | let prev_char = s[..ix].chars().last().unwrap(); |
| 2178 | if delim == '~' && prev_char == '~' && !is_punctuation(next_char) { |
| 2179 | return true; |
| 2180 | } |
| 2181 | |
| 2182 | prev_char.is_whitespace() |
| 2183 | || is_punctuation(prev_char) && (delim != ' \'' || ![']' , ')' ].contains(&prev_char)) |
| 2184 | } |
| 2185 | |
| 2186 | /// Determines whether the delimiter run starting at given index is |
| 2187 | /// right-flanking, as defined by the commonmark spec (and isn't intraword |
| 2188 | /// for _ delims) |
| 2189 | fn delim_run_can_close( |
| 2190 | s: &str, |
| 2191 | suffix: &str, |
| 2192 | run_len: usize, |
| 2193 | ix: usize, |
| 2194 | mode: TableParseMode, |
| 2195 | ) -> bool { |
| 2196 | if ix == 0 { |
| 2197 | return false; |
| 2198 | } |
| 2199 | let prev_char = s[..ix].chars().last().unwrap(); |
| 2200 | if prev_char.is_whitespace() { |
| 2201 | return false; |
| 2202 | } |
| 2203 | let next_char = if let Some(c) = suffix.chars().nth(run_len) { |
| 2204 | c |
| 2205 | } else { |
| 2206 | return true; |
| 2207 | }; |
| 2208 | if mode == TableParseMode::Active { |
| 2209 | if s[..ix].ends_with('|' ) && !s[..ix].ends_with(r"\|" ) { |
| 2210 | return false; |
| 2211 | } |
| 2212 | if next_char == '|' { |
| 2213 | return true; |
| 2214 | } |
| 2215 | } |
| 2216 | let delim = suffix.chars().next().unwrap(); |
| 2217 | // `*` and `~~` can be intraword, `_` and `~` cannot |
| 2218 | if (delim == '*' || (delim == '~' && run_len > 1)) && !is_punctuation(prev_char) { |
| 2219 | return true; |
| 2220 | } |
| 2221 | if delim == '~' && prev_char == '~' { |
| 2222 | return true; |
| 2223 | } |
| 2224 | |
| 2225 | next_char.is_whitespace() || is_punctuation(next_char) |
| 2226 | } |
| 2227 | |
| 2228 | fn create_lut(options: &Options) -> LookupTable { |
| 2229 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 2230 | { |
| 2231 | LookupTable { |
| 2232 | simd: simd::compute_lookup(options), |
| 2233 | scalar: special_bytes(options), |
| 2234 | } |
| 2235 | } |
| 2236 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
| 2237 | { |
| 2238 | special_bytes(options) |
| 2239 | } |
| 2240 | } |
| 2241 | |
| 2242 | fn special_bytes(options: &Options) -> [bool; 256] { |
| 2243 | let mut bytes = [false; 256]; |
| 2244 | let standard_bytes = [ |
| 2245 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
| 2246 | ]; |
| 2247 | |
| 2248 | for &byte in &standard_bytes { |
| 2249 | bytes[byte as usize] = true; |
| 2250 | } |
| 2251 | if options.contains(Options::ENABLE_TABLES) { |
| 2252 | bytes[b'|' as usize] = true; |
| 2253 | } |
| 2254 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
| 2255 | bytes[b'~' as usize] = true; |
| 2256 | } |
| 2257 | if options.contains(Options::ENABLE_MATH) { |
| 2258 | bytes[b'$' as usize] = true; |
| 2259 | bytes[b'{' as usize] = true; |
| 2260 | bytes[b'}' as usize] = true; |
| 2261 | } |
| 2262 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
| 2263 | for &byte in &[b'.' , b'-' , b'"' , b' \'' ] { |
| 2264 | bytes[byte as usize] = true; |
| 2265 | } |
| 2266 | } |
| 2267 | |
| 2268 | bytes |
| 2269 | } |
| 2270 | |
| 2271 | enum LoopInstruction<T> { |
| 2272 | /// Continue looking for more special bytes, but skip next few bytes. |
| 2273 | ContinueAndSkip(usize), |
| 2274 | /// Break looping immediately, returning with the given index and value. |
| 2275 | BreakAtWith(usize, T), |
| 2276 | } |
| 2277 | |
| 2278 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 2279 | struct LookupTable { |
| 2280 | simd: [u8; 16], |
| 2281 | scalar: [bool; 256], |
| 2282 | } |
| 2283 | |
| 2284 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
| 2285 | type LookupTable = [bool; 256]; |
| 2286 | |
| 2287 | /// This function walks the byte slices from the given index and |
| 2288 | /// calls the callback function on all bytes (and their indices) that are in the following set: |
| 2289 | /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n` |
| 2290 | /// It is guaranteed not call the callback on other bytes. |
| 2291 | /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback |
| 2292 | /// will not be called with an index that is less than `ix + n + 1`. |
| 2293 | /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be |
| 2294 | /// called and the function returns immediately with the return value `(end_ix, opt_val)`. |
| 2295 | /// If `BreakAtWith(..)` is never returned, this function will return the first |
| 2296 | /// index that is outside the byteslice bound and a `None` value. |
| 2297 | fn iterate_special_bytes<F, T>( |
| 2298 | lut: &LookupTable, |
| 2299 | bytes: &[u8], |
| 2300 | ix: usize, |
| 2301 | callback: F, |
| 2302 | ) -> (usize, Option<T>) |
| 2303 | where |
| 2304 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 2305 | { |
| 2306 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 2307 | { |
| 2308 | simd::iterate_special_bytes(lut, bytes, ix, callback) |
| 2309 | } |
| 2310 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
| 2311 | { |
| 2312 | scalar_iterate_special_bytes(lut, bytes, ix, callback) |
| 2313 | } |
| 2314 | } |
| 2315 | |
| 2316 | fn scalar_iterate_special_bytes<F, T>( |
| 2317 | lut: &[bool; 256], |
| 2318 | bytes: &[u8], |
| 2319 | mut ix: usize, |
| 2320 | mut callback: F, |
| 2321 | ) -> (usize, Option<T>) |
| 2322 | where |
| 2323 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 2324 | { |
| 2325 | while ix < bytes.len() { |
| 2326 | let b: u8 = bytes[ix]; |
| 2327 | if lut[b as usize] { |
| 2328 | match callback(ix, b) { |
| 2329 | LoopInstruction::ContinueAndSkip(skip: usize) => { |
| 2330 | ix += skip; |
| 2331 | } |
| 2332 | LoopInstruction::BreakAtWith(ix: usize, val: Option) => { |
| 2333 | return (ix, val); |
| 2334 | } |
| 2335 | } |
| 2336 | } |
| 2337 | ix += 1; |
| 2338 | } |
| 2339 | |
| 2340 | (ix, None) |
| 2341 | } |
| 2342 | |
| 2343 | /// Split the usual heading content range and the content inside the trailing attribute block. |
| 2344 | /// |
| 2345 | /// Returns `(leading_content_len, Option<trailing_attr_block_range>)`. |
| 2346 | /// |
| 2347 | /// Note that `trailing_attr_block_range` will be empty range when the block |
| 2348 | /// is `{}`, since the range is content inside the wrapping `{` and `}`. |
| 2349 | /// |
| 2350 | /// The closing `}` of an attribute block can have trailing whitespaces. |
| 2351 | /// They are automatically trimmed when the attribute block is being searched. |
| 2352 | /// |
| 2353 | /// However, this method does not trim the trailing whitespaces of heading content. |
| 2354 | /// It is callers' responsibility to trim them if necessary. |
| 2355 | fn extract_attribute_block_content_from_header_text( |
| 2356 | heading: &[u8], |
| 2357 | ) -> (usize, Option<Range<usize>>) { |
| 2358 | let heading_len = heading.len(); |
| 2359 | let mut ix = heading_len; |
| 2360 | ix -= scan_rev_while(heading, |b| { |
| 2361 | b == b' \n' || b == b' \r' || b == b' ' || b == b' \t' |
| 2362 | }); |
| 2363 | if ix == 0 { |
| 2364 | return (heading_len, None); |
| 2365 | } |
| 2366 | |
| 2367 | let attr_block_close = ix - 1; |
| 2368 | if heading.get(attr_block_close) != Some(&b'}' ) { |
| 2369 | // The last character is not `}`. No attribute blocks found. |
| 2370 | return (heading_len, None); |
| 2371 | } |
| 2372 | // move cursor before the closing right brace (`}`) |
| 2373 | ix -= 1; |
| 2374 | |
| 2375 | ix -= scan_rev_while(&heading[..ix], |b| { |
| 2376 | // Characters to be excluded: |
| 2377 | // * `{` and `}`: special characters to open and close an attribute block. |
| 2378 | // * `\\`: a special character to escape many characters and disable some syntaxes. |
| 2379 | // + Handling of this escape character differs among markdown processors. |
| 2380 | // + Escaped characters will be separate text node from neighbors, so |
| 2381 | // it is not easy to handle unescaped string and trim the trailing block. |
| 2382 | // * `<` and `>`: special characters to start and end HTML tag. |
| 2383 | // + No known processors converts `{#<i>foo</i>}` into |
| 2384 | // `id="<i>foo</>"` as of this writing, so hopefully |
| 2385 | // this restriction won't cause compatibility issues. |
| 2386 | // * `\n` and `\r`: a newline character. |
| 2387 | // + Setext heading can have multiple lines. However it is hard to support |
| 2388 | // attribute blocks that have newline inside, since the parsing proceeds line by |
| 2389 | // line and lines will be separate nodes even they are logically a single text. |
| 2390 | !matches!(b, b'{' | b'}' | b'<' | b'>' | b' \\' | b' \n' | b' \r' ) |
| 2391 | }); |
| 2392 | if ix == 0 { |
| 2393 | // `{` is not found. No attribute blocks available. |
| 2394 | return (heading_len, None); |
| 2395 | } |
| 2396 | let attr_block_open = ix - 1; |
| 2397 | if heading[attr_block_open] != b'{' { |
| 2398 | // `{` is not found. No attribute blocks available. |
| 2399 | return (heading_len, None); |
| 2400 | } |
| 2401 | |
| 2402 | (attr_block_open, Some(ix..attr_block_close)) |
| 2403 | } |
| 2404 | |
| 2405 | /// Parses an attribute block content, such as `.class1 #id .class2`. |
| 2406 | /// |
| 2407 | /// Returns `(id, classes)`. |
| 2408 | /// |
| 2409 | /// It is callers' responsibility to find opening and closing characters of the attribute |
| 2410 | /// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you. |
| 2411 | /// |
| 2412 | /// Note that this parsing requires explicit whitespace separators between |
| 2413 | /// attributes. This is intentional design with the reasons below: |
| 2414 | /// |
| 2415 | /// * to keep conversion simple and easy to understand for any possible input, |
| 2416 | /// * to avoid adding less obvious conversion rule that can reduce compatibility |
| 2417 | /// with other implementations more, and |
| 2418 | /// * to follow the major design of implementations with the support for the |
| 2419 | /// attribute blocks extension (as of this writing). |
| 2420 | /// |
| 2421 | /// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`]. |
| 2422 | /// |
| 2423 | /// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES` |
| 2424 | fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes<'_>> { |
| 2425 | let mut id = None; |
| 2426 | let mut classes = Vec::new(); |
| 2427 | let mut attrs = Vec::new(); |
| 2428 | |
| 2429 | for attr in inside_attr_block.split_ascii_whitespace() { |
| 2430 | // iterator returned by `str::split_ascii_whitespace` never emits empty |
| 2431 | // strings, so taking first byte won't panic. |
| 2432 | if attr.len() > 1 { |
| 2433 | let first_byte = attr.as_bytes()[0]; |
| 2434 | if first_byte == b'#' { |
| 2435 | id = Some(attr[1..].into()); |
| 2436 | } else if first_byte == b'.' { |
| 2437 | classes.push(attr[1..].into()); |
| 2438 | } else { |
| 2439 | let split = attr.split_once('=' ); |
| 2440 | if let Some((key, value)) = split { |
| 2441 | attrs.push((key.into(), Some(value.into()))); |
| 2442 | } else { |
| 2443 | attrs.push((attr.into(), None)); |
| 2444 | } |
| 2445 | } |
| 2446 | } |
| 2447 | } |
| 2448 | |
| 2449 | Some(HeadingAttributes { id, classes, attrs }) |
| 2450 | } |
| 2451 | |
| 2452 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 2453 | mod simd { |
| 2454 | //! SIMD byte scanning logic. |
| 2455 | //! |
| 2456 | //! This module provides functions that allow walking through byteslices, calling |
| 2457 | //! provided callback functions on special bytes and their indices using SIMD. |
| 2458 | //! The byteset is defined in `compute_lookup`. |
| 2459 | //! |
| 2460 | //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of |
| 2461 | //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask |
| 2462 | //! from this and call the callback on every index corresponding to a 1 in this mask |
| 2463 | //! before moving on to the next chunk. This allows us to move quickly when there |
| 2464 | //! are no or few matches. |
| 2465 | //! |
| 2466 | //! The table lookup is inspired by this [great overview]. However, since all of the |
| 2467 | //! bytes we're interested in are ASCII, we don't quite need the full generality of |
| 2468 | //! the universal algorithm and are hence able to skip a few instructions. |
| 2469 | //! |
| 2470 | //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html |
| 2471 | |
| 2472 | use super::{LookupTable, LoopInstruction}; |
| 2473 | use crate::Options; |
| 2474 | use core::arch::x86_64::*; |
| 2475 | |
| 2476 | const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>(); |
| 2477 | |
| 2478 | /// Generates a lookup table containing the bitmaps for our |
| 2479 | /// special marker bytes. This is effectively a 128 element 2d bitvector, |
| 2480 | /// that can be indexed by a four bit row index (the lower nibble) |
| 2481 | /// and a three bit column index (upper nibble). |
| 2482 | pub(super) fn compute_lookup(options: &Options) -> [u8; 16] { |
| 2483 | let mut lookup = [0u8; 16]; |
| 2484 | let standard_bytes = [ |
| 2485 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
| 2486 | ]; |
| 2487 | |
| 2488 | for &byte in &standard_bytes { |
| 2489 | add_lookup_byte(&mut lookup, byte); |
| 2490 | } |
| 2491 | if options.contains(Options::ENABLE_TABLES) { |
| 2492 | add_lookup_byte(&mut lookup, b'|' ); |
| 2493 | } |
| 2494 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
| 2495 | add_lookup_byte(&mut lookup, b'~' ); |
| 2496 | } |
| 2497 | if options.contains(Options::ENABLE_MATH) { |
| 2498 | add_lookup_byte(&mut lookup, b'$' ); |
| 2499 | add_lookup_byte(&mut lookup, b'{' ); |
| 2500 | add_lookup_byte(&mut lookup, b'}' ); |
| 2501 | } |
| 2502 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
| 2503 | for &byte in &[b'.' , b'-' , b'"' , b' \'' ] { |
| 2504 | add_lookup_byte(&mut lookup, byte); |
| 2505 | } |
| 2506 | } |
| 2507 | |
| 2508 | lookup |
| 2509 | } |
| 2510 | |
| 2511 | fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) { |
| 2512 | lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4); |
| 2513 | } |
| 2514 | |
| 2515 | /// Computes a bit mask for the given byteslice starting from the given index, |
| 2516 | /// where the 16 least significant bits indicate (by value of 1) whether or not |
| 2517 | /// there is a special character at that byte position. The least significant bit |
| 2518 | /// corresponds to `bytes[ix]` and the most significant bit corresponds to |
| 2519 | /// `bytes[ix + 15]`. |
| 2520 | /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`. |
| 2521 | #[target_feature (enable = "ssse3" )] |
| 2522 | #[inline ] |
| 2523 | unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 { |
| 2524 | debug_assert!(bytes.len() >= ix + VECTOR_SIZE); |
| 2525 | |
| 2526 | let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i); |
| 2527 | // Small lookup table to compute single bit bitshifts |
| 2528 | // for 16 bytes at once. |
| 2529 | let bitmask_lookup = |
| 2530 | _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1); |
| 2531 | |
| 2532 | // Load input from memory. |
| 2533 | let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i; |
| 2534 | let input = _mm_loadu_si128(raw_ptr); |
| 2535 | // Compute the bitmap using the bottom nibble as an index |
| 2536 | // into the lookup table. Note that non-ascii bytes will have |
| 2537 | // their most significant bit set and will map to lookup[0]. |
| 2538 | let bitset = _mm_shuffle_epi8(bitmap, input); |
| 2539 | // Compute the high nibbles of the input using a 16-bit rightshift of four |
| 2540 | // and a mask to prevent most-significant bit issues. |
| 2541 | let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f)); |
| 2542 | // Create a bitmask for the bitmap by perform a left shift of the value |
| 2543 | // of the higher nibble. Bytes with their most significant set are mapped |
| 2544 | // to -1 (all ones). |
| 2545 | let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles); |
| 2546 | // Test the bit of the bitmap by AND'ing the bitmap and the mask together. |
| 2547 | let tmp = _mm_and_si128(bitset, bitmask); |
| 2548 | // Check whether the result was not null. NEQ is not a SIMD intrinsic, |
| 2549 | // but comparing to the bitmask is logically equivalent. This also prevents us |
| 2550 | // from matching any non-ASCII bytes since none of the bitmaps were all ones |
| 2551 | // (-1). |
| 2552 | let result = _mm_cmpeq_epi8(tmp, bitmask); |
| 2553 | |
| 2554 | // Return the resulting bitmask. |
| 2555 | _mm_movemask_epi8(result) |
| 2556 | } |
| 2557 | |
| 2558 | /// Calls callback on byte indices and their value. |
| 2559 | /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the |
| 2560 | /// number of bytes in callback return value otherwise. |
| 2561 | /// Returns the final index and a possible break value. |
| 2562 | pub(super) fn iterate_special_bytes<F, T>( |
| 2563 | lut: &LookupTable, |
| 2564 | bytes: &[u8], |
| 2565 | ix: usize, |
| 2566 | callback: F, |
| 2567 | ) -> (usize, Option<T>) |
| 2568 | where |
| 2569 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 2570 | { |
| 2571 | if is_x86_feature_detected!("ssse3" ) && bytes.len() >= VECTOR_SIZE { |
| 2572 | unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) } |
| 2573 | } else { |
| 2574 | super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback) |
| 2575 | } |
| 2576 | } |
| 2577 | |
| 2578 | /// Calls the callback function for every 1 in the given bitmask with |
| 2579 | /// the index `offset + ix`, where `ix` is the position of the 1 in the mask. |
| 2580 | /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with |
| 2581 | /// final index `end_ix` and optional value `opt_val`. |
| 2582 | unsafe fn process_mask<F, T>( |
| 2583 | mut mask: i32, |
| 2584 | bytes: &[u8], |
| 2585 | mut offset: usize, |
| 2586 | callback: &mut F, |
| 2587 | ) -> Result<usize, (usize, Option<T>)> |
| 2588 | where |
| 2589 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 2590 | { |
| 2591 | while mask != 0 { |
| 2592 | let mask_ix = mask.trailing_zeros() as usize; |
| 2593 | offset += mask_ix; |
| 2594 | match callback(offset, *bytes.get_unchecked(offset)) { |
| 2595 | LoopInstruction::ContinueAndSkip(skip) => { |
| 2596 | offset += skip + 1; |
| 2597 | mask = mask.wrapping_shr((skip + 1 + mask_ix) as u32); |
| 2598 | } |
| 2599 | LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)), |
| 2600 | } |
| 2601 | } |
| 2602 | Ok(offset) |
| 2603 | } |
| 2604 | |
| 2605 | #[target_feature (enable = "ssse3" )] |
| 2606 | /// Important: only call this function when `bytes.len() >= 16`. Doing |
| 2607 | /// so otherwise may exhibit undefined behaviour. |
| 2608 | unsafe fn simd_iterate_special_bytes<F, T>( |
| 2609 | lut: &[u8; 16], |
| 2610 | bytes: &[u8], |
| 2611 | mut ix: usize, |
| 2612 | mut callback: F, |
| 2613 | ) -> (usize, Option<T>) |
| 2614 | where |
| 2615 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 2616 | { |
| 2617 | debug_assert!(bytes.len() >= VECTOR_SIZE); |
| 2618 | let upperbound = bytes.len() - VECTOR_SIZE; |
| 2619 | |
| 2620 | while ix < upperbound { |
| 2621 | let mask = compute_mask(lut, bytes, ix); |
| 2622 | let block_start = ix; |
| 2623 | ix = match process_mask(mask, bytes, ix, &mut callback) { |
| 2624 | Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start), |
| 2625 | Err((end_ix, val)) => return (end_ix, val), |
| 2626 | }; |
| 2627 | } |
| 2628 | |
| 2629 | if bytes.len() > ix { |
| 2630 | // shift off the bytes at start we have already scanned |
| 2631 | let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound; |
| 2632 | if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) { |
| 2633 | return (end_ix, val); |
| 2634 | } |
| 2635 | } |
| 2636 | |
| 2637 | (bytes.len(), None) |
| 2638 | } |
| 2639 | |
| 2640 | #[cfg (test)] |
| 2641 | mod simd_test { |
| 2642 | use super::super::create_lut; |
| 2643 | use super::{iterate_special_bytes, LoopInstruction}; |
| 2644 | use crate::Options; |
| 2645 | |
| 2646 | fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) { |
| 2647 | let mut opts = Options::empty(); |
| 2648 | opts.insert(Options::ENABLE_MATH); |
| 2649 | opts.insert(Options::ENABLE_TABLES); |
| 2650 | opts.insert(Options::ENABLE_FOOTNOTES); |
| 2651 | opts.insert(Options::ENABLE_STRIKETHROUGH); |
| 2652 | opts.insert(Options::ENABLE_TASKLISTS); |
| 2653 | |
| 2654 | let lut = create_lut(&opts); |
| 2655 | let mut indices = vec![]; |
| 2656 | |
| 2657 | iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| { |
| 2658 | indices.push(ix); |
| 2659 | LoopInstruction::ContinueAndSkip(skip) |
| 2660 | }); |
| 2661 | |
| 2662 | assert_eq!(&indices[..], expected); |
| 2663 | } |
| 2664 | |
| 2665 | #[test ] |
| 2666 | fn simple_no_match() { |
| 2667 | check_expected_indices("abcdef0123456789" .as_bytes(), &[], 0); |
| 2668 | } |
| 2669 | |
| 2670 | #[test ] |
| 2671 | fn simple_match() { |
| 2672 | check_expected_indices("*bcd&f0123456789" .as_bytes(), &[0, 4], 0); |
| 2673 | } |
| 2674 | |
| 2675 | #[test ] |
| 2676 | fn single_open_fish() { |
| 2677 | check_expected_indices("<" .as_bytes(), &[0], 0); |
| 2678 | } |
| 2679 | |
| 2680 | #[test ] |
| 2681 | fn long_match() { |
| 2682 | check_expected_indices("0123456789abcde~*bcd&f0" .as_bytes(), &[15, 16, 20], 0); |
| 2683 | } |
| 2684 | |
| 2685 | #[test ] |
| 2686 | fn border_skip() { |
| 2687 | check_expected_indices("0123456789abcde~~~~d&f0" .as_bytes(), &[15, 20], 3); |
| 2688 | } |
| 2689 | |
| 2690 | #[test ] |
| 2691 | fn exhaustive_search() { |
| 2692 | let chars = [ |
| 2693 | b' \n' , b' \r' , b'*' , b'_' , b'~' , b'|' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
| 2694 | b'$' , b'{' , b'}' , |
| 2695 | ]; |
| 2696 | |
| 2697 | for &c in &chars { |
| 2698 | for i in 0u8..=255 { |
| 2699 | if !chars.contains(&i) { |
| 2700 | // full match |
| 2701 | let mut buf = [i; 18]; |
| 2702 | buf[3] = c; |
| 2703 | buf[6] = c; |
| 2704 | |
| 2705 | check_expected_indices(&buf[..], &[3, 6], 0); |
| 2706 | } |
| 2707 | } |
| 2708 | } |
| 2709 | } |
| 2710 | } |
| 2711 | } |
| 2712 | |