| 1 | //! The first pass resolves all block structure, generating an AST. Within a block, items |
| 2 | //! are in a linear chain with potential inline markup identified. |
| 3 | |
| 4 | use std::cmp::max; |
| 5 | use std::ops::Range; |
| 6 | |
| 7 | use crate::parse::{scan_containers, Allocations, HeadingAttributes, Item, ItemBody, LinkDef}; |
| 8 | use crate::scanners::*; |
| 9 | use crate::strings::CowStr; |
| 10 | use crate::tree::{Tree, TreeIndex}; |
| 11 | use crate::Options; |
| 12 | use crate::{ |
| 13 | linklabel::{scan_link_label_rest, LinkLabel}, |
| 14 | HeadingLevel, |
| 15 | }; |
| 16 | |
| 17 | use unicase::UniCase; |
| 18 | |
| 19 | /// Runs the first pass, which resolves the block structure of the document, |
| 20 | /// and returns the resulting tree. |
| 21 | pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations) { |
| 22 | // This is a very naive heuristic for the number of nodes |
| 23 | // we'll need. |
| 24 | let start_capacity: usize = max(v1:128, v2:text.len() / 32); |
| 25 | let lookup_table: &[bool; 256] = &create_lut(&options); |
| 26 | let first_pass: FirstPass<'_, '_> = FirstPass { |
| 27 | text, |
| 28 | tree: Tree::with_capacity(cap:start_capacity), |
| 29 | begin_list_item: false, |
| 30 | last_line_blank: false, |
| 31 | allocs: Allocations::new(), |
| 32 | options, |
| 33 | lookup_table, |
| 34 | }; |
| 35 | first_pass.run() |
| 36 | } |
| 37 | |
| 38 | /// State for the first parsing pass. |
| 39 | struct FirstPass<'a, 'b> { |
| 40 | text: &'a str, |
| 41 | tree: Tree<Item>, |
| 42 | begin_list_item: bool, |
| 43 | last_line_blank: bool, |
| 44 | allocs: Allocations<'a>, |
| 45 | options: Options, |
| 46 | lookup_table: &'b LookupTable, |
| 47 | } |
| 48 | |
| 49 | impl<'a, 'b> FirstPass<'a, 'b> { |
| 50 | fn run(mut self) -> (Tree<Item>, Allocations<'a>) { |
| 51 | let mut ix = 0; |
| 52 | while ix < self.text.len() { |
| 53 | ix = self.parse_block(ix); |
| 54 | } |
| 55 | for _ in 0..self.tree.spine_len() { |
| 56 | self.pop(ix); |
| 57 | } |
| 58 | (self.tree, self.allocs) |
| 59 | } |
| 60 | |
| 61 | /// Returns offset after block. |
| 62 | fn parse_block(&mut self, mut start_ix: usize) -> usize { |
| 63 | let bytes = self.text.as_bytes(); |
| 64 | let mut line_start = LineStart::new(&bytes[start_ix..]); |
| 65 | |
| 66 | let i = scan_containers(&self.tree, &mut line_start); |
| 67 | for _ in i..self.tree.spine_len() { |
| 68 | self.pop(start_ix); |
| 69 | } |
| 70 | |
| 71 | if self.options.contains(Options::ENABLE_FOOTNOTES) { |
| 72 | // finish footnote if it's still open and was preceded by blank line |
| 73 | if let Some(node_ix) = self.tree.peek_up() { |
| 74 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
| 75 | if self.last_line_blank { |
| 76 | self.pop(start_ix); |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | // Footnote definitions of the form |
| 82 | // [^bar]: |
| 83 | // * anything really |
| 84 | let container_start = start_ix + line_start.bytes_scanned(); |
| 85 | if let Some(bytecount) = self.parse_footnote(container_start) { |
| 86 | start_ix = container_start + bytecount; |
| 87 | start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0); |
| 88 | line_start = LineStart::new(&bytes[start_ix..]); |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | // Process new containers |
| 93 | loop { |
| 94 | let container_start = start_ix + line_start.bytes_scanned(); |
| 95 | if let Some((ch, index, indent)) = line_start.scan_list_marker() { |
| 96 | let after_marker_index = start_ix + line_start.bytes_scanned(); |
| 97 | self.continue_list(container_start, ch, index); |
| 98 | self.tree.append(Item { |
| 99 | start: container_start, |
| 100 | end: after_marker_index, // will get updated later if item not empty |
| 101 | body: ItemBody::ListItem(indent), |
| 102 | }); |
| 103 | self.tree.push(); |
| 104 | if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) { |
| 105 | self.begin_list_item = true; |
| 106 | return after_marker_index + n; |
| 107 | } |
| 108 | if self.options.contains(Options::ENABLE_TASKLISTS) { |
| 109 | if let Some(is_checked) = line_start.scan_task_list_marker() { |
| 110 | self.tree.append(Item { |
| 111 | start: after_marker_index, |
| 112 | end: start_ix + line_start.bytes_scanned(), |
| 113 | body: ItemBody::TaskListMarker(is_checked), |
| 114 | }); |
| 115 | } |
| 116 | } |
| 117 | } else if line_start.scan_blockquote_marker() { |
| 118 | self.finish_list(start_ix); |
| 119 | self.tree.append(Item { |
| 120 | start: container_start, |
| 121 | end: 0, // will get set later |
| 122 | body: ItemBody::BlockQuote, |
| 123 | }); |
| 124 | self.tree.push(); |
| 125 | } else { |
| 126 | break; |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | let ix = start_ix + line_start.bytes_scanned(); |
| 131 | |
| 132 | if let Some(n) = scan_blank_line(&bytes[ix..]) { |
| 133 | if let Some(node_ix) = self.tree.peek_up() { |
| 134 | match self.tree[node_ix].item.body { |
| 135 | ItemBody::BlockQuote => (), |
| 136 | _ => { |
| 137 | if self.begin_list_item { |
| 138 | // A list item can begin with at most one blank line. |
| 139 | self.pop(start_ix); |
| 140 | } |
| 141 | self.last_line_blank = true; |
| 142 | } |
| 143 | } |
| 144 | } |
| 145 | return ix + n; |
| 146 | } |
| 147 | |
| 148 | self.begin_list_item = false; |
| 149 | self.finish_list(start_ix); |
| 150 | |
| 151 | // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks |
| 152 | let remaining_space = line_start.remaining_space(); |
| 153 | |
| 154 | let indent = line_start.scan_space_upto(4); |
| 155 | if indent == 4 { |
| 156 | let ix = start_ix + line_start.bytes_scanned(); |
| 157 | let remaining_space = line_start.remaining_space(); |
| 158 | return self.parse_indented_code_block(ix, remaining_space); |
| 159 | } |
| 160 | |
| 161 | let ix = start_ix + line_start.bytes_scanned(); |
| 162 | |
| 163 | // HTML Blocks |
| 164 | if bytes[ix] == b'<' { |
| 165 | // Types 1-5 are all detected by one function and all end with the same |
| 166 | // pattern |
| 167 | if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) { |
| 168 | return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space); |
| 169 | } |
| 170 | |
| 171 | // Detect type 6 |
| 172 | if starts_html_block_type_6(&bytes[(ix + 1)..]) { |
| 173 | return self.parse_html_block_type_6_or_7(ix, remaining_space); |
| 174 | } |
| 175 | |
| 176 | // Detect type 7 |
| 177 | if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) { |
| 178 | return self.parse_html_block_type_6_or_7(ix, remaining_space); |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | if let Ok(n) = scan_hrule(&bytes[ix..]) { |
| 183 | return self.parse_hrule(n, ix); |
| 184 | } |
| 185 | |
| 186 | if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) { |
| 187 | return self.parse_atx_heading(ix, atx_size); |
| 188 | } |
| 189 | |
| 190 | // parse refdef |
| 191 | if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) { |
| 192 | self.allocs.refdefs.0.entry(label).or_insert(link_def); |
| 193 | let ix = ix + bytecount; |
| 194 | // try to read trailing whitespace or it will register as a completely blank line |
| 195 | // TODO: shouldn't we do this for all block level items? |
| 196 | return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0); |
| 197 | } |
| 198 | |
| 199 | if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) { |
| 200 | return self.parse_fenced_code_block(ix, indent, fence_ch, n); |
| 201 | } |
| 202 | self.parse_paragraph(ix) |
| 203 | } |
| 204 | |
| 205 | /// Returns the offset of the first line after the table. |
| 206 | /// Assumptions: current focus is a table element and the table header |
| 207 | /// matches the separator line (same number of columns). |
| 208 | fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize { |
| 209 | // parse header. this shouldn't fail because we made sure the table header is ok |
| 210 | let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols); |
| 211 | self.tree[thead_ix].item.body = ItemBody::TableHead; |
| 212 | |
| 213 | // parse body |
| 214 | let mut ix = body_start; |
| 215 | while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) { |
| 216 | ix = next_ix; |
| 217 | } |
| 218 | |
| 219 | self.pop(ix); |
| 220 | ix |
| 221 | } |
| 222 | |
| 223 | /// Call this when containers are taken care of. |
| 224 | /// Returns bytes scanned, row_ix |
| 225 | fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) { |
| 226 | let bytes = self.text.as_bytes(); |
| 227 | let mut cells = 0; |
| 228 | let mut final_cell_ix = None; |
| 229 | |
| 230 | let row_ix = self.tree.append(Item { |
| 231 | start: ix, |
| 232 | end: 0, // set at end of this function |
| 233 | body: ItemBody::TableRow, |
| 234 | }); |
| 235 | self.tree.push(); |
| 236 | |
| 237 | loop { |
| 238 | ix += scan_ch(&bytes[ix..], b'|' ); |
| 239 | let start_ix = ix; |
| 240 | ix += scan_whitespace_no_nl(&bytes[ix..]); |
| 241 | |
| 242 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
| 243 | ix += eol_bytes; |
| 244 | break; |
| 245 | } |
| 246 | |
| 247 | let cell_ix = self.tree.append(Item { |
| 248 | start: start_ix, |
| 249 | end: ix, |
| 250 | body: ItemBody::TableCell, |
| 251 | }); |
| 252 | self.tree.push(); |
| 253 | let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active); |
| 254 | |
| 255 | if let Some(cur_ix) = self.tree.cur() { |
| 256 | let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace); |
| 257 | self.tree[cur_ix].item.end -= trailing_whitespace; |
| 258 | } |
| 259 | |
| 260 | self.tree[cell_ix].item.end = next_ix; |
| 261 | self.tree.pop(); |
| 262 | |
| 263 | ix = next_ix; |
| 264 | cells += 1; |
| 265 | |
| 266 | if cells == row_cells { |
| 267 | final_cell_ix = Some(cell_ix); |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | // fill empty cells if needed |
| 272 | // note: this is where GFM and commonmark-extra diverge. we follow |
| 273 | // GFM here |
| 274 | for _ in cells..row_cells { |
| 275 | self.tree.append(Item { |
| 276 | start: ix, |
| 277 | end: ix, |
| 278 | body: ItemBody::TableCell, |
| 279 | }); |
| 280 | } |
| 281 | |
| 282 | // drop excess cells |
| 283 | if let Some(cell_ix) = final_cell_ix { |
| 284 | self.tree[cell_ix].next = None; |
| 285 | } |
| 286 | |
| 287 | self.pop(ix); |
| 288 | |
| 289 | (ix, row_ix) |
| 290 | } |
| 291 | |
| 292 | /// Returns first offset after the row and the tree index of the row. |
| 293 | fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> { |
| 294 | let bytes = self.text.as_bytes(); |
| 295 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 296 | let current_container = |
| 297 | scan_containers(&self.tree, &mut line_start) == self.tree.spine_len(); |
| 298 | if !current_container { |
| 299 | return None; |
| 300 | } |
| 301 | line_start.scan_all_space(); |
| 302 | ix += line_start.bytes_scanned(); |
| 303 | if scan_paragraph_interrupt(&bytes[ix..], current_container) { |
| 304 | return None; |
| 305 | } |
| 306 | |
| 307 | let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells); |
| 308 | Some((ix, row_ix)) |
| 309 | } |
| 310 | |
| 311 | /// Returns offset of line start after paragraph. |
| 312 | fn parse_paragraph(&mut self, start_ix: usize) -> usize { |
| 313 | let node_ix = self.tree.append(Item { |
| 314 | start: start_ix, |
| 315 | end: 0, // will get set later |
| 316 | body: ItemBody::Paragraph, |
| 317 | }); |
| 318 | self.tree.push(); |
| 319 | let bytes = self.text.as_bytes(); |
| 320 | |
| 321 | let mut ix = start_ix; |
| 322 | loop { |
| 323 | if self.options.contains(Options::ENABLE_FOOTNOTES) && self.get_footnote(ix).is_some() { |
| 324 | self.tree[node_ix].item.end = if ix > start_ix { ix - 1 } else { start_ix }; |
| 325 | self.tree.pop(); |
| 326 | if let Some(node_ix) = self.tree.peek_up() { |
| 327 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
| 328 | self.pop(ix); |
| 329 | } |
| 330 | } |
| 331 | return ix; |
| 332 | } |
| 333 | |
| 334 | let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix { |
| 335 | TableParseMode::Scan |
| 336 | } else { |
| 337 | TableParseMode::Disabled |
| 338 | }; |
| 339 | let (next_ix, brk) = self.parse_line(ix, None, scan_mode); |
| 340 | |
| 341 | // break out when we find a table |
| 342 | if let Some(Item { |
| 343 | body: ItemBody::Table(alignment_ix), |
| 344 | .. |
| 345 | }) = brk |
| 346 | { |
| 347 | let table_cols = self.allocs[alignment_ix].len(); |
| 348 | self.tree[node_ix].item.body = ItemBody::Table(alignment_ix); |
| 349 | // this clears out any stuff we may have appended - but there may |
| 350 | // be a cleaner way |
| 351 | self.tree[node_ix].child = None; |
| 352 | self.tree.pop(); |
| 353 | self.tree.push(); |
| 354 | return self.parse_table(table_cols, ix, next_ix); |
| 355 | } |
| 356 | |
| 357 | ix = next_ix; |
| 358 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 359 | let current_container = |
| 360 | scan_containers(&self.tree, &mut line_start) == self.tree.spine_len(); |
| 361 | if !line_start.scan_space(4) { |
| 362 | let ix_new = ix + line_start.bytes_scanned(); |
| 363 | if current_container { |
| 364 | let trailing_backslash_pos = match brk { |
| 365 | Some(Item { |
| 366 | start, |
| 367 | body: ItemBody::HardBreak, |
| 368 | .. |
| 369 | }) if bytes[start] == b' \\' => Some(start), |
| 370 | _ => None, |
| 371 | }; |
| 372 | if let Some(ix_setext) = |
| 373 | self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some()) |
| 374 | { |
| 375 | if let Some(pos) = trailing_backslash_pos { |
| 376 | self.tree.append_text(pos, pos + 1); |
| 377 | } |
| 378 | ix = ix_setext; |
| 379 | break; |
| 380 | } |
| 381 | } |
| 382 | // first check for non-empty lists, then for other interrupts |
| 383 | let suffix = &bytes[ix_new..]; |
| 384 | if scan_paragraph_interrupt(suffix, current_container) { |
| 385 | break; |
| 386 | } |
| 387 | } |
| 388 | line_start.scan_all_space(); |
| 389 | if line_start.is_at_eol() { |
| 390 | break; |
| 391 | } |
| 392 | ix = next_ix + line_start.bytes_scanned(); |
| 393 | if let Some(item) = brk { |
| 394 | self.tree.append(item); |
| 395 | } |
| 396 | } |
| 397 | |
| 398 | self.pop(ix); |
| 399 | ix |
| 400 | } |
| 401 | |
| 402 | /// Returns end ix of setext_heading on success. |
| 403 | fn parse_setext_heading( |
| 404 | &mut self, |
| 405 | ix: usize, |
| 406 | node_ix: TreeIndex, |
| 407 | has_trailing_content: bool, |
| 408 | ) -> Option<usize> { |
| 409 | let bytes = self.text.as_bytes(); |
| 410 | let (n, level) = scan_setext_heading(&bytes[ix..])?; |
| 411 | let mut attrs = None; |
| 412 | |
| 413 | if let Some(cur_ix) = self.tree.cur() { |
| 414 | let parent_ix = self.tree.peek_up().unwrap(); |
| 415 | let header_start = self.tree[parent_ix].item.start; |
| 416 | // Note that `self.tree[parent_ix].item.end` might be zero at this point. |
| 417 | // Use the end position of the current node (i.e. the last known child |
| 418 | // of the parent) instead. |
| 419 | let header_end = self.tree[cur_ix].item.end; |
| 420 | |
| 421 | // extract the trailing attribute block |
| 422 | let (content_end, attrs_) = |
| 423 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
| 424 | attrs = attrs_; |
| 425 | |
| 426 | // strip trailing whitespace |
| 427 | let new_end = if has_trailing_content { |
| 428 | content_end |
| 429 | } else { |
| 430 | let trailing_ws = |
| 431 | scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl); |
| 432 | content_end - trailing_ws |
| 433 | }; |
| 434 | |
| 435 | if attrs.is_some() { |
| 436 | // remove trailing block attributes |
| 437 | self.tree.truncate_siblings(self.text.as_bytes(), new_end); |
| 438 | } |
| 439 | |
| 440 | if let Some(cur_ix) = self.tree.cur() { |
| 441 | self.tree[cur_ix].item.end = new_end; |
| 442 | } |
| 443 | } |
| 444 | |
| 445 | self.tree[node_ix].item.body = ItemBody::Heading( |
| 446 | level, |
| 447 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
| 448 | ); |
| 449 | |
| 450 | Some(ix + n) |
| 451 | } |
| 452 | |
| 453 | /// Parse a line of input, appending text and items to tree. |
| 454 | /// |
| 455 | /// Returns: index after line and an item representing the break. |
| 456 | fn parse_line( |
| 457 | &mut self, |
| 458 | start: usize, |
| 459 | end: Option<usize>, |
| 460 | mode: TableParseMode, |
| 461 | ) -> (usize, Option<Item>) { |
| 462 | let bytes = self.text.as_bytes(); |
| 463 | let bytes = match end { |
| 464 | Some(end) => &bytes[..end], |
| 465 | None => bytes, |
| 466 | }; |
| 467 | let bytes_len = bytes.len(); |
| 468 | let mut pipes = 0; |
| 469 | let mut last_pipe_ix = start; |
| 470 | let mut begin_text = start; |
| 471 | |
| 472 | let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| { |
| 473 | match byte { |
| 474 | b' \n' | b' \r' => { |
| 475 | if let TableParseMode::Active = mode { |
| 476 | return LoopInstruction::BreakAtWith(ix, None); |
| 477 | } |
| 478 | |
| 479 | let mut i = ix; |
| 480 | let eol_bytes = scan_eol(&bytes[ix..]).unwrap(); |
| 481 | if mode == TableParseMode::Scan && pipes > 0 { |
| 482 | // check if we may be parsing a table |
| 483 | let next_line_ix = ix + eol_bytes; |
| 484 | let mut line_start = LineStart::new(&bytes[next_line_ix..]); |
| 485 | if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() { |
| 486 | let table_head_ix = next_line_ix + line_start.bytes_scanned(); |
| 487 | let (table_head_bytes, alignment) = |
| 488 | scan_table_head(&bytes[table_head_ix..]); |
| 489 | |
| 490 | if table_head_bytes > 0 { |
| 491 | // computing header count from number of pipes |
| 492 | let header_count = |
| 493 | count_header_cols(bytes, pipes, start, last_pipe_ix); |
| 494 | |
| 495 | // make sure they match the number of columns we find in separator line |
| 496 | if alignment.len() == header_count { |
| 497 | let alignment_ix = self.allocs.allocate_alignment(alignment); |
| 498 | let end_ix = table_head_ix + table_head_bytes; |
| 499 | return LoopInstruction::BreakAtWith( |
| 500 | end_ix, |
| 501 | Some(Item { |
| 502 | start: i, |
| 503 | end: end_ix, // must update later |
| 504 | body: ItemBody::Table(alignment_ix), |
| 505 | }), |
| 506 | ); |
| 507 | } |
| 508 | } |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | let end_ix = ix + eol_bytes; |
| 513 | let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b' \\' ); |
| 514 | if trailing_backslashes % 2 == 1 && end_ix < bytes_len { |
| 515 | i -= 1; |
| 516 | self.tree.append_text(begin_text, i); |
| 517 | return LoopInstruction::BreakAtWith( |
| 518 | end_ix, |
| 519 | Some(Item { |
| 520 | start: i, |
| 521 | end: end_ix, |
| 522 | body: ItemBody::HardBreak, |
| 523 | }), |
| 524 | ); |
| 525 | } |
| 526 | let trailing_whitespace = |
| 527 | scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl); |
| 528 | if trailing_whitespace >= 2 { |
| 529 | i -= trailing_whitespace; |
| 530 | self.tree.append_text(begin_text, i); |
| 531 | return LoopInstruction::BreakAtWith( |
| 532 | end_ix, |
| 533 | Some(Item { |
| 534 | start: i, |
| 535 | end: end_ix, |
| 536 | body: ItemBody::HardBreak, |
| 537 | }), |
| 538 | ); |
| 539 | } |
| 540 | |
| 541 | self.tree.append_text(begin_text, ix); |
| 542 | LoopInstruction::BreakAtWith( |
| 543 | end_ix, |
| 544 | Some(Item { |
| 545 | start: i, |
| 546 | end: end_ix, |
| 547 | body: ItemBody::SoftBreak, |
| 548 | }), |
| 549 | ) |
| 550 | } |
| 551 | b' \\' => { |
| 552 | if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) { |
| 553 | self.tree.append_text(begin_text, ix); |
| 554 | if bytes[ix + 1] == b'`' { |
| 555 | let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`' ); |
| 556 | self.tree.append(Item { |
| 557 | start: ix + 1, |
| 558 | end: ix + count + 1, |
| 559 | body: ItemBody::MaybeCode(count, true), |
| 560 | }); |
| 561 | begin_text = ix + 1 + count; |
| 562 | LoopInstruction::ContinueAndSkip(count) |
| 563 | } else { |
| 564 | begin_text = ix + 1; |
| 565 | LoopInstruction::ContinueAndSkip(1) |
| 566 | } |
| 567 | } else { |
| 568 | LoopInstruction::ContinueAndSkip(0) |
| 569 | } |
| 570 | } |
| 571 | c @ b'*' | c @ b'_' | c @ b'~' => { |
| 572 | let string_suffix = &self.text[ix..]; |
| 573 | let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c); |
| 574 | let can_open = delim_run_can_open(self.text, string_suffix, count, ix); |
| 575 | let can_close = delim_run_can_close(self.text, string_suffix, count, ix); |
| 576 | let is_valid_seq = c != b'~' || count <= 2; |
| 577 | |
| 578 | if (can_open || can_close) && is_valid_seq { |
| 579 | self.tree.append_text(begin_text, ix); |
| 580 | for i in 0..count { |
| 581 | self.tree.append(Item { |
| 582 | start: ix + i, |
| 583 | end: ix + i + 1, |
| 584 | body: ItemBody::MaybeEmphasis(count - i, can_open, can_close), |
| 585 | }); |
| 586 | } |
| 587 | begin_text = ix + count; |
| 588 | } |
| 589 | LoopInstruction::ContinueAndSkip(count - 1) |
| 590 | } |
| 591 | b'`' => { |
| 592 | self.tree.append_text(begin_text, ix); |
| 593 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`' ); |
| 594 | self.tree.append(Item { |
| 595 | start: ix, |
| 596 | end: ix + count, |
| 597 | body: ItemBody::MaybeCode(count, false), |
| 598 | }); |
| 599 | begin_text = ix + count; |
| 600 | LoopInstruction::ContinueAndSkip(count - 1) |
| 601 | } |
| 602 | b'<' => { |
| 603 | // Note: could detect some non-HTML cases and early escape here, but not |
| 604 | // clear that's a win. |
| 605 | self.tree.append_text(begin_text, ix); |
| 606 | self.tree.append(Item { |
| 607 | start: ix, |
| 608 | end: ix + 1, |
| 609 | body: ItemBody::MaybeHtml, |
| 610 | }); |
| 611 | begin_text = ix + 1; |
| 612 | LoopInstruction::ContinueAndSkip(0) |
| 613 | } |
| 614 | b'!' => { |
| 615 | if ix + 1 < bytes_len && bytes[ix + 1] == b'[' { |
| 616 | self.tree.append_text(begin_text, ix); |
| 617 | self.tree.append(Item { |
| 618 | start: ix, |
| 619 | end: ix + 2, |
| 620 | body: ItemBody::MaybeImage, |
| 621 | }); |
| 622 | begin_text = ix + 2; |
| 623 | LoopInstruction::ContinueAndSkip(1) |
| 624 | } else { |
| 625 | LoopInstruction::ContinueAndSkip(0) |
| 626 | } |
| 627 | } |
| 628 | b'[' => { |
| 629 | self.tree.append_text(begin_text, ix); |
| 630 | self.tree.append(Item { |
| 631 | start: ix, |
| 632 | end: ix + 1, |
| 633 | body: ItemBody::MaybeLinkOpen, |
| 634 | }); |
| 635 | begin_text = ix + 1; |
| 636 | LoopInstruction::ContinueAndSkip(0) |
| 637 | } |
| 638 | b']' => { |
| 639 | self.tree.append_text(begin_text, ix); |
| 640 | self.tree.append(Item { |
| 641 | start: ix, |
| 642 | end: ix + 1, |
| 643 | body: ItemBody::MaybeLinkClose(true), |
| 644 | }); |
| 645 | begin_text = ix + 1; |
| 646 | LoopInstruction::ContinueAndSkip(0) |
| 647 | } |
| 648 | b'&' => match scan_entity(&bytes[ix..]) { |
| 649 | (n, Some(value)) => { |
| 650 | self.tree.append_text(begin_text, ix); |
| 651 | self.tree.append(Item { |
| 652 | start: ix, |
| 653 | end: ix + n, |
| 654 | body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)), |
| 655 | }); |
| 656 | begin_text = ix + n; |
| 657 | LoopInstruction::ContinueAndSkip(n - 1) |
| 658 | } |
| 659 | _ => LoopInstruction::ContinueAndSkip(0), |
| 660 | }, |
| 661 | b'|' => { |
| 662 | if let TableParseMode::Active = mode { |
| 663 | LoopInstruction::BreakAtWith(ix, None) |
| 664 | } else { |
| 665 | last_pipe_ix = ix; |
| 666 | pipes += 1; |
| 667 | LoopInstruction::ContinueAndSkip(0) |
| 668 | } |
| 669 | } |
| 670 | b'.' => { |
| 671 | if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' { |
| 672 | self.tree.append_text(begin_text, ix); |
| 673 | self.tree.append(Item { |
| 674 | start: ix, |
| 675 | end: ix + 3, |
| 676 | body: ItemBody::SynthesizeChar('…' ), |
| 677 | }); |
| 678 | begin_text = ix + 3; |
| 679 | LoopInstruction::ContinueAndSkip(2) |
| 680 | } else { |
| 681 | LoopInstruction::ContinueAndSkip(0) |
| 682 | } |
| 683 | } |
| 684 | b'-' => { |
| 685 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-' ); |
| 686 | if count == 1 { |
| 687 | LoopInstruction::ContinueAndSkip(0) |
| 688 | } else { |
| 689 | let itembody = if count == 2 { |
| 690 | ItemBody::SynthesizeChar('–' ) |
| 691 | } else if count == 3 { |
| 692 | ItemBody::SynthesizeChar('—' ) |
| 693 | } else { |
| 694 | let (ems, ens) = match count % 6 { |
| 695 | 0 | 3 => (count / 3, 0), |
| 696 | 2 | 4 => (0, count / 2), |
| 697 | 1 => (count / 3 - 1, 2), |
| 698 | _ => (count / 3, 1), |
| 699 | }; |
| 700 | // – and — are 3 bytes each in utf8 |
| 701 | let mut buf = String::with_capacity(3 * (ems + ens)); |
| 702 | for _ in 0..ems { |
| 703 | buf.push('—' ); |
| 704 | } |
| 705 | for _ in 0..ens { |
| 706 | buf.push('–' ); |
| 707 | } |
| 708 | ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into())) |
| 709 | }; |
| 710 | |
| 711 | self.tree.append_text(begin_text, ix); |
| 712 | self.tree.append(Item { |
| 713 | start: ix, |
| 714 | end: ix + count, |
| 715 | body: itembody, |
| 716 | }); |
| 717 | begin_text = ix + count; |
| 718 | LoopInstruction::ContinueAndSkip(count - 1) |
| 719 | } |
| 720 | } |
| 721 | c @ b' \'' | c @ b'"' => { |
| 722 | let string_suffix = &self.text[ix..]; |
| 723 | let can_open = delim_run_can_open(self.text, string_suffix, 1, ix); |
| 724 | let can_close = delim_run_can_close(self.text, string_suffix, 1, ix); |
| 725 | |
| 726 | self.tree.append_text(begin_text, ix); |
| 727 | self.tree.append(Item { |
| 728 | start: ix, |
| 729 | end: ix + 1, |
| 730 | body: ItemBody::MaybeSmartQuote(c, can_open, can_close), |
| 731 | }); |
| 732 | begin_text = ix + 1; |
| 733 | |
| 734 | LoopInstruction::ContinueAndSkip(0) |
| 735 | } |
| 736 | _ => LoopInstruction::ContinueAndSkip(0), |
| 737 | } |
| 738 | }); |
| 739 | |
| 740 | if brk.is_none() { |
| 741 | // need to close text at eof |
| 742 | self.tree.append_text(begin_text, final_ix); |
| 743 | } |
| 744 | (final_ix, brk) |
| 745 | } |
| 746 | |
| 747 | /// When start_ix is at the beginning of an HTML block of type 1 to 5, |
| 748 | /// this will find the end of the block, adding the block itself to the |
| 749 | /// tree and also keeping track of the lines of HTML within the block. |
| 750 | /// |
| 751 | /// The html_end_tag is the tag that must be found on a line to end the block. |
| 752 | fn parse_html_block_type_1_to_5( |
| 753 | &mut self, |
| 754 | start_ix: usize, |
| 755 | html_end_tag: &str, |
| 756 | mut remaining_space: usize, |
| 757 | ) -> usize { |
| 758 | let bytes = self.text.as_bytes(); |
| 759 | let mut ix = start_ix; |
| 760 | loop { |
| 761 | let line_start_ix = ix; |
| 762 | ix += scan_nextline(&bytes[ix..]); |
| 763 | self.append_html_line(remaining_space, line_start_ix, ix); |
| 764 | |
| 765 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 766 | let n_containers = scan_containers(&self.tree, &mut line_start); |
| 767 | if n_containers < self.tree.spine_len() { |
| 768 | break; |
| 769 | } |
| 770 | |
| 771 | if (&self.text[line_start_ix..ix]).contains(html_end_tag) { |
| 772 | break; |
| 773 | } |
| 774 | |
| 775 | let next_line_ix = ix + line_start.bytes_scanned(); |
| 776 | if next_line_ix == self.text.len() { |
| 777 | break; |
| 778 | } |
| 779 | ix = next_line_ix; |
| 780 | remaining_space = line_start.remaining_space(); |
| 781 | } |
| 782 | ix |
| 783 | } |
| 784 | |
| 785 | /// When start_ix is at the beginning of an HTML block of type 6 or 7, |
| 786 | /// this will consume lines until there is a blank line and keep track of |
| 787 | /// the HTML within the block. |
| 788 | fn parse_html_block_type_6_or_7( |
| 789 | &mut self, |
| 790 | start_ix: usize, |
| 791 | mut remaining_space: usize, |
| 792 | ) -> usize { |
| 793 | let bytes = self.text.as_bytes(); |
| 794 | let mut ix = start_ix; |
| 795 | loop { |
| 796 | let line_start_ix = ix; |
| 797 | ix += scan_nextline(&bytes[ix..]); |
| 798 | self.append_html_line(remaining_space, line_start_ix, ix); |
| 799 | |
| 800 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 801 | let n_containers = scan_containers(&self.tree, &mut line_start); |
| 802 | if n_containers < self.tree.spine_len() || line_start.is_at_eol() { |
| 803 | break; |
| 804 | } |
| 805 | |
| 806 | let next_line_ix = ix + line_start.bytes_scanned(); |
| 807 | if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some() |
| 808 | { |
| 809 | break; |
| 810 | } |
| 811 | ix = next_line_ix; |
| 812 | remaining_space = line_start.remaining_space(); |
| 813 | } |
| 814 | ix |
| 815 | } |
| 816 | |
| 817 | fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize { |
| 818 | self.tree.append(Item { |
| 819 | start: start_ix, |
| 820 | end: 0, // will get set later |
| 821 | body: ItemBody::IndentCodeBlock, |
| 822 | }); |
| 823 | self.tree.push(); |
| 824 | let bytes = self.text.as_bytes(); |
| 825 | let mut last_nonblank_child = None; |
| 826 | let mut last_nonblank_ix = 0; |
| 827 | let mut end_ix = 0; |
| 828 | let mut last_line_blank = false; |
| 829 | |
| 830 | let mut ix = start_ix; |
| 831 | loop { |
| 832 | let line_start_ix = ix; |
| 833 | ix += scan_nextline(&bytes[ix..]); |
| 834 | self.append_code_text(remaining_space, line_start_ix, ix); |
| 835 | // TODO(spec clarification): should we synthesize newline at EOF? |
| 836 | |
| 837 | if !last_line_blank { |
| 838 | last_nonblank_child = self.tree.cur(); |
| 839 | last_nonblank_ix = ix; |
| 840 | end_ix = ix; |
| 841 | } |
| 842 | |
| 843 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 844 | let n_containers = scan_containers(&self.tree, &mut line_start); |
| 845 | if n_containers < self.tree.spine_len() |
| 846 | || !(line_start.scan_space(4) || line_start.is_at_eol()) |
| 847 | { |
| 848 | break; |
| 849 | } |
| 850 | let next_line_ix = ix + line_start.bytes_scanned(); |
| 851 | if next_line_ix == self.text.len() { |
| 852 | break; |
| 853 | } |
| 854 | ix = next_line_ix; |
| 855 | remaining_space = line_start.remaining_space(); |
| 856 | last_line_blank = scan_blank_line(&bytes[ix..]).is_some(); |
| 857 | } |
| 858 | |
| 859 | // Trim trailing blank lines. |
| 860 | if let Some(child) = last_nonblank_child { |
| 861 | self.tree[child].next = None; |
| 862 | self.tree[child].item.end = last_nonblank_ix; |
| 863 | } |
| 864 | self.pop(end_ix); |
| 865 | ix |
| 866 | } |
| 867 | |
| 868 | fn parse_fenced_code_block( |
| 869 | &mut self, |
| 870 | start_ix: usize, |
| 871 | indent: usize, |
| 872 | fence_ch: u8, |
| 873 | n_fence_char: usize, |
| 874 | ) -> usize { |
| 875 | let bytes = self.text.as_bytes(); |
| 876 | let mut info_start = start_ix + n_fence_char; |
| 877 | info_start += scan_whitespace_no_nl(&bytes[info_start..]); |
| 878 | // TODO: info strings are typically very short. wouldn't it be faster |
| 879 | // to just do a forward scan here? |
| 880 | let mut ix = info_start + scan_nextline(&bytes[info_start..]); |
| 881 | let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace); |
| 882 | let info_string = unescape(&self.text[info_start..info_end]); |
| 883 | self.tree.append(Item { |
| 884 | start: start_ix, |
| 885 | end: 0, // will get set later |
| 886 | body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)), |
| 887 | }); |
| 888 | self.tree.push(); |
| 889 | loop { |
| 890 | let mut line_start = LineStart::new(&bytes[ix..]); |
| 891 | let n_containers = scan_containers(&self.tree, &mut line_start); |
| 892 | if n_containers < self.tree.spine_len() { |
| 893 | break; |
| 894 | } |
| 895 | line_start.scan_space(indent); |
| 896 | let mut close_line_start = line_start.clone(); |
| 897 | if !close_line_start.scan_space(4) { |
| 898 | let close_ix = ix + close_line_start.bytes_scanned(); |
| 899 | if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char) |
| 900 | { |
| 901 | ix = close_ix + n; |
| 902 | break; |
| 903 | } |
| 904 | } |
| 905 | let remaining_space = line_start.remaining_space(); |
| 906 | ix += line_start.bytes_scanned(); |
| 907 | let next_ix = ix + scan_nextline(&bytes[ix..]); |
| 908 | self.append_code_text(remaining_space, ix, next_ix); |
| 909 | ix = next_ix; |
| 910 | } |
| 911 | |
| 912 | self.pop(ix); |
| 913 | |
| 914 | // try to read trailing whitespace or it will register as a completely blank line |
| 915 | ix + scan_blank_line(&bytes[ix..]).unwrap_or(0) |
| 916 | } |
| 917 | |
| 918 | fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) { |
| 919 | if remaining_space > 0 { |
| 920 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
| 921 | self.tree.append(Item { |
| 922 | start, |
| 923 | end: start, |
| 924 | body: ItemBody::SynthesizeText(cow_ix), |
| 925 | }); |
| 926 | } |
| 927 | if self.text.as_bytes()[end - 2] == b' \r' { |
| 928 | // Normalize CRLF to LF |
| 929 | self.tree.append_text(start, end - 2); |
| 930 | self.tree.append_text(end - 1, end); |
| 931 | } else { |
| 932 | self.tree.append_text(start, end); |
| 933 | } |
| 934 | } |
| 935 | |
| 936 | /// Appends a line of HTML to the tree. |
| 937 | fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) { |
| 938 | if remaining_space > 0 { |
| 939 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
| 940 | self.tree.append(Item { |
| 941 | start, |
| 942 | end: start, |
| 943 | // TODO: maybe this should synthesize to html rather than text? |
| 944 | body: ItemBody::SynthesizeText(cow_ix), |
| 945 | }); |
| 946 | } |
| 947 | if self.text.as_bytes()[end - 2] == b' \r' { |
| 948 | // Normalize CRLF to LF |
| 949 | self.tree.append(Item { |
| 950 | start, |
| 951 | end: end - 2, |
| 952 | body: ItemBody::Html, |
| 953 | }); |
| 954 | self.tree.append(Item { |
| 955 | start: end - 1, |
| 956 | end, |
| 957 | body: ItemBody::Html, |
| 958 | }); |
| 959 | } else { |
| 960 | self.tree.append(Item { |
| 961 | start, |
| 962 | end, |
| 963 | body: ItemBody::Html, |
| 964 | }); |
| 965 | } |
| 966 | } |
| 967 | |
| 968 | /// Pop a container, setting its end. |
| 969 | fn pop(&mut self, ix: usize) { |
| 970 | let cur_ix = self.tree.pop().unwrap(); |
| 971 | self.tree[cur_ix].item.end = ix; |
| 972 | if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body { |
| 973 | surgerize_tight_list(&mut self.tree, cur_ix); |
| 974 | } |
| 975 | } |
| 976 | |
| 977 | /// Close a list if it's open. Also set loose if last line was blank |
| 978 | fn finish_list(&mut self, ix: usize) { |
| 979 | if let Some(node_ix) = self.tree.peek_up() { |
| 980 | if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body { |
| 981 | self.pop(ix); |
| 982 | } |
| 983 | } |
| 984 | if self.last_line_blank { |
| 985 | if let Some(node_ix) = self.tree.peek_grandparent() { |
| 986 | if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body { |
| 987 | *is_tight = false; |
| 988 | } |
| 989 | } |
| 990 | self.last_line_blank = false; |
| 991 | } |
| 992 | } |
| 993 | |
| 994 | /// Continue an existing list or start a new one if there's not an open |
| 995 | /// list that matches. |
| 996 | fn continue_list(&mut self, start: usize, ch: u8, index: u64) { |
| 997 | if let Some(node_ix) = self.tree.peek_up() { |
| 998 | if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body { |
| 999 | if existing_ch == ch { |
| 1000 | if self.last_line_blank { |
| 1001 | *is_tight = false; |
| 1002 | self.last_line_blank = false; |
| 1003 | } |
| 1004 | return; |
| 1005 | } |
| 1006 | } |
| 1007 | // TODO: this is not the best choice for end; maybe get end from last list item. |
| 1008 | self.finish_list(start); |
| 1009 | } |
| 1010 | self.tree.append(Item { |
| 1011 | start, |
| 1012 | end: 0, // will get set later |
| 1013 | body: ItemBody::List(true, ch, index), |
| 1014 | }); |
| 1015 | self.tree.push(); |
| 1016 | self.last_line_blank = false; |
| 1017 | } |
| 1018 | |
| 1019 | /// Parse a thematic break. |
| 1020 | /// |
| 1021 | /// Returns index of start of next line. |
| 1022 | fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize { |
| 1023 | self.tree.append(Item { |
| 1024 | start: ix, |
| 1025 | end: ix + hrule_size, |
| 1026 | body: ItemBody::Rule, |
| 1027 | }); |
| 1028 | ix + hrule_size |
| 1029 | } |
| 1030 | |
| 1031 | /// Parse an ATX heading. |
| 1032 | /// |
| 1033 | /// Returns index of start of next line. |
| 1034 | fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize { |
| 1035 | let mut ix = start; |
| 1036 | let heading_ix = self.tree.append(Item { |
| 1037 | start, |
| 1038 | end: 0, // set later |
| 1039 | body: ItemBody::default(), // set later |
| 1040 | }); |
| 1041 | ix += atx_level as usize; |
| 1042 | // next char is space or eol (guaranteed by scan_atx_heading) |
| 1043 | let bytes = self.text.as_bytes(); |
| 1044 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
| 1045 | self.tree[heading_ix].item.end = ix + eol_bytes; |
| 1046 | self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None); |
| 1047 | return ix + eol_bytes; |
| 1048 | } |
| 1049 | // skip leading spaces |
| 1050 | let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]); |
| 1051 | ix += skip_spaces; |
| 1052 | |
| 1053 | // now handle the header text |
| 1054 | let header_start = ix; |
| 1055 | let header_node_idx = self.tree.push(); // so that we can set the endpoint later |
| 1056 | |
| 1057 | // trim the trailing attribute block before parsing the entire line, if necessary |
| 1058 | let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) |
| 1059 | { |
| 1060 | // the start of the next line is the end of the header since the |
| 1061 | // header cannot have line breaks |
| 1062 | let header_end = header_start + scan_nextline(&bytes[header_start..]); |
| 1063 | let (content_end, attrs) = |
| 1064 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
| 1065 | self.parse_line(ix, Some(content_end), TableParseMode::Disabled); |
| 1066 | (header_end, content_end, attrs) |
| 1067 | } else { |
| 1068 | ix = self.parse_line(ix, None, TableParseMode::Disabled).0; |
| 1069 | (ix, ix, None) |
| 1070 | }; |
| 1071 | self.tree[header_node_idx].item.end = end; |
| 1072 | |
| 1073 | // remove trailing matter from header text |
| 1074 | if let Some(cur_ix) = self.tree.cur() { |
| 1075 | // remove closing of the ATX heading |
| 1076 | let header_text = &bytes[header_start..content_end]; |
| 1077 | let mut limit = header_text |
| 1078 | .iter() |
| 1079 | .rposition(|&b| !(b == b' \n' || b == b' \r' || b == b' ' )) |
| 1080 | .map_or(0, |i| i + 1); |
| 1081 | let closer = header_text[..limit] |
| 1082 | .iter() |
| 1083 | .rposition(|&b| b != b'#' ) |
| 1084 | .map_or(0, |i| i + 1); |
| 1085 | if closer == 0 { |
| 1086 | limit = closer; |
| 1087 | } else { |
| 1088 | let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ' ); |
| 1089 | if spaces > 0 { |
| 1090 | limit = closer - spaces; |
| 1091 | } |
| 1092 | } |
| 1093 | self.tree[cur_ix].item.end = limit + header_start; |
| 1094 | } |
| 1095 | |
| 1096 | self.tree.pop(); |
| 1097 | self.tree[heading_ix].item.body = ItemBody::Heading( |
| 1098 | atx_level, |
| 1099 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
| 1100 | ); |
| 1101 | end |
| 1102 | } |
| 1103 | |
| 1104 | fn get_footnote(&mut self, start: usize) -> Option<(usize, CowStr<'a>)> { |
| 1105 | let bytes = &self.text.as_bytes()[start..]; |
| 1106 | if !bytes.starts_with(b"[^" ) { |
| 1107 | return None; |
| 1108 | } |
| 1109 | let (mut i, label) = self.parse_refdef_label(start + 2)?; |
| 1110 | i += 2; |
| 1111 | if scan_ch(&bytes[i..], b':' ) == 0 { |
| 1112 | return None; |
| 1113 | } |
| 1114 | i += 1; |
| 1115 | Some((i, label)) |
| 1116 | } |
| 1117 | |
| 1118 | /// Returns the number of bytes scanned on success. |
| 1119 | fn parse_footnote(&mut self, start: usize) -> Option<usize> { |
| 1120 | let (i, label) = self.get_footnote(start)?; |
| 1121 | self.finish_list(start); |
| 1122 | self.tree.append(Item { |
| 1123 | start, |
| 1124 | end: 0, // will get set later |
| 1125 | // TODO: check whether the label here is strictly necessary |
| 1126 | body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)), |
| 1127 | }); |
| 1128 | self.tree.push(); |
| 1129 | Some(i) |
| 1130 | } |
| 1131 | |
| 1132 | /// Tries to parse a reference label, which can be interrupted by new blocks. |
| 1133 | /// On success, returns the number of bytes of the label and the label itself. |
| 1134 | fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> { |
| 1135 | scan_link_label_rest(&self.text[start..], &|bytes| { |
| 1136 | let mut line_start = LineStart::new(bytes); |
| 1137 | let current_container = |
| 1138 | scan_containers(&self.tree, &mut line_start) == self.tree.spine_len(); |
| 1139 | let bytes_scanned = line_start.bytes_scanned(); |
| 1140 | let suffix = &bytes[bytes_scanned..]; |
| 1141 | if scan_paragraph_interrupt(suffix, current_container) { |
| 1142 | None |
| 1143 | } else { |
| 1144 | Some(bytes_scanned) |
| 1145 | } |
| 1146 | }) |
| 1147 | } |
| 1148 | |
| 1149 | /// Returns number of bytes scanned, label and definition on success. |
| 1150 | fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> { |
| 1151 | let bytes = &self.text.as_bytes()[start..]; |
| 1152 | if scan_ch(bytes, b'[' ) == 0 { |
| 1153 | return None; |
| 1154 | } |
| 1155 | let (mut i, label) = self.parse_refdef_label(start + 1)?; |
| 1156 | i += 1; |
| 1157 | if scan_ch(&bytes[i..], b':' ) == 0 { |
| 1158 | return None; |
| 1159 | } |
| 1160 | i += 1; |
| 1161 | let (bytecount, link_def) = self.scan_refdef(start, start + i)?; |
| 1162 | Some((bytecount + i, UniCase::new(label), link_def)) |
| 1163 | } |
| 1164 | |
| 1165 | /// Returns number of bytes and number of newlines |
| 1166 | fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> { |
| 1167 | let mut newlines = 0; |
| 1168 | loop { |
| 1169 | let whitespaces = scan_whitespace_no_nl(&bytes[i..]); |
| 1170 | i += whitespaces; |
| 1171 | if let Some(eol_bytes) = scan_eol(&bytes[i..]) { |
| 1172 | i += eol_bytes; |
| 1173 | newlines += 1; |
| 1174 | if newlines > 1 { |
| 1175 | return None; |
| 1176 | } |
| 1177 | } else { |
| 1178 | break; |
| 1179 | } |
| 1180 | let mut line_start = LineStart::new(&bytes[i..]); |
| 1181 | if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) { |
| 1182 | return None; |
| 1183 | } |
| 1184 | i += line_start.bytes_scanned(); |
| 1185 | } |
| 1186 | Some((i, newlines)) |
| 1187 | } |
| 1188 | |
| 1189 | /// Returns # of bytes and definition. |
| 1190 | /// Assumes the label of the reference including colon has already been scanned. |
| 1191 | fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> { |
| 1192 | let bytes = self.text.as_bytes(); |
| 1193 | |
| 1194 | // whitespace between label and url (including up to one newline) |
| 1195 | let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?; |
| 1196 | |
| 1197 | // scan link dest |
| 1198 | let (dest_length, dest) = scan_link_dest(self.text, i, 1)?; |
| 1199 | if dest_length == 0 { |
| 1200 | return None; |
| 1201 | } |
| 1202 | let dest = unescape(dest); |
| 1203 | i += dest_length; |
| 1204 | |
| 1205 | // no title |
| 1206 | let mut backup = ( |
| 1207 | i - start, |
| 1208 | LinkDef { |
| 1209 | dest, |
| 1210 | title: None, |
| 1211 | span: span_start..i, |
| 1212 | }, |
| 1213 | ); |
| 1214 | |
| 1215 | // scan whitespace between dest and label |
| 1216 | let (mut i, newlines) = |
| 1217 | if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) { |
| 1218 | if i == self.text.len() { |
| 1219 | newlines += 1; |
| 1220 | } |
| 1221 | if new_i == i && newlines == 0 { |
| 1222 | return None; |
| 1223 | } |
| 1224 | if newlines > 1 { |
| 1225 | return Some(backup); |
| 1226 | }; |
| 1227 | (new_i, newlines) |
| 1228 | } else { |
| 1229 | return Some(backup); |
| 1230 | }; |
| 1231 | |
| 1232 | // scan title |
| 1233 | // if this fails but newline == 1, return also a refdef without title |
| 1234 | if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) { |
| 1235 | i += title_length; |
| 1236 | backup.1.span = span_start..i; |
| 1237 | backup.1.title = Some(unescape(title)); |
| 1238 | } else if newlines > 0 { |
| 1239 | return Some(backup); |
| 1240 | } else { |
| 1241 | return None; |
| 1242 | }; |
| 1243 | |
| 1244 | // scan EOL |
| 1245 | if let Some(bytes) = scan_blank_line(&bytes[i..]) { |
| 1246 | backup.0 = i + bytes - start; |
| 1247 | Some(backup) |
| 1248 | } else if newlines > 0 { |
| 1249 | Some(backup) |
| 1250 | } else { |
| 1251 | None |
| 1252 | } |
| 1253 | } |
| 1254 | |
| 1255 | /// Extracts and parses a heading attribute block if exists. |
| 1256 | /// |
| 1257 | /// Returns `(end_offset_of_heading_content, (id, classes))`. |
| 1258 | /// |
| 1259 | /// If `header_end` is less than or equal to `header_start`, the given |
| 1260 | /// input is considered as empty. |
| 1261 | fn extract_and_parse_heading_attribute_block( |
| 1262 | &mut self, |
| 1263 | header_start: usize, |
| 1264 | header_end: usize, |
| 1265 | ) -> (usize, Option<HeadingAttributes<'a>>) { |
| 1266 | if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) { |
| 1267 | return (header_end, None); |
| 1268 | } |
| 1269 | |
| 1270 | // extract the trailing attribute block |
| 1271 | let header_bytes = &self.text.as_bytes()[header_start..header_end]; |
| 1272 | let (content_len, attr_block_range_rel) = |
| 1273 | extract_attribute_block_content_from_header_text(header_bytes); |
| 1274 | let content_end = header_start + content_len; |
| 1275 | let attrs = attr_block_range_rel.and_then(|r| { |
| 1276 | parse_inside_attribute_block( |
| 1277 | &self.text[(header_start + r.start)..(header_start + r.end)], |
| 1278 | ) |
| 1279 | }); |
| 1280 | (content_end, attrs) |
| 1281 | } |
| 1282 | } |
| 1283 | |
| 1284 | /// Scanning modes for `Parser`'s `parse_line` method. |
| 1285 | #[derive (PartialEq, Eq, Copy, Clone)] |
| 1286 | enum TableParseMode { |
| 1287 | /// Inside a paragraph, scanning for table headers. |
| 1288 | Scan, |
| 1289 | /// Inside a table. |
| 1290 | Active, |
| 1291 | /// Inside a paragraph, not scanning for table headers. |
| 1292 | Disabled, |
| 1293 | } |
| 1294 | |
| 1295 | /// Computes the number of header columns in a table line by computing the number of dividing pipes |
| 1296 | /// that aren't followed or preceded by whitespace. |
| 1297 | fn count_header_cols( |
| 1298 | bytes: &[u8], |
| 1299 | mut pipes: usize, |
| 1300 | mut start: usize, |
| 1301 | last_pipe_ix: usize, |
| 1302 | ) -> usize { |
| 1303 | // was first pipe preceded by whitespace? if so, subtract one |
| 1304 | start += scan_whitespace_no_nl(&bytes[start..]); |
| 1305 | if bytes[start] == b'|' { |
| 1306 | pipes -= 1; |
| 1307 | } |
| 1308 | |
| 1309 | // was last pipe followed by whitespace? if so, sub one |
| 1310 | if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() { |
| 1311 | pipes |
| 1312 | } else { |
| 1313 | pipes + 1 |
| 1314 | } |
| 1315 | } |
| 1316 | |
| 1317 | /// Checks whether we should break a paragraph on the given input. |
| 1318 | fn scan_paragraph_interrupt(bytes: &[u8], current_container: bool) -> bool { |
| 1319 | scan_eol(bytes).is_some() |
| 1320 | || scan_hrule(bytes).is_ok() |
| 1321 | || scan_atx_heading(data:bytes).is_some() |
| 1322 | || scan_code_fence(data:bytes).is_some() |
| 1323 | || scan_blockquote_start(data:bytes).is_some() |
| 1324 | || scan_listitem(bytes).map_or(default:false, |(ix: usize, delim: u8, index: usize, _)| { |
| 1325 | ! current_container || |
| 1326 | // we don't allow interruption by either empty lists or |
| 1327 | // numbered lists starting at an index other than 1 |
| 1328 | (delim == b'*' || delim == b'-' || delim == b'+' || index == 1) |
| 1329 | && !scan_empty_list(&bytes[ix..]) |
| 1330 | }) |
| 1331 | || bytes.starts_with(needle:b"<" ) |
| 1332 | && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..])) |
| 1333 | } |
| 1334 | |
| 1335 | /// Assumes `text_bytes` is preceded by `<`. |
| 1336 | fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> { |
| 1337 | static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre" , b"style" , b"script" , b"textarea" ]; |
| 1338 | static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--" , b"?" , b"![CDATA[" ]; |
| 1339 | |
| 1340 | for (beg_tag, end_tag) in BEGIN_TAGS |
| 1341 | .iter() |
| 1342 | .zip(["</pre>" , "</style>" , "</script>" , "</textarea>" ].iter()) |
| 1343 | { |
| 1344 | let tag_len = beg_tag.len(); |
| 1345 | |
| 1346 | if text_bytes.len() < tag_len { |
| 1347 | // begin tags are increasing in size |
| 1348 | break; |
| 1349 | } |
| 1350 | |
| 1351 | if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) { |
| 1352 | continue; |
| 1353 | } |
| 1354 | |
| 1355 | // Must either be the end of the line... |
| 1356 | if text_bytes.len() == tag_len { |
| 1357 | return Some(end_tag); |
| 1358 | } |
| 1359 | |
| 1360 | // ...or be followed by whitespace, newline, or '>'. |
| 1361 | let s = text_bytes[tag_len]; |
| 1362 | if is_ascii_whitespace(s) || s == b'>' { |
| 1363 | return Some(end_tag); |
| 1364 | } |
| 1365 | } |
| 1366 | |
| 1367 | for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->" , "?>" , "]]>" ].iter()) { |
| 1368 | if text_bytes.starts_with(beg_tag) { |
| 1369 | return Some(end_tag); |
| 1370 | } |
| 1371 | } |
| 1372 | |
| 1373 | if text_bytes.len() > 1 |
| 1374 | && text_bytes[0] == b'!' |
| 1375 | && text_bytes[1] >= b'A' |
| 1376 | && text_bytes[1] <= b'Z' |
| 1377 | { |
| 1378 | Some(">" ) |
| 1379 | } else { |
| 1380 | None |
| 1381 | } |
| 1382 | } |
| 1383 | |
| 1384 | // https://english.stackexchange.com/a/285573 |
| 1385 | fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) { |
| 1386 | let mut list_item = tree[list_ix].child; |
| 1387 | while let Some(listitem_ix) = list_item { |
| 1388 | // first child is special, controls how we repoint list_item.child |
| 1389 | let list_item_firstborn = tree[listitem_ix].child; |
| 1390 | |
| 1391 | // Check that list item has children - this is not necessarily the case! |
| 1392 | if let Some(firstborn_ix) = list_item_firstborn { |
| 1393 | if let ItemBody::Paragraph = tree[firstborn_ix].item.body { |
| 1394 | tree[listitem_ix].child = tree[firstborn_ix].child; |
| 1395 | } |
| 1396 | |
| 1397 | let mut list_item_child = Some(firstborn_ix); |
| 1398 | let mut node_to_repoint = None; |
| 1399 | while let Some(child_ix) = list_item_child { |
| 1400 | // surgerize paragraphs |
| 1401 | let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body { |
| 1402 | if let Some(child_firstborn) = tree[child_ix].child { |
| 1403 | if let Some(repoint_ix) = node_to_repoint { |
| 1404 | tree[repoint_ix].next = Some(child_firstborn); |
| 1405 | } |
| 1406 | let mut child_lastborn = child_firstborn; |
| 1407 | while let Some(lastborn_next_ix) = tree[child_lastborn].next { |
| 1408 | child_lastborn = lastborn_next_ix; |
| 1409 | } |
| 1410 | child_lastborn |
| 1411 | } else { |
| 1412 | child_ix |
| 1413 | } |
| 1414 | } else { |
| 1415 | child_ix |
| 1416 | }; |
| 1417 | |
| 1418 | node_to_repoint = Some(repoint_ix); |
| 1419 | tree[repoint_ix].next = tree[child_ix].next; |
| 1420 | list_item_child = tree[child_ix].next; |
| 1421 | } |
| 1422 | } |
| 1423 | |
| 1424 | list_item = tree[listitem_ix].next; |
| 1425 | } |
| 1426 | } |
| 1427 | |
| 1428 | /// Determines whether the delimiter run starting at given index is |
| 1429 | /// left-flanking, as defined by the commonmark spec (and isn't intraword |
| 1430 | /// for _ delims). |
| 1431 | /// suffix is &s[ix..], which is passed in as an optimization, since taking |
| 1432 | /// a string subslice is O(n). |
| 1433 | fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool { |
| 1434 | let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) { |
| 1435 | c |
| 1436 | } else { |
| 1437 | return false; |
| 1438 | }; |
| 1439 | if next_char.is_whitespace() { |
| 1440 | return false; |
| 1441 | } |
| 1442 | if ix == 0 { |
| 1443 | return true; |
| 1444 | } |
| 1445 | let delim: char = suffix.chars().next().unwrap(); |
| 1446 | if delim == '*' && !is_punctuation(next_char) { |
| 1447 | return true; |
| 1448 | } |
| 1449 | |
| 1450 | let prev_char: char = s[..ix].chars().last().unwrap(); |
| 1451 | |
| 1452 | prev_char.is_whitespace() |
| 1453 | || is_punctuation(prev_char) && (delim != ' \'' || ![']' , ')' ].contains(&prev_char)) |
| 1454 | } |
| 1455 | |
| 1456 | /// Determines whether the delimiter run starting at given index is |
| 1457 | /// left-flanking, as defined by the commonmark spec (and isn't intraword |
| 1458 | /// for _ delims) |
| 1459 | fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool { |
| 1460 | if ix == 0 { |
| 1461 | return false; |
| 1462 | } |
| 1463 | let prev_char: char = s[..ix].chars().last().unwrap(); |
| 1464 | if prev_char.is_whitespace() { |
| 1465 | return false; |
| 1466 | } |
| 1467 | let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) { |
| 1468 | c |
| 1469 | } else { |
| 1470 | return true; |
| 1471 | }; |
| 1472 | let delim: char = suffix.chars().next().unwrap(); |
| 1473 | if delim == '*' && !is_punctuation(prev_char) { |
| 1474 | return true; |
| 1475 | } |
| 1476 | |
| 1477 | next_char.is_whitespace() || is_punctuation(next_char) |
| 1478 | } |
| 1479 | |
| 1480 | fn create_lut(options: &Options) -> LookupTable { |
| 1481 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 1482 | { |
| 1483 | LookupTable { |
| 1484 | simd: simd::compute_lookup(options), |
| 1485 | scalar: special_bytes(options), |
| 1486 | } |
| 1487 | } |
| 1488 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
| 1489 | { |
| 1490 | special_bytes(options) |
| 1491 | } |
| 1492 | } |
| 1493 | |
| 1494 | fn special_bytes(options: &Options) -> [bool; 256] { |
| 1495 | let mut bytes: [bool; 256] = [false; 256]; |
| 1496 | let standard_bytes: [u8; 11] = [ |
| 1497 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
| 1498 | ]; |
| 1499 | |
| 1500 | for &byte: u8 in &standard_bytes { |
| 1501 | bytes[byte as usize] = true; |
| 1502 | } |
| 1503 | if options.contains(Options::ENABLE_TABLES) { |
| 1504 | bytes[b'|' as usize] = true; |
| 1505 | } |
| 1506 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
| 1507 | bytes[b'~' as usize] = true; |
| 1508 | } |
| 1509 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
| 1510 | for &byte: u8 in &[b'.' , b'-' , b'"' , b' \'' ] { |
| 1511 | bytes[byte as usize] = true; |
| 1512 | } |
| 1513 | } |
| 1514 | |
| 1515 | bytes |
| 1516 | } |
| 1517 | |
| 1518 | enum LoopInstruction<T> { |
| 1519 | /// Continue looking for more special bytes, but skip next few bytes. |
| 1520 | ContinueAndSkip(usize), |
| 1521 | /// Break looping immediately, returning with the given index and value. |
| 1522 | BreakAtWith(usize, T), |
| 1523 | } |
| 1524 | |
| 1525 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 1526 | struct LookupTable { |
| 1527 | simd: [u8; 16], |
| 1528 | scalar: [bool; 256], |
| 1529 | } |
| 1530 | |
| 1531 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
| 1532 | type LookupTable = [bool; 256]; |
| 1533 | |
| 1534 | /// This function walks the byte slices from the given index and |
| 1535 | /// calls the callback function on all bytes (and their indices) that are in the following set: |
| 1536 | /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n` |
| 1537 | /// It is guaranteed not call the callback on other bytes. |
| 1538 | /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback |
| 1539 | /// will not be called with an index that is less than `ix + n + 1`. |
| 1540 | /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be |
| 1541 | /// called and the function returns immediately with the return value `(end_ix, opt_val)`. |
| 1542 | /// If `BreakAtWith(..)` is never returned, this function will return the first |
| 1543 | /// index that is outside the byteslice bound and a `None` value. |
| 1544 | fn iterate_special_bytes<F, T>( |
| 1545 | lut: &LookupTable, |
| 1546 | bytes: &[u8], |
| 1547 | ix: usize, |
| 1548 | callback: F, |
| 1549 | ) -> (usize, Option<T>) |
| 1550 | where |
| 1551 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 1552 | { |
| 1553 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 1554 | { |
| 1555 | simd::iterate_special_bytes(lut, bytes, ix, callback) |
| 1556 | } |
| 1557 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
| 1558 | { |
| 1559 | scalar_iterate_special_bytes(lut, bytes, ix, callback) |
| 1560 | } |
| 1561 | } |
| 1562 | |
| 1563 | fn scalar_iterate_special_bytes<F, T>( |
| 1564 | lut: &[bool; 256], |
| 1565 | bytes: &[u8], |
| 1566 | mut ix: usize, |
| 1567 | mut callback: F, |
| 1568 | ) -> (usize, Option<T>) |
| 1569 | where |
| 1570 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 1571 | { |
| 1572 | while ix < bytes.len() { |
| 1573 | let b: u8 = bytes[ix]; |
| 1574 | if lut[b as usize] { |
| 1575 | match callback(ix, b) { |
| 1576 | LoopInstruction::ContinueAndSkip(skip: usize) => { |
| 1577 | ix += skip; |
| 1578 | } |
| 1579 | LoopInstruction::BreakAtWith(ix: usize, val: Option) => { |
| 1580 | return (ix, val); |
| 1581 | } |
| 1582 | } |
| 1583 | } |
| 1584 | ix += 1; |
| 1585 | } |
| 1586 | |
| 1587 | (ix, None) |
| 1588 | } |
| 1589 | |
| 1590 | /// Split the usual heading content range and the content inside the trailing attribute block. |
| 1591 | /// |
| 1592 | /// Returns `(leading_content_len, Option<trailing_attr_block_range>)`. |
| 1593 | /// |
| 1594 | /// Note that `trailing_attr_block_range` will be empty range when the block |
| 1595 | /// is `{}`, since the range is content inside the wrapping `{` and `}`. |
| 1596 | /// |
| 1597 | /// The closing `}` of an attribute block can have trailing whitespaces. |
| 1598 | /// They are automatically trimmed when the attribute block is being searched. |
| 1599 | /// |
| 1600 | /// However, this method does not trim the trailing whitespaces of heading content. |
| 1601 | /// It is callers' responsibility to trim them if necessary. |
| 1602 | fn extract_attribute_block_content_from_header_text( |
| 1603 | heading: &[u8], |
| 1604 | ) -> (usize, Option<Range<usize>>) { |
| 1605 | let heading_len = heading.len(); |
| 1606 | let mut ix = heading_len; |
| 1607 | ix -= scan_rev_while(heading, |b| { |
| 1608 | b == b' \n' || b == b' \r' || b == b' ' || b == b' \t' |
| 1609 | }); |
| 1610 | if ix == 0 { |
| 1611 | return (heading_len, None); |
| 1612 | } |
| 1613 | |
| 1614 | let attr_block_close = ix - 1; |
| 1615 | if heading.get(attr_block_close) != Some(&b'}' ) { |
| 1616 | // The last character is not `}`. No attribute blocks found. |
| 1617 | return (heading_len, None); |
| 1618 | } |
| 1619 | // move cursor before the closing right brace (`}`) |
| 1620 | ix -= 1; |
| 1621 | |
| 1622 | ix -= scan_rev_while(&heading[..ix], |b| { |
| 1623 | // Characters to be excluded: |
| 1624 | // * `{` and `}`: special characters to open and close an attribute block. |
| 1625 | // * `\\`: a special character to escape many characters and disable some syntaxes. |
| 1626 | // + Handling of this escape character differs among markdown processors. |
| 1627 | // + Escaped characters will be separate text node from neighbors, so |
| 1628 | // it is not easy to handle unescaped string and trim the trailing block. |
| 1629 | // * `<` and `>`: special characters to start and end HTML tag. |
| 1630 | // + No known processors converts `{#<i>foo</i>}` into |
| 1631 | // `id="<i>foo</>"` as of this writing, so hopefully |
| 1632 | // this restriction won't cause compatibility issues. |
| 1633 | // * `\n` and `\r`: a newline character. |
| 1634 | // + Setext heading can have multiple lines. However it is hard to support |
| 1635 | // attribute blocks that have newline inside, since the parsing proceeds line by |
| 1636 | // line and lines will be separate nodes even they are logically a single text. |
| 1637 | !matches!(b, b'{' | b'}' | b'<' | b'>' | b' \\' | b' \n' | b' \r' ) |
| 1638 | }); |
| 1639 | if ix == 0 { |
| 1640 | // `{` is not found. No attribute blocks available. |
| 1641 | return (heading_len, None); |
| 1642 | } |
| 1643 | let attr_block_open = ix - 1; |
| 1644 | if heading[attr_block_open] != b'{' { |
| 1645 | // `{` is not found. No attribute blocks available. |
| 1646 | return (heading_len, None); |
| 1647 | } |
| 1648 | |
| 1649 | (attr_block_open, Some(ix..attr_block_close)) |
| 1650 | } |
| 1651 | |
| 1652 | /// Parses an attribute block content, such as `.class1 #id .class2`. |
| 1653 | /// |
| 1654 | /// Returns `(id, classes)`. |
| 1655 | /// |
| 1656 | /// It is callers' responsibility to find opening and closing characters of the attribute |
| 1657 | /// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you. |
| 1658 | /// |
| 1659 | /// Note that this parsing requires explicit whitespace separators between |
| 1660 | /// attributes. This is intentional design with the reasons below: |
| 1661 | /// |
| 1662 | /// * to keep conversion simple and easy to understand for any possible input, |
| 1663 | /// * to avoid adding less obvious conversion rule that can reduce compatibility |
| 1664 | /// with other implementations more, and |
| 1665 | /// * to follow the major design of implementations with the support for the |
| 1666 | /// attribute blocks extension (as of this writing). |
| 1667 | /// |
| 1668 | /// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`]. |
| 1669 | /// |
| 1670 | /// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES` |
| 1671 | fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes> { |
| 1672 | let mut id: Option<&str> = None; |
| 1673 | let mut classes: Vec<&str> = Vec::new(); |
| 1674 | |
| 1675 | for attr: &str in inside_attr_block.split_ascii_whitespace() { |
| 1676 | // iterator returned by `str::split_ascii_whitespace` never emits empty |
| 1677 | // strings, so taking first byte won't panic. |
| 1678 | if attr.len() > 1 { |
| 1679 | let first_byte: u8 = attr.as_bytes()[0]; |
| 1680 | if first_byte == b'#' { |
| 1681 | id = Some(&attr[1..]); |
| 1682 | } else if first_byte == b'.' { |
| 1683 | classes.push(&attr[1..]); |
| 1684 | } |
| 1685 | } |
| 1686 | } |
| 1687 | |
| 1688 | Some(HeadingAttributes { id, classes }) |
| 1689 | } |
| 1690 | |
| 1691 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
| 1692 | mod simd { |
| 1693 | //! SIMD byte scanning logic. |
| 1694 | //! |
| 1695 | //! This module provides functions that allow walking through byteslices, calling |
| 1696 | //! provided callback functions on special bytes and their indices using SIMD. |
| 1697 | //! The byteset is defined in `compute_lookup`. |
| 1698 | //! |
| 1699 | //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of |
| 1700 | //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask |
| 1701 | //! from this and call the callback on every index corresponding to a 1 in this mask |
| 1702 | //! before moving on to the next chunk. This allows us to move quickly when there |
| 1703 | //! are no or few matches. |
| 1704 | //! |
| 1705 | //! The table lookup is inspired by this [great overview]. However, since all of the |
| 1706 | //! bytes we're interested in are ASCII, we don't quite need the full generality of |
| 1707 | //! the universal algorithm and are hence able to skip a few instructions. |
| 1708 | //! |
| 1709 | //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html |
| 1710 | |
| 1711 | use super::{LookupTable, LoopInstruction}; |
| 1712 | use crate::Options; |
| 1713 | use core::arch::x86_64::*; |
| 1714 | |
| 1715 | const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>(); |
| 1716 | |
| 1717 | /// Generates a lookup table containing the bitmaps for our |
| 1718 | /// special marker bytes. This is effectively a 128 element 2d bitvector, |
| 1719 | /// that can be indexed by a four bit row index (the lower nibble) |
| 1720 | /// and a three bit column index (upper nibble). |
| 1721 | pub(super) fn compute_lookup(options: &Options) -> [u8; 16] { |
| 1722 | let mut lookup = [0u8; 16]; |
| 1723 | let standard_bytes = [ |
| 1724 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
| 1725 | ]; |
| 1726 | |
| 1727 | for &byte in &standard_bytes { |
| 1728 | add_lookup_byte(&mut lookup, byte); |
| 1729 | } |
| 1730 | if options.contains(Options::ENABLE_TABLES) { |
| 1731 | add_lookup_byte(&mut lookup, b'|' ); |
| 1732 | } |
| 1733 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
| 1734 | add_lookup_byte(&mut lookup, b'~' ); |
| 1735 | } |
| 1736 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
| 1737 | for &byte in &[b'.' , b'-' , b'"' , b' \'' ] { |
| 1738 | add_lookup_byte(&mut lookup, byte); |
| 1739 | } |
| 1740 | } |
| 1741 | |
| 1742 | lookup |
| 1743 | } |
| 1744 | |
| 1745 | fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) { |
| 1746 | lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4); |
| 1747 | } |
| 1748 | |
| 1749 | /// Computes a bit mask for the given byteslice starting from the given index, |
| 1750 | /// where the 16 least significant bits indicate (by value of 1) whether or not |
| 1751 | /// there is a special character at that byte position. The least significant bit |
| 1752 | /// corresponds to `bytes[ix]` and the most significant bit corresponds to |
| 1753 | /// `bytes[ix + 15]`. |
| 1754 | /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`. |
| 1755 | #[target_feature (enable = "ssse3" )] |
| 1756 | #[inline ] |
| 1757 | unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 { |
| 1758 | debug_assert!(bytes.len() >= ix + VECTOR_SIZE); |
| 1759 | |
| 1760 | let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i); |
| 1761 | // Small lookup table to compute single bit bitshifts |
| 1762 | // for 16 bytes at once. |
| 1763 | let bitmask_lookup = |
| 1764 | _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1); |
| 1765 | |
| 1766 | // Load input from memory. |
| 1767 | let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i; |
| 1768 | let input = _mm_loadu_si128(raw_ptr); |
| 1769 | // Compute the bitmap using the bottom nibble as an index |
| 1770 | // into the lookup table. Note that non-ascii bytes will have |
| 1771 | // their most significant bit set and will map to lookup[0]. |
| 1772 | let bitset = _mm_shuffle_epi8(bitmap, input); |
| 1773 | // Compute the high nibbles of the input using a 16-bit rightshift of four |
| 1774 | // and a mask to prevent most-significant bit issues. |
| 1775 | let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f)); |
| 1776 | // Create a bitmask for the bitmap by perform a left shift of the value |
| 1777 | // of the higher nibble. Bytes with their most significant set are mapped |
| 1778 | // to -1 (all ones). |
| 1779 | let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles); |
| 1780 | // Test the bit of the bitmap by AND'ing the bitmap and the mask together. |
| 1781 | let tmp = _mm_and_si128(bitset, bitmask); |
| 1782 | // Check whether the result was not null. NEQ is not a SIMD intrinsic, |
| 1783 | // but comparing to the bitmask is logically equivalent. This also prevents us |
| 1784 | // from matching any non-ASCII bytes since none of the bitmaps were all ones |
| 1785 | // (-1). |
| 1786 | let result = _mm_cmpeq_epi8(tmp, bitmask); |
| 1787 | |
| 1788 | // Return the resulting bitmask. |
| 1789 | _mm_movemask_epi8(result) |
| 1790 | } |
| 1791 | |
| 1792 | /// Calls callback on byte indices and their value. |
| 1793 | /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the |
| 1794 | /// number of bytes in callback return value otherwise. |
| 1795 | /// Returns the final index and a possible break value. |
| 1796 | pub(super) fn iterate_special_bytes<F, T>( |
| 1797 | lut: &LookupTable, |
| 1798 | bytes: &[u8], |
| 1799 | ix: usize, |
| 1800 | callback: F, |
| 1801 | ) -> (usize, Option<T>) |
| 1802 | where |
| 1803 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 1804 | { |
| 1805 | if is_x86_feature_detected!("ssse3" ) && bytes.len() >= VECTOR_SIZE { |
| 1806 | unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) } |
| 1807 | } else { |
| 1808 | super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback) |
| 1809 | } |
| 1810 | } |
| 1811 | |
| 1812 | /// Calls the callback function for every 1 in the given bitmask with |
| 1813 | /// the index `offset + ix`, where `ix` is the position of the 1 in the mask. |
| 1814 | /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with |
| 1815 | /// final index `end_ix` and optional value `opt_val`. |
| 1816 | unsafe fn process_mask<F, T>( |
| 1817 | mut mask: i32, |
| 1818 | bytes: &[u8], |
| 1819 | mut offset: usize, |
| 1820 | callback: &mut F, |
| 1821 | ) -> Result<usize, (usize, Option<T>)> |
| 1822 | where |
| 1823 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 1824 | { |
| 1825 | while mask != 0 { |
| 1826 | let mask_ix = mask.trailing_zeros() as usize; |
| 1827 | offset += mask_ix; |
| 1828 | match callback(offset, *bytes.get_unchecked(offset)) { |
| 1829 | LoopInstruction::ContinueAndSkip(skip) => { |
| 1830 | offset += skip + 1; |
| 1831 | mask >>= skip + 1 + mask_ix; |
| 1832 | } |
| 1833 | LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)), |
| 1834 | } |
| 1835 | } |
| 1836 | Ok(offset) |
| 1837 | } |
| 1838 | |
| 1839 | #[target_feature (enable = "ssse3" )] |
| 1840 | /// Important: only call this function when `bytes.len() >= 16`. Doing |
| 1841 | /// so otherwise may exhibit undefined behaviour. |
| 1842 | unsafe fn simd_iterate_special_bytes<F, T>( |
| 1843 | lut: &[u8; 16], |
| 1844 | bytes: &[u8], |
| 1845 | mut ix: usize, |
| 1846 | mut callback: F, |
| 1847 | ) -> (usize, Option<T>) |
| 1848 | where |
| 1849 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
| 1850 | { |
| 1851 | debug_assert!(bytes.len() >= VECTOR_SIZE); |
| 1852 | let upperbound = bytes.len() - VECTOR_SIZE; |
| 1853 | |
| 1854 | while ix < upperbound { |
| 1855 | let mask = compute_mask(lut, bytes, ix); |
| 1856 | let block_start = ix; |
| 1857 | ix = match process_mask(mask, bytes, ix, &mut callback) { |
| 1858 | Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start), |
| 1859 | Err((end_ix, val)) => return (end_ix, val), |
| 1860 | }; |
| 1861 | } |
| 1862 | |
| 1863 | if bytes.len() > ix { |
| 1864 | // shift off the bytes at start we have already scanned |
| 1865 | let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound; |
| 1866 | if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) { |
| 1867 | return (end_ix, val); |
| 1868 | } |
| 1869 | } |
| 1870 | |
| 1871 | (bytes.len(), None) |
| 1872 | } |
| 1873 | |
| 1874 | #[cfg (test)] |
| 1875 | mod simd_test { |
| 1876 | use super::super::create_lut; |
| 1877 | use super::{iterate_special_bytes, LoopInstruction}; |
| 1878 | use crate::Options; |
| 1879 | |
| 1880 | fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) { |
| 1881 | let mut opts = Options::empty(); |
| 1882 | opts.insert(Options::ENABLE_TABLES); |
| 1883 | opts.insert(Options::ENABLE_FOOTNOTES); |
| 1884 | opts.insert(Options::ENABLE_STRIKETHROUGH); |
| 1885 | opts.insert(Options::ENABLE_TASKLISTS); |
| 1886 | |
| 1887 | let lut = create_lut(&opts); |
| 1888 | let mut indices = vec![]; |
| 1889 | |
| 1890 | iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| { |
| 1891 | indices.push(ix); |
| 1892 | LoopInstruction::ContinueAndSkip(skip) |
| 1893 | }); |
| 1894 | |
| 1895 | assert_eq!(&indices[..], expected); |
| 1896 | } |
| 1897 | |
| 1898 | #[test ] |
| 1899 | fn simple_no_match() { |
| 1900 | check_expected_indices("abcdef0123456789" .as_bytes(), &[], 0); |
| 1901 | } |
| 1902 | |
| 1903 | #[test ] |
| 1904 | fn simple_match() { |
| 1905 | check_expected_indices("*bcd&f0123456789" .as_bytes(), &[0, 4], 0); |
| 1906 | } |
| 1907 | |
| 1908 | #[test ] |
| 1909 | fn single_open_fish() { |
| 1910 | check_expected_indices("<" .as_bytes(), &[0], 0); |
| 1911 | } |
| 1912 | |
| 1913 | #[test ] |
| 1914 | fn long_match() { |
| 1915 | check_expected_indices("0123456789abcde~*bcd&f0" .as_bytes(), &[15, 16, 20], 0); |
| 1916 | } |
| 1917 | |
| 1918 | #[test ] |
| 1919 | fn border_skip() { |
| 1920 | check_expected_indices("0123456789abcde~~~~d&f0" .as_bytes(), &[15, 20], 3); |
| 1921 | } |
| 1922 | |
| 1923 | #[test ] |
| 1924 | fn exhaustive_search() { |
| 1925 | let chars = [ |
| 1926 | b' \n' , b' \r' , b'*' , b'_' , b'~' , b'|' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
| 1927 | ]; |
| 1928 | |
| 1929 | for &c in &chars { |
| 1930 | for i in 0u8..=255 { |
| 1931 | if !chars.contains(&i) { |
| 1932 | // full match |
| 1933 | let mut buf = [i; 18]; |
| 1934 | buf[3] = c; |
| 1935 | buf[6] = c; |
| 1936 | |
| 1937 | check_expected_indices(&buf[..], &[3, 6], 0); |
| 1938 | } |
| 1939 | } |
| 1940 | } |
| 1941 | } |
| 1942 | } |
| 1943 | } |
| 1944 | |