1 | //! The first pass resolves all block structure, generating an AST. Within a block, items |
2 | //! are in a linear chain with potential inline markup identified. |
3 | |
4 | use std::cmp::max; |
5 | use std::ops::Range; |
6 | |
7 | use crate::parse::{scan_containers, Allocations, HeadingAttributes, Item, ItemBody, LinkDef}; |
8 | use crate::scanners::*; |
9 | use crate::strings::CowStr; |
10 | use crate::tree::{Tree, TreeIndex}; |
11 | use crate::Options; |
12 | use crate::{ |
13 | linklabel::{scan_link_label_rest, LinkLabel}, |
14 | HeadingLevel, |
15 | }; |
16 | |
17 | use unicase::UniCase; |
18 | |
19 | /// Runs the first pass, which resolves the block structure of the document, |
20 | /// and returns the resulting tree. |
21 | pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations) { |
22 | // This is a very naive heuristic for the number of nodes |
23 | // we'll need. |
24 | let start_capacity: usize = max(v1:128, v2:text.len() / 32); |
25 | let lookup_table: &[bool; 256] = &create_lut(&options); |
26 | let first_pass: FirstPass<'_, '_> = FirstPass { |
27 | text, |
28 | tree: Tree::with_capacity(cap:start_capacity), |
29 | begin_list_item: false, |
30 | last_line_blank: false, |
31 | allocs: Allocations::new(), |
32 | options, |
33 | lookup_table, |
34 | }; |
35 | first_pass.run() |
36 | } |
37 | |
38 | /// State for the first parsing pass. |
39 | struct FirstPass<'a, 'b> { |
40 | text: &'a str, |
41 | tree: Tree<Item>, |
42 | begin_list_item: bool, |
43 | last_line_blank: bool, |
44 | allocs: Allocations<'a>, |
45 | options: Options, |
46 | lookup_table: &'b LookupTable, |
47 | } |
48 | |
49 | impl<'a, 'b> FirstPass<'a, 'b> { |
50 | fn run(mut self) -> (Tree<Item>, Allocations<'a>) { |
51 | let mut ix = 0; |
52 | while ix < self.text.len() { |
53 | ix = self.parse_block(ix); |
54 | } |
55 | for _ in 0..self.tree.spine_len() { |
56 | self.pop(ix); |
57 | } |
58 | (self.tree, self.allocs) |
59 | } |
60 | |
61 | /// Returns offset after block. |
62 | fn parse_block(&mut self, mut start_ix: usize) -> usize { |
63 | let bytes = self.text.as_bytes(); |
64 | let mut line_start = LineStart::new(&bytes[start_ix..]); |
65 | |
66 | let i = scan_containers(&self.tree, &mut line_start); |
67 | for _ in i..self.tree.spine_len() { |
68 | self.pop(start_ix); |
69 | } |
70 | |
71 | if self.options.contains(Options::ENABLE_FOOTNOTES) { |
72 | // finish footnote if it's still open and was preceded by blank line |
73 | if let Some(node_ix) = self.tree.peek_up() { |
74 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
75 | if self.last_line_blank { |
76 | self.pop(start_ix); |
77 | } |
78 | } |
79 | } |
80 | |
81 | // Footnote definitions of the form |
82 | // [^bar]: |
83 | // * anything really |
84 | let container_start = start_ix + line_start.bytes_scanned(); |
85 | if let Some(bytecount) = self.parse_footnote(container_start) { |
86 | start_ix = container_start + bytecount; |
87 | start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0); |
88 | line_start = LineStart::new(&bytes[start_ix..]); |
89 | } |
90 | } |
91 | |
92 | // Process new containers |
93 | loop { |
94 | let container_start = start_ix + line_start.bytes_scanned(); |
95 | if let Some((ch, index, indent)) = line_start.scan_list_marker() { |
96 | let after_marker_index = start_ix + line_start.bytes_scanned(); |
97 | self.continue_list(container_start, ch, index); |
98 | self.tree.append(Item { |
99 | start: container_start, |
100 | end: after_marker_index, // will get updated later if item not empty |
101 | body: ItemBody::ListItem(indent), |
102 | }); |
103 | self.tree.push(); |
104 | if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) { |
105 | self.begin_list_item = true; |
106 | return after_marker_index + n; |
107 | } |
108 | if self.options.contains(Options::ENABLE_TASKLISTS) { |
109 | if let Some(is_checked) = line_start.scan_task_list_marker() { |
110 | self.tree.append(Item { |
111 | start: after_marker_index, |
112 | end: start_ix + line_start.bytes_scanned(), |
113 | body: ItemBody::TaskListMarker(is_checked), |
114 | }); |
115 | } |
116 | } |
117 | } else if line_start.scan_blockquote_marker() { |
118 | self.finish_list(start_ix); |
119 | self.tree.append(Item { |
120 | start: container_start, |
121 | end: 0, // will get set later |
122 | body: ItemBody::BlockQuote, |
123 | }); |
124 | self.tree.push(); |
125 | } else { |
126 | break; |
127 | } |
128 | } |
129 | |
130 | let ix = start_ix + line_start.bytes_scanned(); |
131 | |
132 | if let Some(n) = scan_blank_line(&bytes[ix..]) { |
133 | if let Some(node_ix) = self.tree.peek_up() { |
134 | match self.tree[node_ix].item.body { |
135 | ItemBody::BlockQuote => (), |
136 | _ => { |
137 | if self.begin_list_item { |
138 | // A list item can begin with at most one blank line. |
139 | self.pop(start_ix); |
140 | } |
141 | self.last_line_blank = true; |
142 | } |
143 | } |
144 | } |
145 | return ix + n; |
146 | } |
147 | |
148 | self.begin_list_item = false; |
149 | self.finish_list(start_ix); |
150 | |
151 | // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks |
152 | let remaining_space = line_start.remaining_space(); |
153 | |
154 | let indent = line_start.scan_space_upto(4); |
155 | if indent == 4 { |
156 | let ix = start_ix + line_start.bytes_scanned(); |
157 | let remaining_space = line_start.remaining_space(); |
158 | return self.parse_indented_code_block(ix, remaining_space); |
159 | } |
160 | |
161 | let ix = start_ix + line_start.bytes_scanned(); |
162 | |
163 | // HTML Blocks |
164 | if bytes[ix] == b'<' { |
165 | // Types 1-5 are all detected by one function and all end with the same |
166 | // pattern |
167 | if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) { |
168 | return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space); |
169 | } |
170 | |
171 | // Detect type 6 |
172 | if starts_html_block_type_6(&bytes[(ix + 1)..]) { |
173 | return self.parse_html_block_type_6_or_7(ix, remaining_space); |
174 | } |
175 | |
176 | // Detect type 7 |
177 | if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) { |
178 | return self.parse_html_block_type_6_or_7(ix, remaining_space); |
179 | } |
180 | } |
181 | |
182 | if let Ok(n) = scan_hrule(&bytes[ix..]) { |
183 | return self.parse_hrule(n, ix); |
184 | } |
185 | |
186 | if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) { |
187 | return self.parse_atx_heading(ix, atx_size); |
188 | } |
189 | |
190 | // parse refdef |
191 | if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) { |
192 | self.allocs.refdefs.0.entry(label).or_insert(link_def); |
193 | let ix = ix + bytecount; |
194 | // try to read trailing whitespace or it will register as a completely blank line |
195 | // TODO: shouldn't we do this for all block level items? |
196 | return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0); |
197 | } |
198 | |
199 | if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) { |
200 | return self.parse_fenced_code_block(ix, indent, fence_ch, n); |
201 | } |
202 | self.parse_paragraph(ix) |
203 | } |
204 | |
205 | /// Returns the offset of the first line after the table. |
206 | /// Assumptions: current focus is a table element and the table header |
207 | /// matches the separator line (same number of columns). |
208 | fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize { |
209 | // parse header. this shouldn't fail because we made sure the table header is ok |
210 | let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols); |
211 | self.tree[thead_ix].item.body = ItemBody::TableHead; |
212 | |
213 | // parse body |
214 | let mut ix = body_start; |
215 | while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) { |
216 | ix = next_ix; |
217 | } |
218 | |
219 | self.pop(ix); |
220 | ix |
221 | } |
222 | |
223 | /// Call this when containers are taken care of. |
224 | /// Returns bytes scanned, row_ix |
225 | fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) { |
226 | let bytes = self.text.as_bytes(); |
227 | let mut cells = 0; |
228 | let mut final_cell_ix = None; |
229 | |
230 | let row_ix = self.tree.append(Item { |
231 | start: ix, |
232 | end: 0, // set at end of this function |
233 | body: ItemBody::TableRow, |
234 | }); |
235 | self.tree.push(); |
236 | |
237 | loop { |
238 | ix += scan_ch(&bytes[ix..], b'|' ); |
239 | let start_ix = ix; |
240 | ix += scan_whitespace_no_nl(&bytes[ix..]); |
241 | |
242 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
243 | ix += eol_bytes; |
244 | break; |
245 | } |
246 | |
247 | let cell_ix = self.tree.append(Item { |
248 | start: start_ix, |
249 | end: ix, |
250 | body: ItemBody::TableCell, |
251 | }); |
252 | self.tree.push(); |
253 | let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active); |
254 | |
255 | if let Some(cur_ix) = self.tree.cur() { |
256 | let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace); |
257 | self.tree[cur_ix].item.end -= trailing_whitespace; |
258 | } |
259 | |
260 | self.tree[cell_ix].item.end = next_ix; |
261 | self.tree.pop(); |
262 | |
263 | ix = next_ix; |
264 | cells += 1; |
265 | |
266 | if cells == row_cells { |
267 | final_cell_ix = Some(cell_ix); |
268 | } |
269 | } |
270 | |
271 | // fill empty cells if needed |
272 | // note: this is where GFM and commonmark-extra diverge. we follow |
273 | // GFM here |
274 | for _ in cells..row_cells { |
275 | self.tree.append(Item { |
276 | start: ix, |
277 | end: ix, |
278 | body: ItemBody::TableCell, |
279 | }); |
280 | } |
281 | |
282 | // drop excess cells |
283 | if let Some(cell_ix) = final_cell_ix { |
284 | self.tree[cell_ix].next = None; |
285 | } |
286 | |
287 | self.pop(ix); |
288 | |
289 | (ix, row_ix) |
290 | } |
291 | |
292 | /// Returns first offset after the row and the tree index of the row. |
293 | fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> { |
294 | let bytes = self.text.as_bytes(); |
295 | let mut line_start = LineStart::new(&bytes[ix..]); |
296 | let current_container = |
297 | scan_containers(&self.tree, &mut line_start) == self.tree.spine_len(); |
298 | if !current_container { |
299 | return None; |
300 | } |
301 | line_start.scan_all_space(); |
302 | ix += line_start.bytes_scanned(); |
303 | if scan_paragraph_interrupt(&bytes[ix..], current_container) { |
304 | return None; |
305 | } |
306 | |
307 | let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells); |
308 | Some((ix, row_ix)) |
309 | } |
310 | |
311 | /// Returns offset of line start after paragraph. |
312 | fn parse_paragraph(&mut self, start_ix: usize) -> usize { |
313 | let node_ix = self.tree.append(Item { |
314 | start: start_ix, |
315 | end: 0, // will get set later |
316 | body: ItemBody::Paragraph, |
317 | }); |
318 | self.tree.push(); |
319 | let bytes = self.text.as_bytes(); |
320 | |
321 | let mut ix = start_ix; |
322 | loop { |
323 | if self.options.contains(Options::ENABLE_FOOTNOTES) && self.get_footnote(ix).is_some() { |
324 | self.tree[node_ix].item.end = if ix > start_ix { ix - 1 } else { start_ix }; |
325 | self.tree.pop(); |
326 | if let Some(node_ix) = self.tree.peek_up() { |
327 | if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body { |
328 | self.pop(ix); |
329 | } |
330 | } |
331 | return ix; |
332 | } |
333 | |
334 | let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix { |
335 | TableParseMode::Scan |
336 | } else { |
337 | TableParseMode::Disabled |
338 | }; |
339 | let (next_ix, brk) = self.parse_line(ix, None, scan_mode); |
340 | |
341 | // break out when we find a table |
342 | if let Some(Item { |
343 | body: ItemBody::Table(alignment_ix), |
344 | .. |
345 | }) = brk |
346 | { |
347 | let table_cols = self.allocs[alignment_ix].len(); |
348 | self.tree[node_ix].item.body = ItemBody::Table(alignment_ix); |
349 | // this clears out any stuff we may have appended - but there may |
350 | // be a cleaner way |
351 | self.tree[node_ix].child = None; |
352 | self.tree.pop(); |
353 | self.tree.push(); |
354 | return self.parse_table(table_cols, ix, next_ix); |
355 | } |
356 | |
357 | ix = next_ix; |
358 | let mut line_start = LineStart::new(&bytes[ix..]); |
359 | let current_container = |
360 | scan_containers(&self.tree, &mut line_start) == self.tree.spine_len(); |
361 | if !line_start.scan_space(4) { |
362 | let ix_new = ix + line_start.bytes_scanned(); |
363 | if current_container { |
364 | let trailing_backslash_pos = match brk { |
365 | Some(Item { |
366 | start, |
367 | body: ItemBody::HardBreak, |
368 | .. |
369 | }) if bytes[start] == b' \\' => Some(start), |
370 | _ => None, |
371 | }; |
372 | if let Some(ix_setext) = |
373 | self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some()) |
374 | { |
375 | if let Some(pos) = trailing_backslash_pos { |
376 | self.tree.append_text(pos, pos + 1); |
377 | } |
378 | ix = ix_setext; |
379 | break; |
380 | } |
381 | } |
382 | // first check for non-empty lists, then for other interrupts |
383 | let suffix = &bytes[ix_new..]; |
384 | if scan_paragraph_interrupt(suffix, current_container) { |
385 | break; |
386 | } |
387 | } |
388 | line_start.scan_all_space(); |
389 | if line_start.is_at_eol() { |
390 | break; |
391 | } |
392 | ix = next_ix + line_start.bytes_scanned(); |
393 | if let Some(item) = brk { |
394 | self.tree.append(item); |
395 | } |
396 | } |
397 | |
398 | self.pop(ix); |
399 | ix |
400 | } |
401 | |
402 | /// Returns end ix of setext_heading on success. |
403 | fn parse_setext_heading( |
404 | &mut self, |
405 | ix: usize, |
406 | node_ix: TreeIndex, |
407 | has_trailing_content: bool, |
408 | ) -> Option<usize> { |
409 | let bytes = self.text.as_bytes(); |
410 | let (n, level) = scan_setext_heading(&bytes[ix..])?; |
411 | let mut attrs = None; |
412 | |
413 | if let Some(cur_ix) = self.tree.cur() { |
414 | let parent_ix = self.tree.peek_up().unwrap(); |
415 | let header_start = self.tree[parent_ix].item.start; |
416 | // Note that `self.tree[parent_ix].item.end` might be zero at this point. |
417 | // Use the end position of the current node (i.e. the last known child |
418 | // of the parent) instead. |
419 | let header_end = self.tree[cur_ix].item.end; |
420 | |
421 | // extract the trailing attribute block |
422 | let (content_end, attrs_) = |
423 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
424 | attrs = attrs_; |
425 | |
426 | // strip trailing whitespace |
427 | let new_end = if has_trailing_content { |
428 | content_end |
429 | } else { |
430 | let trailing_ws = |
431 | scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl); |
432 | content_end - trailing_ws |
433 | }; |
434 | |
435 | if attrs.is_some() { |
436 | // remove trailing block attributes |
437 | self.tree.truncate_siblings(self.text.as_bytes(), new_end); |
438 | } |
439 | |
440 | if let Some(cur_ix) = self.tree.cur() { |
441 | self.tree[cur_ix].item.end = new_end; |
442 | } |
443 | } |
444 | |
445 | self.tree[node_ix].item.body = ItemBody::Heading( |
446 | level, |
447 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
448 | ); |
449 | |
450 | Some(ix + n) |
451 | } |
452 | |
453 | /// Parse a line of input, appending text and items to tree. |
454 | /// |
455 | /// Returns: index after line and an item representing the break. |
456 | fn parse_line( |
457 | &mut self, |
458 | start: usize, |
459 | end: Option<usize>, |
460 | mode: TableParseMode, |
461 | ) -> (usize, Option<Item>) { |
462 | let bytes = self.text.as_bytes(); |
463 | let bytes = match end { |
464 | Some(end) => &bytes[..end], |
465 | None => bytes, |
466 | }; |
467 | let bytes_len = bytes.len(); |
468 | let mut pipes = 0; |
469 | let mut last_pipe_ix = start; |
470 | let mut begin_text = start; |
471 | |
472 | let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| { |
473 | match byte { |
474 | b' \n' | b' \r' => { |
475 | if let TableParseMode::Active = mode { |
476 | return LoopInstruction::BreakAtWith(ix, None); |
477 | } |
478 | |
479 | let mut i = ix; |
480 | let eol_bytes = scan_eol(&bytes[ix..]).unwrap(); |
481 | if mode == TableParseMode::Scan && pipes > 0 { |
482 | // check if we may be parsing a table |
483 | let next_line_ix = ix + eol_bytes; |
484 | let mut line_start = LineStart::new(&bytes[next_line_ix..]); |
485 | if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() { |
486 | let table_head_ix = next_line_ix + line_start.bytes_scanned(); |
487 | let (table_head_bytes, alignment) = |
488 | scan_table_head(&bytes[table_head_ix..]); |
489 | |
490 | if table_head_bytes > 0 { |
491 | // computing header count from number of pipes |
492 | let header_count = |
493 | count_header_cols(bytes, pipes, start, last_pipe_ix); |
494 | |
495 | // make sure they match the number of columns we find in separator line |
496 | if alignment.len() == header_count { |
497 | let alignment_ix = self.allocs.allocate_alignment(alignment); |
498 | let end_ix = table_head_ix + table_head_bytes; |
499 | return LoopInstruction::BreakAtWith( |
500 | end_ix, |
501 | Some(Item { |
502 | start: i, |
503 | end: end_ix, // must update later |
504 | body: ItemBody::Table(alignment_ix), |
505 | }), |
506 | ); |
507 | } |
508 | } |
509 | } |
510 | } |
511 | |
512 | let end_ix = ix + eol_bytes; |
513 | let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b' \\' ); |
514 | if trailing_backslashes % 2 == 1 && end_ix < bytes_len { |
515 | i -= 1; |
516 | self.tree.append_text(begin_text, i); |
517 | return LoopInstruction::BreakAtWith( |
518 | end_ix, |
519 | Some(Item { |
520 | start: i, |
521 | end: end_ix, |
522 | body: ItemBody::HardBreak, |
523 | }), |
524 | ); |
525 | } |
526 | let trailing_whitespace = |
527 | scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl); |
528 | if trailing_whitespace >= 2 { |
529 | i -= trailing_whitespace; |
530 | self.tree.append_text(begin_text, i); |
531 | return LoopInstruction::BreakAtWith( |
532 | end_ix, |
533 | Some(Item { |
534 | start: i, |
535 | end: end_ix, |
536 | body: ItemBody::HardBreak, |
537 | }), |
538 | ); |
539 | } |
540 | |
541 | self.tree.append_text(begin_text, ix); |
542 | LoopInstruction::BreakAtWith( |
543 | end_ix, |
544 | Some(Item { |
545 | start: i, |
546 | end: end_ix, |
547 | body: ItemBody::SoftBreak, |
548 | }), |
549 | ) |
550 | } |
551 | b' \\' => { |
552 | if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) { |
553 | self.tree.append_text(begin_text, ix); |
554 | if bytes[ix + 1] == b'`' { |
555 | let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`' ); |
556 | self.tree.append(Item { |
557 | start: ix + 1, |
558 | end: ix + count + 1, |
559 | body: ItemBody::MaybeCode(count, true), |
560 | }); |
561 | begin_text = ix + 1 + count; |
562 | LoopInstruction::ContinueAndSkip(count) |
563 | } else { |
564 | begin_text = ix + 1; |
565 | LoopInstruction::ContinueAndSkip(1) |
566 | } |
567 | } else { |
568 | LoopInstruction::ContinueAndSkip(0) |
569 | } |
570 | } |
571 | c @ b'*' | c @ b'_' | c @ b'~' => { |
572 | let string_suffix = &self.text[ix..]; |
573 | let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c); |
574 | let can_open = delim_run_can_open(self.text, string_suffix, count, ix); |
575 | let can_close = delim_run_can_close(self.text, string_suffix, count, ix); |
576 | let is_valid_seq = c != b'~' || count <= 2; |
577 | |
578 | if (can_open || can_close) && is_valid_seq { |
579 | self.tree.append_text(begin_text, ix); |
580 | for i in 0..count { |
581 | self.tree.append(Item { |
582 | start: ix + i, |
583 | end: ix + i + 1, |
584 | body: ItemBody::MaybeEmphasis(count - i, can_open, can_close), |
585 | }); |
586 | } |
587 | begin_text = ix + count; |
588 | } |
589 | LoopInstruction::ContinueAndSkip(count - 1) |
590 | } |
591 | b'`' => { |
592 | self.tree.append_text(begin_text, ix); |
593 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`' ); |
594 | self.tree.append(Item { |
595 | start: ix, |
596 | end: ix + count, |
597 | body: ItemBody::MaybeCode(count, false), |
598 | }); |
599 | begin_text = ix + count; |
600 | LoopInstruction::ContinueAndSkip(count - 1) |
601 | } |
602 | b'<' => { |
603 | // Note: could detect some non-HTML cases and early escape here, but not |
604 | // clear that's a win. |
605 | self.tree.append_text(begin_text, ix); |
606 | self.tree.append(Item { |
607 | start: ix, |
608 | end: ix + 1, |
609 | body: ItemBody::MaybeHtml, |
610 | }); |
611 | begin_text = ix + 1; |
612 | LoopInstruction::ContinueAndSkip(0) |
613 | } |
614 | b'!' => { |
615 | if ix + 1 < bytes_len && bytes[ix + 1] == b'[' { |
616 | self.tree.append_text(begin_text, ix); |
617 | self.tree.append(Item { |
618 | start: ix, |
619 | end: ix + 2, |
620 | body: ItemBody::MaybeImage, |
621 | }); |
622 | begin_text = ix + 2; |
623 | LoopInstruction::ContinueAndSkip(1) |
624 | } else { |
625 | LoopInstruction::ContinueAndSkip(0) |
626 | } |
627 | } |
628 | b'[' => { |
629 | self.tree.append_text(begin_text, ix); |
630 | self.tree.append(Item { |
631 | start: ix, |
632 | end: ix + 1, |
633 | body: ItemBody::MaybeLinkOpen, |
634 | }); |
635 | begin_text = ix + 1; |
636 | LoopInstruction::ContinueAndSkip(0) |
637 | } |
638 | b']' => { |
639 | self.tree.append_text(begin_text, ix); |
640 | self.tree.append(Item { |
641 | start: ix, |
642 | end: ix + 1, |
643 | body: ItemBody::MaybeLinkClose(true), |
644 | }); |
645 | begin_text = ix + 1; |
646 | LoopInstruction::ContinueAndSkip(0) |
647 | } |
648 | b'&' => match scan_entity(&bytes[ix..]) { |
649 | (n, Some(value)) => { |
650 | self.tree.append_text(begin_text, ix); |
651 | self.tree.append(Item { |
652 | start: ix, |
653 | end: ix + n, |
654 | body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)), |
655 | }); |
656 | begin_text = ix + n; |
657 | LoopInstruction::ContinueAndSkip(n - 1) |
658 | } |
659 | _ => LoopInstruction::ContinueAndSkip(0), |
660 | }, |
661 | b'|' => { |
662 | if let TableParseMode::Active = mode { |
663 | LoopInstruction::BreakAtWith(ix, None) |
664 | } else { |
665 | last_pipe_ix = ix; |
666 | pipes += 1; |
667 | LoopInstruction::ContinueAndSkip(0) |
668 | } |
669 | } |
670 | b'.' => { |
671 | if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' { |
672 | self.tree.append_text(begin_text, ix); |
673 | self.tree.append(Item { |
674 | start: ix, |
675 | end: ix + 3, |
676 | body: ItemBody::SynthesizeChar('…' ), |
677 | }); |
678 | begin_text = ix + 3; |
679 | LoopInstruction::ContinueAndSkip(2) |
680 | } else { |
681 | LoopInstruction::ContinueAndSkip(0) |
682 | } |
683 | } |
684 | b'-' => { |
685 | let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-' ); |
686 | if count == 1 { |
687 | LoopInstruction::ContinueAndSkip(0) |
688 | } else { |
689 | let itembody = if count == 2 { |
690 | ItemBody::SynthesizeChar('–' ) |
691 | } else if count == 3 { |
692 | ItemBody::SynthesizeChar('—' ) |
693 | } else { |
694 | let (ems, ens) = match count % 6 { |
695 | 0 | 3 => (count / 3, 0), |
696 | 2 | 4 => (0, count / 2), |
697 | 1 => (count / 3 - 1, 2), |
698 | _ => (count / 3, 1), |
699 | }; |
700 | // – and — are 3 bytes each in utf8 |
701 | let mut buf = String::with_capacity(3 * (ems + ens)); |
702 | for _ in 0..ems { |
703 | buf.push('—' ); |
704 | } |
705 | for _ in 0..ens { |
706 | buf.push('–' ); |
707 | } |
708 | ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into())) |
709 | }; |
710 | |
711 | self.tree.append_text(begin_text, ix); |
712 | self.tree.append(Item { |
713 | start: ix, |
714 | end: ix + count, |
715 | body: itembody, |
716 | }); |
717 | begin_text = ix + count; |
718 | LoopInstruction::ContinueAndSkip(count - 1) |
719 | } |
720 | } |
721 | c @ b' \'' | c @ b'"' => { |
722 | let string_suffix = &self.text[ix..]; |
723 | let can_open = delim_run_can_open(self.text, string_suffix, 1, ix); |
724 | let can_close = delim_run_can_close(self.text, string_suffix, 1, ix); |
725 | |
726 | self.tree.append_text(begin_text, ix); |
727 | self.tree.append(Item { |
728 | start: ix, |
729 | end: ix + 1, |
730 | body: ItemBody::MaybeSmartQuote(c, can_open, can_close), |
731 | }); |
732 | begin_text = ix + 1; |
733 | |
734 | LoopInstruction::ContinueAndSkip(0) |
735 | } |
736 | _ => LoopInstruction::ContinueAndSkip(0), |
737 | } |
738 | }); |
739 | |
740 | if brk.is_none() { |
741 | // need to close text at eof |
742 | self.tree.append_text(begin_text, final_ix); |
743 | } |
744 | (final_ix, brk) |
745 | } |
746 | |
747 | /// When start_ix is at the beginning of an HTML block of type 1 to 5, |
748 | /// this will find the end of the block, adding the block itself to the |
749 | /// tree and also keeping track of the lines of HTML within the block. |
750 | /// |
751 | /// The html_end_tag is the tag that must be found on a line to end the block. |
752 | fn parse_html_block_type_1_to_5( |
753 | &mut self, |
754 | start_ix: usize, |
755 | html_end_tag: &str, |
756 | mut remaining_space: usize, |
757 | ) -> usize { |
758 | let bytes = self.text.as_bytes(); |
759 | let mut ix = start_ix; |
760 | loop { |
761 | let line_start_ix = ix; |
762 | ix += scan_nextline(&bytes[ix..]); |
763 | self.append_html_line(remaining_space, line_start_ix, ix); |
764 | |
765 | let mut line_start = LineStart::new(&bytes[ix..]); |
766 | let n_containers = scan_containers(&self.tree, &mut line_start); |
767 | if n_containers < self.tree.spine_len() { |
768 | break; |
769 | } |
770 | |
771 | if (&self.text[line_start_ix..ix]).contains(html_end_tag) { |
772 | break; |
773 | } |
774 | |
775 | let next_line_ix = ix + line_start.bytes_scanned(); |
776 | if next_line_ix == self.text.len() { |
777 | break; |
778 | } |
779 | ix = next_line_ix; |
780 | remaining_space = line_start.remaining_space(); |
781 | } |
782 | ix |
783 | } |
784 | |
785 | /// When start_ix is at the beginning of an HTML block of type 6 or 7, |
786 | /// this will consume lines until there is a blank line and keep track of |
787 | /// the HTML within the block. |
788 | fn parse_html_block_type_6_or_7( |
789 | &mut self, |
790 | start_ix: usize, |
791 | mut remaining_space: usize, |
792 | ) -> usize { |
793 | let bytes = self.text.as_bytes(); |
794 | let mut ix = start_ix; |
795 | loop { |
796 | let line_start_ix = ix; |
797 | ix += scan_nextline(&bytes[ix..]); |
798 | self.append_html_line(remaining_space, line_start_ix, ix); |
799 | |
800 | let mut line_start = LineStart::new(&bytes[ix..]); |
801 | let n_containers = scan_containers(&self.tree, &mut line_start); |
802 | if n_containers < self.tree.spine_len() || line_start.is_at_eol() { |
803 | break; |
804 | } |
805 | |
806 | let next_line_ix = ix + line_start.bytes_scanned(); |
807 | if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some() |
808 | { |
809 | break; |
810 | } |
811 | ix = next_line_ix; |
812 | remaining_space = line_start.remaining_space(); |
813 | } |
814 | ix |
815 | } |
816 | |
817 | fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize { |
818 | self.tree.append(Item { |
819 | start: start_ix, |
820 | end: 0, // will get set later |
821 | body: ItemBody::IndentCodeBlock, |
822 | }); |
823 | self.tree.push(); |
824 | let bytes = self.text.as_bytes(); |
825 | let mut last_nonblank_child = None; |
826 | let mut last_nonblank_ix = 0; |
827 | let mut end_ix = 0; |
828 | let mut last_line_blank = false; |
829 | |
830 | let mut ix = start_ix; |
831 | loop { |
832 | let line_start_ix = ix; |
833 | ix += scan_nextline(&bytes[ix..]); |
834 | self.append_code_text(remaining_space, line_start_ix, ix); |
835 | // TODO(spec clarification): should we synthesize newline at EOF? |
836 | |
837 | if !last_line_blank { |
838 | last_nonblank_child = self.tree.cur(); |
839 | last_nonblank_ix = ix; |
840 | end_ix = ix; |
841 | } |
842 | |
843 | let mut line_start = LineStart::new(&bytes[ix..]); |
844 | let n_containers = scan_containers(&self.tree, &mut line_start); |
845 | if n_containers < self.tree.spine_len() |
846 | || !(line_start.scan_space(4) || line_start.is_at_eol()) |
847 | { |
848 | break; |
849 | } |
850 | let next_line_ix = ix + line_start.bytes_scanned(); |
851 | if next_line_ix == self.text.len() { |
852 | break; |
853 | } |
854 | ix = next_line_ix; |
855 | remaining_space = line_start.remaining_space(); |
856 | last_line_blank = scan_blank_line(&bytes[ix..]).is_some(); |
857 | } |
858 | |
859 | // Trim trailing blank lines. |
860 | if let Some(child) = last_nonblank_child { |
861 | self.tree[child].next = None; |
862 | self.tree[child].item.end = last_nonblank_ix; |
863 | } |
864 | self.pop(end_ix); |
865 | ix |
866 | } |
867 | |
868 | fn parse_fenced_code_block( |
869 | &mut self, |
870 | start_ix: usize, |
871 | indent: usize, |
872 | fence_ch: u8, |
873 | n_fence_char: usize, |
874 | ) -> usize { |
875 | let bytes = self.text.as_bytes(); |
876 | let mut info_start = start_ix + n_fence_char; |
877 | info_start += scan_whitespace_no_nl(&bytes[info_start..]); |
878 | // TODO: info strings are typically very short. wouldn't it be faster |
879 | // to just do a forward scan here? |
880 | let mut ix = info_start + scan_nextline(&bytes[info_start..]); |
881 | let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace); |
882 | let info_string = unescape(&self.text[info_start..info_end]); |
883 | self.tree.append(Item { |
884 | start: start_ix, |
885 | end: 0, // will get set later |
886 | body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)), |
887 | }); |
888 | self.tree.push(); |
889 | loop { |
890 | let mut line_start = LineStart::new(&bytes[ix..]); |
891 | let n_containers = scan_containers(&self.tree, &mut line_start); |
892 | if n_containers < self.tree.spine_len() { |
893 | break; |
894 | } |
895 | line_start.scan_space(indent); |
896 | let mut close_line_start = line_start.clone(); |
897 | if !close_line_start.scan_space(4) { |
898 | let close_ix = ix + close_line_start.bytes_scanned(); |
899 | if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char) |
900 | { |
901 | ix = close_ix + n; |
902 | break; |
903 | } |
904 | } |
905 | let remaining_space = line_start.remaining_space(); |
906 | ix += line_start.bytes_scanned(); |
907 | let next_ix = ix + scan_nextline(&bytes[ix..]); |
908 | self.append_code_text(remaining_space, ix, next_ix); |
909 | ix = next_ix; |
910 | } |
911 | |
912 | self.pop(ix); |
913 | |
914 | // try to read trailing whitespace or it will register as a completely blank line |
915 | ix + scan_blank_line(&bytes[ix..]).unwrap_or(0) |
916 | } |
917 | |
918 | fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) { |
919 | if remaining_space > 0 { |
920 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
921 | self.tree.append(Item { |
922 | start, |
923 | end: start, |
924 | body: ItemBody::SynthesizeText(cow_ix), |
925 | }); |
926 | } |
927 | if self.text.as_bytes()[end - 2] == b' \r' { |
928 | // Normalize CRLF to LF |
929 | self.tree.append_text(start, end - 2); |
930 | self.tree.append_text(end - 1, end); |
931 | } else { |
932 | self.tree.append_text(start, end); |
933 | } |
934 | } |
935 | |
936 | /// Appends a line of HTML to the tree. |
937 | fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) { |
938 | if remaining_space > 0 { |
939 | let cow_ix = self.allocs.allocate_cow(" " [..remaining_space].into()); |
940 | self.tree.append(Item { |
941 | start, |
942 | end: start, |
943 | // TODO: maybe this should synthesize to html rather than text? |
944 | body: ItemBody::SynthesizeText(cow_ix), |
945 | }); |
946 | } |
947 | if self.text.as_bytes()[end - 2] == b' \r' { |
948 | // Normalize CRLF to LF |
949 | self.tree.append(Item { |
950 | start, |
951 | end: end - 2, |
952 | body: ItemBody::Html, |
953 | }); |
954 | self.tree.append(Item { |
955 | start: end - 1, |
956 | end, |
957 | body: ItemBody::Html, |
958 | }); |
959 | } else { |
960 | self.tree.append(Item { |
961 | start, |
962 | end, |
963 | body: ItemBody::Html, |
964 | }); |
965 | } |
966 | } |
967 | |
968 | /// Pop a container, setting its end. |
969 | fn pop(&mut self, ix: usize) { |
970 | let cur_ix = self.tree.pop().unwrap(); |
971 | self.tree[cur_ix].item.end = ix; |
972 | if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body { |
973 | surgerize_tight_list(&mut self.tree, cur_ix); |
974 | } |
975 | } |
976 | |
977 | /// Close a list if it's open. Also set loose if last line was blank |
978 | fn finish_list(&mut self, ix: usize) { |
979 | if let Some(node_ix) = self.tree.peek_up() { |
980 | if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body { |
981 | self.pop(ix); |
982 | } |
983 | } |
984 | if self.last_line_blank { |
985 | if let Some(node_ix) = self.tree.peek_grandparent() { |
986 | if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body { |
987 | *is_tight = false; |
988 | } |
989 | } |
990 | self.last_line_blank = false; |
991 | } |
992 | } |
993 | |
994 | /// Continue an existing list or start a new one if there's not an open |
995 | /// list that matches. |
996 | fn continue_list(&mut self, start: usize, ch: u8, index: u64) { |
997 | if let Some(node_ix) = self.tree.peek_up() { |
998 | if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body { |
999 | if existing_ch == ch { |
1000 | if self.last_line_blank { |
1001 | *is_tight = false; |
1002 | self.last_line_blank = false; |
1003 | } |
1004 | return; |
1005 | } |
1006 | } |
1007 | // TODO: this is not the best choice for end; maybe get end from last list item. |
1008 | self.finish_list(start); |
1009 | } |
1010 | self.tree.append(Item { |
1011 | start, |
1012 | end: 0, // will get set later |
1013 | body: ItemBody::List(true, ch, index), |
1014 | }); |
1015 | self.tree.push(); |
1016 | self.last_line_blank = false; |
1017 | } |
1018 | |
1019 | /// Parse a thematic break. |
1020 | /// |
1021 | /// Returns index of start of next line. |
1022 | fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize { |
1023 | self.tree.append(Item { |
1024 | start: ix, |
1025 | end: ix + hrule_size, |
1026 | body: ItemBody::Rule, |
1027 | }); |
1028 | ix + hrule_size |
1029 | } |
1030 | |
1031 | /// Parse an ATX heading. |
1032 | /// |
1033 | /// Returns index of start of next line. |
1034 | fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize { |
1035 | let mut ix = start; |
1036 | let heading_ix = self.tree.append(Item { |
1037 | start, |
1038 | end: 0, // set later |
1039 | body: ItemBody::default(), // set later |
1040 | }); |
1041 | ix += atx_level as usize; |
1042 | // next char is space or eol (guaranteed by scan_atx_heading) |
1043 | let bytes = self.text.as_bytes(); |
1044 | if let Some(eol_bytes) = scan_eol(&bytes[ix..]) { |
1045 | self.tree[heading_ix].item.end = ix + eol_bytes; |
1046 | self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None); |
1047 | return ix + eol_bytes; |
1048 | } |
1049 | // skip leading spaces |
1050 | let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]); |
1051 | ix += skip_spaces; |
1052 | |
1053 | // now handle the header text |
1054 | let header_start = ix; |
1055 | let header_node_idx = self.tree.push(); // so that we can set the endpoint later |
1056 | |
1057 | // trim the trailing attribute block before parsing the entire line, if necessary |
1058 | let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) |
1059 | { |
1060 | // the start of the next line is the end of the header since the |
1061 | // header cannot have line breaks |
1062 | let header_end = header_start + scan_nextline(&bytes[header_start..]); |
1063 | let (content_end, attrs) = |
1064 | self.extract_and_parse_heading_attribute_block(header_start, header_end); |
1065 | self.parse_line(ix, Some(content_end), TableParseMode::Disabled); |
1066 | (header_end, content_end, attrs) |
1067 | } else { |
1068 | ix = self.parse_line(ix, None, TableParseMode::Disabled).0; |
1069 | (ix, ix, None) |
1070 | }; |
1071 | self.tree[header_node_idx].item.end = end; |
1072 | |
1073 | // remove trailing matter from header text |
1074 | if let Some(cur_ix) = self.tree.cur() { |
1075 | // remove closing of the ATX heading |
1076 | let header_text = &bytes[header_start..content_end]; |
1077 | let mut limit = header_text |
1078 | .iter() |
1079 | .rposition(|&b| !(b == b' \n' || b == b' \r' || b == b' ' )) |
1080 | .map_or(0, |i| i + 1); |
1081 | let closer = header_text[..limit] |
1082 | .iter() |
1083 | .rposition(|&b| b != b'#' ) |
1084 | .map_or(0, |i| i + 1); |
1085 | if closer == 0 { |
1086 | limit = closer; |
1087 | } else { |
1088 | let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ' ); |
1089 | if spaces > 0 { |
1090 | limit = closer - spaces; |
1091 | } |
1092 | } |
1093 | self.tree[cur_ix].item.end = limit + header_start; |
1094 | } |
1095 | |
1096 | self.tree.pop(); |
1097 | self.tree[heading_ix].item.body = ItemBody::Heading( |
1098 | atx_level, |
1099 | attrs.map(|attrs| self.allocs.allocate_heading(attrs)), |
1100 | ); |
1101 | end |
1102 | } |
1103 | |
1104 | fn get_footnote(&mut self, start: usize) -> Option<(usize, CowStr<'a>)> { |
1105 | let bytes = &self.text.as_bytes()[start..]; |
1106 | if !bytes.starts_with(b"[^" ) { |
1107 | return None; |
1108 | } |
1109 | let (mut i, label) = self.parse_refdef_label(start + 2)?; |
1110 | i += 2; |
1111 | if scan_ch(&bytes[i..], b':' ) == 0 { |
1112 | return None; |
1113 | } |
1114 | i += 1; |
1115 | Some((i, label)) |
1116 | } |
1117 | |
1118 | /// Returns the number of bytes scanned on success. |
1119 | fn parse_footnote(&mut self, start: usize) -> Option<usize> { |
1120 | let (i, label) = self.get_footnote(start)?; |
1121 | self.finish_list(start); |
1122 | self.tree.append(Item { |
1123 | start, |
1124 | end: 0, // will get set later |
1125 | // TODO: check whether the label here is strictly necessary |
1126 | body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)), |
1127 | }); |
1128 | self.tree.push(); |
1129 | Some(i) |
1130 | } |
1131 | |
1132 | /// Tries to parse a reference label, which can be interrupted by new blocks. |
1133 | /// On success, returns the number of bytes of the label and the label itself. |
1134 | fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> { |
1135 | scan_link_label_rest(&self.text[start..], &|bytes| { |
1136 | let mut line_start = LineStart::new(bytes); |
1137 | let current_container = |
1138 | scan_containers(&self.tree, &mut line_start) == self.tree.spine_len(); |
1139 | let bytes_scanned = line_start.bytes_scanned(); |
1140 | let suffix = &bytes[bytes_scanned..]; |
1141 | if scan_paragraph_interrupt(suffix, current_container) { |
1142 | None |
1143 | } else { |
1144 | Some(bytes_scanned) |
1145 | } |
1146 | }) |
1147 | } |
1148 | |
1149 | /// Returns number of bytes scanned, label and definition on success. |
1150 | fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> { |
1151 | let bytes = &self.text.as_bytes()[start..]; |
1152 | if scan_ch(bytes, b'[' ) == 0 { |
1153 | return None; |
1154 | } |
1155 | let (mut i, label) = self.parse_refdef_label(start + 1)?; |
1156 | i += 1; |
1157 | if scan_ch(&bytes[i..], b':' ) == 0 { |
1158 | return None; |
1159 | } |
1160 | i += 1; |
1161 | let (bytecount, link_def) = self.scan_refdef(start, start + i)?; |
1162 | Some((bytecount + i, UniCase::new(label), link_def)) |
1163 | } |
1164 | |
1165 | /// Returns number of bytes and number of newlines |
1166 | fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> { |
1167 | let mut newlines = 0; |
1168 | loop { |
1169 | let whitespaces = scan_whitespace_no_nl(&bytes[i..]); |
1170 | i += whitespaces; |
1171 | if let Some(eol_bytes) = scan_eol(&bytes[i..]) { |
1172 | i += eol_bytes; |
1173 | newlines += 1; |
1174 | if newlines > 1 { |
1175 | return None; |
1176 | } |
1177 | } else { |
1178 | break; |
1179 | } |
1180 | let mut line_start = LineStart::new(&bytes[i..]); |
1181 | if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) { |
1182 | return None; |
1183 | } |
1184 | i += line_start.bytes_scanned(); |
1185 | } |
1186 | Some((i, newlines)) |
1187 | } |
1188 | |
1189 | /// Returns # of bytes and definition. |
1190 | /// Assumes the label of the reference including colon has already been scanned. |
1191 | fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> { |
1192 | let bytes = self.text.as_bytes(); |
1193 | |
1194 | // whitespace between label and url (including up to one newline) |
1195 | let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?; |
1196 | |
1197 | // scan link dest |
1198 | let (dest_length, dest) = scan_link_dest(self.text, i, 1)?; |
1199 | if dest_length == 0 { |
1200 | return None; |
1201 | } |
1202 | let dest = unescape(dest); |
1203 | i += dest_length; |
1204 | |
1205 | // no title |
1206 | let mut backup = ( |
1207 | i - start, |
1208 | LinkDef { |
1209 | dest, |
1210 | title: None, |
1211 | span: span_start..i, |
1212 | }, |
1213 | ); |
1214 | |
1215 | // scan whitespace between dest and label |
1216 | let (mut i, newlines) = |
1217 | if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) { |
1218 | if i == self.text.len() { |
1219 | newlines += 1; |
1220 | } |
1221 | if new_i == i && newlines == 0 { |
1222 | return None; |
1223 | } |
1224 | if newlines > 1 { |
1225 | return Some(backup); |
1226 | }; |
1227 | (new_i, newlines) |
1228 | } else { |
1229 | return Some(backup); |
1230 | }; |
1231 | |
1232 | // scan title |
1233 | // if this fails but newline == 1, return also a refdef without title |
1234 | if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) { |
1235 | i += title_length; |
1236 | backup.1.span = span_start..i; |
1237 | backup.1.title = Some(unescape(title)); |
1238 | } else if newlines > 0 { |
1239 | return Some(backup); |
1240 | } else { |
1241 | return None; |
1242 | }; |
1243 | |
1244 | // scan EOL |
1245 | if let Some(bytes) = scan_blank_line(&bytes[i..]) { |
1246 | backup.0 = i + bytes - start; |
1247 | Some(backup) |
1248 | } else if newlines > 0 { |
1249 | Some(backup) |
1250 | } else { |
1251 | None |
1252 | } |
1253 | } |
1254 | |
1255 | /// Extracts and parses a heading attribute block if exists. |
1256 | /// |
1257 | /// Returns `(end_offset_of_heading_content, (id, classes))`. |
1258 | /// |
1259 | /// If `header_end` is less than or equal to `header_start`, the given |
1260 | /// input is considered as empty. |
1261 | fn extract_and_parse_heading_attribute_block( |
1262 | &mut self, |
1263 | header_start: usize, |
1264 | header_end: usize, |
1265 | ) -> (usize, Option<HeadingAttributes<'a>>) { |
1266 | if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) { |
1267 | return (header_end, None); |
1268 | } |
1269 | |
1270 | // extract the trailing attribute block |
1271 | let header_bytes = &self.text.as_bytes()[header_start..header_end]; |
1272 | let (content_len, attr_block_range_rel) = |
1273 | extract_attribute_block_content_from_header_text(header_bytes); |
1274 | let content_end = header_start + content_len; |
1275 | let attrs = attr_block_range_rel.and_then(|r| { |
1276 | parse_inside_attribute_block( |
1277 | &self.text[(header_start + r.start)..(header_start + r.end)], |
1278 | ) |
1279 | }); |
1280 | (content_end, attrs) |
1281 | } |
1282 | } |
1283 | |
1284 | /// Scanning modes for `Parser`'s `parse_line` method. |
1285 | #[derive (PartialEq, Eq, Copy, Clone)] |
1286 | enum TableParseMode { |
1287 | /// Inside a paragraph, scanning for table headers. |
1288 | Scan, |
1289 | /// Inside a table. |
1290 | Active, |
1291 | /// Inside a paragraph, not scanning for table headers. |
1292 | Disabled, |
1293 | } |
1294 | |
1295 | /// Computes the number of header columns in a table line by computing the number of dividing pipes |
1296 | /// that aren't followed or preceded by whitespace. |
1297 | fn count_header_cols( |
1298 | bytes: &[u8], |
1299 | mut pipes: usize, |
1300 | mut start: usize, |
1301 | last_pipe_ix: usize, |
1302 | ) -> usize { |
1303 | // was first pipe preceded by whitespace? if so, subtract one |
1304 | start += scan_whitespace_no_nl(&bytes[start..]); |
1305 | if bytes[start] == b'|' { |
1306 | pipes -= 1; |
1307 | } |
1308 | |
1309 | // was last pipe followed by whitespace? if so, sub one |
1310 | if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() { |
1311 | pipes |
1312 | } else { |
1313 | pipes + 1 |
1314 | } |
1315 | } |
1316 | |
1317 | /// Checks whether we should break a paragraph on the given input. |
1318 | fn scan_paragraph_interrupt(bytes: &[u8], current_container: bool) -> bool { |
1319 | scan_eol(bytes).is_some() |
1320 | || scan_hrule(bytes).is_ok() |
1321 | || scan_atx_heading(data:bytes).is_some() |
1322 | || scan_code_fence(data:bytes).is_some() |
1323 | || scan_blockquote_start(data:bytes).is_some() |
1324 | || scan_listitem(bytes).map_or(default:false, |(ix: usize, delim: u8, index: usize, _)| { |
1325 | ! current_container || |
1326 | // we don't allow interruption by either empty lists or |
1327 | // numbered lists starting at an index other than 1 |
1328 | (delim == b'*' || delim == b'-' || delim == b'+' || index == 1) |
1329 | && !scan_empty_list(&bytes[ix..]) |
1330 | }) |
1331 | || bytes.starts_with(needle:b"<" ) |
1332 | && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..])) |
1333 | } |
1334 | |
1335 | /// Assumes `text_bytes` is preceded by `<`. |
1336 | fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> { |
1337 | static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre" , b"style" , b"script" , b"textarea" ]; |
1338 | static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--" , b"?" , b"![CDATA[" ]; |
1339 | |
1340 | for (beg_tag, end_tag) in BEGIN_TAGS |
1341 | .iter() |
1342 | .zip(["</pre>" , "</style>" , "</script>" , "</textarea>" ].iter()) |
1343 | { |
1344 | let tag_len = beg_tag.len(); |
1345 | |
1346 | if text_bytes.len() < tag_len { |
1347 | // begin tags are increasing in size |
1348 | break; |
1349 | } |
1350 | |
1351 | if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) { |
1352 | continue; |
1353 | } |
1354 | |
1355 | // Must either be the end of the line... |
1356 | if text_bytes.len() == tag_len { |
1357 | return Some(end_tag); |
1358 | } |
1359 | |
1360 | // ...or be followed by whitespace, newline, or '>'. |
1361 | let s = text_bytes[tag_len]; |
1362 | if is_ascii_whitespace(s) || s == b'>' { |
1363 | return Some(end_tag); |
1364 | } |
1365 | } |
1366 | |
1367 | for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->" , "?>" , "]]>" ].iter()) { |
1368 | if text_bytes.starts_with(beg_tag) { |
1369 | return Some(end_tag); |
1370 | } |
1371 | } |
1372 | |
1373 | if text_bytes.len() > 1 |
1374 | && text_bytes[0] == b'!' |
1375 | && text_bytes[1] >= b'A' |
1376 | && text_bytes[1] <= b'Z' |
1377 | { |
1378 | Some(">" ) |
1379 | } else { |
1380 | None |
1381 | } |
1382 | } |
1383 | |
1384 | // https://english.stackexchange.com/a/285573 |
1385 | fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) { |
1386 | let mut list_item = tree[list_ix].child; |
1387 | while let Some(listitem_ix) = list_item { |
1388 | // first child is special, controls how we repoint list_item.child |
1389 | let list_item_firstborn = tree[listitem_ix].child; |
1390 | |
1391 | // Check that list item has children - this is not necessarily the case! |
1392 | if let Some(firstborn_ix) = list_item_firstborn { |
1393 | if let ItemBody::Paragraph = tree[firstborn_ix].item.body { |
1394 | tree[listitem_ix].child = tree[firstborn_ix].child; |
1395 | } |
1396 | |
1397 | let mut list_item_child = Some(firstborn_ix); |
1398 | let mut node_to_repoint = None; |
1399 | while let Some(child_ix) = list_item_child { |
1400 | // surgerize paragraphs |
1401 | let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body { |
1402 | if let Some(child_firstborn) = tree[child_ix].child { |
1403 | if let Some(repoint_ix) = node_to_repoint { |
1404 | tree[repoint_ix].next = Some(child_firstborn); |
1405 | } |
1406 | let mut child_lastborn = child_firstborn; |
1407 | while let Some(lastborn_next_ix) = tree[child_lastborn].next { |
1408 | child_lastborn = lastborn_next_ix; |
1409 | } |
1410 | child_lastborn |
1411 | } else { |
1412 | child_ix |
1413 | } |
1414 | } else { |
1415 | child_ix |
1416 | }; |
1417 | |
1418 | node_to_repoint = Some(repoint_ix); |
1419 | tree[repoint_ix].next = tree[child_ix].next; |
1420 | list_item_child = tree[child_ix].next; |
1421 | } |
1422 | } |
1423 | |
1424 | list_item = tree[listitem_ix].next; |
1425 | } |
1426 | } |
1427 | |
1428 | /// Determines whether the delimiter run starting at given index is |
1429 | /// left-flanking, as defined by the commonmark spec (and isn't intraword |
1430 | /// for _ delims). |
1431 | /// suffix is &s[ix..], which is passed in as an optimization, since taking |
1432 | /// a string subslice is O(n). |
1433 | fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool { |
1434 | let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) { |
1435 | c |
1436 | } else { |
1437 | return false; |
1438 | }; |
1439 | if next_char.is_whitespace() { |
1440 | return false; |
1441 | } |
1442 | if ix == 0 { |
1443 | return true; |
1444 | } |
1445 | let delim: char = suffix.chars().next().unwrap(); |
1446 | if delim == '*' && !is_punctuation(next_char) { |
1447 | return true; |
1448 | } |
1449 | |
1450 | let prev_char: char = s[..ix].chars().last().unwrap(); |
1451 | |
1452 | prev_char.is_whitespace() |
1453 | || is_punctuation(prev_char) && (delim != ' \'' || ![']' , ')' ].contains(&prev_char)) |
1454 | } |
1455 | |
1456 | /// Determines whether the delimiter run starting at given index is |
1457 | /// left-flanking, as defined by the commonmark spec (and isn't intraword |
1458 | /// for _ delims) |
1459 | fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool { |
1460 | if ix == 0 { |
1461 | return false; |
1462 | } |
1463 | let prev_char: char = s[..ix].chars().last().unwrap(); |
1464 | if prev_char.is_whitespace() { |
1465 | return false; |
1466 | } |
1467 | let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) { |
1468 | c |
1469 | } else { |
1470 | return true; |
1471 | }; |
1472 | let delim: char = suffix.chars().next().unwrap(); |
1473 | if delim == '*' && !is_punctuation(prev_char) { |
1474 | return true; |
1475 | } |
1476 | |
1477 | next_char.is_whitespace() || is_punctuation(next_char) |
1478 | } |
1479 | |
1480 | fn create_lut(options: &Options) -> LookupTable { |
1481 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
1482 | { |
1483 | LookupTable { |
1484 | simd: simd::compute_lookup(options), |
1485 | scalar: special_bytes(options), |
1486 | } |
1487 | } |
1488 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
1489 | { |
1490 | special_bytes(options) |
1491 | } |
1492 | } |
1493 | |
1494 | fn special_bytes(options: &Options) -> [bool; 256] { |
1495 | let mut bytes: [bool; 256] = [false; 256]; |
1496 | let standard_bytes: [u8; 11] = [ |
1497 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
1498 | ]; |
1499 | |
1500 | for &byte: u8 in &standard_bytes { |
1501 | bytes[byte as usize] = true; |
1502 | } |
1503 | if options.contains(Options::ENABLE_TABLES) { |
1504 | bytes[b'|' as usize] = true; |
1505 | } |
1506 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
1507 | bytes[b'~' as usize] = true; |
1508 | } |
1509 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
1510 | for &byte: u8 in &[b'.' , b'-' , b'"' , b' \'' ] { |
1511 | bytes[byte as usize] = true; |
1512 | } |
1513 | } |
1514 | |
1515 | bytes |
1516 | } |
1517 | |
1518 | enum LoopInstruction<T> { |
1519 | /// Continue looking for more special bytes, but skip next few bytes. |
1520 | ContinueAndSkip(usize), |
1521 | /// Break looping immediately, returning with the given index and value. |
1522 | BreakAtWith(usize, T), |
1523 | } |
1524 | |
1525 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
1526 | struct LookupTable { |
1527 | simd: [u8; 16], |
1528 | scalar: [bool; 256], |
1529 | } |
1530 | |
1531 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
1532 | type LookupTable = [bool; 256]; |
1533 | |
1534 | /// This function walks the byte slices from the given index and |
1535 | /// calls the callback function on all bytes (and their indices) that are in the following set: |
1536 | /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n` |
1537 | /// It is guaranteed not call the callback on other bytes. |
1538 | /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback |
1539 | /// will not be called with an index that is less than `ix + n + 1`. |
1540 | /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be |
1541 | /// called and the function returns immediately with the return value `(end_ix, opt_val)`. |
1542 | /// If `BreakAtWith(..)` is never returned, this function will return the first |
1543 | /// index that is outside the byteslice bound and a `None` value. |
1544 | fn iterate_special_bytes<F, T>( |
1545 | lut: &LookupTable, |
1546 | bytes: &[u8], |
1547 | ix: usize, |
1548 | callback: F, |
1549 | ) -> (usize, Option<T>) |
1550 | where |
1551 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
1552 | { |
1553 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
1554 | { |
1555 | simd::iterate_special_bytes(lut, bytes, ix, callback) |
1556 | } |
1557 | #[cfg (not(all(target_arch = "x86_64" , feature = "simd" )))] |
1558 | { |
1559 | scalar_iterate_special_bytes(lut, bytes, ix, callback) |
1560 | } |
1561 | } |
1562 | |
1563 | fn scalar_iterate_special_bytes<F, T>( |
1564 | lut: &[bool; 256], |
1565 | bytes: &[u8], |
1566 | mut ix: usize, |
1567 | mut callback: F, |
1568 | ) -> (usize, Option<T>) |
1569 | where |
1570 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
1571 | { |
1572 | while ix < bytes.len() { |
1573 | let b: u8 = bytes[ix]; |
1574 | if lut[b as usize] { |
1575 | match callback(ix, b) { |
1576 | LoopInstruction::ContinueAndSkip(skip: usize) => { |
1577 | ix += skip; |
1578 | } |
1579 | LoopInstruction::BreakAtWith(ix: usize, val: Option) => { |
1580 | return (ix, val); |
1581 | } |
1582 | } |
1583 | } |
1584 | ix += 1; |
1585 | } |
1586 | |
1587 | (ix, None) |
1588 | } |
1589 | |
1590 | /// Split the usual heading content range and the content inside the trailing attribute block. |
1591 | /// |
1592 | /// Returns `(leading_content_len, Option<trailing_attr_block_range>)`. |
1593 | /// |
1594 | /// Note that `trailing_attr_block_range` will be empty range when the block |
1595 | /// is `{}`, since the range is content inside the wrapping `{` and `}`. |
1596 | /// |
1597 | /// The closing `}` of an attribute block can have trailing whitespaces. |
1598 | /// They are automatically trimmed when the attribute block is being searched. |
1599 | /// |
1600 | /// However, this method does not trim the trailing whitespaces of heading content. |
1601 | /// It is callers' responsibility to trim them if necessary. |
1602 | fn extract_attribute_block_content_from_header_text( |
1603 | heading: &[u8], |
1604 | ) -> (usize, Option<Range<usize>>) { |
1605 | let heading_len = heading.len(); |
1606 | let mut ix = heading_len; |
1607 | ix -= scan_rev_while(heading, |b| { |
1608 | b == b' \n' || b == b' \r' || b == b' ' || b == b' \t' |
1609 | }); |
1610 | if ix == 0 { |
1611 | return (heading_len, None); |
1612 | } |
1613 | |
1614 | let attr_block_close = ix - 1; |
1615 | if heading.get(attr_block_close) != Some(&b'}' ) { |
1616 | // The last character is not `}`. No attribute blocks found. |
1617 | return (heading_len, None); |
1618 | } |
1619 | // move cursor before the closing right brace (`}`) |
1620 | ix -= 1; |
1621 | |
1622 | ix -= scan_rev_while(&heading[..ix], |b| { |
1623 | // Characters to be excluded: |
1624 | // * `{` and `}`: special characters to open and close an attribute block. |
1625 | // * `\\`: a special character to escape many characters and disable some syntaxes. |
1626 | // + Handling of this escape character differs among markdown processors. |
1627 | // + Escaped characters will be separate text node from neighbors, so |
1628 | // it is not easy to handle unescaped string and trim the trailing block. |
1629 | // * `<` and `>`: special characters to start and end HTML tag. |
1630 | // + No known processors converts `{#<i>foo</i>}` into |
1631 | // `id="<i>foo</>"` as of this writing, so hopefully |
1632 | // this restriction won't cause compatibility issues. |
1633 | // * `\n` and `\r`: a newline character. |
1634 | // + Setext heading can have multiple lines. However it is hard to support |
1635 | // attribute blocks that have newline inside, since the parsing proceeds line by |
1636 | // line and lines will be separate nodes even they are logically a single text. |
1637 | !matches!(b, b'{' | b'}' | b'<' | b'>' | b' \\' | b' \n' | b' \r' ) |
1638 | }); |
1639 | if ix == 0 { |
1640 | // `{` is not found. No attribute blocks available. |
1641 | return (heading_len, None); |
1642 | } |
1643 | let attr_block_open = ix - 1; |
1644 | if heading[attr_block_open] != b'{' { |
1645 | // `{` is not found. No attribute blocks available. |
1646 | return (heading_len, None); |
1647 | } |
1648 | |
1649 | (attr_block_open, Some(ix..attr_block_close)) |
1650 | } |
1651 | |
1652 | /// Parses an attribute block content, such as `.class1 #id .class2`. |
1653 | /// |
1654 | /// Returns `(id, classes)`. |
1655 | /// |
1656 | /// It is callers' responsibility to find opening and closing characters of the attribute |
1657 | /// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you. |
1658 | /// |
1659 | /// Note that this parsing requires explicit whitespace separators between |
1660 | /// attributes. This is intentional design with the reasons below: |
1661 | /// |
1662 | /// * to keep conversion simple and easy to understand for any possible input, |
1663 | /// * to avoid adding less obvious conversion rule that can reduce compatibility |
1664 | /// with other implementations more, and |
1665 | /// * to follow the major design of implementations with the support for the |
1666 | /// attribute blocks extension (as of this writing). |
1667 | /// |
1668 | /// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`]. |
1669 | /// |
1670 | /// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES` |
1671 | fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes> { |
1672 | let mut id: Option<&str> = None; |
1673 | let mut classes: Vec<&str> = Vec::new(); |
1674 | |
1675 | for attr: &str in inside_attr_block.split_ascii_whitespace() { |
1676 | // iterator returned by `str::split_ascii_whitespace` never emits empty |
1677 | // strings, so taking first byte won't panic. |
1678 | if attr.len() > 1 { |
1679 | let first_byte: u8 = attr.as_bytes()[0]; |
1680 | if first_byte == b'#' { |
1681 | id = Some(&attr[1..]); |
1682 | } else if first_byte == b'.' { |
1683 | classes.push(&attr[1..]); |
1684 | } |
1685 | } |
1686 | } |
1687 | |
1688 | Some(HeadingAttributes { id, classes }) |
1689 | } |
1690 | |
1691 | #[cfg (all(target_arch = "x86_64" , feature = "simd" ))] |
1692 | mod simd { |
1693 | //! SIMD byte scanning logic. |
1694 | //! |
1695 | //! This module provides functions that allow walking through byteslices, calling |
1696 | //! provided callback functions on special bytes and their indices using SIMD. |
1697 | //! The byteset is defined in `compute_lookup`. |
1698 | //! |
1699 | //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of |
1700 | //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask |
1701 | //! from this and call the callback on every index corresponding to a 1 in this mask |
1702 | //! before moving on to the next chunk. This allows us to move quickly when there |
1703 | //! are no or few matches. |
1704 | //! |
1705 | //! The table lookup is inspired by this [great overview]. However, since all of the |
1706 | //! bytes we're interested in are ASCII, we don't quite need the full generality of |
1707 | //! the universal algorithm and are hence able to skip a few instructions. |
1708 | //! |
1709 | //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html |
1710 | |
1711 | use super::{LookupTable, LoopInstruction}; |
1712 | use crate::Options; |
1713 | use core::arch::x86_64::*; |
1714 | |
1715 | const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>(); |
1716 | |
1717 | /// Generates a lookup table containing the bitmaps for our |
1718 | /// special marker bytes. This is effectively a 128 element 2d bitvector, |
1719 | /// that can be indexed by a four bit row index (the lower nibble) |
1720 | /// and a three bit column index (upper nibble). |
1721 | pub(super) fn compute_lookup(options: &Options) -> [u8; 16] { |
1722 | let mut lookup = [0u8; 16]; |
1723 | let standard_bytes = [ |
1724 | b' \n' , b' \r' , b'*' , b'_' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
1725 | ]; |
1726 | |
1727 | for &byte in &standard_bytes { |
1728 | add_lookup_byte(&mut lookup, byte); |
1729 | } |
1730 | if options.contains(Options::ENABLE_TABLES) { |
1731 | add_lookup_byte(&mut lookup, b'|' ); |
1732 | } |
1733 | if options.contains(Options::ENABLE_STRIKETHROUGH) { |
1734 | add_lookup_byte(&mut lookup, b'~' ); |
1735 | } |
1736 | if options.contains(Options::ENABLE_SMART_PUNCTUATION) { |
1737 | for &byte in &[b'.' , b'-' , b'"' , b' \'' ] { |
1738 | add_lookup_byte(&mut lookup, byte); |
1739 | } |
1740 | } |
1741 | |
1742 | lookup |
1743 | } |
1744 | |
1745 | fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) { |
1746 | lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4); |
1747 | } |
1748 | |
1749 | /// Computes a bit mask for the given byteslice starting from the given index, |
1750 | /// where the 16 least significant bits indicate (by value of 1) whether or not |
1751 | /// there is a special character at that byte position. The least significant bit |
1752 | /// corresponds to `bytes[ix]` and the most significant bit corresponds to |
1753 | /// `bytes[ix + 15]`. |
1754 | /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`. |
1755 | #[target_feature (enable = "ssse3" )] |
1756 | #[inline ] |
1757 | unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 { |
1758 | debug_assert!(bytes.len() >= ix + VECTOR_SIZE); |
1759 | |
1760 | let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i); |
1761 | // Small lookup table to compute single bit bitshifts |
1762 | // for 16 bytes at once. |
1763 | let bitmask_lookup = |
1764 | _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1); |
1765 | |
1766 | // Load input from memory. |
1767 | let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i; |
1768 | let input = _mm_loadu_si128(raw_ptr); |
1769 | // Compute the bitmap using the bottom nibble as an index |
1770 | // into the lookup table. Note that non-ascii bytes will have |
1771 | // their most significant bit set and will map to lookup[0]. |
1772 | let bitset = _mm_shuffle_epi8(bitmap, input); |
1773 | // Compute the high nibbles of the input using a 16-bit rightshift of four |
1774 | // and a mask to prevent most-significant bit issues. |
1775 | let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f)); |
1776 | // Create a bitmask for the bitmap by perform a left shift of the value |
1777 | // of the higher nibble. Bytes with their most significant set are mapped |
1778 | // to -1 (all ones). |
1779 | let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles); |
1780 | // Test the bit of the bitmap by AND'ing the bitmap and the mask together. |
1781 | let tmp = _mm_and_si128(bitset, bitmask); |
1782 | // Check whether the result was not null. NEQ is not a SIMD intrinsic, |
1783 | // but comparing to the bitmask is logically equivalent. This also prevents us |
1784 | // from matching any non-ASCII bytes since none of the bitmaps were all ones |
1785 | // (-1). |
1786 | let result = _mm_cmpeq_epi8(tmp, bitmask); |
1787 | |
1788 | // Return the resulting bitmask. |
1789 | _mm_movemask_epi8(result) |
1790 | } |
1791 | |
1792 | /// Calls callback on byte indices and their value. |
1793 | /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the |
1794 | /// number of bytes in callback return value otherwise. |
1795 | /// Returns the final index and a possible break value. |
1796 | pub(super) fn iterate_special_bytes<F, T>( |
1797 | lut: &LookupTable, |
1798 | bytes: &[u8], |
1799 | ix: usize, |
1800 | callback: F, |
1801 | ) -> (usize, Option<T>) |
1802 | where |
1803 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
1804 | { |
1805 | if is_x86_feature_detected!("ssse3" ) && bytes.len() >= VECTOR_SIZE { |
1806 | unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) } |
1807 | } else { |
1808 | super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback) |
1809 | } |
1810 | } |
1811 | |
1812 | /// Calls the callback function for every 1 in the given bitmask with |
1813 | /// the index `offset + ix`, where `ix` is the position of the 1 in the mask. |
1814 | /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with |
1815 | /// final index `end_ix` and optional value `opt_val`. |
1816 | unsafe fn process_mask<F, T>( |
1817 | mut mask: i32, |
1818 | bytes: &[u8], |
1819 | mut offset: usize, |
1820 | callback: &mut F, |
1821 | ) -> Result<usize, (usize, Option<T>)> |
1822 | where |
1823 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
1824 | { |
1825 | while mask != 0 { |
1826 | let mask_ix = mask.trailing_zeros() as usize; |
1827 | offset += mask_ix; |
1828 | match callback(offset, *bytes.get_unchecked(offset)) { |
1829 | LoopInstruction::ContinueAndSkip(skip) => { |
1830 | offset += skip + 1; |
1831 | mask >>= skip + 1 + mask_ix; |
1832 | } |
1833 | LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)), |
1834 | } |
1835 | } |
1836 | Ok(offset) |
1837 | } |
1838 | |
1839 | #[target_feature (enable = "ssse3" )] |
1840 | /// Important: only call this function when `bytes.len() >= 16`. Doing |
1841 | /// so otherwise may exhibit undefined behaviour. |
1842 | unsafe fn simd_iterate_special_bytes<F, T>( |
1843 | lut: &[u8; 16], |
1844 | bytes: &[u8], |
1845 | mut ix: usize, |
1846 | mut callback: F, |
1847 | ) -> (usize, Option<T>) |
1848 | where |
1849 | F: FnMut(usize, u8) -> LoopInstruction<Option<T>>, |
1850 | { |
1851 | debug_assert!(bytes.len() >= VECTOR_SIZE); |
1852 | let upperbound = bytes.len() - VECTOR_SIZE; |
1853 | |
1854 | while ix < upperbound { |
1855 | let mask = compute_mask(lut, bytes, ix); |
1856 | let block_start = ix; |
1857 | ix = match process_mask(mask, bytes, ix, &mut callback) { |
1858 | Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start), |
1859 | Err((end_ix, val)) => return (end_ix, val), |
1860 | }; |
1861 | } |
1862 | |
1863 | if bytes.len() > ix { |
1864 | // shift off the bytes at start we have already scanned |
1865 | let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound; |
1866 | if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) { |
1867 | return (end_ix, val); |
1868 | } |
1869 | } |
1870 | |
1871 | (bytes.len(), None) |
1872 | } |
1873 | |
1874 | #[cfg (test)] |
1875 | mod simd_test { |
1876 | use super::super::create_lut; |
1877 | use super::{iterate_special_bytes, LoopInstruction}; |
1878 | use crate::Options; |
1879 | |
1880 | fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) { |
1881 | let mut opts = Options::empty(); |
1882 | opts.insert(Options::ENABLE_TABLES); |
1883 | opts.insert(Options::ENABLE_FOOTNOTES); |
1884 | opts.insert(Options::ENABLE_STRIKETHROUGH); |
1885 | opts.insert(Options::ENABLE_TASKLISTS); |
1886 | |
1887 | let lut = create_lut(&opts); |
1888 | let mut indices = vec![]; |
1889 | |
1890 | iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| { |
1891 | indices.push(ix); |
1892 | LoopInstruction::ContinueAndSkip(skip) |
1893 | }); |
1894 | |
1895 | assert_eq!(&indices[..], expected); |
1896 | } |
1897 | |
1898 | #[test ] |
1899 | fn simple_no_match() { |
1900 | check_expected_indices("abcdef0123456789" .as_bytes(), &[], 0); |
1901 | } |
1902 | |
1903 | #[test ] |
1904 | fn simple_match() { |
1905 | check_expected_indices("*bcd&f0123456789" .as_bytes(), &[0, 4], 0); |
1906 | } |
1907 | |
1908 | #[test ] |
1909 | fn single_open_fish() { |
1910 | check_expected_indices("<" .as_bytes(), &[0], 0); |
1911 | } |
1912 | |
1913 | #[test ] |
1914 | fn long_match() { |
1915 | check_expected_indices("0123456789abcde~*bcd&f0" .as_bytes(), &[15, 16, 20], 0); |
1916 | } |
1917 | |
1918 | #[test ] |
1919 | fn border_skip() { |
1920 | check_expected_indices("0123456789abcde~~~~d&f0" .as_bytes(), &[15, 20], 3); |
1921 | } |
1922 | |
1923 | #[test ] |
1924 | fn exhaustive_search() { |
1925 | let chars = [ |
1926 | b' \n' , b' \r' , b'*' , b'_' , b'~' , b'|' , b'&' , b' \\' , b'[' , b']' , b'<' , b'!' , b'`' , |
1927 | ]; |
1928 | |
1929 | for &c in &chars { |
1930 | for i in 0u8..=255 { |
1931 | if !chars.contains(&i) { |
1932 | // full match |
1933 | let mut buf = [i; 18]; |
1934 | buf[3] = c; |
1935 | buf[6] = c; |
1936 | |
1937 | check_expected_indices(&buf[..], &[3, 6], 0); |
1938 | } |
1939 | } |
1940 | } |
1941 | } |
1942 | } |
1943 | } |
1944 | |