firstpass.rs source code [crates/pulldown-cmark/src/firstpass.rs]

1	//! The first pass resolves all block structure, generating an AST. Within a block, items
2	//! are in a linear chain with potential inline markup identified.
3
4	use std::cmp::max;
5	use std::ops::Range;
6
7	use crate::parse::{scan_containers, Allocations, HeadingAttributes, Item, ItemBody, LinkDef};
8	use crate::scanners::*;
9	use crate::strings::CowStr;
10	use crate::tree::{Tree, TreeIndex};
11	use crate::Options;
12	use crate::{
13	linklabel::{scan_link_label_rest, LinkLabel},
14	HeadingLevel,
15	};
16
17	use unicase::UniCase;
18
19	/// Runs the first pass, which resolves the block structure of the document,
20	/// and returns the resulting tree.
21	pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations) {
22	// This is a very naive heuristic for the number of nodes
23	// we'll need.
24	let start_capacity: usize = max(v1:`128`, v2:text.len() / `32`);
25	let lookup_table: &[bool; 256] = &create_lut(&options);
26	let first_pass: FirstPass<'_, '_> = FirstPass {
27	text,
28	tree: Tree::with_capacity(cap:start_capacity),
29	begin_list_item: `false`,
30	last_line_blank: `false`,
31	allocs: Allocations::new(),
32	options,
33	lookup_table,
34	};
35	first_pass.run()
36	}
37
38	/// State for the first parsing pass.
39	struct FirstPass<'a, 'b> {
40	text: &'a str,
41	tree: Tree<Item>,
42	begin_list_item: bool,
43	last_line_blank: bool,
44	allocs: Allocations<'a>,
45	options: Options,
46	lookup_table: &'b LookupTable,
47	}
48
49	impl<'a, 'b> FirstPass<'a, 'b> {
50	fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
51	let mut ix = `0`;
52	while ix < self.text.len() {
53	ix = self.parse_block(ix);
54	}
55	for _ in `0`..self.tree.spine_len() {
56	self.pop(ix);
57	}
58	(self.tree, self.allocs)
59	}
60
61	/// Returns offset after block.
62	fn parse_block(&mut self, mut start_ix: usize) -> usize {
63	let bytes = self.text.as_bytes();
64	let mut line_start = LineStart::new(&bytes[start_ix..]);
65
66	let i = scan_containers(&self.tree, &mut line_start);
67	for _ in i..self.tree.spine_len() {
68	self.pop(start_ix);
69	}
70
71	if self.options.contains(Options::ENABLE_FOOTNOTES) {
72	// finish footnote if it's still open and was preceded by blank line
73	if let Some(node_ix) = self.tree.peek_up() {
74	if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
75	if self.last_line_blank {
76	self.pop(start_ix);
77	}
78	}
79	}
80
81	// Footnote definitions of the form
82	// [^bar]:
83	// anything really*
84	let container_start = start_ix + line_start.bytes_scanned();
85	if let Some(bytecount) = self.parse_footnote(container_start) {
86	start_ix = container_start + bytecount;
87	start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(`0`);
88	line_start = LineStart::new(&bytes[start_ix..]);
89	}
90	}
91
92	// Process new containers
93	loop {
94	let container_start = start_ix + line_start.bytes_scanned();
95	if let Some((ch, index, indent)) = line_start.scan_list_marker() {
96	let after_marker_index = start_ix + line_start.bytes_scanned();
97	self.continue_list(container_start, ch, index);
98	self.tree.append(Item {
99	start: container_start,
100	end: after_marker_index, // will get updated later if item not empty
101	body: ItemBody::ListItem(indent),
102	});
103	self.tree.push();
104	if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
105	self.begin_list_item = `true`;
106	return after_marker_index + n;
107	}
108	if self.options.contains(Options::ENABLE_TASKLISTS) {
109	if let Some(is_checked) = line_start.scan_task_list_marker() {
110	self.tree.append(Item {
111	start: after_marker_index,
112	end: start_ix + line_start.bytes_scanned(),
113	body: ItemBody::TaskListMarker(is_checked),
114	});
115	}
116	}
117	} else if line_start.scan_blockquote_marker() {
118	self.finish_list(start_ix);
119	self.tree.append(Item {
120	start: container_start,
121	end: `0`, // will get set later
122	body: ItemBody::BlockQuote,
123	});
124	self.tree.push();
125	} else {
126	break;
127	}
128	}
129
130	let ix = start_ix + line_start.bytes_scanned();
131
132	if let Some(n) = scan_blank_line(&bytes[ix..]) {
133	if let Some(node_ix) = self.tree.peek_up() {
134	match self.tree[node_ix].item.body {
135	ItemBody::BlockQuote => (),
136	_ => {
137	if self.begin_list_item {
138	// A list item can begin with at most one blank line.
139	self.pop(start_ix);
140	}
141	self.last_line_blank = `true`;
142	}
143	}
144	}
145	return ix + n;
146	}
147
148	self.begin_list_item = `false`;
149	self.finish_list(start_ix);
150
151	// Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
152	let remaining_space = line_start.remaining_space();
153
154	let indent = line_start.scan_space_upto(`4`);
155	if indent == `4` {
156	let ix = start_ix + line_start.bytes_scanned();
157	let remaining_space = line_start.remaining_space();
158	return self.parse_indented_code_block(ix, remaining_space);
159	}
160
161	let ix = start_ix + line_start.bytes_scanned();
162
163	// HTML Blocks
164	if bytes[ix] == b'<' {
165	// Types 1-5 are all detected by one function and all end with the same
166	// pattern
167	if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + `1`)..]) {
168	return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
169	}
170
171	// Detect type 6
172	if starts_html_block_type_6(&bytes[(ix + `1`)..]) {
173	return self.parse_html_block_type_6_or_7(ix, remaining_space);
174	}
175
176	// Detect type 7
177	if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
178	return self.parse_html_block_type_6_or_7(ix, remaining_space);
179	}
180	}
181
182	if let Ok(n) = scan_hrule(&bytes[ix..]) {
183	return self.parse_hrule(n, ix);
184	}
185
186	if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
187	return self.parse_atx_heading(ix, atx_size);
188	}
189
190	// parse refdef
191	if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
192	self.allocs.refdefs.0.entry(label).or_insert(link_def);
193	let ix = ix + bytecount;
194	// try to read trailing whitespace or it will register as a completely blank line
195	// TODO: shouldn't we do this for all block level items?
196	return ix + scan_blank_line(&bytes[ix..]).unwrap_or(`0`);
197	}
198
199	if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
200	return self.parse_fenced_code_block(ix, indent, fence_ch, n);
201	}
202	self.parse_paragraph(ix)
203	}
204
205	/// Returns the offset of the first line after the table.
206	/// Assumptions: current focus is a table element and the table header
207	/// matches the separator line (same number of columns).
208	fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
209	// parse header. this shouldn't fail because we made sure the table header is ok
210	let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
211	self.tree[thead_ix].item.body = ItemBody::TableHead;
212
213	// parse body
214	let mut ix = body_start;
215	while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
216	ix = next_ix;
217	}
218
219	self.pop(ix);
220	ix
221	}
222
223	/// Call this when containers are taken care of.
224	/// Returns bytes scanned, row_ix
225	fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
226	let bytes = self.text.as_bytes();
227	let mut cells = `0`;
228	let mut final_cell_ix = None;
229
230	let row_ix = self.tree.append(Item {
231	start: ix,
232	end: `0`, // set at end of this function
233	body: ItemBody::TableRow,
234	});
235	self.tree.push();
236
237	loop {
238	ix += scan_ch(&bytes[ix..], b'\|');
239	let start_ix = ix;
240	ix += scan_whitespace_no_nl(&bytes[ix..]);
241
242	if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
243	ix += eol_bytes;
244	break;
245	}
246
247	let cell_ix = self.tree.append(Item {
248	start: start_ix,
249	end: ix,
250	body: ItemBody::TableCell,
251	});
252	self.tree.push();
253	let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active);
254
255	if let Some(cur_ix) = self.tree.cur() {
256	let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
257	self.tree[cur_ix].item.end -= trailing_whitespace;
258	}
259
260	self.tree[cell_ix].item.end = next_ix;
261	self.tree.pop();
262
263	ix = next_ix;
264	cells += `1`;
265
266	if cells == row_cells {
267	final_cell_ix = Some(cell_ix);
268	}
269	}
270
271	// fill empty cells if needed
272	// note: this is where GFM and commonmark-extra diverge. we follow
273	// GFM here
274	for _ in cells..row_cells {
275	self.tree.append(Item {
276	start: ix,
277	end: ix,
278	body: ItemBody::TableCell,
279	});
280	}
281
282	// drop excess cells
283	if let Some(cell_ix) = final_cell_ix {
284	self.tree[cell_ix].next = None;
285	}
286
287	self.pop(ix);
288
289	(ix, row_ix)
290	}
291
292	/// Returns first offset after the row and the tree index of the row.
293	fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
294	let bytes = self.text.as_bytes();
295	let mut line_start = LineStart::new(&bytes[ix..]);
296	let current_container =
297	scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
298	if !current_container {
299	return None;
300	}
301	line_start.scan_all_space();
302	ix += line_start.bytes_scanned();
303	if scan_paragraph_interrupt(&bytes[ix..], current_container) {
304	return None;
305	}
306
307	let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
308	Some((ix, row_ix))
309	}
310
311	/// Returns offset of line start after paragraph.
312	fn parse_paragraph(&mut self, start_ix: usize) -> usize {
313	let node_ix = self.tree.append(Item {
314	start: start_ix,
315	end: `0`, // will get set later
316	body: ItemBody::Paragraph,
317	});
318	self.tree.push();
319	let bytes = self.text.as_bytes();
320
321	let mut ix = start_ix;
322	loop {
323	if self.options.contains(Options::ENABLE_FOOTNOTES) && self.get_footnote(ix).is_some() {
324	self.tree[node_ix].item.end = if ix > start_ix { ix - `1` } else { start_ix };
325	self.tree.pop();
326	if let Some(node_ix) = self.tree.peek_up() {
327	if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
328	self.pop(ix);
329	}
330	}
331	return ix;
332	}
333
334	let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
335	TableParseMode::Scan
336	} else {
337	TableParseMode::Disabled
338	};
339	let (next_ix, brk) = self.parse_line(ix, None, scan_mode);
340
341	// break out when we find a table
342	if let Some(Item {
343	body: ItemBody::Table(alignment_ix),
344	..
345	}) = brk
346	{
347	let table_cols = self.allocs[alignment_ix].len();
348	self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
349	// this clears out any stuff we may have appended - but there may
350	// be a cleaner way
351	self.tree[node_ix].child = None;
352	self.tree.pop();
353	self.tree.push();
354	return self.parse_table(table_cols, ix, next_ix);
355	}
356
357	ix = next_ix;
358	let mut line_start = LineStart::new(&bytes[ix..]);
359	let current_container =
360	scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
361	if !line_start.scan_space(`4`) {
362	let ix_new = ix + line_start.bytes_scanned();
363	if current_container {
364	let trailing_backslash_pos = match brk {
365	Some(Item {
366	start,
367	body: ItemBody::HardBreak,
368	..
369	}) if bytes[start] == b'`\\`' => Some(start),
370	_ => None,
371	};
372	if let Some(ix_setext) =
373	self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some())
374	{
375	if let Some(pos) = trailing_backslash_pos {
376	self.tree.append_text(pos, pos + `1`);
377	}
378	ix = ix_setext;
379	break;
380	}
381	}
382	// first check for non-empty lists, then for other interrupts
383	let suffix = &bytes[ix_new..];
384	if scan_paragraph_interrupt(suffix, current_container) {
385	break;
386	}
387	}
388	line_start.scan_all_space();
389	if line_start.is_at_eol() {
390	break;
391	}
392	ix = next_ix + line_start.bytes_scanned();
393	if let Some(item) = brk {
394	self.tree.append(item);
395	}
396	}
397
398	self.pop(ix);
399	ix
400	}
401
402	/// Returns end ix of setext_heading on success.
403	fn parse_setext_heading(
404	&mut self,
405	ix: usize,
406	node_ix: TreeIndex,
407	has_trailing_content: bool,
408	) -> Option<usize> {
409	let bytes = self.text.as_bytes();
410	let (n, level) = scan_setext_heading(&bytes[ix..])?;
411	let mut attrs = None;
412
413	if let Some(cur_ix) = self.tree.cur() {
414	let parent_ix = self.tree.peek_up().unwrap();
415	let header_start = self.tree[parent_ix].item.start;
416	// Note that `self.tree[parent_ix].item.end` might be zero at this point.
417	// Use the end position of the current node (i.e. the last known child
418	// of the parent) instead.
419	let header_end = self.tree[cur_ix].item.end;
420
421	// extract the trailing attribute block
422	let (content_end, attrs_) =
423	self.extract_and_parse_heading_attribute_block(header_start, header_end);
424	attrs = attrs_;
425
426	// strip trailing whitespace
427	let new_end = if has_trailing_content {
428	content_end
429	} else {
430	let trailing_ws =
431	scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl);
432	content_end - trailing_ws
433	};
434
435	if attrs.is_some() {
436	// remove trailing block attributes
437	self.tree.truncate_siblings(self.text.as_bytes(), new_end);
438	}
439
440	if let Some(cur_ix) = self.tree.cur() {
441	self.tree[cur_ix].item.end = new_end;
442	}
443	}
444
445	self.tree[node_ix].item.body = ItemBody::Heading(
446	level,
447	attrs.map(\|attrs\| self.allocs.allocate_heading(attrs)),
448	);
449
450	Some(ix + n)
451	}
452
453	/// Parse a line of input, appending text and items to tree.
454	///
455	/// Returns: index after line and an item representing the break.
456	fn parse_line(
457	&mut self,
458	start: usize,
459	end: Option<usize>,
460	mode: TableParseMode,
461	) -> (usize, Option<Item>) {
462	let bytes = self.text.as_bytes();
463	let bytes = match end {
464	Some(end) => &bytes[..end],
465	None => bytes,
466	};
467	let bytes_len = bytes.len();
468	let mut pipes = `0`;
469	let mut last_pipe_ix = start;
470	let mut begin_text = start;
471
472	let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, \|ix, byte\| {
473	match byte {
474	b'`\n`' \| b'`\r`' => {
475	if let TableParseMode::Active = mode {
476	return LoopInstruction::BreakAtWith(ix, None);
477	}
478
479	let mut i = ix;
480	let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
481	if mode == TableParseMode::Scan && pipes > `0` {
482	// check if we may be parsing a table
483	let next_line_ix = ix + eol_bytes;
484	let mut line_start = LineStart::new(&bytes[next_line_ix..]);
485	if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() {
486	let table_head_ix = next_line_ix + line_start.bytes_scanned();
487	let (table_head_bytes, alignment) =
488	scan_table_head(&bytes[table_head_ix..]);
489
490	if table_head_bytes > `0` {
491	// computing header count from number of pipes
492	let header_count =
493	count_header_cols(bytes, pipes, start, last_pipe_ix);
494
495	// make sure they match the number of columns we find in separator line
496	if alignment.len() == header_count {
497	let alignment_ix = self.allocs.allocate_alignment(alignment);
498	let end_ix = table_head_ix + table_head_bytes;
499	return LoopInstruction::BreakAtWith(
500	end_ix,
501	Some(Item {
502	start: i,
503	end: end_ix, // must update later
504	body: ItemBody::Table(alignment_ix),
505	}),
506	);
507	}
508	}
509	}
510	}
511
512	let end_ix = ix + eol_bytes;
513	let trailing_backslashes = scan_rev_while(&bytes[..ix], \|b\| b == b'`\\`');
514	if trailing_backslashes % `2` == `1` && end_ix < bytes_len {
515	i -= `1`;
516	self.tree.append_text(begin_text, i);
517	return LoopInstruction::BreakAtWith(
518	end_ix,
519	Some(Item {
520	start: i,
521	end: end_ix,
522	body: ItemBody::HardBreak,
523	}),
524	);
525	}
526	let trailing_whitespace =
527	scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
528	if trailing_whitespace >= `2` {
529	i -= trailing_whitespace;
530	self.tree.append_text(begin_text, i);
531	return LoopInstruction::BreakAtWith(
532	end_ix,
533	Some(Item {
534	start: i,
535	end: end_ix,
536	body: ItemBody::HardBreak,
537	}),
538	);
539	}
540
541	self.tree.append_text(begin_text, ix);
542	LoopInstruction::BreakAtWith(
543	end_ix,
544	Some(Item {
545	start: i,
546	end: end_ix,
547	body: ItemBody::SoftBreak,
548	}),
549	)
550	}
551	b'`\\`' => {
552	if ix + `1` < bytes_len && is_ascii_punctuation(bytes[ix + `1`]) {
553	self.tree.append_text(begin_text, ix);
554	if bytes[ix + `1`] == b'`' {
555	let count = `1` + scan_ch_repeat(&bytes[(ix + `2`)..], b'`');
556	self.tree.append(Item {
557	start: ix + `1`,
558	end: ix + count + `1`,
559	body: ItemBody::MaybeCode(count, `true`),
560	});
561	begin_text = ix + `1` + count;
562	LoopInstruction::ContinueAndSkip(count)
563	} else {
564	begin_text = ix + `1`;
565	LoopInstruction::ContinueAndSkip(`1`)
566	}
567	} else {
568	LoopInstruction::ContinueAndSkip(`0`)
569	}
570	}
571	c @ b'*' \| c @ b'_' \| c @ b'~' => {
572	let string_suffix = &self.text[ix..];
573	let count = `1` + scan_ch_repeat(&string_suffix.as_bytes()[`1`..], c);
574	let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
575	let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
576	let is_valid_seq = c != b'~' \|\| count <= `2`;
577
578	if (can_open \|\| can_close) && is_valid_seq {
579	self.tree.append_text(begin_text, ix);
580	for i in `0`..count {
581	self.tree.append(Item {
582	start: ix + i,
583	end: ix + i + `1`,
584	body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
585	});
586	}
587	begin_text = ix + count;
588	}
589	LoopInstruction::ContinueAndSkip(count - `1`)
590	}
591	b'`' => {
592	self.tree.append_text(begin_text, ix);
593	let count = `1` + scan_ch_repeat(&bytes[(ix + `1`)..], b'`');
594	self.tree.append(Item {
595	start: ix,
596	end: ix + count,
597	body: ItemBody::MaybeCode(count, `false`),
598	});
599	begin_text = ix + count;
600	LoopInstruction::ContinueAndSkip(count - `1`)
601	}
602	b'<' => {
603	// Note: could detect some non-HTML cases and early escape here, but not
604	// clear that's a win.
605	self.tree.append_text(begin_text, ix);
606	self.tree.append(Item {
607	start: ix,
608	end: ix + `1`,
609	body: ItemBody::MaybeHtml,
610	});
611	begin_text = ix + `1`;
612	LoopInstruction::ContinueAndSkip(`0`)
613	}
614	b'!' => {
615	if ix + `1` < bytes_len && bytes[ix + `1`] == b'[' {
616	self.tree.append_text(begin_text, ix);
617	self.tree.append(Item {
618	start: ix,
619	end: ix + `2`,
620	body: ItemBody::MaybeImage,
621	});
622	begin_text = ix + `2`;
623	LoopInstruction::ContinueAndSkip(`1`)
624	} else {
625	LoopInstruction::ContinueAndSkip(`0`)
626	}
627	}
628	b'[' => {
629	self.tree.append_text(begin_text, ix);
630	self.tree.append(Item {
631	start: ix,
632	end: ix + `1`,
633	body: ItemBody::MaybeLinkOpen,
634	});
635	begin_text = ix + `1`;
636	LoopInstruction::ContinueAndSkip(`0`)
637	}
638	b']' => {
639	self.tree.append_text(begin_text, ix);
640	self.tree.append(Item {
641	start: ix,
642	end: ix + `1`,
643	body: ItemBody::MaybeLinkClose(`true`),
644	});
645	begin_text = ix + `1`;
646	LoopInstruction::ContinueAndSkip(`0`)
647	}
648	b'&' => match scan_entity(&bytes[ix..]) {
649	(n, Some(value)) => {
650	self.tree.append_text(begin_text, ix);
651	self.tree.append(Item {
652	start: ix,
653	end: ix + n,
654	body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
655	});
656	begin_text = ix + n;
657	LoopInstruction::ContinueAndSkip(n - `1`)
658	}
659	_ => LoopInstruction::ContinueAndSkip(`0`),
660	},
661	b'\|' => {
662	if let TableParseMode::Active = mode {
663	LoopInstruction::BreakAtWith(ix, None)
664	} else {
665	last_pipe_ix = ix;
666	pipes += `1`;
667	LoopInstruction::ContinueAndSkip(`0`)
668	}
669	}
670	b'.' => {
671	if ix + `2` < bytes.len() && bytes[ix + `1`] == b'.' && bytes[ix + `2`] == b'.' {
672	self.tree.append_text(begin_text, ix);
673	self.tree.append(Item {
674	start: ix,
675	end: ix + `3`,
676	body: ItemBody::SynthesizeChar('…'),
677	});
678	begin_text = ix + `3`;
679	LoopInstruction::ContinueAndSkip(`2`)
680	} else {
681	LoopInstruction::ContinueAndSkip(`0`)
682	}
683	}
684	b'-' => {
685	let count = `1` + scan_ch_repeat(&bytes[(ix + `1`)..], b'-');
686	if count == `1` {
687	LoopInstruction::ContinueAndSkip(`0`)
688	} else {
689	let itembody = if count == `2` {
690	ItemBody::SynthesizeChar('–')
691	} else if count == `3` {
692	ItemBody::SynthesizeChar('—')
693	} else {
694	let (ems, ens) = match count % `6` {
695	`0` \| `3` => (count / `3`, `0`),
696	`2` \| `4` => (`0`, count / `2`),
697	`1` => (count / `3` - `1`, `2`),
698	_ => (count / `3`, `1`),
699	};
700	// – and — are 3 bytes each in utf8
701	let mut buf = String::with_capacity(`3` * (ems + ens));
702	for _ in `0`..ems {
703	buf.push('—');
704	}
705	for _ in `0`..ens {
706	buf.push('–');
707	}
708	ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
709	};
710
711	self.tree.append_text(begin_text, ix);
712	self.tree.append(Item {
713	start: ix,
714	end: ix + count,
715	body: itembody,
716	});
717	begin_text = ix + count;
718	LoopInstruction::ContinueAndSkip(count - `1`)
719	}
720	}
721	c @ b'`\'`' \| c @ b'"' => {
722	let string_suffix = &self.text[ix..];
723	let can_open = delim_run_can_open(self.text, string_suffix, `1`, ix);
724	let can_close = delim_run_can_close(self.text, string_suffix, `1`, ix);
725
726	self.tree.append_text(begin_text, ix);
727	self.tree.append(Item {
728	start: ix,
729	end: ix + `1`,
730	body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
731	});
732	begin_text = ix + `1`;
733
734	LoopInstruction::ContinueAndSkip(`0`)
735	}
736	_ => LoopInstruction::ContinueAndSkip(`0`),
737	}
738	});
739
740	if brk.is_none() {
741	// need to close text at eof
742	self.tree.append_text(begin_text, final_ix);
743	}
744	(final_ix, brk)
745	}
746
747	/// When start_ix is at the beginning of an HTML block of type 1 to 5,
748	/// this will find the end of the block, adding the block itself to the
749	/// tree and also keeping track of the lines of HTML within the block.
750	///
751	/// The html_end_tag is the tag that must be found on a line to end the block.
752	fn parse_html_block_type_1_to_5(
753	&mut self,
754	start_ix: usize,
755	html_end_tag: &str,
756	mut remaining_space: usize,
757	) -> usize {
758	let bytes = self.text.as_bytes();
759	let mut ix = start_ix;
760	loop {
761	let line_start_ix = ix;
762	ix += scan_nextline(&bytes[ix..]);
763	self.append_html_line(remaining_space, line_start_ix, ix);
764
765	let mut line_start = LineStart::new(&bytes[ix..]);
766	let n_containers = scan_containers(&self.tree, &mut line_start);
767	if n_containers < self.tree.spine_len() {
768	break;
769	}
770
771	if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
772	break;
773	}
774
775	let next_line_ix = ix + line_start.bytes_scanned();
776	if next_line_ix == self.text.len() {
777	break;
778	}
779	ix = next_line_ix;
780	remaining_space = line_start.remaining_space();
781	}
782	ix
783	}
784
785	/// When start_ix is at the beginning of an HTML block of type 6 or 7,
786	/// this will consume lines until there is a blank line and keep track of
787	/// the HTML within the block.
788	fn parse_html_block_type_6_or_7(
789	&mut self,
790	start_ix: usize,
791	mut remaining_space: usize,
792	) -> usize {
793	let bytes = self.text.as_bytes();
794	let mut ix = start_ix;
795	loop {
796	let line_start_ix = ix;
797	ix += scan_nextline(&bytes[ix..]);
798	self.append_html_line(remaining_space, line_start_ix, ix);
799
800	let mut line_start = LineStart::new(&bytes[ix..]);
801	let n_containers = scan_containers(&self.tree, &mut line_start);
802	if n_containers < self.tree.spine_len() \|\| line_start.is_at_eol() {
803	break;
804	}
805
806	let next_line_ix = ix + line_start.bytes_scanned();
807	if next_line_ix == self.text.len() \|\| scan_blank_line(&bytes[next_line_ix..]).is_some()
808	{
809	break;
810	}
811	ix = next_line_ix;
812	remaining_space = line_start.remaining_space();
813	}
814	ix
815	}
816
817	fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
818	self.tree.append(Item {
819	start: start_ix,
820	end: `0`, // will get set later
821	body: ItemBody::IndentCodeBlock,
822	});
823	self.tree.push();
824	let bytes = self.text.as_bytes();
825	let mut last_nonblank_child = None;
826	let mut last_nonblank_ix = `0`;
827	let mut end_ix = `0`;
828	let mut last_line_blank = `false`;
829
830	let mut ix = start_ix;
831	loop {
832	let line_start_ix = ix;
833	ix += scan_nextline(&bytes[ix..]);
834	self.append_code_text(remaining_space, line_start_ix, ix);
835	// TODO(spec clarification): should we synthesize newline at EOF?
836
837	if !last_line_blank {
838	last_nonblank_child = self.tree.cur();
839	last_nonblank_ix = ix;
840	end_ix = ix;
841	}
842
843	let mut line_start = LineStart::new(&bytes[ix..]);
844	let n_containers = scan_containers(&self.tree, &mut line_start);
845	if n_containers < self.tree.spine_len()
846	\|\| !(line_start.scan_space(`4`) \|\| line_start.is_at_eol())
847	{
848	break;
849	}
850	let next_line_ix = ix + line_start.bytes_scanned();
851	if next_line_ix == self.text.len() {
852	break;
853	}
854	ix = next_line_ix;
855	remaining_space = line_start.remaining_space();
856	last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
857	}
858
859	// Trim trailing blank lines.
860	if let Some(child) = last_nonblank_child {
861	self.tree[child].next = None;
862	self.tree[child].item.end = last_nonblank_ix;
863	}
864	self.pop(end_ix);
865	ix
866	}
867
868	fn parse_fenced_code_block(
869	&mut self,
870	start_ix: usize,
871	indent: usize,
872	fence_ch: u8,
873	n_fence_char: usize,
874	) -> usize {
875	let bytes = self.text.as_bytes();
876	let mut info_start = start_ix + n_fence_char;
877	info_start += scan_whitespace_no_nl(&bytes[info_start..]);
878	// TODO: info strings are typically very short. wouldn't it be faster
879	// to just do a forward scan here?
880	let mut ix = info_start + scan_nextline(&bytes[info_start..]);
881	let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
882	let info_string = unescape(&self.text[info_start..info_end]);
883	self.tree.append(Item {
884	start: start_ix,
885	end: `0`, // will get set later
886	body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
887	});
888	self.tree.push();
889	loop {
890	let mut line_start = LineStart::new(&bytes[ix..]);
891	let n_containers = scan_containers(&self.tree, &mut line_start);
892	if n_containers < self.tree.spine_len() {
893	break;
894	}
895	line_start.scan_space(indent);
896	let mut close_line_start = line_start.clone();
897	if !close_line_start.scan_space(`4`) {
898	let close_ix = ix + close_line_start.bytes_scanned();
899	if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
900	{
901	ix = close_ix + n;
902	break;
903	}
904	}
905	let remaining_space = line_start.remaining_space();
906	ix += line_start.bytes_scanned();
907	let next_ix = ix + scan_nextline(&bytes[ix..]);
908	self.append_code_text(remaining_space, ix, next_ix);
909	ix = next_ix;
910	}
911
912	self.pop(ix);
913
914	// try to read trailing whitespace or it will register as a completely blank line
915	ix + scan_blank_line(&bytes[ix..]).unwrap_or(`0`)
916	}
917
918	fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
919	if remaining_space > `0` {
920	let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
921	self.tree.append(Item {
922	start,
923	end: start,
924	body: ItemBody::SynthesizeText(cow_ix),
925	});
926	}
927	if self.text.as_bytes()[end - `2`] == b'`\r`' {
928	// Normalize CRLF to LF
929	self.tree.append_text(start, end - `2`);
930	self.tree.append_text(end - `1`, end);
931	} else {
932	self.tree.append_text(start, end);
933	}
934	}
935
936	/// Appends a line of HTML to the tree.
937	fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
938	if remaining_space > `0` {
939	let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
940	self.tree.append(Item {
941	start,
942	end: start,
943	// TODO: maybe this should synthesize to html rather than text?
944	body: ItemBody::SynthesizeText(cow_ix),
945	});
946	}
947	if self.text.as_bytes()[end - `2`] == b'`\r`' {
948	// Normalize CRLF to LF
949	self.tree.append(Item {
950	start,
951	end: end - `2`,
952	body: ItemBody::Html,
953	});
954	self.tree.append(Item {
955	start: end - `1`,
956	end,
957	body: ItemBody::Html,
958	});
959	} else {
960	self.tree.append(Item {
961	start,
962	end,
963	body: ItemBody::Html,
964	});
965	}
966	}
967
968	/// Pop a container, setting its end.
969	fn pop(&mut self, ix: usize) {
970	let cur_ix = self.tree.pop().unwrap();
971	self.tree[cur_ix].item.end = ix;
972	if let ItemBody::List(`true`, _, _) = self.tree[cur_ix].item.body {
973	surgerize_tight_list(&mut self.tree, cur_ix);
974	}
975	}
976
977	/// Close a list if it's open. Also set loose if last line was blank
978	fn finish_list(&mut self, ix: usize) {
979	if let Some(node_ix) = self.tree.peek_up() {
980	if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
981	self.pop(ix);
982	}
983	}
984	if self.last_line_blank {
985	if let Some(node_ix) = self.tree.peek_grandparent() {
986	if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
987	*is_tight = `false`;
988	}
989	}
990	self.last_line_blank = `false`;
991	}
992	}
993
994	/// Continue an existing list or start a new one if there's not an open
995	/// list that matches.
996	fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
997	if let Some(node_ix) = self.tree.peek_up() {
998	if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
999	if existing_ch == ch {
1000	if self.last_line_blank {
1001	*is_tight = `false`;
1002	self.last_line_blank = `false`;
1003	}
1004	return;
1005	}
1006	}
1007	// TODO: this is not the best choice for end; maybe get end from last list item.
1008	self.finish_list(start);
1009	}
1010	self.tree.append(Item {
1011	start,
1012	end: `0`, // will get set later
1013	body: ItemBody::List(`true`, ch, index),
1014	});
1015	self.tree.push();
1016	self.last_line_blank = `false`;
1017	}
1018
1019	/// Parse a thematic break.
1020	///
1021	/// Returns index of start of next line.
1022	fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
1023	self.tree.append(Item {
1024	start: ix,
1025	end: ix + hrule_size,
1026	body: ItemBody::Rule,
1027	});
1028	ix + hrule_size
1029	}
1030
1031	/// Parse an ATX heading.
1032	///
1033	/// Returns index of start of next line.
1034	fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize {
1035	let mut ix = start;
1036	let heading_ix = self.tree.append(Item {
1037	start,
1038	end: `0`, // set later
1039	body: ItemBody::default(), // set later
1040	});
1041	ix += atx_level as usize;
1042	// next char is space or eol (guaranteed by scan_atx_heading)
1043	let bytes = self.text.as_bytes();
1044	if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
1045	self.tree[heading_ix].item.end = ix + eol_bytes;
1046	self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None);
1047	return ix + eol_bytes;
1048	}
1049	// skip leading spaces
1050	let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
1051	ix += skip_spaces;
1052
1053	// now handle the header text
1054	let header_start = ix;
1055	let header_node_idx = self.tree.push(); // so that we can set the endpoint later
1056
1057	// trim the trailing attribute block before parsing the entire line, if necessary
1058	let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES)
1059	{
1060	// the start of the next line is the end of the header since the
1061	// header cannot have line breaks
1062	let header_end = header_start + scan_nextline(&bytes[header_start..]);
1063	let (content_end, attrs) =
1064	self.extract_and_parse_heading_attribute_block(header_start, header_end);
1065	self.parse_line(ix, Some(content_end), TableParseMode::Disabled);
1066	(header_end, content_end, attrs)
1067	} else {
1068	ix = self.parse_line(ix, None, TableParseMode::Disabled).0;
1069	(ix, ix, None)
1070	};
1071	self.tree[header_node_idx].item.end = end;
1072
1073	// remove trailing matter from header text
1074	if let Some(cur_ix) = self.tree.cur() {
1075	// remove closing of the ATX heading
1076	let header_text = &bytes[header_start..content_end];
1077	let mut limit = header_text
1078	.iter()
1079	.rposition(\|&b\| !(b == b'`\n`' \|\| b == b'`\r`' \|\| b == b' '))
1080	.map_or(`0`, \|i\| i + `1`);
1081	let closer = header_text[..limit]
1082	.iter()
1083	.rposition(\|&b\| b != b'#')
1084	.map_or(`0`, \|i\| i + `1`);
1085	if closer == `0` {
1086	limit = closer;
1087	} else {
1088	let spaces = scan_rev_while(&header_text[..closer], \|b\| b == b' ');
1089	if spaces > `0` {
1090	limit = closer - spaces;
1091	}
1092	}
1093	self.tree[cur_ix].item.end = limit + header_start;
1094	}
1095
1096	self.tree.pop();
1097	self.tree[heading_ix].item.body = ItemBody::Heading(
1098	atx_level,
1099	attrs.map(\|attrs\| self.allocs.allocate_heading(attrs)),
1100	);
1101	end
1102	}
1103
1104	fn get_footnote(&mut self, start: usize) -> Option<(usize, CowStr<'a>)> {
1105	let bytes = &self.text.as_bytes()[start..];
1106	if !bytes.starts_with(b"[^") {
1107	return None;
1108	}
1109	let (mut i, label) = self.parse_refdef_label(start + `2`)?;
1110	i += `2`;
1111	if scan_ch(&bytes[i..], b':') == `0` {
1112	return None;
1113	}
1114	i += `1`;
1115	Some((i, label))
1116	}
1117
1118	/// Returns the number of bytes scanned on success.
1119	fn parse_footnote(&mut self, start: usize) -> Option<usize> {
1120	let (i, label) = self.get_footnote(start)?;
1121	self.finish_list(start);
1122	self.tree.append(Item {
1123	start,
1124	end: `0`, // will get set later
1125	// TODO: check whether the label here is strictly necessary
1126	body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
1127	});
1128	self.tree.push();
1129	Some(i)
1130	}
1131
1132	/// Tries to parse a reference label, which can be interrupted by new blocks.
1133	/// On success, returns the number of bytes of the label and the label itself.
1134	fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
1135	scan_link_label_rest(&self.text[start..], &\|bytes\| {
1136	let mut line_start = LineStart::new(bytes);
1137	let current_container =
1138	scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
1139	let bytes_scanned = line_start.bytes_scanned();
1140	let suffix = &bytes[bytes_scanned..];
1141	if scan_paragraph_interrupt(suffix, current_container) {
1142	None
1143	} else {
1144	Some(bytes_scanned)
1145	}
1146	})
1147	}
1148
1149	/// Returns number of bytes scanned, label and definition on success.
1150	fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
1151	let bytes = &self.text.as_bytes()[start..];
1152	if scan_ch(bytes, b'[') == `0` {
1153	return None;
1154	}
1155	let (mut i, label) = self.parse_refdef_label(start + `1`)?;
1156	i += `1`;
1157	if scan_ch(&bytes[i..], b':') == `0` {
1158	return None;
1159	}
1160	i += `1`;
1161	let (bytecount, link_def) = self.scan_refdef(start, start + i)?;
1162	Some((bytecount + i, UniCase::new(label), link_def))
1163	}
1164
1165	/// Returns number of bytes and number of newlines
1166	fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
1167	let mut newlines = `0`;
1168	loop {
1169	let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
1170	i += whitespaces;
1171	if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
1172	i += eol_bytes;
1173	newlines += `1`;
1174	if newlines > `1` {
1175	return None;
1176	}
1177	} else {
1178	break;
1179	}
1180	let mut line_start = LineStart::new(&bytes[i..]);
1181	if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
1182	return None;
1183	}
1184	i += line_start.bytes_scanned();
1185	}
1186	Some((i, newlines))
1187	}
1188
1189	/// Returns # of bytes and definition.
1190	/// Assumes the label of the reference including colon has already been scanned.
1191	fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> {
1192	let bytes = self.text.as_bytes();
1193
1194	// whitespace between label and url (including up to one newline)
1195	let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
1196
1197	// scan link dest
1198	let (dest_length, dest) = scan_link_dest(self.text, i, `1`)?;
1199	if dest_length == `0` {
1200	return None;
1201	}
1202	let dest = unescape(dest);
1203	i += dest_length;
1204
1205	// no title
1206	let mut backup = (
1207	i - start,
1208	LinkDef {
1209	dest,
1210	title: None,
1211	span: span_start..i,
1212	},
1213	);
1214
1215	// scan whitespace between dest and label
1216	let (mut i, newlines) =
1217	if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
1218	if i == self.text.len() {
1219	newlines += `1`;
1220	}
1221	if new_i == i && newlines == `0` {
1222	return None;
1223	}
1224	if newlines > `1` {
1225	return Some(backup);
1226	};
1227	(new_i, newlines)
1228	} else {
1229	return Some(backup);
1230	};
1231
1232	// scan title
1233	// if this fails but newline == 1, return also a refdef without title
1234	if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
1235	i += title_length;
1236	backup.1.span = span_start..i;
1237	backup.1.title = Some(unescape(title));
1238	} else if newlines > `0` {
1239	return Some(backup);
1240	} else {
1241	return None;
1242	};
1243
1244	// scan EOL
1245	if let Some(bytes) = scan_blank_line(&bytes[i..]) {
1246	backup.0 = i + bytes - start;
1247	Some(backup)
1248	} else if newlines > `0` {
1249	Some(backup)
1250	} else {
1251	None
1252	}
1253	}
1254
1255	/// Extracts and parses a heading attribute block if exists.
1256	///
1257	/// Returns `(end_offset_of_heading_content, (id, classes))`.
1258	///
1259	/// If `header_end` is less than or equal to `header_start`, the given
1260	/// input is considered as empty.
1261	fn extract_and_parse_heading_attribute_block(
1262	&mut self,
1263	header_start: usize,
1264	header_end: usize,
1265	) -> (usize, Option<HeadingAttributes<'a>>) {
1266	if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) {
1267	return (header_end, None);
1268	}
1269
1270	// extract the trailing attribute block
1271	let header_bytes = &self.text.as_bytes()[header_start..header_end];
1272	let (content_len, attr_block_range_rel) =
1273	extract_attribute_block_content_from_header_text(header_bytes);
1274	let content_end = header_start + content_len;
1275	let attrs = attr_block_range_rel.and_then(\|r\| {
1276	parse_inside_attribute_block(
1277	&self.text[(header_start + r.start)..(header_start + r.end)],
1278	)
1279	});
1280	(content_end, attrs)
1281	}
1282	}
1283
1284	/// Scanning modes for `Parser`'s `parse_line` method.
1285	#[derive(PartialEq, Eq, Copy, Clone)]
1286	enum TableParseMode {
1287	/// Inside a paragraph, scanning for table headers.
1288	Scan,
1289	/// Inside a table.
1290	Active,
1291	/// Inside a paragraph, not scanning for table headers.
1292	Disabled,
1293	}
1294
1295	/// Computes the number of header columns in a table line by computing the number of dividing pipes
1296	/// that aren't followed or preceded by whitespace.
1297	fn count_header_cols(
1298	bytes: &[u8],
1299	mut pipes: usize,
1300	mut start: usize,
1301	last_pipe_ix: usize,
1302	) -> usize {
1303	// was first pipe preceded by whitespace? if so, subtract one
1304	start += scan_whitespace_no_nl(&bytes[start..]);
1305	if bytes[start] == b'\|' {
1306	pipes -= `1`;
1307	}
1308
1309	// was last pipe followed by whitespace? if so, sub one
1310	if scan_blank_line(&bytes[(last_pipe_ix + `1`)..]).is_some() {
1311	pipes
1312	} else {
1313	pipes + `1`
1314	}
1315	}
1316
1317	/// Checks whether we should break a paragraph on the given input.
1318	fn scan_paragraph_interrupt(bytes: &[u8], current_container: bool) -> bool {
1319	scan_eol(bytes).is_some()
1320	\|\| scan_hrule(bytes).is_ok()
1321	\|\| scan_atx_heading(data:bytes).is_some()
1322	\|\| scan_code_fence(data:bytes).is_some()
1323	\|\| scan_blockquote_start(data:bytes).is_some()
1324	\|\| scan_listitem(bytes).map_or(default:`false`, \|(ix: usize, delim: u8, index: usize, _)\| {
1325	! current_container \|\|
1326	// we don't allow interruption by either empty lists or
1327	// numbered lists starting at an index other than 1
1328	(delim == b'*' \|\| delim == b'-' \|\| delim == b'+' \|\| index == `1`)
1329	&& !scan_empty_list(&bytes[ix..])
1330	})
1331	\|\| bytes.starts_with(needle:b"<")
1332	&& (get_html_end_tag(&bytes[`1`..]).is_some() \|\| starts_html_block_type_6(&bytes[`1`..]))
1333	}
1334
1335	/// Assumes `text_bytes` is preceded by `<`.
1336	fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
1337	static BEGIN_TAGS: &[&[u8]; `4`] = &[b"pre", b"style", b"script", b"textarea"];
1338	static ST_BEGIN_TAGS: &[&[u8]; `3`] = &[b"!--", b"?", b"![CDATA["];
1339
1340	for (beg_tag, end_tag) in BEGIN_TAGS
1341	.iter()
1342	.zip(["</pre>", "</style>", "</script>", "</textarea>"].iter())
1343	{
1344	let tag_len = beg_tag.len();
1345
1346	if text_bytes.len() < tag_len {
1347	// begin tags are increasing in size
1348	break;
1349	}
1350
1351	if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
1352	continue;
1353	}
1354
1355	// Must either be the end of the line...
1356	if text_bytes.len() == tag_len {
1357	return Some(end_tag);
1358	}
1359
1360	// ...or be followed by whitespace, newline, or '>'.
1361	let s = text_bytes[tag_len];
1362	if is_ascii_whitespace(s) \|\| s == b'>' {
1363	return Some(end_tag);
1364	}
1365	}
1366
1367	for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
1368	if text_bytes.starts_with(beg_tag) {
1369	return Some(end_tag);
1370	}
1371	}
1372
1373	if text_bytes.len() > `1`
1374	&& text_bytes[`0`] == b'!'
1375	&& text_bytes[`1`] >= b'A'
1376	&& text_bytes[`1`] <= b'Z'
1377	{
1378	Some(">")
1379	} else {
1380	None
1381	}
1382	}
1383
1384	// https://english.stackexchange.com/a/285573
1385	fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
1386	let mut list_item = tree[list_ix].child;
1387	while let Some(listitem_ix) = list_item {
1388	// first child is special, controls how we repoint list_item.child
1389	let list_item_firstborn = tree[listitem_ix].child;
1390
1391	// Check that list item has children - this is not necessarily the case!
1392	if let Some(firstborn_ix) = list_item_firstborn {
1393	if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
1394	tree[listitem_ix].child = tree[firstborn_ix].child;
1395	}
1396
1397	let mut list_item_child = Some(firstborn_ix);
1398	let mut node_to_repoint = None;
1399	while let Some(child_ix) = list_item_child {
1400	// surgerize paragraphs
1401	let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
1402	if let Some(child_firstborn) = tree[child_ix].child {
1403	if let Some(repoint_ix) = node_to_repoint {
1404	tree[repoint_ix].next = Some(child_firstborn);
1405	}
1406	let mut child_lastborn = child_firstborn;
1407	while let Some(lastborn_next_ix) = tree[child_lastborn].next {
1408	child_lastborn = lastborn_next_ix;
1409	}
1410	child_lastborn
1411	} else {
1412	child_ix
1413	}
1414	} else {
1415	child_ix
1416	};
1417
1418	node_to_repoint = Some(repoint_ix);
1419	tree[repoint_ix].next = tree[child_ix].next;
1420	list_item_child = tree[child_ix].next;
1421	}
1422	}
1423
1424	list_item = tree[listitem_ix].next;
1425	}
1426	}
1427
1428	/// Determines whether the delimiter run starting at given index is
1429	/// left-flanking, as defined by the commonmark spec (and isn't intraword
1430	/// for _ delims).
1431	/// suffix is &s[ix..], which is passed in as an optimization, since taking
1432	/// a string subslice is O(n).
1433	fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1434	let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) {
1435	c
1436	} else {
1437	return `false`;
1438	};
1439	if next_char.is_whitespace() {
1440	return `false`;
1441	}
1442	if ix == `0` {
1443	return `true`;
1444	}
1445	let delim: char = suffix.chars().next().unwrap();
1446	if delim == '*' && !is_punctuation(next_char) {
1447	return `true`;
1448	}
1449
1450	let prev_char: char = s[..ix].chars().last().unwrap();
1451
1452	prev_char.is_whitespace()
1453	\|\| is_punctuation(prev_char) && (delim != '`\'`' \|\| ![']', ')'].contains(&prev_char))
1454	}
1455
1456	/// Determines whether the delimiter run starting at given index is
1457	/// left-flanking, as defined by the commonmark spec (and isn't intraword
1458	/// for _ delims)
1459	fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1460	if ix == `0` {
1461	return `false`;
1462	}
1463	let prev_char: char = s[..ix].chars().last().unwrap();
1464	if prev_char.is_whitespace() {
1465	return `false`;
1466	}
1467	let next_char: char = if let Some(c: char) = suffix.chars().nth(run_len) {
1468	c
1469	} else {
1470	return `true`;
1471	};
1472	let delim: char = suffix.chars().next().unwrap();
1473	if delim == '*' && !is_punctuation(prev_char) {
1474	return `true`;
1475	}
1476
1477	next_char.is_whitespace() \|\| is_punctuation(next_char)
1478	}
1479
1480	fn create_lut(options: &Options) -> LookupTable {
1481	#[cfg(all(target_arch = "x86_64", feature = "simd"))]
1482	{
1483	LookupTable {
1484	simd: simd::compute_lookup(options),
1485	scalar: special_bytes(options),
1486	}
1487	}
1488	#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
1489	{
1490	special_bytes(options)
1491	}
1492	}
1493
1494	fn special_bytes(options: &Options) -> [bool; `256`] {
1495	let mut bytes: [bool; 256] = [`false`; `256`];
1496	let standard_bytes: [u8; 11] = [
1497	b'`\n`', b'`\r`', b'*', b'_', b'&', b'`\\`', b'[', b']', b'<', b'!', b'`',
1498	];
1499
1500	for &byte: u8 in &standard_bytes {
1501	bytes[byte as usize] = `true`;
1502	}
1503	if options.contains(Options::ENABLE_TABLES) {
1504	bytes[b'\|' as usize] = `true`;
1505	}
1506	if options.contains(Options::ENABLE_STRIKETHROUGH) {
1507	bytes[b'~' as usize] = `true`;
1508	}
1509	if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
1510	for &byte: u8 in &[b'.', b'-', b'"', b'`\'`'] {
1511	bytes[byte as usize] = `true`;
1512	}
1513	}
1514
1515	bytes
1516	}
1517
1518	enum LoopInstruction<T> {
1519	/// Continue looking for more special bytes, but skip next few bytes.
1520	ContinueAndSkip(usize),
1521	/// Break looping immediately, returning with the given index and value.
1522	BreakAtWith(usize, T),
1523	}
1524
1525	#[cfg(all(target_arch = "x86_64", feature = "simd"))]
1526	struct LookupTable {
1527	simd: [u8; `16`],
1528	scalar: [bool; `256`],
1529	}
1530
1531	#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
1532	type LookupTable = [bool; `256`];
1533
1534	/// This function walks the byte slices from the given index and
1535	/// calls the callback function on all bytes (and their indices) that are in the following set:
1536	/// `` ` ``, `\`, `&`, ``, `_`, `~`, `!`, `<`, `[`, `]`, `\|`, `\r`, `\n`*
1537	/// It is guaranteed not call the callback on other bytes.
1538	/// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
1539	/// will not be called with an index that is less than `ix + n + 1`.
1540	/// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
1541	/// called and the function returns immediately with the return value `(end_ix, opt_val)`.
1542	/// If `BreakAtWith(..)` is never returned, this function will return the first
1543	/// index that is outside the byteslice bound and a `None` value.
1544	fn iterate_special_bytes<F, T>(
1545	lut: &LookupTable,
1546	bytes: &[u8],
1547	ix: usize,
1548	callback: F,
1549	) -> (usize, Option<T>)
1550	where
1551	F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1552	{
1553	#[cfg(all(target_arch = "x86_64", feature = "simd"))]
1554	{
1555	simd::iterate_special_bytes(lut, bytes, ix, callback)
1556	}
1557	#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
1558	{
1559	scalar_iterate_special_bytes(lut, bytes, ix, callback)
1560	}
1561	}
1562
1563	fn scalar_iterate_special_bytes<F, T>(
1564	lut: &[bool; `256`],
1565	bytes: &[u8],
1566	mut ix: usize,
1567	mut callback: F,
1568	) -> (usize, Option<T>)
1569	where
1570	F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1571	{
1572	while ix < bytes.len() {
1573	let b: u8 = bytes[ix];
1574	if lut[b as usize] {
1575	match callback(ix, b) {
1576	LoopInstruction::ContinueAndSkip(skip: usize) => {
1577	ix += skip;
1578	}
1579	LoopInstruction::BreakAtWith(ix: usize, val: Option) => {
1580	return (ix, val);
1581	}
1582	}
1583	}
1584	ix += `1`;
1585	}
1586
1587	(ix, None)
1588	}
1589
1590	/// Split the usual heading content range and the content inside the trailing attribute block.
1591	///
1592	/// Returns `(leading_content_len, Option<trailing_attr_block_range>)`.
1593	///
1594	/// Note that `trailing_attr_block_range` will be empty range when the block
1595	/// is `{}`, since the range is content inside the wrapping `{` and `}`.
1596	///
1597	/// The closing `}` of an attribute block can have trailing whitespaces.
1598	/// They are automatically trimmed when the attribute block is being searched.
1599	///
1600	/// However, this method does not trim the trailing whitespaces of heading content.
1601	/// It is callers' responsibility to trim them if necessary.
1602	fn extract_attribute_block_content_from_header_text(
1603	heading: &[u8],
1604	) -> (usize, Option<Range<usize>>) {
1605	let heading_len = heading.len();
1606	let mut ix = heading_len;
1607	ix -= scan_rev_while(heading, \|b\| {
1608	b == b'`\n`' \|\| b == b'`\r`' \|\| b == b' ' \|\| b == b'`\t`'
1609	});
1610	if ix == `0` {
1611	return (heading_len, None);
1612	}
1613
1614	let attr_block_close = ix - `1`;
1615	if heading.get(attr_block_close) != Some(&b'}') {
1616	// The last character is not `}`. No attribute blocks found.
1617	return (heading_len, None);
1618	}
1619	// move cursor before the closing right brace (`}`)
1620	ix -= `1`;
1621
1622	ix -= scan_rev_while(&heading[..ix], \|b\| {
1623	// Characters to be excluded:
1624	// `{` and `}`: special characters to open and close an attribute block.*
1625	// `\\`: a special character to escape many characters and disable some syntaxes.*
1626	// + Handling of this escape character differs among markdown processors.
1627	// + Escaped characters will be separate text node from neighbors, so
1628	// it is not easy to handle unescaped string and trim the trailing block.
1629	// `<` and `>`: special characters to start and end HTML tag.*
1630	// + No known processors converts `{#<i>foo</i>}` into
1631	// `id="<i>foo</>"` as of this writing, so hopefully
1632	// this restriction won't cause compatibility issues.
1633	// `\n` and `\r`: a newline character.*
1634	// + Setext heading can have multiple lines. However it is hard to support
1635	// attribute blocks that have newline inside, since the parsing proceeds line by
1636	// line and lines will be separate nodes even they are logically a single text.
1637	!matches!(b, b'{' \| b'}' \| b'<' \| b'>' \| b'`\\`' \| b'`\n`' \| b'`\r`')
1638	});
1639	if ix == `0` {
1640	// `{` is not found. No attribute blocks available.
1641	return (heading_len, None);
1642	}
1643	let attr_block_open = ix - `1`;
1644	if heading[attr_block_open] != b'{' {
1645	// `{` is not found. No attribute blocks available.
1646	return (heading_len, None);
1647	}
1648
1649	(attr_block_open, Some(ix..attr_block_close))
1650	}
1651
1652	/// Parses an attribute block content, such as `.class1 #id .class2`.
1653	///
1654	/// Returns `(id, classes)`.
1655	///
1656	/// It is callers' responsibility to find opening and closing characters of the attribute
1657	/// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you.
1658	///
1659	/// Note that this parsing requires explicit whitespace separators between
1660	/// attributes. This is intentional design with the reasons below:
1661	///
1662	/// to keep conversion simple and easy to understand for any possible input,*
1663	/// to avoid adding less obvious conversion rule that can reduce compatibility*
1664	/// with other implementations more, and
1665	/// to follow the major design of implementations with the support for the*
1666	/// attribute blocks extension (as of this writing).
1667	///
1668	/// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`].
1669	///
1670	/// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES`
1671	fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes> {
1672	let mut id: Option<&str> = None;
1673	let mut classes: Vec<&str> = Vec::new();
1674
1675	for attr: &str in inside_attr_block.split_ascii_whitespace() {
1676	// iterator returned by `str::split_ascii_whitespace` never emits empty
1677	// strings, so taking first byte won't panic.
1678	if attr.len() > `1` {
1679	let first_byte: u8 = attr.as_bytes()[`0`];
1680	if first_byte == b'#' {
1681	id = Some(&attr[`1`..]);
1682	} else if first_byte == b'.' {
1683	classes.push(&attr[`1`..]);
1684	}
1685	}
1686	}
1687
1688	Some(HeadingAttributes { id, classes })
1689	}
1690
1691	#[cfg(all(target_arch = "x86_64", feature = "simd"))]
1692	mod simd {
1693	//! SIMD byte scanning logic.
1694	//!
1695	//! This module provides functions that allow walking through byteslices, calling
1696	//! provided callback functions on special bytes and their indices using SIMD.
1697	//! The byteset is defined in `compute_lookup`.
1698	//!
1699	//! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of
1700	//! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask
1701	//! from this and call the callback on every index corresponding to a 1 in this mask
1702	//! before moving on to the next chunk. This allows us to move quickly when there
1703	//! are no or few matches.
1704	//!
1705	//! The table lookup is inspired by this [great overview]. However, since all of the
1706	//! bytes we're interested in are ASCII, we don't quite need the full generality of
1707	//! the universal algorithm and are hence able to skip a few instructions.
1708	//!
1709	//! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html
1710
1711	use super::{LookupTable, LoopInstruction};
1712	use crate::Options;
1713	use core::arch::x86_64::*;
1714
1715	const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
1716
1717	/// Generates a lookup table containing the bitmaps for our
1718	/// special marker bytes. This is effectively a 128 element 2d bitvector,
1719	/// that can be indexed by a four bit row index (the lower nibble)
1720	/// and a three bit column index (upper nibble).
1721	pub(super) fn compute_lookup(options: &Options) -> [u8; `16`] {
1722	let mut lookup = [`0u8`; `16`];
1723	let standard_bytes = [
1724	b'`\n`', b'`\r`', b'*', b'_', b'&', b'`\\`', b'[', b']', b'<', b'!', b'`',
1725	];
1726
1727	for &byte in &standard_bytes {
1728	add_lookup_byte(&mut lookup, byte);
1729	}
1730	if options.contains(Options::ENABLE_TABLES) {
1731	add_lookup_byte(&mut lookup, b'\|');
1732	}
1733	if options.contains(Options::ENABLE_STRIKETHROUGH) {
1734	add_lookup_byte(&mut lookup, b'~');
1735	}
1736	if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
1737	for &byte in &[b'.', b'-', b'"', b'`\'`'] {
1738	add_lookup_byte(&mut lookup, byte);
1739	}
1740	}
1741
1742	lookup
1743	}
1744
1745	fn add_lookup_byte(lookup: &mut [u8; `16`], byte: u8) {
1746	lookup[(byte & `0x0f`) as usize] \|= `1` << (byte >> `4`);
1747	}
1748
1749	/// Computes a bit mask for the given byteslice starting from the given index,
1750	/// where the 16 least significant bits indicate (by value of 1) whether or not
1751	/// there is a special character at that byte position. The least significant bit
1752	/// corresponds to `bytes[ix]` and the most significant bit corresponds to
1753	/// `bytes[ix + 15]`.
1754	/// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`.
1755	#[target_feature(enable = "ssse3")]
1756	#[inline]
1757	unsafe fn compute_mask(lut: &[u8; `16`], bytes: &[u8], ix: usize) -> i32 {
1758	debug_assert!(bytes.len() >= ix + VECTOR_SIZE);
1759
1760	let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i);
1761	// Small lookup table to compute single bit bitshifts
1762	// for 16 bytes at once.
1763	let bitmask_lookup =
1764	_mm_setr_epi8(`1`, `2`, `4`, `8`, `16`, `32`, `64`, `-128`, `-1`, `-1`, `-1`, `-1`, `-1`, `-1`, `-1`, `-1`);
1765
1766	// Load input from memory.
1767	let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i;
1768	let input = _mm_loadu_si128(raw_ptr);
1769	// Compute the bitmap using the bottom nibble as an index
1770	// into the lookup table. Note that non-ascii bytes will have
1771	// their most significant bit set and will map to lookup[0].
1772	let bitset = _mm_shuffle_epi8(bitmap, input);
1773	// Compute the high nibbles of the input using a 16-bit rightshift of four
1774	// and a mask to prevent most-significant bit issues.
1775	let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, `4`), _mm_set1_epi8(`0x0f`));
1776	// Create a bitmask for the bitmap by perform a left shift of the value
1777	// of the higher nibble. Bytes with their most significant set are mapped
1778	// to -1 (all ones).
1779	let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles);
1780	// Test the bit of the bitmap by AND'ing the bitmap and the mask together.
1781	let tmp = _mm_and_si128(bitset, bitmask);
1782	// Check whether the result was not null. NEQ is not a SIMD intrinsic,
1783	// but comparing to the bitmask is logically equivalent. This also prevents us
1784	// from matching any non-ASCII bytes since none of the bitmaps were all ones
1785	// (-1).
1786	let result = _mm_cmpeq_epi8(tmp, bitmask);
1787
1788	// Return the resulting bitmask.
1789	_mm_movemask_epi8(result)
1790	}
1791
1792	/// Calls callback on byte indices and their value.
1793	/// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the
1794	/// number of bytes in callback return value otherwise.
1795	/// Returns the final index and a possible break value.
1796	pub(super) fn iterate_special_bytes<F, T>(
1797	lut: &LookupTable,
1798	bytes: &[u8],
1799	ix: usize,
1800	callback: F,
1801	) -> (usize, Option<T>)
1802	where
1803	F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1804	{
1805	if is_x86_feature_detected!("ssse3") && bytes.len() >= VECTOR_SIZE {
1806	unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) }
1807	} else {
1808	super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback)
1809	}
1810	}
1811
1812	/// Calls the callback function for every 1 in the given bitmask with
1813	/// the index `offset + ix`, where `ix` is the position of the 1 in the mask.
1814	/// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with
1815	/// final index `end_ix` and optional value `opt_val`.
1816	unsafe fn process_mask<F, T>(
1817	mut mask: i32,
1818	bytes: &[u8],
1819	mut offset: usize,
1820	callback: &mut F,
1821	) -> Result<usize, (usize, Option<T>)>
1822	where
1823	F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1824	{
1825	while mask != `0` {
1826	let mask_ix = mask.trailing_zeros() as usize;
1827	offset += mask_ix;
1828	match callback(offset, *bytes.get_unchecked(offset)) {
1829	LoopInstruction::ContinueAndSkip(skip) => {
1830	offset += skip + `1`;
1831	mask >>= skip + `1` + mask_ix;
1832	}
1833	LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)),
1834	}
1835	}
1836	Ok(offset)
1837	}
1838
1839	#[target_feature(enable = "ssse3")]
1840	/// Important: only call this function when `bytes.len() >= 16`. Doing
1841	/// so otherwise may exhibit undefined behaviour.
1842	unsafe fn simd_iterate_special_bytes<F, T>(
1843	lut: &[u8; `16`],
1844	bytes: &[u8],
1845	mut ix: usize,
1846	mut callback: F,
1847	) -> (usize, Option<T>)
1848	where
1849	F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
1850	{
1851	debug_assert!(bytes.len() >= VECTOR_SIZE);
1852	let upperbound = bytes.len() - VECTOR_SIZE;
1853
1854	while ix < upperbound {
1855	let mask = compute_mask(lut, bytes, ix);
1856	let block_start = ix;
1857	ix = match process_mask(mask, bytes, ix, &mut callback) {
1858	Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start),
1859	Err((end_ix, val)) => return (end_ix, val),
1860	};
1861	}
1862
1863	if bytes.len() > ix {
1864	// shift off the bytes at start we have already scanned
1865	let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound;
1866	if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) {
1867	return (end_ix, val);
1868	}
1869	}
1870
1871	(bytes.len(), None)
1872	}
1873
1874	#[cfg(test)]
1875	mod simd_test {
1876	use super::super::create_lut;
1877	use super::{iterate_special_bytes, LoopInstruction};
1878	use crate::Options;
1879
1880	fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) {
1881	let mut opts = Options::empty();
1882	opts.insert(Options::ENABLE_TABLES);
1883	opts.insert(Options::ENABLE_FOOTNOTES);
1884	opts.insert(Options::ENABLE_STRIKETHROUGH);
1885	opts.insert(Options::ENABLE_TASKLISTS);
1886
1887	let lut = create_lut(&opts);
1888	let mut indices = vec![];
1889
1890	iterate_special_bytes::<_, i32>(&lut, bytes, `0`, \|ix, _byte_ty\| {
1891	indices.push(ix);
1892	LoopInstruction::ContinueAndSkip(skip)
1893	});
1894
1895	assert_eq!(&indices[..], expected);
1896	}
1897
1898	#[test]
1899	fn simple_no_match() {
1900	check_expected_indices("abcdef0123456789".as_bytes(), &[], `0`);
1901	}
1902
1903	#[test]
1904	fn simple_match() {
1905	check_expected_indices("*bcd&f0123456789".as_bytes(), &[`0`, `4`], `0`);
1906	}
1907
1908	#[test]
1909	fn single_open_fish() {
1910	check_expected_indices("<".as_bytes(), &[`0`], `0`);
1911	}
1912
1913	#[test]
1914	fn long_match() {
1915	check_expected_indices("0123456789abcde~*bcd&f0".as_bytes(), &[`15`, `16`, `20`], `0`);
1916	}
1917
1918	#[test]
1919	fn border_skip() {
1920	check_expected_indices("0123456789abcde~~~~d&f0".as_bytes(), &[`15`, `20`], `3`);
1921	}
1922
1923	#[test]
1924	fn exhaustive_search() {
1925	let chars = [
1926	b'`\n`', b'`\r`', b'*', b'_', b'~', b'\|', b'&', b'`\\`', b'[', b']', b'<', b'!', b'`',
1927	];
1928
1929	for &c in &chars {
1930	for i in `0u8`..=`255` {
1931	if !chars.contains(&i) {
1932	// full match
1933	let mut buf = [i; `18`];
1934	buf[`3`] = c;
1935	buf[`6`] = c;
1936
1937	check_expected_indices(&buf[..], &[`3`, `6`], `0`);
1938	}
1939	}
1940	}
1941	}
1942	}
1943	}
1944