scanners.rs source code [crates/pulldown-cmark/src/scanners.rs]

1	// Copyright 2015 Google Inc. All rights reserved.
2	//
3	// Permission is hereby granted, free of charge, to any person obtaining a copy
4	// of this software and associated documentation files (the "Software"), to deal
5	// in the Software without restriction, including without limitation the rights
6	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	// copies of the Software, and to permit persons to whom the Software is
8	// furnished to do so, subject to the following conditions:
9	//
10	// The above copyright notice and this permission notice shall be included in
11	// all copies or substantial portions of the Software.
12	//
13	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	// THE SOFTWARE.
20
21	//! Scanners for fragments of CommonMark syntax
22
23	use std::convert::TryInto;
24	use std::{char, convert::TryFrom};
25
26	use crate::parse::HtmlScanGuard;
27	pub(crate) use crate::puncttable::{is_ascii_punctuation, is_punctuation};
28	use crate::strings::CowStr;
29	use crate::{entities, HeadingLevel};
30	use crate::{Alignment, LinkType};
31
32	use memchr::memchr;
33
34	// sorted for binary search
35	const HTML_TAGS: [&str; `62`] = [
36	"address",
37	"article",
38	"aside",
39	"base",
40	"basefont",
41	"blockquote",
42	"body",
43	"caption",
44	"center",
45	"col",
46	"colgroup",
47	"dd",
48	"details",
49	"dialog",
50	"dir",
51	"div",
52	"dl",
53	"dt",
54	"fieldset",
55	"figcaption",
56	"figure",
57	"footer",
58	"form",
59	"frame",
60	"frameset",
61	"h1",
62	"h2",
63	"h3",
64	"h4",
65	"h5",
66	"h6",
67	"head",
68	"header",
69	"hr",
70	"html",
71	"iframe",
72	"legend",
73	"li",
74	"link",
75	"main",
76	"menu",
77	"menuitem",
78	"nav",
79	"noframes",
80	"ol",
81	"optgroup",
82	"option",
83	"p",
84	"param",
85	"section",
86	"source",
87	"summary",
88	"table",
89	"tbody",
90	"td",
91	"tfoot",
92	"th",
93	"thead",
94	"title",
95	"tr",
96	"track",
97	"ul",
98	];
99
100	/// Analysis of the beginning of a line, including indentation and container
101	/// markers.
102	#[derive(Clone)]
103	pub(crate) struct LineStart<'a> {
104	bytes: &'a [u8],
105	tab_start: usize,
106	ix: usize,
107	spaces_remaining: usize,
108	// no thematic breaks can occur before this offset.
109	// this prevents scanning over and over up to a certain point
110	min_hrule_offset: usize,
111	}
112
113	impl<'a> LineStart<'a> {
114	pub(crate) fn new(bytes: &[u8]) -> LineStart {
115	LineStart {
116	bytes,
117	tab_start: `0`,
118	ix: `0`,
119	spaces_remaining: `0`,
120	min_hrule_offset: `0`,
121	}
122	}
123
124	/// Try to scan a number of spaces.
125	///
126	/// Returns true if all spaces were consumed.
127	///
128	/// Note: consumes some spaces even if not successful.
129	pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
130	self.scan_space_inner(n_space) == `0`
131	}
132
133	/// Scan a number of spaces up to a maximum.
134	///
135	/// Returns number of spaces scanned.
136	pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
137	n_space - self.scan_space_inner(n_space)
138	}
139
140	/// Returns unused remainder of spaces.
141	fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
142	let n_from_remaining = self.spaces_remaining.min(n_space);
143	self.spaces_remaining -= n_from_remaining;
144	n_space -= n_from_remaining;
145	while n_space > `0` && self.ix < self.bytes.len() {
146	match self.bytes[self.ix] {
147	b' ' => {
148	self.ix += `1`;
149	n_space -= `1`;
150	}
151	b'`\t`' => {
152	let spaces = `4` - (self.ix - self.tab_start) % `4`;
153	self.ix += `1`;
154	self.tab_start = self.ix;
155	let n = spaces.min(n_space);
156	n_space -= n;
157	self.spaces_remaining = spaces - n;
158	}
159	_ => break,
160	}
161	}
162	n_space
163	}
164
165	/// Scan all available ASCII whitespace (not including eol).
166	pub(crate) fn scan_all_space(&mut self) {
167	self.spaces_remaining = `0`;
168	self.ix += self.bytes[self.ix..]
169	.iter()
170	.take_while(\|&&b\| b == b' ' \|\| b == b'`\t`')
171	.count();
172	}
173
174	/// Determine whether we're at end of line (includes end of file).
175	pub(crate) fn is_at_eol(&self) -> bool {
176	self.bytes
177	.get(self.ix)
178	.map(\|&c\| c == b'`\r`' \|\| c == b'`\n`')
179	.unwrap_or(`true`)
180	}
181
182	fn scan_ch(&mut self, c: u8) -> bool {
183	if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
184	self.ix += `1`;
185	`true`
186	} else {
187	`false`
188	}
189	}
190
191	pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
192	let save = self.clone();
193	let _ = self.scan_space(`3`);
194	if self.scan_ch(b'>') {
195	let _ = self.scan_space(`1`);
196	`true`
197	} else {
198	*self = save;
199	`false`
200	}
201	}
202
203	/// Scan a list marker.
204	///
205	/// Return value is the character, the start index, and the indent in spaces.
206	/// For ordered list markers, the character will be one of b'.' or b')'. For
207	/// bullet list markers, it will be one of b'-', b'+', or b''.*
208	pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
209	let save = self.clone();
210	let indent = self.scan_space_upto(`4`);
211	if indent < `4` && self.ix < self.bytes.len() {
212	let c = self.bytes[self.ix];
213	if c == b'-' \|\| c == b'+' \|\| c == b'*' {
214	if self.ix >= self.min_hrule_offset {
215	// there could be an hrule here
216	if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
217	self.min_hrule_offset = min_offset;
218	} else {
219	*self = save;
220	return None;
221	}
222	}
223	self.ix += `1`;
224	if self.scan_space(`1`) \|\| self.is_at_eol() {
225	return self.finish_list_marker(c, `0`, indent + `2`);
226	}
227	} else if c >= b'0' && c <= b'9' {
228	let start_ix = self.ix;
229	let mut ix = self.ix + `1`;
230	let mut val = u64::from(c - b'0');
231	while ix < self.bytes.len() && ix - start_ix < `10` {
232	let c = self.bytes[ix];
233	ix += `1`;
234	if c >= b'0' && c <= b'9' {
235	val = val * `10` + u64::from(c - b'0');
236	} else if c == b')' \|\| c == b'.' {
237	self.ix = ix;
238	if self.scan_space(`1`) \|\| self.is_at_eol() {
239	return self.finish_list_marker(c, val, indent + self.ix - start_ix);
240	} else {
241	break;
242	}
243	} else {
244	break;
245	}
246	}
247	}
248	}
249	*self = save;
250	None
251	}
252
253	fn finish_list_marker(
254	&mut self,
255	c: u8,
256	start: u64,
257	mut indent: usize,
258	) -> Option<(u8, u64, usize)> {
259	let save = self.clone();
260
261	// skip the rest of the line if it's blank
262	if scan_blank_line(&self.bytes[self.ix..]).is_some() {
263	return Some((c, start, indent));
264	}
265
266	let post_indent = self.scan_space_upto(`4`);
267	if post_indent < `4` {
268	indent += post_indent;
269	} else {
270	*self = save;
271	}
272	Some((c, start, indent))
273	}
274
275	/// Returns Some(is_checked) when a task list marker was found. Resets itself
276	/// to original state otherwise.
277	pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
278	let save = self.clone();
279	self.scan_space_upto(`3`);
280
281	if !self.scan_ch(b'[') {
282	*self = save;
283	return None;
284	}
285	let is_checked = match self.bytes.get(self.ix) {
286	Some(&c) if is_ascii_whitespace_no_nl(c) => {
287	self.ix += `1`;
288	`false`
289	}
290	Some(b'x') \| Some(b'X') => {
291	self.ix += `1`;
292	`true`
293	}
294	_ => {
295	*self = save;
296	return None;
297	}
298	};
299	if !self.scan_ch(b']') {
300	*self = save;
301	return None;
302	}
303	if !self
304	.bytes
305	.get(self.ix)
306	.map(\|&b\| is_ascii_whitespace_no_nl(b))
307	.unwrap_or(`false`)
308	{
309	*self = save;
310	return None;
311	}
312	Some(is_checked)
313	}
314
315	pub(crate) fn bytes_scanned(&self) -> usize {
316	self.ix
317	}
318
319	pub(crate) fn remaining_space(&self) -> usize {
320	self.spaces_remaining
321	}
322	}
323
324	pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
325	(c >= `0x09` && c <= `0x0d`) \|\| c == b' '
326	}
327
328	pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
329	c == b'`\t`' \|\| c == `0x0b` \|\| c == `0x0c` \|\| c == b' '
330	}
331
332	fn is_ascii_alpha(c: u8) -> bool {
333	matches!(c, b'a'..=b'z' \| b'A'..=b'Z')
334	}
335
336	fn is_ascii_alphanumeric(c: u8) -> bool {
337	matches!(c, b'0'..=b'9' \| b'a'..=b'z' \| b'A'..=b'Z')
338	}
339
340	fn is_ascii_letterdigitdash(c: u8) -> bool {
341	c == b'-' \|\| is_ascii_alphanumeric(c)
342	}
343
344	fn is_digit(c: u8) -> bool {
345	b'0' <= c && c <= b'9'
346	}
347
348	fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
349	!matches!(
350	c,
351	b'`\'`' \| b'"' \| b' ' \| b'=' \| b'>' \| b'<' \| b'`' \| b'`\n`' \| b'`\r`'
352	)
353	}
354
355	// scan a single character
356	pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
357	if !data.is_empty() && data[`0`] == c {
358	`1`
359	} else {
360	`0`
361	}
362	}
363
364	pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
365	where
366	F: FnMut(u8) -> bool,
367	{
368	data.iter().take_while(\|&&c: u8\| f(c)).count()
369	}
370
371	pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
372	where
373	F: FnMut(u8) -> bool,
374	{
375	data.iter().rev().take_while(\|&&c: u8\| f(c)).count()
376	}
377
378	pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
379	scan_while(data, \|x: u8\| x == c)
380	}
381
382	// Note: this scans ASCII whitespace only, for Unicode whitespace use
383	// a different function.
384	pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
385	scan_while(data, f:is_ascii_whitespace_no_nl)
386	}
387
388	fn scan_attr_value_chars(data: &[u8]) -> usize {
389	scan_while(data, f:is_valid_unquoted_attr_value_char)
390	}
391
392	pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
393	if bytes.is_empty() {
394	return Some(`0`);
395	}
396	match bytes[`0`] {
397	b'`\n`' => Some(`1`),
398	b'`\r`' => Some(if bytes.get(index:`1`) == Some(&b'`\n`') { `2` } else { `1` }),
399	_ => None,
400	}
401	}
402
403	pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
404	let i: usize = scan_whitespace_no_nl(data:bytes);
405	scan_eol(&bytes[i..]).map(\|n: usize\| i + n)
406	}
407
408	pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
409	memchr(b'`\n`', bytes).map_or(default:bytes.len(), \|x: usize\| x + `1`)
410	}
411
412	// return: end byte for closing code fence, or None
413	// if the line is not a closing code fence
414	pub(crate) fn scan_closing_code_fence(
415	bytes: &[u8],
416	fence_char: u8,
417	n_fence_char: usize,
418	) -> Option<usize> {
419	if bytes.is_empty() {
420	return Some(`0`);
421	}
422	let mut i: usize = `0`;
423	let num_fence_chars_found: usize = scan_ch_repeat(&bytes[i..], c:fence_char);
424	if num_fence_chars_found < n_fence_char {
425	return None;
426	}
427	i += num_fence_chars_found;
428	let num_trailing_spaces: usize = scan_ch_repeat(&bytes[i..], c:b' ');
429	i += num_trailing_spaces;
430	scan_eol(&bytes[i..]).map(\|_\| i)
431	}
432
433	// returned pair is (number of bytes, number of spaces)
434	fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
435	let mut spaces = `0`;
436	let mut offset = `0`;
437
438	for (i, &b) in text.iter().enumerate() {
439	match b {
440	b' ' => {
441	spaces += `1`;
442	if spaces == max {
443	break;
444	}
445	}
446	b'`\t`' => {
447	let new_spaces = spaces + `4` - (spaces & `3`);
448	if new_spaces > max {
449	break;
450	}
451	spaces = new_spaces;
452	}
453	_ => break,
454	}
455	offset = i;
456	}
457
458	(offset, spaces)
459	}
460
461	/// Scan hrule opening sequence.
462	///
463	/// Returns Ok(x) when it finds an hrule, where x is the
464	/// size of line containing the hrule, including the trailing newline.
465	///
466	/// Returns Err(x) when it does not find an hrule and x is
467	/// the offset in data before no hrule can appear.
468	pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
469	if bytes.len() < `3` {
470	return Err(`0`);
471	}
472	let c = bytes[`0`];
473	if !(c == b'*' \|\| c == b'-' \|\| c == b'_') {
474	return Err(`0`);
475	}
476	let mut n = `0`;
477	let mut i = `0`;
478
479	while i < bytes.len() {
480	match bytes[i] {
481	b'`\n`' \| b'`\r`' => {
482	i += scan_eol(&bytes[i..]).unwrap_or(`0`);
483	break;
484	}
485	c2 if c2 == c => {
486	n += `1`;
487	}
488	b' ' \| b'`\t`' => (),
489	_ => return Err(i),
490	}
491	i += `1`;
492	}
493	if n >= `3` {
494	Ok(i)
495	} else {
496	Err(i)
497	}
498	}
499
500	/// Scan an ATX heading opening sequence.
501	///
502	/// Returns number of bytes in prefix and level.
503	pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<HeadingLevel> {
504	let level: usize = scan_ch_repeat(data, c:b'#');
505	if data.get(level).copied().map_or(default:`true`, f:is_ascii_whitespace) {
506	HeadingLevel::try_from(level).ok()
507	} else {
508	None
509	}
510	}
511
512	/// Scan a setext heading underline.
513	///
514	/// Returns number of bytes in line (including trailing newline) and level.
515	pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, HeadingLevel)> {
516	let c: u8 = *data.get(index:`0`)?;
517	let level: HeadingLevel = if c == b'=' {
518	HeadingLevel::H1
519	} else if c == b'-' {
520	HeadingLevel::H2
521	} else {
522	return None;
523	};
524	let mut i: usize = `1` + scan_ch_repeat(&data[`1`..], c);
525	i += scan_blank_line(&data[i..])?;
526	Some((i, level))
527	}
528
529	// returns number of bytes in line (including trailing
530	// newline) and column alignments
531	pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
532	let (mut i, spaces) = calc_indent(data, `4`);
533	if spaces > `3` \|\| i == data.len() {
534	return (`0`, vec![]);
535	}
536	let mut cols = vec![];
537	let mut active_col = Alignment::None;
538	let mut start_col = `true`;
539	if data[i] == b'\|' {
540	i += `1`;
541	}
542	for c in &data[i..] {
543	if let Some(n) = scan_eol(&data[i..]) {
544	i += n;
545	break;
546	}
547	match *c {
548	b' ' => (),
549	b':' => {
550	active_col = match (start_col, active_col) {
551	(`true`, Alignment::None) => Alignment::Left,
552	(`false`, Alignment::Left) => Alignment::Center,
553	(`false`, Alignment::None) => Alignment::Right,
554	_ => active_col,
555	};
556	start_col = `false`;
557	}
558	b'-' => {
559	start_col = `false`;
560	}
561	b'\|' => {
562	start_col = `true`;
563	cols.push(active_col);
564	active_col = Alignment::None;
565	}
566	_ => {
567	cols = vec![];
568	start_col = `true`;
569	break;
570	}
571	}
572	i += `1`;
573	}
574
575	if !start_col {
576	cols.push(active_col);
577	}
578
579	(i, cols)
580	}
581
582	/// Scan code fence.
583	///
584	/// Returns number of bytes scanned and the char that is repeated to make the code fence.
585	pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
586	let c: u8 = *data.get(index:`0`)?;
587	if !(c == b'`' \|\| c == b'~') {
588	return None;
589	}
590	let i: usize = `1` + scan_ch_repeat(&data[`1`..], c);
591	if i >= `3` {
592	if c == b'`' {
593	let suffix: &[u8] = &data[i..];
594	let next_line: usize = i + scan_nextline(bytes:suffix);
595	// FIXME: make sure this is correct
596	if suffix[..(next_line - i)].iter().any(\|&b: u8\| b == b'`') {
597	return None;
598	}
599	}
600	Some((i, c))
601	} else {
602	None
603	}
604	}
605
606	pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
607	if data.starts_with(needle:b"> ") {
608	Some(`2`)
609	} else {
610	None
611	}
612	}
613
614	/// This already assumes the list item has been scanned.
615	pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
616	let mut ix: usize = `0`;
617	for _ in `0`..`2` {
618	if let Some(bytes: usize) = scan_blank_line(&data[ix..]) {
619	ix += bytes;
620	} else {
621	return `false`;
622	}
623	}
624	`true`
625	}
626
627	// return number of bytes scanned, delimiter, start index, and indent
628	pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
629	let mut c = *bytes.get(`0`)?;
630	let (w, start) = match c {
631	b'-' \| b'+' \| b'*' => (`1`, `0`),
632	b'0'..=b'9' => {
633	let (length, start) = parse_decimal(bytes);
634	c = *bytes.get(length)?;
635	if !(c == b'.' \|\| c == b')') {
636	return None;
637	}
638	(length + `1`, start)
639	}
640	_ => {
641	return None;
642	}
643	};
644	// TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
645	let (mut postn, mut postindent) = calc_indent(&bytes[w..], `5`);
646	if postindent == `0` {
647	scan_eol(&bytes[w..])?;
648	postindent += `1`;
649	} else if postindent > `4` {
650	postn = `1`;
651	postindent = `1`;
652	}
653	if scan_blank_line(&bytes[w..]).is_some() {
654	postn = `0`;
655	postindent = `1`;
656	}
657	Some((w + postn, c, start, w + postindent))
658	}
659
660	// returns (number of bytes, parsed decimal)
661	fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
662	match bytes
663	.iter()
664	.take_while(\|&&b\| is_digit(b))
665	.try_fold((`0`, `0usize`), \|(count: usize, acc: usize), c: &u8\| {
666	let digit: usize = usize::from(c - b'0');
667	match accOption
668	.checked_mul(`10`)
669	.and_then(\|ten_acc: usize\| ten_acc.checked_add(digit))
670	{
671	Some(number: usize) => Ok((count + `1`, number)),
672	// stop early on overflow
673	None => Err((count, acc)),
674	}
675	}) {
676	Ok(p: (usize, usize)) \| Err(p: (usize, usize)) => p,
677	}
678	}
679
680	// returns (number of bytes, parsed hex)
681	fn parse_hex(bytes: &[u8]) -> (usize, usize) {
682	match bytes.iter().try_fold((`0`, `0usize`), \|(count, acc), c\| {
683	let mut c = *c;
684	let digit = if c >= b'0' && c <= b'9' {
685	usize::from(c - b'0')
686	} else {
687	// make lower case
688	c \|= `0x20`;
689	if c >= b'a' && c <= b'f' {
690	usize::from(c - b'a' + `10`)
691	} else {
692	return Err((count, acc));
693	}
694	};
695	match acc
696	.checked_mul(`16`)
697	.and_then(\|sixteen_acc\| sixteen_acc.checked_add(digit))
698	{
699	Some(number) => Ok((count + `1`, number)),
700	// stop early on overflow
701	None => Err((count, acc)),
702	}
703	}) {
704	Ok(p) \| Err(p) => p,
705	}
706	}
707
708	fn char_from_codepoint(input: usize) -> Option<char> {
709	let mut codepoint: u32 = input.try_into().ok()?;
710	if codepoint == `0` {
711	codepoint = `0xFFFD`;
712	}
713	char::from_u32(codepoint)
714	}
715
716	// doesn't bother to check data[0] == '&'
717	pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
718	let mut end = `1`;
719	if scan_ch(&bytes[end..], b'#') == `1` {
720	end += `1`;
721	let (bytecount, codepoint) = if end < bytes.len() && bytes[end] \| `0x20` == b'x' {
722	end += `1`;
723	parse_hex(&bytes[end..])
724	} else {
725	parse_decimal(&bytes[end..])
726	};
727	end += bytecount;
728	return if bytecount == `0` \|\| scan_ch(&bytes[end..], b';') == `0` {
729	(`0`, None)
730	} else if let Some(c) = char_from_codepoint(codepoint) {
731	(end + `1`, Some(c.into()))
732	} else {
733	(`0`, None)
734	};
735	}
736	end += scan_while(&bytes[end..], is_ascii_alphanumeric);
737	if scan_ch(&bytes[end..], b';') == `1` {
738	if let Some(value) = entities::get_entity(&bytes[`1`..end]) {
739	return (end + `1`, Some(value.into()));
740	}
741	}
742	(`0`, None)
743	}
744
745	// FIXME: we can most likely re-use other scanners
746	// returns (bytelength, title_str)
747	pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
748	let mut chars = text.chars().peekable();
749	let closing_delim = match chars.next()? {
750	'`\'`' => '`\'`',
751	'"' => '"',
752	'(' => ')',
753	_ => return None,
754	};
755	let mut bytecount = `1`;
756
757	while let Some(c) = chars.next() {
758	match c {
759	'`\n`' => {
760	bytecount += `1`;
761	let mut next = *chars.peek()?;
762	while is_ascii_whitespace_no_nl(next as u8) {
763	bytecount += chars.next()?.len_utf8();
764	next = *chars.peek()?;
765	}
766	if *chars.peek()? == '`\n`' {
767	// blank line - not allowed
768	return None;
769	}
770	}
771	'`\\`' => {
772	let next_char = chars.next()?;
773	bytecount += `1` + next_char.len_utf8();
774	}
775	c if c == closing_delim => {
776	return Some((bytecount + `1`, &text[`1`..bytecount]));
777	}
778	c => {
779	bytecount += c.len_utf8();
780	}
781	}
782	}
783	None
784	}
785
786	// note: dest returned is raw, still needs to be unescaped
787	// TODO: check that nested parens are really not allowed for refdefs
788	// TODO(performance): this func should probably its own unescaping
789	pub(crate) fn scan_link_dest(
790	data: &str,
791	start_ix: usize,
792	max_next: usize,
793	) -> Option<(usize, &str)> {
794	let bytes = &data.as_bytes()[start_ix..];
795	let mut i = scan_ch(bytes, b'<');
796
797	if i != `0` {
798	// pointy links
799	while i < bytes.len() {
800	match bytes[i] {
801	b'`\n`' \| b'`\r`' \| b'<' => return None,
802	b'>' => return Some((i + `1`, &data[(start_ix + `1`)..(start_ix + i)])),
803	b'`\\`' if i + `1` < bytes.len() && is_ascii_punctuation(bytes[i + `1`]) => {
804	i += `1`;
805	}
806	_ => {}
807	}
808	i += `1`;
809	}
810	None
811	} else {
812	// non-pointy links
813	let mut nest = `0`;
814	while i < bytes.len() {
815	match bytes[i] {
816	`0x0`..=`0x20` => {
817	break;
818	}
819	b'(' => {
820	if nest > max_next {
821	return None;
822	}
823	nest += `1`;
824	}
825	b')' => {
826	if nest == `0` {
827	break;
828	}
829	nest -= `1`;
830	}
831	b'`\\`' if i + `1` < bytes.len() && is_ascii_punctuation(bytes[i + `1`]) => {
832	i += `1`;
833	}
834	_ => {}
835	}
836	i += `1`;
837	}
838	Some((i, &data[start_ix..(start_ix + i)]))
839	}
840	}
841
842	/// Returns bytes scanned
843	fn scan_attribute_name(data: &[u8]) -> Option<usize> {
844	let (&c: u8, tail: &[u8]) = data.split_first()?;
845	if is_ascii_alpha(c) \|\| c == b'_' \|\| c == b':' {
846	Some(
847	`1` + scan_while(data:tail, \|c: u8\| {
848	is_ascii_alphanumeric(c) \|\| c == b'_' \|\| c == b'.' \|\| c == b':' \|\| c == b'-'
849	}),
850	)
851	} else {
852	None
853	}
854	}
855
856	/// Returns the index immediately following the attribute on success.
857	/// The argument `buffer_ix` refers to the index into `data` from which we
858	/// should copy into `buffer` when we find bytes to skip.
859	fn scan_attribute(
860	data: &[u8],
861	mut ix: usize,
862	newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
863	buffer: &mut Vec<u8>,
864	buffer_ix: &mut usize,
865	) -> Option<usize> {
866	ix += scan_attribute_name(&data[ix..])?;
867	let n_whitespace: usize =
868	scan_whitespace_with_newline_handler(data, i:ix, newline_handler, buffer, buffer_ix)? - ix;
869	ix += n_whitespace;
870	if scan_ch(&data[ix..], c:b'=') == `1` {
871	ix += `1`;
872	ix = scan_whitespace_with_newline_handler(data, i:ix, newline_handler, buffer, buffer_ix)?;
873	ix = scan_attribute_value(data, i:ix, newline_handler, buffer, buffer_ix)?;
874	} else if n_whitespace > `0` {
875	// Leave whitespace for next attribute.
876	ix -= `1`;
877	}
878	Some(ix)
879	}
880
881	/// Scans whitespace and possibly newlines according to the
882	/// behavior defined by the newline handler. When bytes are skipped,
883	/// all preceding non-skipped bytes are pushed to the buffer.
884	fn scan_whitespace_with_newline_handler(
885	data: &[u8],
886	mut i: usize,
887	newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
888	buffer: &mut Vec<u8>,
889	buffer_ix: &mut usize,
890	) -> Option<usize> {
891	while i < data.len() {
892	if !is_ascii_whitespace(data[i]) {
893	return Some(i);
894	}
895	if let Some(eol_bytes: usize) = scan_eol(&data[i..]) {
896	let handler: &dyn Fn(&[u8]) -> usize = newline_handler?;
897	i += eol_bytes;
898	let skipped_bytes: usize = handler(&data[i..]);
899
900	if skipped_bytes > `0` {
901	buffer.extend(&data[*buffer_ix..i]);
902	*buffer_ix = i + skipped_bytes;
903	}
904
905	i += skipped_bytes;
906	} else {
907	i += `1`;
908	}
909	}
910
911	Some(i)
912	}
913
914	/// Returns the index immediately following the attribute value on success.
915	fn scan_attribute_value(
916	data: &[u8],
917	mut i: usize,
918	newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
919	buffer: &mut Vec<u8>,
920	buffer_ix: &mut usize,
921	) -> Option<usize> {
922	match *data.get(i)? {
923	b @ b'"' \| b @ b'`\'`' => {
924	i += `1`;
925	while i < data.len() {
926	if data[i] == b {
927	return Some(i + `1`);
928	}
929	if let Some(eol_bytes) = scan_eol(&data[i..]) {
930	let handler = newline_handler?;
931	i += eol_bytes;
932	let skipped_bytes = handler(&data[i..]);
933
934	if skipped_bytes > `0` {
935	buffer.extend(&data[*buffer_ix..i]);
936	*buffer_ix = i + skipped_bytes;
937	}
938	i += skipped_bytes;
939	} else {
940	i += `1`;
941	}
942	}
943	return None;
944	}
945	b' ' \| b'=' \| b'>' \| b'<' \| b'`' \| b'`\n`' \| b'`\r`' => {
946	return None;
947	}
948	_ => {
949	// unquoted attribute value
950	i += scan_attr_value_chars(&data[i..]);
951	}
952	}
953
954	Some(i)
955	}
956
957	// Remove backslash escapes and resolve entities
958	pub(crate) fn unescape(input: &str) -> CowStr<'_> {
959	let mut result = String::new();
960	let mut mark = `0`;
961	let mut i = `0`;
962	let bytes = input.as_bytes();
963	while i < bytes.len() {
964	match bytes[i] {
965	b'`\\`' if i + `1` < bytes.len() && is_ascii_punctuation(bytes[i + `1`]) => {
966	result.push_str(&input[mark..i]);
967	mark = i + `1`;
968	i += `2`;
969	}
970	b'&' => match scan_entity(&bytes[i..]) {
971	(n, Some(value)) => {
972	result.push_str(&input[mark..i]);
973	result.push_str(&value);
974	i += n;
975	mark = i;
976	}
977	_ => i += `1`,
978	},
979	b'`\r`' => {
980	result.push_str(&input[mark..i]);
981	i += `1`;
982	mark = i;
983	}
984	_ => i += `1`,
985	}
986	}
987	if mark == `0` {
988	input.into()
989	} else {
990	result.push_str(&input[mark..]);
991	result.into()
992	}
993	}
994
995	/// Assumes `data` is preceded by `<`.
996	pub(crate) fn starts_html_block_type_6(data: &[u8]) -> bool {
997	let i: usize = scan_ch(data, c:b'/');
998	let tail: &[u8] = &data[i..];
999	let n: usize = scan_while(data:tail, f:is_ascii_alphanumeric);
1000	if !is_html_tag(&tail[..n]) {
1001	return `false`;
1002	}
1003	// Starting condition says the next byte must be either a space, a tab,
1004	// the end of the line, the string >, or the string />
1005	let tail: &[u8] = &tail[n..];
1006	tail.is_empty()
1007	\|\| tail[`0`] == b' '
1008	\|\| tail[`0`] == b'`\t`'
1009	\|\| tail[`0`] == b'`\r`'
1010	\|\| tail[`0`] == b'`\n`'
1011	\|\| tail[`0`] == b'>'
1012	\|\| tail.len() >= `2` && &tail[..`2`] == b"/>"
1013	}
1014
1015	fn is_html_tag(tag: &[u8]) -> bool {
1016	HTML_TAGS
1017	.binary_search_by(\|probe: &&str\| {
1018	let probe_bytes_iter: Iter<'_, u8> = probe.as_bytes().iter();
1019	let tag_bytes_iter: Iter<'_, u8> = tag.iter();
1020
1021	probe_bytes_iterOption
1022	.zip(tag_bytes_iter)
1023	.find_map(\|(&a: u8, &b: u8)\| {
1024	// We can compare case insensitively because the probes are
1025	// all lower case alpha strings.
1026	match a.cmp(&(b \| `0x20`)) {
1027	std::cmp::Ordering::Equal => None,
1028	inequality: Ordering => Some(inequality),
1029	}
1030	})
1031	.unwrap_or_else(\|\| probe.len().cmp(&tag.len()))
1032	})
1033	.is_ok()
1034	}
1035
1036	/// Assumes that `data` starts with `<`.
1037	/// Returns the index into data directly after the html tag on success.
1038	pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1039	// Block type html does not allow for newlines, so we
1040	// do not pass a newline handler.
1041	let (_span: Vec, i: usize) = scan_html_block_inner(data, newline_handler:None)?;
1042	scan_blank_line(&data[i..])?;
1043	Some(i)
1044	}
1045
1046	/// Assumes that `data` starts with `<`.
1047	/// Returns the number of bytes scanned and the html in case of
1048	/// success.
1049	/// When some bytes were skipped, because the html was split over
1050	/// multiple leafs (e.g. over multiple lines in a blockquote),
1051	/// the html is returned as a vector of bytes.
1052	/// If no bytes were skipped, the buffer will be empty.
1053	pub(crate) fn scan_html_block_inner(
1054	data: &[u8],
1055	newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1056	) -> Option<(Vec<u8>, usize)> {
1057	let mut buffer = Vec::new();
1058	let mut last_buf_index = `0`;
1059
1060	let close_tag_bytes = scan_ch(&data[`1`..], b'/');
1061	let l = scan_while(&data[(`1` + close_tag_bytes)..], is_ascii_alpha);
1062	if l == `0` {
1063	return None;
1064	}
1065	let mut i = `1` + close_tag_bytes + l;
1066	i += scan_while(&data[i..], is_ascii_letterdigitdash);
1067
1068	if close_tag_bytes == `0` {
1069	loop {
1070	let old_i = i;
1071	loop {
1072	i += scan_whitespace_no_nl(&data[i..]);
1073	if let Some(eol_bytes) = scan_eol(&data[i..]) {
1074	if eol_bytes == `0` {
1075	return None;
1076	}
1077	let handler = newline_handler?;
1078	i += eol_bytes;
1079	let skipped_bytes = handler(&data[i..]);
1080
1081	let data_len = data.len() - i;
1082
1083	debug_assert!(
1084	skipped_bytes <= data_len,
1085	"Handler tried to skip too many bytes, fed {}, skipped {}",
1086	data_len,
1087	skipped_bytes
1088	);
1089
1090	if skipped_bytes > `0` {
1091	buffer.extend(&data[last_buf_index..i]);
1092	i += skipped_bytes;
1093	last_buf_index = i;
1094	}
1095	} else {
1096	break;
1097	}
1098	}
1099	if let Some(b'/') \| Some(b'>') = data.get(i) {
1100	break;
1101	}
1102	if old_i == i {
1103	// No whitespace, which is mandatory.
1104	return None;
1105	}
1106	i = scan_attribute(data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1107	}
1108	}
1109
1110	i += scan_whitespace_no_nl(&data[i..]);
1111
1112	if close_tag_bytes == `0` {
1113	i += scan_ch(&data[i..], b'/');
1114	}
1115
1116	if scan_ch(&data[i..], b'>') == `0` {
1117	None
1118	} else {
1119	i += `1`;
1120	if !buffer.is_empty() {
1121	buffer.extend(&data[last_buf_index..i]);
1122	}
1123	Some((buffer, i))
1124	}
1125	}
1126
1127	/// Returns (next_byte_offset, uri, type)
1128	pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1129	scan_uriOption<(usize, CowStr<'_>, …)>(text, start_ix)
1130	.map(\|(bytes: usize, uri: CowStr<'_>)\| (bytes, uri, LinkType::Autolink))
1131	.or_else(\|\| scan_email(text, start_ix).map(\|(bytes: usize, uri: CowStr<'_>)\| (bytes, uri, LinkType::Email)))
1132	}
1133
1134	/// Returns (next_byte_offset, uri)
1135	fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1136	let bytes = &text.as_bytes()[start_ix..];
1137
1138	// scheme's first byte must be an ascii letter
1139	if bytes.is_empty() \|\| !is_ascii_alpha(bytes[`0`]) {
1140	return None;
1141	}
1142
1143	let mut i = `1`;
1144
1145	while i < bytes.len() {
1146	let c = bytes[i];
1147	i += `1`;
1148	match c {
1149	c if is_ascii_alphanumeric(c) => (),
1150	b'.' \| b'-' \| b'+' => (),
1151	b':' => break,
1152	_ => return None,
1153	}
1154	}
1155
1156	// scheme length must be between 2 and 32 characters long. scheme
1157	// must be followed by colon
1158	if i < `3` \|\| i > `33` {
1159	return None;
1160	}
1161
1162	while i < bytes.len() {
1163	match bytes[i] {
1164	b'>' => return Some((start_ix + i + `1`, text[start_ix..(start_ix + i)].into())),
1165	b'`\0`'..=b' ' \| b'<' => return None,
1166	_ => (),
1167	}
1168	i += `1`;
1169	}
1170
1171	None
1172	}
1173
1174	/// Returns (next_byte_offset, email)
1175	fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1176	// using a regex library would be convenient, but doing it by hand is not too bad
1177	let bytes = &text.as_bytes()[start_ix..];
1178	let mut i = `0`;
1179
1180	while i < bytes.len() {
1181	let c = bytes[i];
1182	i += `1`;
1183	match c {
1184	c if is_ascii_alphanumeric(c) => (),
1185	b'.' \| b'!' \| b'#' \| b'$' \| b'%' \| b'&' \| b'`\'`' \| b'*' \| b'+' \| b'/' \| b'=' \| b'?'
1186	\| b'^' \| b'_' \| b'`' \| b'{' \| b'\|' \| b'}' \| b'~' \| b'-' => (),
1187	b'@' => break,
1188	_ => return None,
1189	}
1190	}
1191
1192	loop {
1193	let label_start_ix = i;
1194	let mut fresh_label = `true`;
1195
1196	while i < bytes.len() {
1197	match bytes[i] {
1198	c if is_ascii_alphanumeric(c) => (),
1199	b'-' if fresh_label => {
1200	return None;
1201	}
1202	b'-' => (),
1203	_ => break,
1204	}
1205	fresh_label = `false`;
1206	i += `1`;
1207	}
1208
1209	if i == label_start_ix \|\| i - label_start_ix > `63` \|\| bytes[i - `1`] == b'-' {
1210	return None;
1211	}
1212
1213	if scan_ch(&bytes[i..], b'.') == `0` {
1214	break;
1215	}
1216	i += `1`;
1217	}
1218
1219	if scan_ch(&bytes[i..], b'>') == `0` {
1220	return None;
1221	}
1222
1223	Some((start_ix + i + `1`, text[start_ix..(start_ix + i)].into()))
1224	}
1225
1226	/// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1227	/// Returns byte offset on match.
1228	pub(crate) fn scan_inline_html_comment(
1229	bytes: &[u8],
1230	mut ix: usize,
1231	scan_guard: &mut HtmlScanGuard,
1232	) -> Option<usize> {
1233	let c = *bytes.get(ix)?;
1234	ix += `1`;
1235	match c {
1236	b'-' => {
1237	let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1238	if dashes < `1` {
1239	return None;
1240	}
1241	// Saw "<!--", scan comment.
1242	ix += dashes;
1243	if scan_ch(&bytes[ix..], b'>') == `1` {
1244	return None;
1245	}
1246
1247	while let Some(x) = memchr(b'-', &bytes[ix..]) {
1248	ix += x + `1`;
1249	if scan_ch(&bytes[ix..], b'-') == `1` {
1250	ix += `1`;
1251	return if scan_ch(&bytes[ix..], b'>') == `1` {
1252	Some(ix + `1`)
1253	} else {
1254	None
1255	};
1256	}
1257	}
1258	None
1259	}
1260	b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1261	ix += b"CDATA[".len();
1262	ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), \|x\| ix + x);
1263	let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1264	ix += close_brackets;
1265
1266	if close_brackets == `0` \|\| scan_ch(&bytes[ix..], b'>') == `0` {
1267	scan_guard.cdata = ix;
1268	None
1269	} else {
1270	Some(ix + `1`)
1271	}
1272	}
1273	b'A'..=b'Z' if ix > scan_guard.declaration => {
1274	// Scan declaration.
1275	ix += scan_while(&bytes[ix..], \|c\| c >= b'A' && c <= b'Z');
1276	let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1277	if whitespace == `0` {
1278	return None;
1279	}
1280	ix += whitespace;
1281	ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), \|x\| ix + x);
1282	if scan_ch(&bytes[ix..], b'>') == `0` {
1283	scan_guard.declaration = ix;
1284	None
1285	} else {
1286	Some(ix + `1`)
1287	}
1288	}
1289	_ => None,
1290	}
1291	}
1292
1293	/// Scan processing directive, with initial "<?" already consumed.
1294	/// Returns the next byte offset on success.
1295	pub(crate) fn scan_inline_html_processing(
1296	bytes: &[u8],
1297	mut ix: usize,
1298	scan_guard: &mut HtmlScanGuard,
1299	) -> Option<usize> {
1300	if ix <= scan_guard.processing {
1301	return None;
1302	}
1303	while let Some(offset: usize) = memchr(needle:b'?', &bytes[ix..]) {
1304	ix += offset + `1`;
1305	if scan_ch(&bytes[ix..], c:b'>') == `1` {
1306	return Some(ix + `1`);
1307	}
1308	}
1309	scan_guard.processing = ix;
1310	None
1311	}
1312
1313	#[cfg(test)]
1314	mod test {
1315	use super::*;
1316	#[test]
1317	fn overflow_list() {
1318	assert!(
1319	scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1320	);
1321	}
1322
1323	#[test]
1324	fn overflow_by_addition() {
1325	assert!(scan_listitem(b"1844674407370955161615!").is_none());
1326	}
1327	}
1328