1// Copyright 2015 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Scanners for fragments of CommonMark syntax
22
23use std::convert::TryInto;
24use std::{char, convert::TryFrom};
25
26use crate::parse::HtmlScanGuard;
27pub(crate) use crate::puncttable::{is_ascii_punctuation, is_punctuation};
28use crate::strings::CowStr;
29use crate::{entities, HeadingLevel};
30use crate::{Alignment, LinkType};
31
32use memchr::memchr;
33
34// sorted for binary search
35const HTML_TAGS: [&str; 62] = [
36 "address",
37 "article",
38 "aside",
39 "base",
40 "basefont",
41 "blockquote",
42 "body",
43 "caption",
44 "center",
45 "col",
46 "colgroup",
47 "dd",
48 "details",
49 "dialog",
50 "dir",
51 "div",
52 "dl",
53 "dt",
54 "fieldset",
55 "figcaption",
56 "figure",
57 "footer",
58 "form",
59 "frame",
60 "frameset",
61 "h1",
62 "h2",
63 "h3",
64 "h4",
65 "h5",
66 "h6",
67 "head",
68 "header",
69 "hr",
70 "html",
71 "iframe",
72 "legend",
73 "li",
74 "link",
75 "main",
76 "menu",
77 "menuitem",
78 "nav",
79 "noframes",
80 "ol",
81 "optgroup",
82 "option",
83 "p",
84 "param",
85 "section",
86 "source",
87 "summary",
88 "table",
89 "tbody",
90 "td",
91 "tfoot",
92 "th",
93 "thead",
94 "title",
95 "tr",
96 "track",
97 "ul",
98];
99
100/// Analysis of the beginning of a line, including indentation and container
101/// markers.
102#[derive(Clone)]
103pub(crate) struct LineStart<'a> {
104 bytes: &'a [u8],
105 tab_start: usize,
106 ix: usize,
107 spaces_remaining: usize,
108 // no thematic breaks can occur before this offset.
109 // this prevents scanning over and over up to a certain point
110 min_hrule_offset: usize,
111}
112
113impl<'a> LineStart<'a> {
114 pub(crate) fn new(bytes: &[u8]) -> LineStart {
115 LineStart {
116 bytes,
117 tab_start: 0,
118 ix: 0,
119 spaces_remaining: 0,
120 min_hrule_offset: 0,
121 }
122 }
123
124 /// Try to scan a number of spaces.
125 ///
126 /// Returns true if all spaces were consumed.
127 ///
128 /// Note: consumes some spaces even if not successful.
129 pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
130 self.scan_space_inner(n_space) == 0
131 }
132
133 /// Scan a number of spaces up to a maximum.
134 ///
135 /// Returns number of spaces scanned.
136 pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
137 n_space - self.scan_space_inner(n_space)
138 }
139
140 /// Returns unused remainder of spaces.
141 fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
142 let n_from_remaining = self.spaces_remaining.min(n_space);
143 self.spaces_remaining -= n_from_remaining;
144 n_space -= n_from_remaining;
145 while n_space > 0 && self.ix < self.bytes.len() {
146 match self.bytes[self.ix] {
147 b' ' => {
148 self.ix += 1;
149 n_space -= 1;
150 }
151 b'\t' => {
152 let spaces = 4 - (self.ix - self.tab_start) % 4;
153 self.ix += 1;
154 self.tab_start = self.ix;
155 let n = spaces.min(n_space);
156 n_space -= n;
157 self.spaces_remaining = spaces - n;
158 }
159 _ => break,
160 }
161 }
162 n_space
163 }
164
165 /// Scan all available ASCII whitespace (not including eol).
166 pub(crate) fn scan_all_space(&mut self) {
167 self.spaces_remaining = 0;
168 self.ix += self.bytes[self.ix..]
169 .iter()
170 .take_while(|&&b| b == b' ' || b == b'\t')
171 .count();
172 }
173
174 /// Determine whether we're at end of line (includes end of file).
175 pub(crate) fn is_at_eol(&self) -> bool {
176 self.bytes
177 .get(self.ix)
178 .map(|&c| c == b'\r' || c == b'\n')
179 .unwrap_or(true)
180 }
181
182 fn scan_ch(&mut self, c: u8) -> bool {
183 if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
184 self.ix += 1;
185 true
186 } else {
187 false
188 }
189 }
190
191 pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
192 let save = self.clone();
193 let _ = self.scan_space(3);
194 if self.scan_ch(b'>') {
195 let _ = self.scan_space(1);
196 true
197 } else {
198 *self = save;
199 false
200 }
201 }
202
203 /// Scan a list marker.
204 ///
205 /// Return value is the character, the start index, and the indent in spaces.
206 /// For ordered list markers, the character will be one of b'.' or b')'. For
207 /// bullet list markers, it will be one of b'-', b'+', or b'*'.
208 pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
209 let save = self.clone();
210 let indent = self.scan_space_upto(4);
211 if indent < 4 && self.ix < self.bytes.len() {
212 let c = self.bytes[self.ix];
213 if c == b'-' || c == b'+' || c == b'*' {
214 if self.ix >= self.min_hrule_offset {
215 // there could be an hrule here
216 if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
217 self.min_hrule_offset = min_offset;
218 } else {
219 *self = save;
220 return None;
221 }
222 }
223 self.ix += 1;
224 if self.scan_space(1) || self.is_at_eol() {
225 return self.finish_list_marker(c, 0, indent + 2);
226 }
227 } else if c >= b'0' && c <= b'9' {
228 let start_ix = self.ix;
229 let mut ix = self.ix + 1;
230 let mut val = u64::from(c - b'0');
231 while ix < self.bytes.len() && ix - start_ix < 10 {
232 let c = self.bytes[ix];
233 ix += 1;
234 if c >= b'0' && c <= b'9' {
235 val = val * 10 + u64::from(c - b'0');
236 } else if c == b')' || c == b'.' {
237 self.ix = ix;
238 if self.scan_space(1) || self.is_at_eol() {
239 return self.finish_list_marker(c, val, indent + self.ix - start_ix);
240 } else {
241 break;
242 }
243 } else {
244 break;
245 }
246 }
247 }
248 }
249 *self = save;
250 None
251 }
252
253 fn finish_list_marker(
254 &mut self,
255 c: u8,
256 start: u64,
257 mut indent: usize,
258 ) -> Option<(u8, u64, usize)> {
259 let save = self.clone();
260
261 // skip the rest of the line if it's blank
262 if scan_blank_line(&self.bytes[self.ix..]).is_some() {
263 return Some((c, start, indent));
264 }
265
266 let post_indent = self.scan_space_upto(4);
267 if post_indent < 4 {
268 indent += post_indent;
269 } else {
270 *self = save;
271 }
272 Some((c, start, indent))
273 }
274
275 /// Returns Some(is_checked) when a task list marker was found. Resets itself
276 /// to original state otherwise.
277 pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
278 let save = self.clone();
279 self.scan_space_upto(3);
280
281 if !self.scan_ch(b'[') {
282 *self = save;
283 return None;
284 }
285 let is_checked = match self.bytes.get(self.ix) {
286 Some(&c) if is_ascii_whitespace_no_nl(c) => {
287 self.ix += 1;
288 false
289 }
290 Some(b'x') | Some(b'X') => {
291 self.ix += 1;
292 true
293 }
294 _ => {
295 *self = save;
296 return None;
297 }
298 };
299 if !self.scan_ch(b']') {
300 *self = save;
301 return None;
302 }
303 if !self
304 .bytes
305 .get(self.ix)
306 .map(|&b| is_ascii_whitespace_no_nl(b))
307 .unwrap_or(false)
308 {
309 *self = save;
310 return None;
311 }
312 Some(is_checked)
313 }
314
315 pub(crate) fn bytes_scanned(&self) -> usize {
316 self.ix
317 }
318
319 pub(crate) fn remaining_space(&self) -> usize {
320 self.spaces_remaining
321 }
322}
323
324pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
325 (c >= 0x09 && c <= 0x0d) || c == b' '
326}
327
328pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
329 c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
330}
331
332fn is_ascii_alpha(c: u8) -> bool {
333 matches!(c, b'a'..=b'z' | b'A'..=b'Z')
334}
335
336fn is_ascii_alphanumeric(c: u8) -> bool {
337 matches!(c, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z')
338}
339
340fn is_ascii_letterdigitdash(c: u8) -> bool {
341 c == b'-' || is_ascii_alphanumeric(c)
342}
343
344fn is_digit(c: u8) -> bool {
345 b'0' <= c && c <= b'9'
346}
347
348fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
349 !matches!(
350 c,
351 b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r'
352 )
353}
354
355// scan a single character
356pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
357 if !data.is_empty() && data[0] == c {
358 1
359 } else {
360 0
361 }
362}
363
364pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
365where
366 F: FnMut(u8) -> bool,
367{
368 data.iter().take_while(|&&c: u8| f(c)).count()
369}
370
371pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
372where
373 F: FnMut(u8) -> bool,
374{
375 data.iter().rev().take_while(|&&c: u8| f(c)).count()
376}
377
378pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
379 scan_while(data, |x: u8| x == c)
380}
381
382// Note: this scans ASCII whitespace only, for Unicode whitespace use
383// a different function.
384pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
385 scan_while(data, f:is_ascii_whitespace_no_nl)
386}
387
388fn scan_attr_value_chars(data: &[u8]) -> usize {
389 scan_while(data, f:is_valid_unquoted_attr_value_char)
390}
391
392pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
393 if bytes.is_empty() {
394 return Some(0);
395 }
396 match bytes[0] {
397 b'\n' => Some(1),
398 b'\r' => Some(if bytes.get(index:1) == Some(&b'\n') { 2 } else { 1 }),
399 _ => None,
400 }
401}
402
403pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
404 let i: usize = scan_whitespace_no_nl(data:bytes);
405 scan_eol(&bytes[i..]).map(|n: usize| i + n)
406}
407
408pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
409 memchr(b'\n', bytes).map_or(default:bytes.len(), |x: usize| x + 1)
410}
411
412// return: end byte for closing code fence, or None
413// if the line is not a closing code fence
414pub(crate) fn scan_closing_code_fence(
415 bytes: &[u8],
416 fence_char: u8,
417 n_fence_char: usize,
418) -> Option<usize> {
419 if bytes.is_empty() {
420 return Some(0);
421 }
422 let mut i: usize = 0;
423 let num_fence_chars_found: usize = scan_ch_repeat(&bytes[i..], c:fence_char);
424 if num_fence_chars_found < n_fence_char {
425 return None;
426 }
427 i += num_fence_chars_found;
428 let num_trailing_spaces: usize = scan_ch_repeat(&bytes[i..], c:b' ');
429 i += num_trailing_spaces;
430 scan_eol(&bytes[i..]).map(|_| i)
431}
432
433// returned pair is (number of bytes, number of spaces)
434fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
435 let mut spaces = 0;
436 let mut offset = 0;
437
438 for (i, &b) in text.iter().enumerate() {
439 match b {
440 b' ' => {
441 spaces += 1;
442 if spaces == max {
443 break;
444 }
445 }
446 b'\t' => {
447 let new_spaces = spaces + 4 - (spaces & 3);
448 if new_spaces > max {
449 break;
450 }
451 spaces = new_spaces;
452 }
453 _ => break,
454 }
455 offset = i;
456 }
457
458 (offset, spaces)
459}
460
461/// Scan hrule opening sequence.
462///
463/// Returns Ok(x) when it finds an hrule, where x is the
464/// size of line containing the hrule, including the trailing newline.
465///
466/// Returns Err(x) when it does not find an hrule and x is
467/// the offset in data before no hrule can appear.
468pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
469 if bytes.len() < 3 {
470 return Err(0);
471 }
472 let c = bytes[0];
473 if !(c == b'*' || c == b'-' || c == b'_') {
474 return Err(0);
475 }
476 let mut n = 0;
477 let mut i = 0;
478
479 while i < bytes.len() {
480 match bytes[i] {
481 b'\n' | b'\r' => {
482 i += scan_eol(&bytes[i..]).unwrap_or(0);
483 break;
484 }
485 c2 if c2 == c => {
486 n += 1;
487 }
488 b' ' | b'\t' => (),
489 _ => return Err(i),
490 }
491 i += 1;
492 }
493 if n >= 3 {
494 Ok(i)
495 } else {
496 Err(i)
497 }
498}
499
500/// Scan an ATX heading opening sequence.
501///
502/// Returns number of bytes in prefix and level.
503pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<HeadingLevel> {
504 let level: usize = scan_ch_repeat(data, c:b'#');
505 if data.get(level).copied().map_or(default:true, f:is_ascii_whitespace) {
506 HeadingLevel::try_from(level).ok()
507 } else {
508 None
509 }
510}
511
512/// Scan a setext heading underline.
513///
514/// Returns number of bytes in line (including trailing newline) and level.
515pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, HeadingLevel)> {
516 let c: u8 = *data.get(index:0)?;
517 let level: HeadingLevel = if c == b'=' {
518 HeadingLevel::H1
519 } else if c == b'-' {
520 HeadingLevel::H2
521 } else {
522 return None;
523 };
524 let mut i: usize = 1 + scan_ch_repeat(&data[1..], c);
525 i += scan_blank_line(&data[i..])?;
526 Some((i, level))
527}
528
529// returns number of bytes in line (including trailing
530// newline) and column alignments
531pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
532 let (mut i, spaces) = calc_indent(data, 4);
533 if spaces > 3 || i == data.len() {
534 return (0, vec![]);
535 }
536 let mut cols = vec![];
537 let mut active_col = Alignment::None;
538 let mut start_col = true;
539 if data[i] == b'|' {
540 i += 1;
541 }
542 for c in &data[i..] {
543 if let Some(n) = scan_eol(&data[i..]) {
544 i += n;
545 break;
546 }
547 match *c {
548 b' ' => (),
549 b':' => {
550 active_col = match (start_col, active_col) {
551 (true, Alignment::None) => Alignment::Left,
552 (false, Alignment::Left) => Alignment::Center,
553 (false, Alignment::None) => Alignment::Right,
554 _ => active_col,
555 };
556 start_col = false;
557 }
558 b'-' => {
559 start_col = false;
560 }
561 b'|' => {
562 start_col = true;
563 cols.push(active_col);
564 active_col = Alignment::None;
565 }
566 _ => {
567 cols = vec![];
568 start_col = true;
569 break;
570 }
571 }
572 i += 1;
573 }
574
575 if !start_col {
576 cols.push(active_col);
577 }
578
579 (i, cols)
580}
581
582/// Scan code fence.
583///
584/// Returns number of bytes scanned and the char that is repeated to make the code fence.
585pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
586 let c: u8 = *data.get(index:0)?;
587 if !(c == b'`' || c == b'~') {
588 return None;
589 }
590 let i: usize = 1 + scan_ch_repeat(&data[1..], c);
591 if i >= 3 {
592 if c == b'`' {
593 let suffix: &[u8] = &data[i..];
594 let next_line: usize = i + scan_nextline(bytes:suffix);
595 // FIXME: make sure this is correct
596 if suffix[..(next_line - i)].iter().any(|&b: u8| b == b'`') {
597 return None;
598 }
599 }
600 Some((i, c))
601 } else {
602 None
603 }
604}
605
606pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
607 if data.starts_with(needle:b"> ") {
608 Some(2)
609 } else {
610 None
611 }
612}
613
614/// This already assumes the list item has been scanned.
615pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
616 let mut ix: usize = 0;
617 for _ in 0..2 {
618 if let Some(bytes: usize) = scan_blank_line(&data[ix..]) {
619 ix += bytes;
620 } else {
621 return false;
622 }
623 }
624 true
625}
626
627// return number of bytes scanned, delimiter, start index, and indent
628pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
629 let mut c = *bytes.get(0)?;
630 let (w, start) = match c {
631 b'-' | b'+' | b'*' => (1, 0),
632 b'0'..=b'9' => {
633 let (length, start) = parse_decimal(bytes);
634 c = *bytes.get(length)?;
635 if !(c == b'.' || c == b')') {
636 return None;
637 }
638 (length + 1, start)
639 }
640 _ => {
641 return None;
642 }
643 };
644 // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
645 let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
646 if postindent == 0 {
647 scan_eol(&bytes[w..])?;
648 postindent += 1;
649 } else if postindent > 4 {
650 postn = 1;
651 postindent = 1;
652 }
653 if scan_blank_line(&bytes[w..]).is_some() {
654 postn = 0;
655 postindent = 1;
656 }
657 Some((w + postn, c, start, w + postindent))
658}
659
660// returns (number of bytes, parsed decimal)
661fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
662 match bytes
663 .iter()
664 .take_while(|&&b| is_digit(b))
665 .try_fold((0, 0usize), |(count: usize, acc: usize), c: &u8| {
666 let digit: usize = usize::from(c - b'0');
667 match accOption
668 .checked_mul(10)
669 .and_then(|ten_acc: usize| ten_acc.checked_add(digit))
670 {
671 Some(number: usize) => Ok((count + 1, number)),
672 // stop early on overflow
673 None => Err((count, acc)),
674 }
675 }) {
676 Ok(p: (usize, usize)) | Err(p: (usize, usize)) => p,
677 }
678}
679
680// returns (number of bytes, parsed hex)
681fn parse_hex(bytes: &[u8]) -> (usize, usize) {
682 match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
683 let mut c = *c;
684 let digit = if c >= b'0' && c <= b'9' {
685 usize::from(c - b'0')
686 } else {
687 // make lower case
688 c |= 0x20;
689 if c >= b'a' && c <= b'f' {
690 usize::from(c - b'a' + 10)
691 } else {
692 return Err((count, acc));
693 }
694 };
695 match acc
696 .checked_mul(16)
697 .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
698 {
699 Some(number) => Ok((count + 1, number)),
700 // stop early on overflow
701 None => Err((count, acc)),
702 }
703 }) {
704 Ok(p) | Err(p) => p,
705 }
706}
707
708fn char_from_codepoint(input: usize) -> Option<char> {
709 let mut codepoint: u32 = input.try_into().ok()?;
710 if codepoint == 0 {
711 codepoint = 0xFFFD;
712 }
713 char::from_u32(codepoint)
714}
715
716// doesn't bother to check data[0] == '&'
717pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
718 let mut end = 1;
719 if scan_ch(&bytes[end..], b'#') == 1 {
720 end += 1;
721 let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
722 end += 1;
723 parse_hex(&bytes[end..])
724 } else {
725 parse_decimal(&bytes[end..])
726 };
727 end += bytecount;
728 return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
729 (0, None)
730 } else if let Some(c) = char_from_codepoint(codepoint) {
731 (end + 1, Some(c.into()))
732 } else {
733 (0, None)
734 };
735 }
736 end += scan_while(&bytes[end..], is_ascii_alphanumeric);
737 if scan_ch(&bytes[end..], b';') == 1 {
738 if let Some(value) = entities::get_entity(&bytes[1..end]) {
739 return (end + 1, Some(value.into()));
740 }
741 }
742 (0, None)
743}
744
745// FIXME: we can most likely re-use other scanners
746// returns (bytelength, title_str)
747pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
748 let mut chars = text.chars().peekable();
749 let closing_delim = match chars.next()? {
750 '\'' => '\'',
751 '"' => '"',
752 '(' => ')',
753 _ => return None,
754 };
755 let mut bytecount = 1;
756
757 while let Some(c) = chars.next() {
758 match c {
759 '\n' => {
760 bytecount += 1;
761 let mut next = *chars.peek()?;
762 while is_ascii_whitespace_no_nl(next as u8) {
763 bytecount += chars.next()?.len_utf8();
764 next = *chars.peek()?;
765 }
766 if *chars.peek()? == '\n' {
767 // blank line - not allowed
768 return None;
769 }
770 }
771 '\\' => {
772 let next_char = chars.next()?;
773 bytecount += 1 + next_char.len_utf8();
774 }
775 c if c == closing_delim => {
776 return Some((bytecount + 1, &text[1..bytecount]));
777 }
778 c => {
779 bytecount += c.len_utf8();
780 }
781 }
782 }
783 None
784}
785
786// note: dest returned is raw, still needs to be unescaped
787// TODO: check that nested parens are really not allowed for refdefs
788// TODO(performance): this func should probably its own unescaping
789pub(crate) fn scan_link_dest(
790 data: &str,
791 start_ix: usize,
792 max_next: usize,
793) -> Option<(usize, &str)> {
794 let bytes = &data.as_bytes()[start_ix..];
795 let mut i = scan_ch(bytes, b'<');
796
797 if i != 0 {
798 // pointy links
799 while i < bytes.len() {
800 match bytes[i] {
801 b'\n' | b'\r' | b'<' => return None,
802 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
803 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
804 i += 1;
805 }
806 _ => {}
807 }
808 i += 1;
809 }
810 None
811 } else {
812 // non-pointy links
813 let mut nest = 0;
814 while i < bytes.len() {
815 match bytes[i] {
816 0x0..=0x20 => {
817 break;
818 }
819 b'(' => {
820 if nest > max_next {
821 return None;
822 }
823 nest += 1;
824 }
825 b')' => {
826 if nest == 0 {
827 break;
828 }
829 nest -= 1;
830 }
831 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
832 i += 1;
833 }
834 _ => {}
835 }
836 i += 1;
837 }
838 Some((i, &data[start_ix..(start_ix + i)]))
839 }
840}
841
842/// Returns bytes scanned
843fn scan_attribute_name(data: &[u8]) -> Option<usize> {
844 let (&c: u8, tail: &[u8]) = data.split_first()?;
845 if is_ascii_alpha(c) || c == b'_' || c == b':' {
846 Some(
847 1 + scan_while(data:tail, |c: u8| {
848 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
849 }),
850 )
851 } else {
852 None
853 }
854}
855
856/// Returns the index immediately following the attribute on success.
857/// The argument `buffer_ix` refers to the index into `data` from which we
858/// should copy into `buffer` when we find bytes to skip.
859fn scan_attribute(
860 data: &[u8],
861 mut ix: usize,
862 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
863 buffer: &mut Vec<u8>,
864 buffer_ix: &mut usize,
865) -> Option<usize> {
866 ix += scan_attribute_name(&data[ix..])?;
867 let n_whitespace: usize =
868 scan_whitespace_with_newline_handler(data, i:ix, newline_handler, buffer, buffer_ix)? - ix;
869 ix += n_whitespace;
870 if scan_ch(&data[ix..], c:b'=') == 1 {
871 ix += 1;
872 ix = scan_whitespace_with_newline_handler(data, i:ix, newline_handler, buffer, buffer_ix)?;
873 ix = scan_attribute_value(data, i:ix, newline_handler, buffer, buffer_ix)?;
874 } else if n_whitespace > 0 {
875 // Leave whitespace for next attribute.
876 ix -= 1;
877 }
878 Some(ix)
879}
880
881/// Scans whitespace and possibly newlines according to the
882/// behavior defined by the newline handler. When bytes are skipped,
883/// all preceding non-skipped bytes are pushed to the buffer.
884fn scan_whitespace_with_newline_handler(
885 data: &[u8],
886 mut i: usize,
887 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
888 buffer: &mut Vec<u8>,
889 buffer_ix: &mut usize,
890) -> Option<usize> {
891 while i < data.len() {
892 if !is_ascii_whitespace(data[i]) {
893 return Some(i);
894 }
895 if let Some(eol_bytes: usize) = scan_eol(&data[i..]) {
896 let handler: &dyn Fn(&[u8]) -> usize = newline_handler?;
897 i += eol_bytes;
898 let skipped_bytes: usize = handler(&data[i..]);
899
900 if skipped_bytes > 0 {
901 buffer.extend(&data[*buffer_ix..i]);
902 *buffer_ix = i + skipped_bytes;
903 }
904
905 i += skipped_bytes;
906 } else {
907 i += 1;
908 }
909 }
910
911 Some(i)
912}
913
914/// Returns the index immediately following the attribute value on success.
915fn scan_attribute_value(
916 data: &[u8],
917 mut i: usize,
918 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
919 buffer: &mut Vec<u8>,
920 buffer_ix: &mut usize,
921) -> Option<usize> {
922 match *data.get(i)? {
923 b @ b'"' | b @ b'\'' => {
924 i += 1;
925 while i < data.len() {
926 if data[i] == b {
927 return Some(i + 1);
928 }
929 if let Some(eol_bytes) = scan_eol(&data[i..]) {
930 let handler = newline_handler?;
931 i += eol_bytes;
932 let skipped_bytes = handler(&data[i..]);
933
934 if skipped_bytes > 0 {
935 buffer.extend(&data[*buffer_ix..i]);
936 *buffer_ix = i + skipped_bytes;
937 }
938 i += skipped_bytes;
939 } else {
940 i += 1;
941 }
942 }
943 return None;
944 }
945 b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
946 return None;
947 }
948 _ => {
949 // unquoted attribute value
950 i += scan_attr_value_chars(&data[i..]);
951 }
952 }
953
954 Some(i)
955}
956
957// Remove backslash escapes and resolve entities
958pub(crate) fn unescape(input: &str) -> CowStr<'_> {
959 let mut result = String::new();
960 let mut mark = 0;
961 let mut i = 0;
962 let bytes = input.as_bytes();
963 while i < bytes.len() {
964 match bytes[i] {
965 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
966 result.push_str(&input[mark..i]);
967 mark = i + 1;
968 i += 2;
969 }
970 b'&' => match scan_entity(&bytes[i..]) {
971 (n, Some(value)) => {
972 result.push_str(&input[mark..i]);
973 result.push_str(&value);
974 i += n;
975 mark = i;
976 }
977 _ => i += 1,
978 },
979 b'\r' => {
980 result.push_str(&input[mark..i]);
981 i += 1;
982 mark = i;
983 }
984 _ => i += 1,
985 }
986 }
987 if mark == 0 {
988 input.into()
989 } else {
990 result.push_str(&input[mark..]);
991 result.into()
992 }
993}
994
995/// Assumes `data` is preceded by `<`.
996pub(crate) fn starts_html_block_type_6(data: &[u8]) -> bool {
997 let i: usize = scan_ch(data, c:b'/');
998 let tail: &[u8] = &data[i..];
999 let n: usize = scan_while(data:tail, f:is_ascii_alphanumeric);
1000 if !is_html_tag(&tail[..n]) {
1001 return false;
1002 }
1003 // Starting condition says the next byte must be either a space, a tab,
1004 // the end of the line, the string >, or the string />
1005 let tail: &[u8] = &tail[n..];
1006 tail.is_empty()
1007 || tail[0] == b' '
1008 || tail[0] == b'\t'
1009 || tail[0] == b'\r'
1010 || tail[0] == b'\n'
1011 || tail[0] == b'>'
1012 || tail.len() >= 2 && &tail[..2] == b"/>"
1013}
1014
1015fn is_html_tag(tag: &[u8]) -> bool {
1016 HTML_TAGS
1017 .binary_search_by(|probe: &&str| {
1018 let probe_bytes_iter: Iter<'_, u8> = probe.as_bytes().iter();
1019 let tag_bytes_iter: Iter<'_, u8> = tag.iter();
1020
1021 probe_bytes_iterOption
1022 .zip(tag_bytes_iter)
1023 .find_map(|(&a: u8, &b: u8)| {
1024 // We can compare case insensitively because the probes are
1025 // all lower case alpha strings.
1026 match a.cmp(&(b | 0x20)) {
1027 std::cmp::Ordering::Equal => None,
1028 inequality: Ordering => Some(inequality),
1029 }
1030 })
1031 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1032 })
1033 .is_ok()
1034}
1035
1036/// Assumes that `data` starts with `<`.
1037/// Returns the index into data directly after the html tag on success.
1038pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1039 // Block type html does not allow for newlines, so we
1040 // do not pass a newline handler.
1041 let (_span: Vec, i: usize) = scan_html_block_inner(data, newline_handler:None)?;
1042 scan_blank_line(&data[i..])?;
1043 Some(i)
1044}
1045
1046/// Assumes that `data` starts with `<`.
1047/// Returns the number of bytes scanned and the html in case of
1048/// success.
1049/// When some bytes were skipped, because the html was split over
1050/// multiple leafs (e.g. over multiple lines in a blockquote),
1051/// the html is returned as a vector of bytes.
1052/// If no bytes were skipped, the buffer will be empty.
1053pub(crate) fn scan_html_block_inner(
1054 data: &[u8],
1055 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1056) -> Option<(Vec<u8>, usize)> {
1057 let mut buffer = Vec::new();
1058 let mut last_buf_index = 0;
1059
1060 let close_tag_bytes = scan_ch(&data[1..], b'/');
1061 let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1062 if l == 0 {
1063 return None;
1064 }
1065 let mut i = 1 + close_tag_bytes + l;
1066 i += scan_while(&data[i..], is_ascii_letterdigitdash);
1067
1068 if close_tag_bytes == 0 {
1069 loop {
1070 let old_i = i;
1071 loop {
1072 i += scan_whitespace_no_nl(&data[i..]);
1073 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1074 if eol_bytes == 0 {
1075 return None;
1076 }
1077 let handler = newline_handler?;
1078 i += eol_bytes;
1079 let skipped_bytes = handler(&data[i..]);
1080
1081 let data_len = data.len() - i;
1082
1083 debug_assert!(
1084 skipped_bytes <= data_len,
1085 "Handler tried to skip too many bytes, fed {}, skipped {}",
1086 data_len,
1087 skipped_bytes
1088 );
1089
1090 if skipped_bytes > 0 {
1091 buffer.extend(&data[last_buf_index..i]);
1092 i += skipped_bytes;
1093 last_buf_index = i;
1094 }
1095 } else {
1096 break;
1097 }
1098 }
1099 if let Some(b'/') | Some(b'>') = data.get(i) {
1100 break;
1101 }
1102 if old_i == i {
1103 // No whitespace, which is mandatory.
1104 return None;
1105 }
1106 i = scan_attribute(data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1107 }
1108 }
1109
1110 i += scan_whitespace_no_nl(&data[i..]);
1111
1112 if close_tag_bytes == 0 {
1113 i += scan_ch(&data[i..], b'/');
1114 }
1115
1116 if scan_ch(&data[i..], b'>') == 0 {
1117 None
1118 } else {
1119 i += 1;
1120 if !buffer.is_empty() {
1121 buffer.extend(&data[last_buf_index..i]);
1122 }
1123 Some((buffer, i))
1124 }
1125}
1126
1127/// Returns (next_byte_offset, uri, type)
1128pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1129 scan_uriOption<(usize, CowStr<'_>, …)>(text, start_ix)
1130 .map(|(bytes: usize, uri: CowStr<'_>)| (bytes, uri, LinkType::Autolink))
1131 .or_else(|| scan_email(text, start_ix).map(|(bytes: usize, uri: CowStr<'_>)| (bytes, uri, LinkType::Email)))
1132}
1133
1134/// Returns (next_byte_offset, uri)
1135fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1136 let bytes = &text.as_bytes()[start_ix..];
1137
1138 // scheme's first byte must be an ascii letter
1139 if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1140 return None;
1141 }
1142
1143 let mut i = 1;
1144
1145 while i < bytes.len() {
1146 let c = bytes[i];
1147 i += 1;
1148 match c {
1149 c if is_ascii_alphanumeric(c) => (),
1150 b'.' | b'-' | b'+' => (),
1151 b':' => break,
1152 _ => return None,
1153 }
1154 }
1155
1156 // scheme length must be between 2 and 32 characters long. scheme
1157 // must be followed by colon
1158 if i < 3 || i > 33 {
1159 return None;
1160 }
1161
1162 while i < bytes.len() {
1163 match bytes[i] {
1164 b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1165 b'\0'..=b' ' | b'<' => return None,
1166 _ => (),
1167 }
1168 i += 1;
1169 }
1170
1171 None
1172}
1173
1174/// Returns (next_byte_offset, email)
1175fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1176 // using a regex library would be convenient, but doing it by hand is not too bad
1177 let bytes = &text.as_bytes()[start_ix..];
1178 let mut i = 0;
1179
1180 while i < bytes.len() {
1181 let c = bytes[i];
1182 i += 1;
1183 match c {
1184 c if is_ascii_alphanumeric(c) => (),
1185 b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1186 | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1187 b'@' => break,
1188 _ => return None,
1189 }
1190 }
1191
1192 loop {
1193 let label_start_ix = i;
1194 let mut fresh_label = true;
1195
1196 while i < bytes.len() {
1197 match bytes[i] {
1198 c if is_ascii_alphanumeric(c) => (),
1199 b'-' if fresh_label => {
1200 return None;
1201 }
1202 b'-' => (),
1203 _ => break,
1204 }
1205 fresh_label = false;
1206 i += 1;
1207 }
1208
1209 if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1210 return None;
1211 }
1212
1213 if scan_ch(&bytes[i..], b'.') == 0 {
1214 break;
1215 }
1216 i += 1;
1217 }
1218
1219 if scan_ch(&bytes[i..], b'>') == 0 {
1220 return None;
1221 }
1222
1223 Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1224}
1225
1226/// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1227/// Returns byte offset on match.
1228pub(crate) fn scan_inline_html_comment(
1229 bytes: &[u8],
1230 mut ix: usize,
1231 scan_guard: &mut HtmlScanGuard,
1232) -> Option<usize> {
1233 let c = *bytes.get(ix)?;
1234 ix += 1;
1235 match c {
1236 b'-' => {
1237 let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1238 if dashes < 1 {
1239 return None;
1240 }
1241 // Saw "<!--", scan comment.
1242 ix += dashes;
1243 if scan_ch(&bytes[ix..], b'>') == 1 {
1244 return None;
1245 }
1246
1247 while let Some(x) = memchr(b'-', &bytes[ix..]) {
1248 ix += x + 1;
1249 if scan_ch(&bytes[ix..], b'-') == 1 {
1250 ix += 1;
1251 return if scan_ch(&bytes[ix..], b'>') == 1 {
1252 Some(ix + 1)
1253 } else {
1254 None
1255 };
1256 }
1257 }
1258 None
1259 }
1260 b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1261 ix += b"CDATA[".len();
1262 ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1263 let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1264 ix += close_brackets;
1265
1266 if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1267 scan_guard.cdata = ix;
1268 None
1269 } else {
1270 Some(ix + 1)
1271 }
1272 }
1273 b'A'..=b'Z' if ix > scan_guard.declaration => {
1274 // Scan declaration.
1275 ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
1276 let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1277 if whitespace == 0 {
1278 return None;
1279 }
1280 ix += whitespace;
1281 ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1282 if scan_ch(&bytes[ix..], b'>') == 0 {
1283 scan_guard.declaration = ix;
1284 None
1285 } else {
1286 Some(ix + 1)
1287 }
1288 }
1289 _ => None,
1290 }
1291}
1292
1293/// Scan processing directive, with initial "<?" already consumed.
1294/// Returns the next byte offset on success.
1295pub(crate) fn scan_inline_html_processing(
1296 bytes: &[u8],
1297 mut ix: usize,
1298 scan_guard: &mut HtmlScanGuard,
1299) -> Option<usize> {
1300 if ix <= scan_guard.processing {
1301 return None;
1302 }
1303 while let Some(offset: usize) = memchr(needle:b'?', &bytes[ix..]) {
1304 ix += offset + 1;
1305 if scan_ch(&bytes[ix..], c:b'>') == 1 {
1306 return Some(ix + 1);
1307 }
1308 }
1309 scan_guard.processing = ix;
1310 None
1311}
1312
1313#[cfg(test)]
1314mod test {
1315 use super::*;
1316 #[test]
1317 fn overflow_list() {
1318 assert!(
1319 scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1320 );
1321 }
1322
1323 #[test]
1324 fn overflow_by_addition() {
1325 assert!(scan_listitem(b"1844674407370955161615!").is_none());
1326 }
1327}
1328