1// Copyright 2018 Google LLC
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Link label parsing and matching.
22
23use unicase::UniCase;
24
25use crate::scanners::{is_ascii_whitespace, scan_eol};
26use crate::strings::CowStr;
27
28pub(crate) enum ReferenceLabel<'a> {
29 Link(CowStr<'a>),
30 Footnote(CowStr<'a>),
31}
32
33pub(crate) type LinkLabel<'a> = UniCase<CowStr<'a>>;
34
35/// Assumes the opening bracket has already been scanned.
36/// The line break handler determines what happens when a linebreak
37/// is found. It is passed the bytes following the line break and
38/// either returns `Some(k)`, where `k` is the number of bytes to skip,
39/// or `None` to abort parsing the label.
40/// Returns the number of bytes read (including closing bracket) and label on success.
41pub(crate) fn scan_link_label_rest<'t>(
42 text: &'t str,
43 linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
44) -> Option<(usize, CowStr<'t>)> {
45 let bytes = text.as_bytes();
46 let mut ix = 0;
47 let mut only_white_space = true;
48 let mut codepoints = 0;
49 // no worries, doesn't allocate until we push things onto it
50 let mut label = String::new();
51 let mut mark = 0;
52
53 loop {
54 if codepoints >= 1000 {
55 return None;
56 }
57 match *bytes.get(ix)? {
58 b'[' => return None,
59 b']' => break,
60 b'\\' => {
61 ix += 2;
62 codepoints += 2;
63 only_white_space = false;
64 }
65 b if is_ascii_whitespace(b) => {
66 // normalize labels by collapsing whitespaces, including linebreaks
67 let mut whitespaces = 0;
68 let mut linebreaks = 0;
69 let whitespace_start = ix;
70
71 while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
72 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
73 linebreaks += 1;
74 if linebreaks > 1 {
75 return None;
76 }
77 ix += eol_bytes;
78 ix += linebreak_handler(&bytes[ix..])?;
79 whitespaces += 2; // indicate that we need to replace
80 } else {
81 whitespaces += if bytes[ix] == b' ' { 1 } else { 2 };
82 ix += 1;
83 }
84 }
85 if whitespaces > 1 {
86 label.push_str(&text[mark..whitespace_start]);
87 label.push(' ');
88 mark = ix;
89 codepoints += ix - whitespace_start;
90 } else {
91 codepoints += 1;
92 }
93 }
94 b => {
95 only_white_space = false;
96 ix += 1;
97 if b & 0b1000_0000 != 0 {
98 codepoints += 1;
99 }
100 }
101 }
102 }
103
104 if only_white_space {
105 None
106 } else {
107 let cow = if mark == 0 {
108 text[..ix].into()
109 } else {
110 label.push_str(&text[mark..ix]);
111 label.into()
112 };
113 Some((ix + 1, cow))
114 }
115}
116
117#[cfg(test)]
118mod test {
119 use super::scan_link_label_rest;
120
121 #[test]
122 fn whitespace_normalization() {
123 let input = \t\tBlurry Eyes\t\t»][blurry_eyes]";
124 let expected_output = "« Blurry Eyes »"; // regular spaces!
125
126 let (_bytes, normalized_label) = scan_link_label_rest(input, &|_| None).unwrap();
127 assert_eq!(expected_output, normalized_label.as_ref());
128 }
129
130 #[test]
131 fn return_carriage_linefeed_ok() {
132 let input = "hello\r\nworld\r\n]";
133 assert!(scan_link_label_rest(input, &|_| Some(0)).is_some());
134 }
135}
136