linklabel.rs source code [crates/pulldown_cmark/src/linklabel.rs]

1	// Copyright 2018 Google LLC
2	//
3	// Permission is hereby granted, free of charge, to any person obtaining a copy
4	// of this software and associated documentation files (the "Software"), to deal
5	// in the Software without restriction, including without limitation the rights
6	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	// copies of the Software, and to permit persons to whom the Software is
8	// furnished to do so, subject to the following conditions:
9	//
10	// The above copyright notice and this permission notice shall be included in
11	// all copies or substantial portions of the Software.
12	//
13	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19	// THE SOFTWARE.
20
21	//! Link label parsing and matching.
22
23	use unicase::UniCase;
24
25	use crate::scanners::{is_ascii_whitespace, scan_eol, is_ascii_punctuation};
26	use crate::strings::CowStr;
27
28	#[derive(Debug)]
29	pub(crate) enum ReferenceLabel<'a> {
30	Link(CowStr<'a>),
31	Footnote(CowStr<'a>),
32	}
33
34	pub(crate) type LinkLabel<'a> = UniCase<CowStr<'a>>;
35
36	pub(crate) type FootnoteLabel<'a> = UniCase<CowStr<'a>>;
37
38	/// Assumes the opening bracket has already been scanned.
39	/// The line break handler determines what happens when a linebreak
40	/// is found. It is passed the bytes following the line break and
41	/// either returns `Some(k)`, where `k` is the number of bytes to skip,
42	/// or `None` to abort parsing the label.
43	/// Returns the number of bytes read (including closing bracket) and label on success.
44	pub(crate) fn scan_link_label_rest<'t>(
45	text: &'t str,
46	linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
47	is_in_table: bool,
48	) -> Option<(usize, CowStr<'t>)> {
49	let bytes = text.as_bytes();
50	let mut ix = `0`;
51	let mut only_white_space = `true`;
52	let mut codepoints = `0`;
53	// no worries, doesn't allocate until we push things onto it
54	let mut label = String::new();
55	let mut mark = `0`;
56
57	loop {
58	if codepoints >= `1000` {
59	return None;
60	}
61	match *bytes.get(ix)? {
62	b'[' => return None,
63	b']' => break,
64	// Backslash escapes in link references are normally untouched, but
65	// tables are an exception, because they're parsed as-if the tables
66	// were parsed in a discrete pass, changing `\\|` to `\|`, and then
67	// passing the changed string to the inline parser.
68	b'\|' if is_in_table && ix != `0` && bytes.get(ix - `1`) == Some(&b'`\\`') => {
69	// only way to reach this spot is to have `\\\|` (even number of `\` before `\|`)
70	label.push_str(&text[mark..ix - `1`]);
71	label.push('\|');
72	ix += `1`;
73	only_white_space = `false`;
74	mark = ix;
75	}
76	b'`\\`' if is_in_table && bytes.get(ix + `1`) == Some(&b'\|') => {
77	// only way to reach this spot is to have `\\|` (odd number of `\` before `\|`)
78	label.push_str(&text[mark..ix]);
79	label.push('\|');
80	ix += `2`;
81	codepoints += `1`;
82	only_white_space = `false`;
83	mark = ix;
84	}
85	b'`\\`' if is_ascii_punctuation(*bytes.get(ix + `1`)?) => {
86	ix += `2`;
87	codepoints += `2`;
88	only_white_space = `false`;
89	}
90	b if is_ascii_whitespace(b) => {
91	// normalize labels by collapsing whitespaces, including linebreaks
92	let mut whitespaces = `0`;
93	let mut linebreaks = `0`;
94	let whitespace_start = ix;
95
96	while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
97	if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
98	linebreaks += `1`;
99	if linebreaks > `1` {
100	return None;
101	}
102	ix += eol_bytes;
103	ix += linebreak_handler(&bytes[ix..])?;
104	whitespaces += `2`; // indicate that we need to replace
105	} else {
106	whitespaces += if bytes[ix] == b' ' { `1` } else { `2` };
107	ix += `1`;
108	}
109	}
110	if whitespaces > `1` {
111	label.push_str(&text[mark..whitespace_start]);
112	label.push(' ');
113	mark = ix;
114	codepoints += ix - whitespace_start;
115	} else {
116	codepoints += `1`;
117	}
118	}
119	b => {
120	only_white_space = `false`;
121	ix += `1`;
122	if b & `0b1000_0000` != `0` {
123	codepoints += `1`;
124	}
125	}
126	}
127	}
128
129	if only_white_space {
130	None
131	} else {
132	let cow = if mark == `0` {
133	let asciiws = &[' ', '`\r`', '`\n`', '`\t`'][..];
134	text[..ix].trim_matches(asciiws).into()
135	} else {
136	label.push_str(&text[mark..ix]);
137	while matches!(label.as_bytes().last(), Some(&b' ' \| &b'`\r`' \| &b'`\n`' \| &b'`\t`')) {
138	label.pop();
139	}
140	while matches!(label.as_bytes().first(), Some(&b' ' \| &b'`\r`' \| &b'`\n`' \| &b'`\t`')) {
141	label.remove(`0`);
142	}
143	label.into()
144	};
145	Some((ix + `1`, cow))
146	}
147	}
148
149	#[cfg(test)]
150	mod test {
151	use super::scan_link_label_rest;
152
153	#[test]
154	fn whitespace_normalization() {
155	let input = "«`\t\t`Blurry Eyes`\t\t`»][blurry_eyes]";
156	let expected_output = "« Blurry Eyes »"; // regular spaces!
157
158	let (_bytes, normalized_label) = scan_link_label_rest(input, &\|_\| None, `false`).unwrap();
159	assert_eq!(expected_output, normalized_label.as_ref());
160	}
161
162	#[test]
163	fn return_carriage_linefeed_ok() {
164	let input = "hello`\r\n`world`\r\n`]";
165	assert!(scan_link_label_rest(input, &\|_\| Some(`0`), `false`).is_some());
166	}
167	}
168