mod.rs source code [crates/mdbook/src/utils/mod.rs]

1	#![allow(missing_docs)] // FIXME: Document this
2
3	pub mod fs;
4	mod string;
5	pub(crate) mod toml_ext;
6	use crate::errors::Error;
7	use log::error;
8	use once_cell::sync::Lazy;
9	use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag};
10	use regex::Regex;
11
12	use std::borrow::Cow;
13	use std::collections::HashMap;
14	use std::fmt::Write;
15	use std::path::Path;
16
17	pub use self::string::{
18	take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines,
19	take_rustdoc_include_lines,
20	};
21
22	/// Replaces multiple consecutive whitespace characters with a single space character.
23	pub fn collapse_whitespace(text: &str) -> Cow<'_, str> {
24	static RE: Lazy<Regex> = Lazy::new(\|\| Regex::new(re:r"\s\s+").unwrap());
25	RE.replace_all(text, rep:" ")
26	}
27
28	/// Convert the given string to a valid HTML element ID.
29	/// The only restriction is that the ID must not contain any ASCII whitespace.
30	pub fn normalize_id(content: &str) -> String {
31	contentimpl Iterator
32	.chars()
33	.filter_map(\|ch: char\| {
34	if ch.is_alphanumeric() \|\| ch == '_' \|\| ch == '-' {
35	Some(ch.to_ascii_lowercase())
36	} else if ch.is_whitespace() {
37	Some('-')
38	} else {
39	None
40	}
41	})
42	.collect::<String>()
43	}
44
45	/// Generate an ID for use with anchors which is derived from a "normalised"
46	/// string.
47	// This function should be made private when the deprecation expires.
48	#[deprecated(since = "0.4.16", note = "use unique_id_from_content instead")]
49	pub fn id_from_content(content: &str) -> String {
50	let mut content: String = content.to_string();
51
52	// Skip any tags or html-encoded stuff
53	static HTML: Lazy<Regex> = Lazy::new(\|\| Regex::new(re:r"(<.*?>)").unwrap());
54	content = HTML.replace_all(&content, rep:"").into();
55	const REPL_SUB: &[&str] = &["<", ">", "&", "'", """];
56	for sub: &&str in REPL_SUB {
57	content = content.replace(from:sub, to:"");
58	}
59
60	// Remove spaces and hashes indicating a header
61	let trimmed: &str = content.trim().trim_start_matches('#').trim();
62	normalize_id(content:trimmed)
63	}
64
65	/// Generate an ID for use with anchors which is derived from a "normalised"
66	/// string.
67	///
68	/// Each ID returned will be unique, if the same `id_counter` is provided on
69	/// each call.
70	pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap<String, usize>) -> String {
71	let id: String = {
72	#[allow(deprecated)]
73	id_from_content(content)
74	};
75
76	// If we have headers with the same normalized id, append an incrementing counter
77	let id_count: &mut usize = id_counter.entry(id.clone()).or_insert(default:`0`);
78	let unique_id: String = match *id_count {
79	`0` => id,
80	id_count: usize => format!("{}-{}", id, id_count),
81	};
82	*id_count += `1`;
83	unique_id
84	}
85
86	/// Fix links to the correct location.
87	///
88	/// This adjusts links, such as turning `.md` extensions to `.html`.
89	///
90	/// `path` is the path to the page being rendered relative to the root of the
91	/// book. This is used for the `print.html` page so that links on the print
92	/// page go to the original location. Normal page rendering sets `path` to
93	/// None. Ideally, print page links would link to anchors on the print page,
94	/// but that is very difficult.
95	fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
96	static SCHEME_LINK: Lazy<Regex> = Lazy::new(\|\| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap());
97	static MD_LINK: Lazy<Regex> =
98	Lazy::new(\|\| Regex::new(r"(?P<link>.)\.md(?P<anchor>#.)?").unwrap());
99
100	fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
101	if dest.starts_with('#') {
102	// Fragment-only link.
103	if let Some(path) = path {
104	let mut base = path.display().to_string();
105	if base.ends_with(".md") {
106	base.replace_range(base.len() - `3`.., ".html");
107	}
108	return format!("{}{}", base, dest).into();
109	} else {
110	return dest;
111	}
112	}
113	// Don't modify links with schemes like `https`.
114	if !SCHEME_LINK.is_match(&dest) {
115	// This is a relative link, adjust it as necessary.
116	let mut fixed_link = String::new();
117	if let Some(path) = path {
118	let base = path
119	.parent()
120	.expect("path can't be empty")
121	.to_str()
122	.expect("utf-8 paths only");
123	if !base.is_empty() {
124	write!(fixed_link, "{}/", base).unwrap();
125	}
126	}
127
128	if let Some(caps) = MD_LINK.captures(&dest) {
129	fixed_link.push_str(&caps["link"]);
130	fixed_link.push_str(".html");
131	if let Some(anchor) = caps.name("anchor") {
132	fixed_link.push_str(anchor.as_str());
133	}
134	} else {
135	fixed_link.push_str(&dest);
136	};
137	return CowStr::from(fixed_link);
138	}
139	dest
140	}
141
142	fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> {
143	// This is a terrible hack, but should be reasonably reliable. Nobody
144	// should ever parse a tag with a regex. However, there isn't anything
145	// in Rust that I know of that is suitable for handling partial html
146	// fragments like those generated by pulldown_cmark.
147	//
148	// There are dozens of HTML tags/attributes that contain paths, so
149	// feel free to add more tags if desired; these are the only ones I
150	// care about right now.
151	static HTML_LINK: Lazy<Regex> =
152	Lazy::new(\|\| Regex::new(r#"(<(?:a\|img) [^>]*?(?:src\|href)=")([^"]+?)""#).unwrap());
153
154	HTML_LINK
155	.replace_all(&html, \|caps: &regex::Captures<'_>\| {
156	let fixed = fix(caps[`2`].into(), path);
157	format!("{}{}`\"`", &caps[`1`], fixed)
158	})
159	.into_owned()
160	.into()
161	}
162
163	match event {
164	Event::Start(Tag::Link(link_type, dest, title)) => {
165	Event::Start(Tag::Link(link_type, fix(dest, path), title))
166	}
167	Event::Start(Tag::Image(link_type, dest, title)) => {
168	Event::Start(Tag::Image(link_type, fix(dest, path), title))
169	}
170	Event::Html(html) => Event::Html(fix_html(html, path)),
171	_ => event,
172	}
173	}
174
175	/// Wrapper around the pulldown-cmark parser for rendering markdown to HTML.
176	pub fn render_markdown(text: &str, curly_quotes: bool) -> String {
177	render_markdown_with_path(text, curly_quotes, path:None)
178	}
179
180	pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> {
181	let mut opts: Options = Options::empty();
182	opts.insert(Options::ENABLE_TABLES);
183	opts.insert(Options::ENABLE_FOOTNOTES);
184	opts.insert(Options::ENABLE_STRIKETHROUGH);
185	opts.insert(Options::ENABLE_TASKLISTS);
186	opts.insert(Options::ENABLE_HEADING_ATTRIBUTES);
187	if curly_quotes {
188	opts.insert(Options::ENABLE_SMART_PUNCTUATION);
189	}
190	Parser::new_ext(text, options:opts)
191	}
192
193	pub fn render_markdown_with_path(text: &str, curly_quotes: bool, path: Option<&Path>) -> String {
194	let mut s: String = String::with_capacity(text.len() * `3` / `2`);
195	let p: Parser<'_, '_> = new_cmark_parser(text, curly_quotes);
196	let events: impl Iterator> = pimpl Iterator>
197	.map(clean_codeblock_headers)
198	.map(\|event: Event<'_>\| adjust_links(event, path))
199	.flat_map(\|event: Event<'_>\| {
200	let (a: Option>, b: Option>) = wrap_tables(event);
201	a.into_iter().chain(b)
202	});
203
204	html::push_html(&mut s, iter:events);
205	s
206	}
207
208	/// Wraps tables in a `.table-wrapper` class to apply overflow-x rules to.
209	fn wrap_tables(event: Event<'_>) -> (Option<Event<'_>>, Option<Event<'_>>) {
210	match event {
211	Event::Start(Tag::Table(_)) => (
212	Some(Event::Html(r#"<div class="table-wrapper">"#.into())),
213	Some(event),
214	),
215	Event::End(Tag::Table(_)) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
216	_ => (Some(event), None),
217	}
218	}
219
220	fn clean_codeblock_headers(event: Event<'_>) -> Event<'_> {
221	match event {
222	Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(ref info: &CowStr<'_>))) => {
223	let info: String = infoimpl Iterator
224	.chars()
225	.map(\|x: char\| match x {
226	' ' \| '`\t`' => ',',
227	_ => x,
228	})
229	.filter(\|ch: &char\| !ch.is_whitespace())
230	.collect();
231
232	Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(CowStr::from(info))))
233	}
234	_ => event,
235	}
236	}
237
238	/// Prints a "backtrace" of some `Error`.
239	pub fn log_backtrace(e: &Error) {
240	error!("Error: {}", e);
241
242	for cause: &dyn Error in e.chain().skip(`1`) {
243	error!("`\t`Caused By: {}", cause);
244	}
245	}
246
247	pub(crate) fn bracket_escape(mut s: &str) -> String {
248	let mut escaped: String = String::with_capacity(s.len());
249	let needs_escape: &[char] = &['<', '>'];
250	while let Some(next: usize) = s.find(needs_escape) {
251	escaped.push_str(&s[..next]);
252	match s.as_bytes()[next] {
253	b'<' => escaped.push_str(string:"<"),
254	b'>' => escaped.push_str(string:">"),
255	_ => unreachable!(),
256	}
257	s = &s[next + `1`..];
258	}
259	escaped.push_str(string:s);
260	escaped
261	}
262
263	#[cfg(test)]
264	mod tests {
265	use super::bracket_escape;
266
267	mod render_markdown {
268	use super::super::render_markdown;
269
270	#[test]
271	fn preserves_external_links() {
272	assert_eq!(
273	render_markdown("[example](https://www.rust-lang.org/)", `false`),
274	"<p><a href=`\"`https://www.rust-lang.org/`\"`>example</a></p>`\n`"
275	);
276	}
277
278	#[test]
279	fn it_can_adjust_markdown_links() {
280	assert_eq!(
281	render_markdown("[example](example.md)", `false`),
282	"<p><a href=`\"`example.html`\"`>example</a></p>`\n`"
283	);
284	assert_eq!(
285	render_markdown("[example_anchor](example.md#anchor)", `false`),
286	"<p><a href=`\"`example.html#anchor`\"`>example_anchor</a></p>`\n`"
287	);
288
289	// this anchor contains 'md' inside of it
290	assert_eq!(
291	render_markdown("[phantom data](foo.html#phantomdata)", `false`),
292	"<p><a href=`\"`foo.html#phantomdata`\"`>phantom data</a></p>`\n`"
293	);
294	}
295
296	#[test]
297	fn it_can_wrap_tables() {
298	let src = r#"
299	\| Original \| Punycode \| Punycode + Encoding \|
300	\|-----------------\|-----------------\|---------------------\|
301	\| føø \| f-5gaa \| f_5gaa \|
302	"#;
303	let out = r#"
304	<div class="table-wrapper"><table><thead><tr><th>Original</th><th>Punycode</th><th>Punycode + Encoding</th></tr></thead><tbody>
305	<tr><td>føø</td><td>f-5gaa</td><td>f_5gaa</td></tr>
306	</tbody></table>
307	</div>
308	"#.trim();
309	assert_eq!(render_markdown(src, `false`), out);
310	}
311
312	#[test]
313	fn it_can_keep_quotes_straight() {
314	assert_eq!(render_markdown("'one'", `false`), "<p>'one'</p>`\n`");
315	}
316
317	#[test]
318	fn it_can_make_quotes_curly_except_when_they_are_in_code() {
319	let input = r#"
320	'one'
321	```
322	'two'
323	```
324	`'three'` 'four'"#;
325	let expected = r#"<p>‘one’</p>
326	<pre><code>'two'
327	</code></pre>
328	<p><code>'three'</code> ‘four’</p>
329	"#;
330	assert_eq!(render_markdown(input, `true`), expected);
331	}
332
333	#[test]
334	fn whitespace_outside_of_codeblock_header_is_preserved() {
335	let input = r#"
336	some text with spaces
337	```rust
338	fn main() {
339	// code inside is unchanged
340	}
341	```
342	more text with spaces
343	"#;
344
345	let expected = r#"<p>some text with spaces</p>
346	<pre><code class="language-rust">fn main() {
347	// code inside is unchanged
348	}
349	</code></pre>
350	<p>more text with spaces</p>
351	"#;
352	assert_eq!(render_markdown(input, `false`), expected);
353	assert_eq!(render_markdown(input, `true`), expected);
354	}
355
356	#[test]
357	fn rust_code_block_properties_are_passed_as_space_delimited_class() {
358	let input = r#"
359	```rust,no_run,should_panic,property_3
360	```
361	"#;
362
363	let expected = r#"<pre><code class="language-rust,no_run,should_panic,property_3"></code></pre>
364	"#;
365	assert_eq!(render_markdown(input, `false`), expected);
366	assert_eq!(render_markdown(input, `true`), expected);
367	}
368
369	#[test]
370	fn rust_code_block_properties_with_whitespace_are_passed_as_space_delimited_class() {
371	let input = r#"
372	```rust, no_run,,,should_panic , ,property_3
373	```
374	"#;
375
376	let expected = r#"<pre><code class="language-rust,,,,,no_run,,,should_panic,,,,property_3"></code></pre>
377	"#;
378	assert_eq!(render_markdown(input, `false`), expected);
379	assert_eq!(render_markdown(input, `true`), expected);
380	}
381
382	#[test]
383	fn rust_code_block_without_properties_has_proper_html_class() {
384	let input = r#"
385	```rust
386	```
387	"#;
388
389	let expected = r#"<pre><code class="language-rust"></code></pre>
390	"#;
391	assert_eq!(render_markdown(input, `false`), expected);
392	assert_eq!(render_markdown(input, `true`), expected);
393
394	let input = r#"
395	```rust
396	```
397	"#;
398	assert_eq!(render_markdown(input, `false`), expected);
399	assert_eq!(render_markdown(input, `true`), expected);
400	}
401	}
402
403	#[allow(deprecated)]
404	mod id_from_content {
405	use super::super::id_from_content;
406
407	#[test]
408	fn it_generates_anchors() {
409	assert_eq!(
410	id_from_content("## Method-call expressions"),
411	"method-call-expressions"
412	);
413	assert_eq!(id_from_content("## Bold title"), "bold-title");
414	assert_eq!(id_from_content("## `Code` title"), "code-title");
415	assert_eq!(
416	id_from_content("## title <span dir=rtl>foo</span>"),
417	"title-foo"
418	);
419	}
420
421	#[test]
422	fn it_generates_anchors_from_non_ascii_initial() {
423	assert_eq!(
424	id_from_content("## `--passes`: add more rustdoc passes"),
425	"--passes-add-more-rustdoc-passes"
426	);
427	assert_eq!(
428	id_from_content("## 中文標題 CJK title"),
429	"中文標題-cjk-title"
430	);
431	assert_eq!(id_from_content("## Über"), "Über");
432	}
433	}
434
435	mod html_munging {
436	use super::super::{normalize_id, unique_id_from_content};
437
438	#[test]
439	fn it_normalizes_ids() {
440	assert_eq!(
441	normalize_id("`--passes`: add more rustdoc passes"),
442	"--passes-add-more-rustdoc-passes"
443	);
444	assert_eq!(
445	normalize_id("Method-call 🐙 expressions `\u{1f47c}`"),
446	"method-call--expressions-"
447	);
448	assert_eq!(normalize_id("_-_12345"), "_-_12345");
449	assert_eq!(normalize_id("12345"), "12345");
450	assert_eq!(normalize_id("中文"), "中文");
451	assert_eq!(normalize_id("にほんご"), "にほんご");
452	assert_eq!(normalize_id("한국어"), "한국어");
453	assert_eq!(normalize_id(""), "");
454	}
455
456	#[test]
457	fn it_generates_unique_ids_from_content() {
458	// Same id if not given shared state
459	assert_eq!(
460	unique_id_from_content("## 中文標題 CJK title", &mut Default::default()),
461	"中文標題-cjk-title"
462	);
463	assert_eq!(
464	unique_id_from_content("## 中文標題 CJK title", &mut Default::default()),
465	"中文標題-cjk-title"
466	);
467
468	// Different id if given shared state
469	let mut id_counter = Default::default();
470	assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über");
471	assert_eq!(
472	unique_id_from_content("## 中文標題 CJK title", &mut id_counter),
473	"中文標題-cjk-title"
474	);
475	assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über-1");
476	assert_eq!(unique_id_from_content("## Über", &mut id_counter), "Über-2");
477	}
478	}
479
480	#[test]
481	fn escaped_brackets() {
482	assert_eq!(bracket_escape(""), "");
483	assert_eq!(bracket_escape("<"), "<");
484	assert_eq!(bracket_escape(">"), ">");
485	assert_eq!(bracket_escape("<>"), "<>");
486	assert_eq!(bracket_escape("<test>"), "<test>");
487	assert_eq!(bracket_escape("a<test>b"), "a<test>b");
488	}
489	}
490