search.rs source code [crates/mdbook/src/renderer/html_handlebars/search.rs]

1	use std::borrow::Cow;
2	use std::collections::{HashMap, HashSet};
3	use std::path::Path;
4
5	use elasticlunr::{Index, IndexBuilder};
6	use once_cell::sync::Lazy;
7	use pulldown_cmark::*;
8
9	use crate::book::{Book, BookItem};
10	use crate::config::Search;
11	use crate::errors::*;
12	use crate::theme::searcher;
13	use crate::utils;
14	use log::{debug, warn};
15	use serde::Serialize;
16
17	const MAX_WORD_LENGTH_TO_INDEX: usize = `80`;
18
19	/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
20	fn tokenize(text: &str) -> Vec<String> {
21	textimpl Iterator.split(\|c: char\| c.is_whitespace() \|\| c == '-')
22	.filter(\|s: &&str\| !s.is_empty())
23	.map(\|s: &str\| s.trim().to_lowercase())
24	.filter(\|s: &String\| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
25	.collect()
26	}
27
28	/// Creates all files required for search.
29	pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
30	let mut index = IndexBuilder::new()
31	.add_field_with_tokenizer("title", Box::new(&tokenize))
32	.add_field_with_tokenizer("body", Box::new(&tokenize))
33	.add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize))
34	.build();
35
36	let mut doc_urls = Vec::with_capacity(book.sections.len());
37
38	for item in book.iter() {
39	render_item(&mut index, search_config, &mut doc_urls, item)?;
40	}
41
42	let index = write_to_json(index, search_config, doc_urls)?;
43	debug!("Writing search index ✓");
44	if index.len() > `10_000_000` {
45	warn!("searchindex.json is very large ({} bytes)", index.len());
46	}
47
48	if search_config.copy_js {
49	utils::fs::write_file(destination, "searchindex.json", index.as_bytes())?;
50	utils::fs::write_file(
51	destination,
52	"searchindex.js",
53	format!("Object.assign(window.search, {});", index).as_bytes(),
54	)?;
55	utils::fs::write_file(destination, "searcher.js", searcher::JS)?;
56	utils::fs::write_file(destination, "mark.min.js", searcher::MARK_JS)?;
57	utils::fs::write_file(destination, "elasticlunr.min.js", searcher::ELASTICLUNR_JS)?;
58	debug!("Copying search files ✓");
59	}
60
61	Ok(())
62	}
63
64	/// Uses the given arguments to construct a search document, then inserts it to the given index.
65	fn add_doc(
66	index: &mut Index,
67	doc_urls: &mut Vec<String>,
68	anchor_base: &str,
69	section_id: &Option<String>,
70	items: &[&str],
71	) {
72	let url: Cow<'_, str> = if let Some(ref id: &String) = *section_id {
73	Cow::Owned(format!("{}#{}", anchor_base, id))
74	} else {
75	Cow::Borrowed(anchor_base)
76	};
77	let url: Cow<'_, str> = utils::collapse_whitespace(text:url.trim());
78	let doc_ref: String = doc_urls.len().to_string();
79	doc_urls.push(url.into());
80
81	let items: impl Iterator> = items.iter().map(\|&x: &str\| utils::collapse_whitespace(text:x.trim()));
82	index.add_doc(&doc_ref, data:items);
83	}
84
85	/// Renders markdown into flat unformatted text and adds it to the search index.
86	fn render_item(
87	index: &mut Index,
88	search_config: &Search,
89	doc_urls: &mut Vec<String>,
90	item: &BookItem,
91	) -> Result<()> {
92	let chapter = match *item {
93	BookItem::Chapter(ref ch) if !ch.is_draft_chapter() => ch,
94	_ => return Ok(()),
95	};
96
97	let chapter_path = chapter
98	.path
99	.as_ref()
100	.expect("Checked that path exists above");
101	let filepath = Path::new(&chapter_path).with_extension("html");
102	let filepath = filepath
103	.to_str()
104	.with_context(\|\| "Could not convert HTML path to str")?;
105	let anchor_base = utils::fs::normalize_path(filepath);
106
107	let mut p = utils::new_cmark_parser(&chapter.content, `false`).peekable();
108
109	let mut in_heading = `false`;
110	let max_section_depth = u32::from(search_config.heading_split_level);
111	let mut section_id = None;
112	let mut heading = String::new();
113	let mut body = String::new();
114	let mut breadcrumbs = chapter.parent_names.clone();
115	let mut footnote_numbers = HashMap::new();
116
117	breadcrumbs.push(chapter.name.clone());
118
119	let mut id_counter = HashMap::new();
120	while let Some(event) = p.next() {
121	match event {
122	Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => {
123	if !heading.is_empty() {
124	// Section finished, the next heading is following now
125	// Write the data to the index, and clear it for the next section
126	add_doc(
127	index,
128	doc_urls,
129	&anchor_base,
130	&section_id,
131	&[&heading, &body, &breadcrumbs.join(" » ")],
132	);
133	section_id = None;
134	heading.clear();
135	body.clear();
136	breadcrumbs.pop();
137	}
138
139	in_heading = `true`;
140	}
141	Event::End(Tag::Heading(i, id, _classes)) if i as u32 <= max_section_depth => {
142	in_heading = `false`;
143	section_id = id
144	.map(\|id\| id.to_string())
145	.or_else(\|\| Some(utils::unique_id_from_content(&heading, &mut id_counter)));
146	breadcrumbs.push(heading.clone());
147	}
148	Event::Start(Tag::FootnoteDefinition(name)) => {
149	let number = footnote_numbers.len() + `1`;
150	footnote_numbers.entry(name).or_insert(number);
151	}
152	Event::Html(html) => {
153	let mut html_block = html.into_string();
154
155	// As of pulldown_cmark 0.6, html events are no longer contained
156	// in an HtmlBlock tag. We must collect consecutive Html events
157	// into a block ourselves.
158	while let Some(Event::Html(html)) = p.peek() {
159	html_block.push_str(html);
160	p.next();
161	}
162
163	body.push_str(&clean_html(&html_block));
164	}
165	Event::Start(_) \| Event::End(_) \| Event::Rule \| Event::SoftBreak \| Event::HardBreak => {
166	// Insert spaces where HTML output would usually separate text
167	// to ensure words don't get merged together
168	if in_heading {
169	heading.push(' ');
170	} else {
171	body.push(' ');
172	}
173	}
174	Event::Text(text) \| Event::Code(text) => {
175	if in_heading {
176	heading.push_str(&text);
177	} else {
178	body.push_str(&text);
179	}
180	}
181	Event::FootnoteReference(name) => {
182	let len = footnote_numbers.len() + `1`;
183	let number = footnote_numbers.entry(name).or_insert(len);
184	body.push_str(&format!(" [{}] ", number));
185	}
186	Event::TaskListMarker(_checked) => {}
187	}
188	}
189
190	if !body.is_empty() \|\| !heading.is_empty() {
191	if heading.is_empty() {
192	if let Some(chapter) = breadcrumbs.first() {
193	heading = chapter.clone();
194	}
195	}
196	// Make sure the last section is added to the index
197	add_doc(
198	index,
199	doc_urls,
200	&anchor_base,
201	&section_id,
202	&[&heading, &body, &breadcrumbs.join(" » ")],
203	);
204	}
205
206	Ok(())
207	}
208
209	fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> {
210	use elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField};
211	use std::collections::BTreeMap;
212
213	#[derive(Serialize)]
214	struct ResultsOptions {
215	limit_results: u32,
216	teaser_word_count: u32,
217	}
218
219	#[derive(Serialize)]
220	struct SearchindexJson {
221	/// The options used for displaying search results
222	results_options: ResultsOptions,
223	/// The searchoptions for elasticlunr.js
224	search_options: SearchOptions,
225	/// Used to lookup a document's URL from an integer document ref.
226	doc_urls: Vec<String>,
227	/// The index for elasticlunr.js
228	index: elasticlunr::Index,
229	}
230
231	let mut fields = BTreeMap::new();
232	let mut opt = SearchOptionsField::default();
233	let mut insert_boost = \|key: &str, boost\| {
234	opt.boost = Some(boost);
235	fields.insert(key.into(), opt);
236	};
237	insert_boost("title", search_config.boost_title);
238	insert_boost("body", search_config.boost_paragraph);
239	insert_boost("breadcrumbs", search_config.boost_hierarchy);
240
241	let search_options = SearchOptions {
242	bool: if search_config.use_boolean_and {
243	SearchBool::And
244	} else {
245	SearchBool::Or
246	},
247	expand: search_config.expand,
248	fields,
249	};
250
251	let results_options = ResultsOptions {
252	limit_results: search_config.limit_results,
253	teaser_word_count: search_config.teaser_word_count,
254	};
255
256	let json_contents = SearchindexJson {
257	results_options,
258	search_options,
259	doc_urls,
260	index,
261	};
262
263	// By converting to serde_json::Value as an intermediary, we use a
264	// BTreeMap internally and can force a stable ordering of map keys.
265	let json_contents = serde_json::to_value(&json_contents)?;
266	let json_contents = serde_json::to_string(&json_contents)?;
267
268	Ok(json_contents)
269	}
270
271	fn clean_html(html: &str) -> String {
272	static AMMONIA: Lazy<ammonia::Builder<'static>> = Lazy::new(\|\| {
273	let mut clean_content: HashSet<&str> = HashSet::new();
274	clean_content.insert("script");
275	clean_content.insert("style");
276	let mut builder: Builder<'_> = ammonia::Builder::new();
277	builder&mut Builder<'_>
278	.tags(HashSet::new())
279	.tag_attributes(HashMap::new())
280	.generic_attributes(HashSet::new())
281	.link_rel(None)
282	.allowed_classes(HashMap::new())
283	.clean_content_tags(clean_content);
284	builder
285	});
286	AMMONIA.clean(src:html).to_string()
287	}
288