1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3use std::path::Path;
4
5use elasticlunr::{Index, IndexBuilder};
6use once_cell::sync::Lazy;
7use pulldown_cmark::*;
8
9use crate::book::{Book, BookItem};
10use crate::config::Search;
11use crate::errors::*;
12use crate::theme::searcher;
13use crate::utils;
14use log::{debug, warn};
15use serde::Serialize;
16
17const MAX_WORD_LENGTH_TO_INDEX: usize = 80;
18
19/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens.
20fn tokenize(text: &str) -> Vec<String> {
21 textimpl Iterator.split(|c: char| c.is_whitespace() || c == '-')
22 .filter(|s: &&str| !s.is_empty())
23 .map(|s: &str| s.trim().to_lowercase())
24 .filter(|s: &String| s.len() <= MAX_WORD_LENGTH_TO_INDEX)
25 .collect()
26}
27
28/// Creates all files required for search.
29pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> {
30 let mut index = IndexBuilder::new()
31 .add_field_with_tokenizer("title", Box::new(&tokenize))
32 .add_field_with_tokenizer("body", Box::new(&tokenize))
33 .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize))
34 .build();
35
36 let mut doc_urls = Vec::with_capacity(book.sections.len());
37
38 for item in book.iter() {
39 render_item(&mut index, search_config, &mut doc_urls, item)?;
40 }
41
42 let index = write_to_json(index, search_config, doc_urls)?;
43 debug!("Writing search index ✓");
44 if index.len() > 10_000_000 {
45 warn!("searchindex.json is very large ({} bytes)", index.len());
46 }
47
48 if search_config.copy_js {
49 utils::fs::write_file(destination, "searchindex.json", index.as_bytes())?;
50 utils::fs::write_file(
51 destination,
52 "searchindex.js",
53 format!("Object.assign(window.search, {});", index).as_bytes(),
54 )?;
55 utils::fs::write_file(destination, "searcher.js", searcher::JS)?;
56 utils::fs::write_file(destination, "mark.min.js", searcher::MARK_JS)?;
57 utils::fs::write_file(destination, "elasticlunr.min.js", searcher::ELASTICLUNR_JS)?;
58 debug!("Copying search files ✓");
59 }
60
61 Ok(())
62}
63
64/// Uses the given arguments to construct a search document, then inserts it to the given index.
65fn add_doc(
66 index: &mut Index,
67 doc_urls: &mut Vec<String>,
68 anchor_base: &str,
69 section_id: &Option<String>,
70 items: &[&str],
71) {
72 let url: Cow<'_, str> = if let Some(ref id: &String) = *section_id {
73 Cow::Owned(format!("{}#{}", anchor_base, id))
74 } else {
75 Cow::Borrowed(anchor_base)
76 };
77 let url: Cow<'_, str> = utils::collapse_whitespace(text:url.trim());
78 let doc_ref: String = doc_urls.len().to_string();
79 doc_urls.push(url.into());
80
81 let items: impl Iterator> = items.iter().map(|&x: &str| utils::collapse_whitespace(text:x.trim()));
82 index.add_doc(&doc_ref, data:items);
83}
84
85/// Renders markdown into flat unformatted text and adds it to the search index.
86fn render_item(
87 index: &mut Index,
88 search_config: &Search,
89 doc_urls: &mut Vec<String>,
90 item: &BookItem,
91) -> Result<()> {
92 let chapter = match *item {
93 BookItem::Chapter(ref ch) if !ch.is_draft_chapter() => ch,
94 _ => return Ok(()),
95 };
96
97 let chapter_path = chapter
98 .path
99 .as_ref()
100 .expect("Checked that path exists above");
101 let filepath = Path::new(&chapter_path).with_extension("html");
102 let filepath = filepath
103 .to_str()
104 .with_context(|| "Could not convert HTML path to str")?;
105 let anchor_base = utils::fs::normalize_path(filepath);
106
107 let mut p = utils::new_cmark_parser(&chapter.content, false).peekable();
108
109 let mut in_heading = false;
110 let max_section_depth = u32::from(search_config.heading_split_level);
111 let mut section_id = None;
112 let mut heading = String::new();
113 let mut body = String::new();
114 let mut breadcrumbs = chapter.parent_names.clone();
115 let mut footnote_numbers = HashMap::new();
116
117 breadcrumbs.push(chapter.name.clone());
118
119 let mut id_counter = HashMap::new();
120 while let Some(event) = p.next() {
121 match event {
122 Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => {
123 if !heading.is_empty() {
124 // Section finished, the next heading is following now
125 // Write the data to the index, and clear it for the next section
126 add_doc(
127 index,
128 doc_urls,
129 &anchor_base,
130 &section_id,
131 &[&heading, &body, &breadcrumbs.join(" » ")],
132 );
133 section_id = None;
134 heading.clear();
135 body.clear();
136 breadcrumbs.pop();
137 }
138
139 in_heading = true;
140 }
141 Event::End(Tag::Heading(i, id, _classes)) if i as u32 <= max_section_depth => {
142 in_heading = false;
143 section_id = id
144 .map(|id| id.to_string())
145 .or_else(|| Some(utils::unique_id_from_content(&heading, &mut id_counter)));
146 breadcrumbs.push(heading.clone());
147 }
148 Event::Start(Tag::FootnoteDefinition(name)) => {
149 let number = footnote_numbers.len() + 1;
150 footnote_numbers.entry(name).or_insert(number);
151 }
152 Event::Html(html) => {
153 let mut html_block = html.into_string();
154
155 // As of pulldown_cmark 0.6, html events are no longer contained
156 // in an HtmlBlock tag. We must collect consecutive Html events
157 // into a block ourselves.
158 while let Some(Event::Html(html)) = p.peek() {
159 html_block.push_str(html);
160 p.next();
161 }
162
163 body.push_str(&clean_html(&html_block));
164 }
165 Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => {
166 // Insert spaces where HTML output would usually separate text
167 // to ensure words don't get merged together
168 if in_heading {
169 heading.push(' ');
170 } else {
171 body.push(' ');
172 }
173 }
174 Event::Text(text) | Event::Code(text) => {
175 if in_heading {
176 heading.push_str(&text);
177 } else {
178 body.push_str(&text);
179 }
180 }
181 Event::FootnoteReference(name) => {
182 let len = footnote_numbers.len() + 1;
183 let number = footnote_numbers.entry(name).or_insert(len);
184 body.push_str(&format!(" [{}] ", number));
185 }
186 Event::TaskListMarker(_checked) => {}
187 }
188 }
189
190 if !body.is_empty() || !heading.is_empty() {
191 if heading.is_empty() {
192 if let Some(chapter) = breadcrumbs.first() {
193 heading = chapter.clone();
194 }
195 }
196 // Make sure the last section is added to the index
197 add_doc(
198 index,
199 doc_urls,
200 &anchor_base,
201 &section_id,
202 &[&heading, &body, &breadcrumbs.join(" » ")],
203 );
204 }
205
206 Ok(())
207}
208
209fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> {
210 use elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField};
211 use std::collections::BTreeMap;
212
213 #[derive(Serialize)]
214 struct ResultsOptions {
215 limit_results: u32,
216 teaser_word_count: u32,
217 }
218
219 #[derive(Serialize)]
220 struct SearchindexJson {
221 /// The options used for displaying search results
222 results_options: ResultsOptions,
223 /// The searchoptions for elasticlunr.js
224 search_options: SearchOptions,
225 /// Used to lookup a document's URL from an integer document ref.
226 doc_urls: Vec<String>,
227 /// The index for elasticlunr.js
228 index: elasticlunr::Index,
229 }
230
231 let mut fields = BTreeMap::new();
232 let mut opt = SearchOptionsField::default();
233 let mut insert_boost = |key: &str, boost| {
234 opt.boost = Some(boost);
235 fields.insert(key.into(), opt);
236 };
237 insert_boost("title", search_config.boost_title);
238 insert_boost("body", search_config.boost_paragraph);
239 insert_boost("breadcrumbs", search_config.boost_hierarchy);
240
241 let search_options = SearchOptions {
242 bool: if search_config.use_boolean_and {
243 SearchBool::And
244 } else {
245 SearchBool::Or
246 },
247 expand: search_config.expand,
248 fields,
249 };
250
251 let results_options = ResultsOptions {
252 limit_results: search_config.limit_results,
253 teaser_word_count: search_config.teaser_word_count,
254 };
255
256 let json_contents = SearchindexJson {
257 results_options,
258 search_options,
259 doc_urls,
260 index,
261 };
262
263 // By converting to serde_json::Value as an intermediary, we use a
264 // BTreeMap internally and can force a stable ordering of map keys.
265 let json_contents = serde_json::to_value(&json_contents)?;
266 let json_contents = serde_json::to_string(&json_contents)?;
267
268 Ok(json_contents)
269}
270
271fn clean_html(html: &str) -> String {
272 static AMMONIA: Lazy<ammonia::Builder<'static>> = Lazy::new(|| {
273 let mut clean_content: HashSet<&str> = HashSet::new();
274 clean_content.insert("script");
275 clean_content.insert("style");
276 let mut builder: Builder<'_> = ammonia::Builder::new();
277 builder&mut Builder<'_>
278 .tags(HashSet::new())
279 .tag_attributes(HashMap::new())
280 .generic_attributes(HashSet::new())
281 .link_rel(None)
282 .allowed_classes(HashMap::new())
283 .clean_content_tags(clean_content);
284 builder
285 });
286 AMMONIA.clean(src:html).to_string()
287}
288