1 | use std::borrow::Cow; |
2 | use std::collections::{HashMap, HashSet}; |
3 | use std::path::Path; |
4 | |
5 | use elasticlunr::{Index, IndexBuilder}; |
6 | use once_cell::sync::Lazy; |
7 | use pulldown_cmark::*; |
8 | |
9 | use crate::book::{Book, BookItem}; |
10 | use crate::config::Search; |
11 | use crate::errors::*; |
12 | use crate::theme::searcher; |
13 | use crate::utils; |
14 | use log::{debug, warn}; |
15 | use serde::Serialize; |
16 | |
17 | const MAX_WORD_LENGTH_TO_INDEX: usize = 80; |
18 | |
19 | /// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens. |
20 | fn tokenize(text: &str) -> Vec<String> { |
21 | textimpl Iterator .split(|c: char| c.is_whitespace() || c == '-' ) |
22 | .filter(|s: &&str| !s.is_empty()) |
23 | .map(|s: &str| s.trim().to_lowercase()) |
24 | .filter(|s: &String| s.len() <= MAX_WORD_LENGTH_TO_INDEX) |
25 | .collect() |
26 | } |
27 | |
28 | /// Creates all files required for search. |
29 | pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> { |
30 | let mut index = IndexBuilder::new() |
31 | .add_field_with_tokenizer("title" , Box::new(&tokenize)) |
32 | .add_field_with_tokenizer("body" , Box::new(&tokenize)) |
33 | .add_field_with_tokenizer("breadcrumbs" , Box::new(&tokenize)) |
34 | .build(); |
35 | |
36 | let mut doc_urls = Vec::with_capacity(book.sections.len()); |
37 | |
38 | for item in book.iter() { |
39 | render_item(&mut index, search_config, &mut doc_urls, item)?; |
40 | } |
41 | |
42 | let index = write_to_json(index, search_config, doc_urls)?; |
43 | debug!("Writing search index ✓" ); |
44 | if index.len() > 10_000_000 { |
45 | warn!("searchindex.json is very large ( {} bytes)" , index.len()); |
46 | } |
47 | |
48 | if search_config.copy_js { |
49 | utils::fs::write_file(destination, "searchindex.json" , index.as_bytes())?; |
50 | utils::fs::write_file( |
51 | destination, |
52 | "searchindex.js" , |
53 | format!("Object.assign(window.search, {});" , index).as_bytes(), |
54 | )?; |
55 | utils::fs::write_file(destination, "searcher.js" , searcher::JS)?; |
56 | utils::fs::write_file(destination, "mark.min.js" , searcher::MARK_JS)?; |
57 | utils::fs::write_file(destination, "elasticlunr.min.js" , searcher::ELASTICLUNR_JS)?; |
58 | debug!("Copying search files ✓" ); |
59 | } |
60 | |
61 | Ok(()) |
62 | } |
63 | |
64 | /// Uses the given arguments to construct a search document, then inserts it to the given index. |
65 | fn add_doc( |
66 | index: &mut Index, |
67 | doc_urls: &mut Vec<String>, |
68 | anchor_base: &str, |
69 | section_id: &Option<String>, |
70 | items: &[&str], |
71 | ) { |
72 | let url: Cow<'_, str> = if let Some(ref id: &String) = *section_id { |
73 | Cow::Owned(format!(" {}# {}" , anchor_base, id)) |
74 | } else { |
75 | Cow::Borrowed(anchor_base) |
76 | }; |
77 | let url: Cow<'_, str> = utils::collapse_whitespace(text:url.trim()); |
78 | let doc_ref: String = doc_urls.len().to_string(); |
79 | doc_urls.push(url.into()); |
80 | |
81 | let items: impl Iterator- >
= items.iter().map(|&x: &str| utils::collapse_whitespace(text:x.trim())); |
82 | index.add_doc(&doc_ref, data:items); |
83 | } |
84 | |
85 | /// Renders markdown into flat unformatted text and adds it to the search index. |
86 | fn render_item( |
87 | index: &mut Index, |
88 | search_config: &Search, |
89 | doc_urls: &mut Vec<String>, |
90 | item: &BookItem, |
91 | ) -> Result<()> { |
92 | let chapter = match *item { |
93 | BookItem::Chapter(ref ch) if !ch.is_draft_chapter() => ch, |
94 | _ => return Ok(()), |
95 | }; |
96 | |
97 | let chapter_path = chapter |
98 | .path |
99 | .as_ref() |
100 | .expect("Checked that path exists above" ); |
101 | let filepath = Path::new(&chapter_path).with_extension("html" ); |
102 | let filepath = filepath |
103 | .to_str() |
104 | .with_context(|| "Could not convert HTML path to str" )?; |
105 | let anchor_base = utils::fs::normalize_path(filepath); |
106 | |
107 | let mut p = utils::new_cmark_parser(&chapter.content, false).peekable(); |
108 | |
109 | let mut in_heading = false; |
110 | let max_section_depth = u32::from(search_config.heading_split_level); |
111 | let mut section_id = None; |
112 | let mut heading = String::new(); |
113 | let mut body = String::new(); |
114 | let mut breadcrumbs = chapter.parent_names.clone(); |
115 | let mut footnote_numbers = HashMap::new(); |
116 | |
117 | breadcrumbs.push(chapter.name.clone()); |
118 | |
119 | let mut id_counter = HashMap::new(); |
120 | while let Some(event) = p.next() { |
121 | match event { |
122 | Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => { |
123 | if !heading.is_empty() { |
124 | // Section finished, the next heading is following now |
125 | // Write the data to the index, and clear it for the next section |
126 | add_doc( |
127 | index, |
128 | doc_urls, |
129 | &anchor_base, |
130 | §ion_id, |
131 | &[&heading, &body, &breadcrumbs.join(" » " )], |
132 | ); |
133 | section_id = None; |
134 | heading.clear(); |
135 | body.clear(); |
136 | breadcrumbs.pop(); |
137 | } |
138 | |
139 | in_heading = true; |
140 | } |
141 | Event::End(Tag::Heading(i, id, _classes)) if i as u32 <= max_section_depth => { |
142 | in_heading = false; |
143 | section_id = id |
144 | .map(|id| id.to_string()) |
145 | .or_else(|| Some(utils::unique_id_from_content(&heading, &mut id_counter))); |
146 | breadcrumbs.push(heading.clone()); |
147 | } |
148 | Event::Start(Tag::FootnoteDefinition(name)) => { |
149 | let number = footnote_numbers.len() + 1; |
150 | footnote_numbers.entry(name).or_insert(number); |
151 | } |
152 | Event::Html(html) => { |
153 | let mut html_block = html.into_string(); |
154 | |
155 | // As of pulldown_cmark 0.6, html events are no longer contained |
156 | // in an HtmlBlock tag. We must collect consecutive Html events |
157 | // into a block ourselves. |
158 | while let Some(Event::Html(html)) = p.peek() { |
159 | html_block.push_str(html); |
160 | p.next(); |
161 | } |
162 | |
163 | body.push_str(&clean_html(&html_block)); |
164 | } |
165 | Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => { |
166 | // Insert spaces where HTML output would usually separate text |
167 | // to ensure words don't get merged together |
168 | if in_heading { |
169 | heading.push(' ' ); |
170 | } else { |
171 | body.push(' ' ); |
172 | } |
173 | } |
174 | Event::Text(text) | Event::Code(text) => { |
175 | if in_heading { |
176 | heading.push_str(&text); |
177 | } else { |
178 | body.push_str(&text); |
179 | } |
180 | } |
181 | Event::FootnoteReference(name) => { |
182 | let len = footnote_numbers.len() + 1; |
183 | let number = footnote_numbers.entry(name).or_insert(len); |
184 | body.push_str(&format!(" [ {}] " , number)); |
185 | } |
186 | Event::TaskListMarker(_checked) => {} |
187 | } |
188 | } |
189 | |
190 | if !body.is_empty() || !heading.is_empty() { |
191 | if heading.is_empty() { |
192 | if let Some(chapter) = breadcrumbs.first() { |
193 | heading = chapter.clone(); |
194 | } |
195 | } |
196 | // Make sure the last section is added to the index |
197 | add_doc( |
198 | index, |
199 | doc_urls, |
200 | &anchor_base, |
201 | §ion_id, |
202 | &[&heading, &body, &breadcrumbs.join(" » " )], |
203 | ); |
204 | } |
205 | |
206 | Ok(()) |
207 | } |
208 | |
209 | fn write_to_json(index: Index, search_config: &Search, doc_urls: Vec<String>) -> Result<String> { |
210 | use elasticlunr::config::{SearchBool, SearchOptions, SearchOptionsField}; |
211 | use std::collections::BTreeMap; |
212 | |
213 | #[derive (Serialize)] |
214 | struct ResultsOptions { |
215 | limit_results: u32, |
216 | teaser_word_count: u32, |
217 | } |
218 | |
219 | #[derive (Serialize)] |
220 | struct SearchindexJson { |
221 | /// The options used for displaying search results |
222 | results_options: ResultsOptions, |
223 | /// The searchoptions for elasticlunr.js |
224 | search_options: SearchOptions, |
225 | /// Used to lookup a document's URL from an integer document ref. |
226 | doc_urls: Vec<String>, |
227 | /// The index for elasticlunr.js |
228 | index: elasticlunr::Index, |
229 | } |
230 | |
231 | let mut fields = BTreeMap::new(); |
232 | let mut opt = SearchOptionsField::default(); |
233 | let mut insert_boost = |key: &str, boost| { |
234 | opt.boost = Some(boost); |
235 | fields.insert(key.into(), opt); |
236 | }; |
237 | insert_boost("title" , search_config.boost_title); |
238 | insert_boost("body" , search_config.boost_paragraph); |
239 | insert_boost("breadcrumbs" , search_config.boost_hierarchy); |
240 | |
241 | let search_options = SearchOptions { |
242 | bool: if search_config.use_boolean_and { |
243 | SearchBool::And |
244 | } else { |
245 | SearchBool::Or |
246 | }, |
247 | expand: search_config.expand, |
248 | fields, |
249 | }; |
250 | |
251 | let results_options = ResultsOptions { |
252 | limit_results: search_config.limit_results, |
253 | teaser_word_count: search_config.teaser_word_count, |
254 | }; |
255 | |
256 | let json_contents = SearchindexJson { |
257 | results_options, |
258 | search_options, |
259 | doc_urls, |
260 | index, |
261 | }; |
262 | |
263 | // By converting to serde_json::Value as an intermediary, we use a |
264 | // BTreeMap internally and can force a stable ordering of map keys. |
265 | let json_contents = serde_json::to_value(&json_contents)?; |
266 | let json_contents = serde_json::to_string(&json_contents)?; |
267 | |
268 | Ok(json_contents) |
269 | } |
270 | |
271 | fn clean_html(html: &str) -> String { |
272 | static AMMONIA: Lazy<ammonia::Builder<'static>> = Lazy::new(|| { |
273 | let mut clean_content: HashSet<&str> = HashSet::new(); |
274 | clean_content.insert("script" ); |
275 | clean_content.insert("style" ); |
276 | let mut builder: Builder<'_> = ammonia::Builder::new(); |
277 | builder&mut Builder<'_> |
278 | .tags(HashSet::new()) |
279 | .tag_attributes(HashMap::new()) |
280 | .generic_attributes(HashSet::new()) |
281 | .link_rel(None) |
282 | .allowed_classes(HashMap::new()) |
283 | .clean_content_tags(clean_content); |
284 | builder |
285 | }); |
286 | AMMONIA.clean(src:html).to_string() |
287 | } |
288 | |