1 | //!# elasticlunr-rs |
2 | //! |
3 | //! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs) |
4 | //! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs) |
5 | //! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs) |
6 | //! |
7 | //! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to |
8 | //! be used for generating compatible search indices. |
9 | //! |
10 | //! Access to all index-generating functionality is provided. Most users will only need to use the |
11 | //! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types. |
12 | //! |
13 | //! The [`Language`] trait can be used to implement a custom language. |
14 | //! |
15 | //! ## Example |
16 | //! |
17 | //! ``` |
18 | //! use std::fs::File; |
19 | //! use std::io::Write; |
20 | //! use elasticlunr::Index; |
21 | //! |
22 | //! let mut index = Index::new(&["title" , "body" ]); |
23 | //! index.add_doc("1" , &["This is a title" , "This is body text!" ]); |
24 | //! // Add more docs... |
25 | //! let mut file = File::create("out.json" ).unwrap(); |
26 | //! file.write_all(index.to_json_pretty().as_bytes()); |
27 | //! ``` |
28 | |
29 | #[macro_use ] |
30 | extern crate serde_derive; |
31 | |
32 | #[cfg (test)] |
33 | #[macro_use ] |
34 | extern crate maplit; |
35 | |
36 | /// The version of elasticlunr.js this library was designed for. |
37 | pub const ELASTICLUNR_VERSION: &str = "0.9.5" ; |
38 | |
39 | pub mod config; |
40 | pub mod document_store; |
41 | pub mod inverted_index; |
42 | pub mod lang; |
43 | pub mod pipeline; |
44 | |
45 | use std::collections::BTreeMap; |
46 | |
47 | use document_store::DocumentStore; |
48 | use inverted_index::InvertedIndex; |
49 | use lang::English; |
50 | pub use lang::Language; |
51 | pub use pipeline::Pipeline; |
52 | |
53 | type Tokenizer = Option<Box<dyn Fn(&str) -> Vec<String>>>; |
54 | |
55 | /// A builder for an `Index` with custom parameters. |
56 | /// |
57 | /// # Example |
58 | /// ``` |
59 | /// # use elasticlunr::{Index, IndexBuilder}; |
60 | /// let mut index = IndexBuilder::new() |
61 | /// .save_docs(false) |
62 | /// .add_fields(&["title" , "subtitle" , "body" ]) |
63 | /// .set_ref("doc_id" ) |
64 | /// .build(); |
65 | /// index.add_doc("doc_a" , &["Chapter 1" , "Welcome to Copenhagen" , "..." ]); |
66 | /// ``` |
67 | pub struct IndexBuilder { |
68 | save: bool, |
69 | fields: Vec<String>, |
70 | field_tokenizers: Vec<Tokenizer>, |
71 | ref_field: String, |
72 | pipeline: Option<Pipeline>, |
73 | language: Box<dyn Language>, |
74 | } |
75 | |
76 | impl Default for IndexBuilder { |
77 | fn default() -> Self { |
78 | IndexBuilder { |
79 | save: true, |
80 | fields: Vec::new(), |
81 | field_tokenizers: Vec::new(), |
82 | ref_field: "id" .into(), |
83 | pipeline: None, |
84 | language: Box::new(English::new()), |
85 | } |
86 | } |
87 | } |
88 | |
89 | impl IndexBuilder { |
90 | pub fn new() -> Self { |
91 | Default::default() |
92 | } |
93 | |
94 | pub fn with_language(language: Box<dyn Language>) -> Self { |
95 | Self { |
96 | language, |
97 | ..Default::default() |
98 | } |
99 | } |
100 | |
101 | /// Set whether or not documents should be saved in the `Index`'s document store. |
102 | pub fn save_docs(mut self, save: bool) -> Self { |
103 | self.save = save; |
104 | self |
105 | } |
106 | |
107 | /// Add a document field to the `Index`. |
108 | /// |
109 | /// # Panics |
110 | /// |
111 | /// Panics if a field with the name already exists. |
112 | pub fn add_field(mut self, field: &str) -> Self { |
113 | let field = field.into(); |
114 | if self.fields.contains(&field) { |
115 | panic!("Duplicate fields in index: {}" , field); |
116 | } |
117 | self.fields.push(field); |
118 | self.field_tokenizers.push(None); |
119 | self |
120 | } |
121 | |
122 | /// Add a document field to the `Index`, with a custom tokenizer for that field. |
123 | /// |
124 | /// # Panics |
125 | /// |
126 | /// Panics if a field with the name already exists. |
127 | pub fn add_field_with_tokenizer( |
128 | mut self, |
129 | field: &str, |
130 | tokenizer: Box<dyn Fn(&str) -> Vec<String>>, |
131 | ) -> Self { |
132 | let field = field.into(); |
133 | if self.fields.contains(&field) { |
134 | panic!("Duplicate fields in index: {}" , field); |
135 | } |
136 | self.fields.push(field); |
137 | self.field_tokenizers.push(Some(tokenizer)); |
138 | self |
139 | } |
140 | |
141 | /// Add the document fields to the `Index`. |
142 | /// |
143 | /// # Panics |
144 | /// |
145 | /// Panics if two fields have the same name. |
146 | pub fn add_fields<I>(mut self, fields: I) -> Self |
147 | where |
148 | I: IntoIterator, |
149 | I::Item: AsRef<str>, |
150 | { |
151 | for field in fields { |
152 | self = self.add_field(field.as_ref()) |
153 | } |
154 | self |
155 | } |
156 | |
157 | /// Set the key used to store the document reference field. |
158 | pub fn set_ref(mut self, ref_field: &str) -> Self { |
159 | self.ref_field = ref_field.into(); |
160 | self |
161 | } |
162 | |
163 | /// Build an `Index` from this builder. |
164 | pub fn build(self) -> Index { |
165 | let IndexBuilder { |
166 | save, |
167 | fields, |
168 | field_tokenizers, |
169 | ref_field, |
170 | pipeline, |
171 | language, |
172 | } = self; |
173 | |
174 | let index = fields |
175 | .iter() |
176 | .map(|f| (f.clone(), InvertedIndex::new())) |
177 | .collect(); |
178 | |
179 | let pipeline = pipeline.unwrap_or_else(|| language.make_pipeline()); |
180 | |
181 | Index { |
182 | index, |
183 | fields, |
184 | field_tokenizers, |
185 | ref_field, |
186 | document_store: DocumentStore::new(save), |
187 | pipeline, |
188 | version: crate::ELASTICLUNR_VERSION, |
189 | lang: language, |
190 | } |
191 | } |
192 | } |
193 | |
194 | /// An elasticlunr search index. |
195 | #[derive (Serialize, Deserialize)] |
196 | #[serde(rename_all = "camelCase" )] |
197 | pub struct Index { |
198 | fields: Vec<String>, |
199 | #[serde(skip)] |
200 | field_tokenizers: Vec<Tokenizer>, |
201 | pipeline: Pipeline, |
202 | #[serde(rename = "ref" )] |
203 | ref_field: String, |
204 | version: &'static str, |
205 | index: BTreeMap<String, InvertedIndex>, |
206 | document_store: DocumentStore, |
207 | #[serde(with = "ser_lang" )] |
208 | lang: Box<dyn Language>, |
209 | } |
210 | |
211 | mod ser_lang { |
212 | use crate::Language; |
213 | use serde::de; |
214 | use serde::{Deserializer, Serializer}; |
215 | use std::fmt; |
216 | |
217 | pub fn serialize<S>(lang: &Box<dyn Language>, serializer: S) -> Result<S::Ok, S::Error> |
218 | where |
219 | S: Serializer, |
220 | { |
221 | serializer.serialize_str(&lang.name()) |
222 | } |
223 | |
224 | pub fn deserialize<'de, D>(deserializer: D) -> Result<Box<dyn Language>, D::Error> |
225 | where |
226 | D: Deserializer<'de>, |
227 | { |
228 | deserializer.deserialize_str(LanguageVisitor) |
229 | } |
230 | |
231 | struct LanguageVisitor; |
232 | |
233 | impl<'de> de::Visitor<'de> for LanguageVisitor { |
234 | type Value = Box<dyn Language>; |
235 | |
236 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
237 | formatter.write_str("a capitalized language name" ) |
238 | } |
239 | |
240 | fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E> |
241 | where |
242 | E: de::Error, |
243 | { |
244 | match crate::lang::from_name(v) { |
245 | Some(l) => Ok(l), |
246 | None => Err(E::custom(format!("Unknown language name: {}" , v))), |
247 | } |
248 | } |
249 | } |
250 | } |
251 | |
252 | impl Index { |
253 | /// Create a new index with the provided fields. |
254 | /// |
255 | /// # Example |
256 | /// |
257 | /// ``` |
258 | /// # use elasticlunr::{Index}; |
259 | /// let mut index = Index::new(&["title" , "body" ]); |
260 | /// index.add_doc("1" , &["this is a title" , "this is body text" ]); |
261 | /// ``` |
262 | /// |
263 | /// # Panics |
264 | /// |
265 | /// Panics if a field with the name already exists. |
266 | pub fn new<I>(fields: I) -> Self |
267 | where |
268 | I: IntoIterator, |
269 | I::Item: AsRef<str>, |
270 | { |
271 | IndexBuilder::new().add_fields(fields).build() |
272 | } |
273 | |
274 | /// Create a new index with the provided fields for the given |
275 | /// [`Language`](lang/enum.Language.html). |
276 | /// |
277 | /// # Example |
278 | /// |
279 | /// ``` |
280 | /// use elasticlunr::{Index, lang::English}; |
281 | /// let mut index = Index::with_language(Box::new(English::new()), &["title" , "body" ]); |
282 | /// index.add_doc("1" , &["this is a title" , "this is body text" ]); |
283 | /// ``` |
284 | /// |
285 | /// # Panics |
286 | /// |
287 | /// Panics if a field with the name already exists. |
288 | pub fn with_language<I>(lang: Box<dyn Language>, fields: I) -> Self |
289 | where |
290 | I: IntoIterator, |
291 | I::Item: AsRef<str>, |
292 | { |
293 | IndexBuilder::with_language(lang).add_fields(fields).build() |
294 | } |
295 | |
296 | /// Add the data from a document to the index. |
297 | /// |
298 | /// *NOTE: The elements of `data` should be provided in the same order as |
299 | /// the fields used to create the index.* |
300 | /// |
301 | /// # Example |
302 | /// ``` |
303 | /// # use elasticlunr::Index; |
304 | /// let mut index = Index::new(&["title" , "body" ]); |
305 | /// index.add_doc("1" , &["this is a title" , "this is body text" ]); |
306 | /// ``` |
307 | pub fn add_doc<I>(&mut self, doc_ref: &str, data: I) |
308 | where |
309 | I: IntoIterator, |
310 | I::Item: AsRef<str>, |
311 | { |
312 | let mut doc = BTreeMap::new(); |
313 | doc.insert(self.ref_field.clone(), doc_ref.into()); |
314 | let mut token_freq = BTreeMap::new(); |
315 | |
316 | for (i, value) in data.into_iter().enumerate() { |
317 | let field = &self.fields[i]; |
318 | let tokenizer = self.field_tokenizers[i].as_ref(); |
319 | doc.insert(field.clone(), value.as_ref().to_string()); |
320 | |
321 | if field == &self.ref_field { |
322 | continue; |
323 | } |
324 | |
325 | let raw_tokens = if let Some(tokenizer) = tokenizer { |
326 | tokenizer(value.as_ref()) |
327 | } else { |
328 | self.lang.tokenize(value.as_ref()) |
329 | }; |
330 | |
331 | let tokens = self.pipeline.run(raw_tokens); |
332 | |
333 | self.document_store |
334 | .add_field_length(doc_ref, field, tokens.len()); |
335 | |
336 | for token in tokens { |
337 | *token_freq.entry(token).or_insert(0u64) += 1; |
338 | } |
339 | |
340 | for (token, count) in &token_freq { |
341 | let freq = (*count as f64).sqrt(); |
342 | |
343 | self.index |
344 | .get_mut(field) |
345 | .unwrap_or_else(|| panic!("InvertedIndex does not exist for field {}" , field)) |
346 | .add_token(doc_ref, token, freq); |
347 | } |
348 | } |
349 | |
350 | self.document_store.add_doc(doc_ref, doc); |
351 | } |
352 | |
353 | pub fn get_fields(&self) -> &[String] { |
354 | &self.fields |
355 | } |
356 | |
357 | /// Returns the index, serialized to pretty-printed JSON. |
358 | pub fn to_json_pretty(&self) -> String { |
359 | serde_json::to_string_pretty(&self).unwrap() |
360 | } |
361 | |
362 | /// Returns the index, serialized to JSON. |
363 | pub fn to_json(&self) -> String { |
364 | serde_json::to_string(&self).unwrap() |
365 | } |
366 | } |
367 | |
368 | #[cfg (test)] |
369 | mod tests { |
370 | use super::*; |
371 | |
372 | #[test ] |
373 | fn add_field_to_builder() { |
374 | let idx = IndexBuilder::new() |
375 | .add_fields(&["foo" , "bar" , "baz" ]) |
376 | .build(); |
377 | |
378 | let idx_fields = idx.get_fields(); |
379 | for f in &["foo" , "bar" , "baz" ] { |
380 | assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1); |
381 | } |
382 | } |
383 | |
384 | #[test ] |
385 | fn adding_document_to_index() { |
386 | let mut idx = Index::new(&["body" ]); |
387 | idx.add_doc("1" , &["this is a test" ]); |
388 | |
389 | assert_eq!(idx.document_store.len(), 1); |
390 | assert_eq!( |
391 | idx.document_store.get_doc("1" ).unwrap(), |
392 | btreemap! { |
393 | "id" .into() => "1" .into(), |
394 | "body" .into() => "this is a test" .into(), |
395 | } |
396 | ); |
397 | } |
398 | |
399 | #[test ] |
400 | fn adding_document_with_empty_field() { |
401 | let mut idx = Index::new(&["title" , "body" ]); |
402 | |
403 | idx.add_doc("1" , &["" , "test" ]); |
404 | assert_eq!(idx.index["body" ].get_doc_frequency("test" ), 1); |
405 | assert_eq!(idx.index["body" ].get_docs("test" ).unwrap()["1" ], 1.); |
406 | } |
407 | |
408 | #[test ] |
409 | #[should_panic ] |
410 | fn creating_index_with_identical_fields_panics() { |
411 | let _idx = Index::new(&["title" , "body" , "title" ]); |
412 | } |
413 | } |
414 | |