1//!# elasticlunr-rs
2//!
3//! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs)
4//! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs)
5//! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs)
6//!
7//! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to
8//! be used for generating compatible search indices.
9//!
10//! Access to all index-generating functionality is provided. Most users will only need to use the
11//! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types.
12//!
13//! The [`Language`] trait can be used to implement a custom language.
14//!
15//! ## Example
16//!
17//! ```
18//! use std::fs::File;
19//! use std::io::Write;
20//! use elasticlunr::Index;
21//!
22//! let mut index = Index::new(&["title", "body"]);
23//! index.add_doc("1", &["This is a title", "This is body text!"]);
24//! // Add more docs...
25//! let mut file = File::create("out.json").unwrap();
26//! file.write_all(index.to_json_pretty().as_bytes());
27//! ```
28
29#[macro_use]
30extern crate serde_derive;
31
32#[cfg(test)]
33#[macro_use]
34extern crate maplit;
35
36/// The version of elasticlunr.js this library was designed for.
37pub const ELASTICLUNR_VERSION: &str = "0.9.5";
38
39pub mod config;
40pub mod document_store;
41pub mod inverted_index;
42pub mod lang;
43pub mod pipeline;
44
45use std::collections::BTreeMap;
46
47use document_store::DocumentStore;
48use inverted_index::InvertedIndex;
49use lang::English;
50pub use lang::Language;
51pub use pipeline::Pipeline;
52
53type Tokenizer = Option<Box<dyn Fn(&str) -> Vec<String>>>;
54
55/// A builder for an `Index` with custom parameters.
56///
57/// # Example
58/// ```
59/// # use elasticlunr::{Index, IndexBuilder};
60/// let mut index = IndexBuilder::new()
61/// .save_docs(false)
62/// .add_fields(&["title", "subtitle", "body"])
63/// .set_ref("doc_id")
64/// .build();
65/// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]);
66/// ```
67pub struct IndexBuilder {
68 save: bool,
69 fields: Vec<String>,
70 field_tokenizers: Vec<Tokenizer>,
71 ref_field: String,
72 pipeline: Option<Pipeline>,
73 language: Box<dyn Language>,
74}
75
76impl Default for IndexBuilder {
77 fn default() -> Self {
78 IndexBuilder {
79 save: true,
80 fields: Vec::new(),
81 field_tokenizers: Vec::new(),
82 ref_field: "id".into(),
83 pipeline: None,
84 language: Box::new(English::new()),
85 }
86 }
87}
88
89impl IndexBuilder {
90 pub fn new() -> Self {
91 Default::default()
92 }
93
94 pub fn with_language(language: Box<dyn Language>) -> Self {
95 Self {
96 language,
97 ..Default::default()
98 }
99 }
100
101 /// Set whether or not documents should be saved in the `Index`'s document store.
102 pub fn save_docs(mut self, save: bool) -> Self {
103 self.save = save;
104 self
105 }
106
107 /// Add a document field to the `Index`.
108 ///
109 /// # Panics
110 ///
111 /// Panics if a field with the name already exists.
112 pub fn add_field(mut self, field: &str) -> Self {
113 let field = field.into();
114 if self.fields.contains(&field) {
115 panic!("Duplicate fields in index: {}", field);
116 }
117 self.fields.push(field);
118 self.field_tokenizers.push(None);
119 self
120 }
121
122 /// Add a document field to the `Index`, with a custom tokenizer for that field.
123 ///
124 /// # Panics
125 ///
126 /// Panics if a field with the name already exists.
127 pub fn add_field_with_tokenizer(
128 mut self,
129 field: &str,
130 tokenizer: Box<dyn Fn(&str) -> Vec<String>>,
131 ) -> Self {
132 let field = field.into();
133 if self.fields.contains(&field) {
134 panic!("Duplicate fields in index: {}", field);
135 }
136 self.fields.push(field);
137 self.field_tokenizers.push(Some(tokenizer));
138 self
139 }
140
141 /// Add the document fields to the `Index`.
142 ///
143 /// # Panics
144 ///
145 /// Panics if two fields have the same name.
146 pub fn add_fields<I>(mut self, fields: I) -> Self
147 where
148 I: IntoIterator,
149 I::Item: AsRef<str>,
150 {
151 for field in fields {
152 self = self.add_field(field.as_ref())
153 }
154 self
155 }
156
157 /// Set the key used to store the document reference field.
158 pub fn set_ref(mut self, ref_field: &str) -> Self {
159 self.ref_field = ref_field.into();
160 self
161 }
162
163 /// Build an `Index` from this builder.
164 pub fn build(self) -> Index {
165 let IndexBuilder {
166 save,
167 fields,
168 field_tokenizers,
169 ref_field,
170 pipeline,
171 language,
172 } = self;
173
174 let index = fields
175 .iter()
176 .map(|f| (f.clone(), InvertedIndex::new()))
177 .collect();
178
179 let pipeline = pipeline.unwrap_or_else(|| language.make_pipeline());
180
181 Index {
182 index,
183 fields,
184 field_tokenizers,
185 ref_field,
186 document_store: DocumentStore::new(save),
187 pipeline,
188 version: crate::ELASTICLUNR_VERSION,
189 lang: language,
190 }
191 }
192}
193
194/// An elasticlunr search index.
195#[derive(Serialize, Deserialize)]
196#[serde(rename_all = "camelCase")]
197pub struct Index {
198 fields: Vec<String>,
199 #[serde(skip)]
200 field_tokenizers: Vec<Tokenizer>,
201 pipeline: Pipeline,
202 #[serde(rename = "ref")]
203 ref_field: String,
204 version: &'static str,
205 index: BTreeMap<String, InvertedIndex>,
206 document_store: DocumentStore,
207 #[serde(with = "ser_lang")]
208 lang: Box<dyn Language>,
209}
210
211mod ser_lang {
212 use crate::Language;
213 use serde::de;
214 use serde::{Deserializer, Serializer};
215 use std::fmt;
216
217 pub fn serialize<S>(lang: &Box<dyn Language>, serializer: S) -> Result<S::Ok, S::Error>
218 where
219 S: Serializer,
220 {
221 serializer.serialize_str(&lang.name())
222 }
223
224 pub fn deserialize<'de, D>(deserializer: D) -> Result<Box<dyn Language>, D::Error>
225 where
226 D: Deserializer<'de>,
227 {
228 deserializer.deserialize_str(LanguageVisitor)
229 }
230
231 struct LanguageVisitor;
232
233 impl<'de> de::Visitor<'de> for LanguageVisitor {
234 type Value = Box<dyn Language>;
235
236 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
237 formatter.write_str("a capitalized language name")
238 }
239
240 fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
241 where
242 E: de::Error,
243 {
244 match crate::lang::from_name(v) {
245 Some(l) => Ok(l),
246 None => Err(E::custom(format!("Unknown language name: {}", v))),
247 }
248 }
249 }
250}
251
252impl Index {
253 /// Create a new index with the provided fields.
254 ///
255 /// # Example
256 ///
257 /// ```
258 /// # use elasticlunr::{Index};
259 /// let mut index = Index::new(&["title", "body"]);
260 /// index.add_doc("1", &["this is a title", "this is body text"]);
261 /// ```
262 ///
263 /// # Panics
264 ///
265 /// Panics if a field with the name already exists.
266 pub fn new<I>(fields: I) -> Self
267 where
268 I: IntoIterator,
269 I::Item: AsRef<str>,
270 {
271 IndexBuilder::new().add_fields(fields).build()
272 }
273
274 /// Create a new index with the provided fields for the given
275 /// [`Language`](lang/enum.Language.html).
276 ///
277 /// # Example
278 ///
279 /// ```
280 /// use elasticlunr::{Index, lang::English};
281 /// let mut index = Index::with_language(Box::new(English::new()), &["title", "body"]);
282 /// index.add_doc("1", &["this is a title", "this is body text"]);
283 /// ```
284 ///
285 /// # Panics
286 ///
287 /// Panics if a field with the name already exists.
288 pub fn with_language<I>(lang: Box<dyn Language>, fields: I) -> Self
289 where
290 I: IntoIterator,
291 I::Item: AsRef<str>,
292 {
293 IndexBuilder::with_language(lang).add_fields(fields).build()
294 }
295
296 /// Add the data from a document to the index.
297 ///
298 /// *NOTE: The elements of `data` should be provided in the same order as
299 /// the fields used to create the index.*
300 ///
301 /// # Example
302 /// ```
303 /// # use elasticlunr::Index;
304 /// let mut index = Index::new(&["title", "body"]);
305 /// index.add_doc("1", &["this is a title", "this is body text"]);
306 /// ```
307 pub fn add_doc<I>(&mut self, doc_ref: &str, data: I)
308 where
309 I: IntoIterator,
310 I::Item: AsRef<str>,
311 {
312 let mut doc = BTreeMap::new();
313 doc.insert(self.ref_field.clone(), doc_ref.into());
314 let mut token_freq = BTreeMap::new();
315
316 for (i, value) in data.into_iter().enumerate() {
317 let field = &self.fields[i];
318 let tokenizer = self.field_tokenizers[i].as_ref();
319 doc.insert(field.clone(), value.as_ref().to_string());
320
321 if field == &self.ref_field {
322 continue;
323 }
324
325 let raw_tokens = if let Some(tokenizer) = tokenizer {
326 tokenizer(value.as_ref())
327 } else {
328 self.lang.tokenize(value.as_ref())
329 };
330
331 let tokens = self.pipeline.run(raw_tokens);
332
333 self.document_store
334 .add_field_length(doc_ref, field, tokens.len());
335
336 for token in tokens {
337 *token_freq.entry(token).or_insert(0u64) += 1;
338 }
339
340 for (token, count) in &token_freq {
341 let freq = (*count as f64).sqrt();
342
343 self.index
344 .get_mut(field)
345 .unwrap_or_else(|| panic!("InvertedIndex does not exist for field {}", field))
346 .add_token(doc_ref, token, freq);
347 }
348 }
349
350 self.document_store.add_doc(doc_ref, doc);
351 }
352
353 pub fn get_fields(&self) -> &[String] {
354 &self.fields
355 }
356
357 /// Returns the index, serialized to pretty-printed JSON.
358 pub fn to_json_pretty(&self) -> String {
359 serde_json::to_string_pretty(&self).unwrap()
360 }
361
362 /// Returns the index, serialized to JSON.
363 pub fn to_json(&self) -> String {
364 serde_json::to_string(&self).unwrap()
365 }
366}
367
368#[cfg(test)]
369mod tests {
370 use super::*;
371
372 #[test]
373 fn add_field_to_builder() {
374 let idx = IndexBuilder::new()
375 .add_fields(&["foo", "bar", "baz"])
376 .build();
377
378 let idx_fields = idx.get_fields();
379 for f in &["foo", "bar", "baz"] {
380 assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1);
381 }
382 }
383
384 #[test]
385 fn adding_document_to_index() {
386 let mut idx = Index::new(&["body"]);
387 idx.add_doc("1", &["this is a test"]);
388
389 assert_eq!(idx.document_store.len(), 1);
390 assert_eq!(
391 idx.document_store.get_doc("1").unwrap(),
392 btreemap! {
393 "id".into() => "1".into(),
394 "body".into() => "this is a test".into(),
395 }
396 );
397 }
398
399 #[test]
400 fn adding_document_with_empty_field() {
401 let mut idx = Index::new(&["title", "body"]);
402
403 idx.add_doc("1", &["", "test"]);
404 assert_eq!(idx.index["body"].get_doc_frequency("test"), 1);
405 assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.);
406 }
407
408 #[test]
409 #[should_panic]
410 fn creating_index_with_identical_fields_panics() {
411 let _idx = Index::new(&["title", "body", "title"]);
412 }
413}
414