1//! Implements an elasticlunr.js inverted index. Most users do not need to use this module directly.
2
3use std::collections::BTreeMap;
4
5#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)]
6struct TermFrequency {
7 #[serde(rename = "tf")]
8 pub term_freq: f64,
9}
10
11#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Default)]
12struct IndexItem {
13 pub docs: BTreeMap<String, TermFrequency>,
14 #[serde(rename = "df")]
15 pub doc_freq: i64,
16 #[serde(flatten, serialize_with = "IndexItem::serialize")]
17 pub children: BTreeMap<char, IndexItem>,
18}
19
20impl IndexItem {
21 fn new() -> Self {
22 Default::default()
23 }
24
25 fn serialize<S>(map: &BTreeMap<char, IndexItem>, ser: S) -> Result<S::Ok, S::Error>
26 where
27 S: ::serde::Serializer,
28 {
29 use serde::ser::SerializeMap;
30
31 let mut ser_map = ser.serialize_map(Some(map.len()))?;
32 let mut buf = [0u8; 4];
33 for (key, value) in map {
34 let key = key.encode_utf8(&mut buf);
35 ser_map.serialize_entry(key, value)?;
36 }
37 ser_map.end()
38 }
39
40 fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) {
41 let mut iter = token.chars();
42 if let Some(character) = iter.next() {
43 let mut item = self
44 .children
45 .entry(character)
46 .or_insert_with(IndexItem::new);
47
48 for character in iter {
49 let tmp = item;
50 item = tmp.children.entry(character).or_insert_with(IndexItem::new);
51 }
52
53 if !item.docs.contains_key(doc_ref) {
54 item.doc_freq += 1;
55 }
56 item.docs
57 .insert(doc_ref.into(), TermFrequency { term_freq });
58 }
59 }
60
61 fn get_node(&self, token: &str) -> Option<&IndexItem> {
62 let mut root = self;
63 for ch in token.chars() {
64 if let Some(item) = root.children.get(&ch) {
65 root = item;
66 } else {
67 return None;
68 }
69 }
70
71 Some(root)
72 }
73
74 fn remove_token(&mut self, doc_ref: &str, token: &str) {
75 let mut iter = token.char_indices();
76 if let Some((_, ch)) = iter.next() {
77 if let Some(item) = self.children.get_mut(&ch) {
78 if let Some((idx, _)) = iter.next() {
79 item.remove_token(doc_ref, &token[idx..]);
80 } else if item.docs.contains_key(doc_ref) {
81 item.docs.remove(doc_ref);
82 item.doc_freq -= 1;
83 }
84 }
85 }
86 }
87}
88
89/// Implements an elasticlunr.js inverted index. Most users do not need to use this type directly.
90#[derive(Serialize, Deserialize, Debug, PartialEq, Default)]
91pub struct InvertedIndex {
92 root: IndexItem,
93}
94
95impl InvertedIndex {
96 pub fn new() -> Self {
97 Default::default()
98 }
99
100 pub fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) {
101 self.root.add_token(doc_ref, token, term_freq)
102 }
103
104 pub fn has_token(&self, token: &str) -> bool {
105 self.root.get_node(token).map_or(false, |_| true)
106 }
107
108 pub fn remove_token(&mut self, doc_ref: &str, token: &str) {
109 self.root.remove_token(doc_ref, token)
110 }
111
112 pub fn get_docs(&self, token: &str) -> Option<BTreeMap<String, f64>> {
113 self.root.get_node(token).map(|node| {
114 node.docs
115 .iter()
116 .map(|(k, &v)| (k.clone(), v.term_freq))
117 .collect()
118 })
119 }
120
121 pub fn get_term_frequency(&self, doc_ref: &str, token: &str) -> f64 {
122 self.root
123 .get_node(token)
124 .and_then(|node| node.docs.get(doc_ref))
125 .map_or(0., |docs| docs.term_freq)
126 }
127
128 pub fn get_doc_frequency(&self, token: &str) -> i64 {
129 self.root.get_node(token).map_or(0, |node| node.doc_freq)
130 }
131}
132
133#[cfg(test)]
134mod tests {
135 use super::*;
136
137 #[test]
138 fn adding_token() {
139 let mut inverted_index = InvertedIndex::new();
140 let token = "foo";
141
142 inverted_index.add_token("123", token, 1.);
143 assert_eq!(inverted_index.get_doc_frequency("foo"), 1);
144 assert_eq!(inverted_index.get_term_frequency("123", "foo"), 1.);
145 }
146
147 #[test]
148 fn has_token() {
149 let mut inverted_index = InvertedIndex::new();
150 let token = "foo";
151
152 inverted_index.add_token("123", token, 1.);
153 assert!(inverted_index.has_token(token));
154 assert!(inverted_index.has_token("fo"));
155 assert!(inverted_index.has_token("f"));
156
157 assert!(!inverted_index.has_token("bar"));
158 assert!(!inverted_index.has_token("foo "));
159 assert!(!inverted_index.has_token("foo "))
160 }
161
162 #[test]
163 fn adding_another_document_to_the_token() {
164 let mut inverted_index = InvertedIndex::new();
165 let token = "foo";
166
167 inverted_index.add_token("123", token, 1.);
168 inverted_index.add_token("456", token, 1.);
169
170 assert_eq!(inverted_index.get_term_frequency("123", "foo"), 1.);
171 assert_eq!(inverted_index.get_term_frequency("456", "foo"), 1.);
172 assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
173 }
174
175 #[test]
176 fn df_of_nonexistant_token() {
177 let mut inverted_index = InvertedIndex::new();
178 let token = "foo";
179
180 inverted_index.add_token("123", token, 1.);
181 inverted_index.add_token("456", token, 1.);
182
183 assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
184 assert_eq!(inverted_index.get_doc_frequency("fox"), 0);
185 }
186
187 #[test]
188 fn adding_existing_doc() {
189 let mut inverted_index = InvertedIndex::new();
190 let token = "foo";
191
192 inverted_index.add_token("123", token, 1.);
193 inverted_index.add_token("456", token, 1.);
194 inverted_index.add_token("456", token, 100.);
195
196 assert_eq!(inverted_index.get_term_frequency("456", "foo"), 100.);
197 assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
198 }
199
200 #[test]
201 fn checking_token_exists_in() {
202 let mut inverted_index = InvertedIndex::new();
203 let token = "foo";
204
205 inverted_index.add_token("123", token, 1.);
206
207 assert!(inverted_index.has_token(token));
208 }
209
210 #[test]
211 fn checking_if_a_token_does_not_exist() {
212 let mut inverted_index = InvertedIndex::new();
213 let token = "foo";
214
215 inverted_index.add_token("123", token, 1.);
216 assert!(!inverted_index.has_token("fooo"));
217 assert!(!inverted_index.has_token("bar"));
218 assert!(!inverted_index.has_token("fof"));
219 }
220
221 #[test]
222 fn retrieving_items() {
223 let mut inverted_index = InvertedIndex::new();
224 let token = "foo";
225
226 inverted_index.add_token("123", token, 1.);
227 assert_eq!(
228 inverted_index.get_docs(token).unwrap(),
229 btreemap! {
230 "123".into() => 1.
231 }
232 );
233
234 assert_eq!(inverted_index.get_docs(""), Some(BTreeMap::new()));
235
236 inverted_index.add_token("234", "boo", 100.);
237 inverted_index.add_token("345", "too", 101.);
238
239 assert_eq!(
240 inverted_index.get_docs(token).unwrap(),
241 btreemap! {
242 "123".into() => 1.
243 }
244 );
245
246 inverted_index.add_token("234", token, 100.);
247 inverted_index.add_token("345", token, 101.);
248
249 assert_eq!(
250 inverted_index.get_docs(token).unwrap(),
251 btreemap! {
252 "123".into() => 1.,
253 "234".into() => 100.,
254 "345".into() => 101.,
255 }
256 );
257 }
258
259 #[test]
260 fn retrieving_nonexistant_items() {
261 let inverted_index = InvertedIndex::new();
262
263 assert_eq!(inverted_index.get_docs("foo"), None);
264 assert_eq!(inverted_index.get_docs("fox"), None);
265 }
266
267 #[test]
268 fn df_of_items() {
269 let mut inverted_index = InvertedIndex::new();
270
271 inverted_index.add_token("123", "foo", 1.);
272 inverted_index.add_token("456", "foo", 1.);
273 inverted_index.add_token("789", "bar", 1.);
274
275 assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
276 assert_eq!(inverted_index.get_doc_frequency("bar"), 1);
277 assert_eq!(inverted_index.get_doc_frequency("baz"), 0);
278 assert_eq!(inverted_index.get_doc_frequency("ba"), 0);
279 assert_eq!(inverted_index.get_doc_frequency("b"), 0);
280 assert_eq!(inverted_index.get_doc_frequency("fo"), 0);
281 assert_eq!(inverted_index.get_doc_frequency("f"), 0);
282 }
283
284 #[test]
285 fn removing_document_from_token() {
286 let mut inverted_index = InvertedIndex::new();
287 assert_eq!(inverted_index.get_docs("foo"), None);
288
289 inverted_index.add_token("123", "foo", 1.);
290 assert_eq!(
291 inverted_index.get_docs("foo").unwrap(),
292 btreemap! {
293 "123".into() => 1.,
294 }
295 );
296
297 inverted_index.remove_token("123", "foo");
298 assert_eq!(inverted_index.get_docs("foo"), Some(BTreeMap::new()));
299 assert_eq!(inverted_index.get_doc_frequency("foo"), 0);
300 assert_eq!(inverted_index.has_token("foo"), true);
301 }
302
303 #[test]
304 fn removing_nonexistant_document() {
305 let mut inverted_index = InvertedIndex::new();
306
307 inverted_index.add_token("123", "foo", 1.);
308 inverted_index.add_token("567", "bar", 1.);
309 inverted_index.remove_token("foo", "456");
310
311 assert_eq!(
312 inverted_index.get_docs("foo").unwrap(),
313 btreemap! {
314 "123".into() => 1.
315 }
316 );
317 assert_eq!(inverted_index.get_doc_frequency("foo"), 1);
318 }
319
320 #[test]
321 fn removing_documet_nonexistant_key() {
322 let mut inverted_index = InvertedIndex::new();
323
324 inverted_index.remove_token("123", "foo");
325 assert!(!inverted_index.has_token("foo"));
326 assert_eq!(inverted_index.get_doc_frequency("foo"), 0);
327 }
328
329 #[test]
330 fn get_term_frequency() {
331 let mut inverted_index = InvertedIndex::new();
332 let token = "foo";
333
334 inverted_index.add_token("123", token, 2.);
335 inverted_index.add_token("456", token, 3.);
336
337 assert_eq!(inverted_index.get_term_frequency("123", token), 2.);
338 assert_eq!(inverted_index.get_term_frequency("456", token), 3.);
339 assert_eq!(inverted_index.get_term_frequency("789", token), 0.);
340 }
341
342 #[test]
343 fn get_term_frequency_nonexistant_token() {
344 let mut inverted_index = InvertedIndex::new();
345 let token = "foo";
346
347 inverted_index.add_token("123", token, 2.);
348 inverted_index.add_token("456", token, 3.);
349
350 assert_eq!(inverted_index.get_term_frequency("123", "ken"), 0.);
351 assert_eq!(inverted_index.get_term_frequency("456", "ken"), 0.);
352 }
353
354 #[test]
355 fn get_term_frequency_nonexistant_docref() {
356 let mut inverted_index = InvertedIndex::new();
357 let token = "foo";
358
359 inverted_index.add_token("123", token, 2.);
360 inverted_index.add_token("456", token, 3.);
361
362 assert_eq!(inverted_index.get_term_frequency(token, "12"), 0.);
363 assert_eq!(inverted_index.get_term_frequency(token, "23"), 0.);
364 assert_eq!(inverted_index.get_term_frequency(token, "45"), 0.);
365 }
366
367 #[test]
368 fn get_term_frequency_nonexistant_token_and_docref() {
369 let mut inverted_index = InvertedIndex::new();
370 let token = "foo";
371
372 inverted_index.add_token("123", token, 2.);
373 inverted_index.add_token("456", token, 3.);
374
375 assert_eq!(inverted_index.get_term_frequency("token", "1"), 0.);
376 assert_eq!(inverted_index.get_term_frequency("abc", "2"), 0.);
377 assert_eq!(inverted_index.get_term_frequency("fo", "123"), 0.);
378 }
379}
380