1 | //! Implements an elasticlunr.js inverted index. Most users do not need to use this module directly. |
2 | |
3 | use std::collections::BTreeMap; |
4 | |
5 | #[derive (Debug, Copy, Clone, Serialize, Deserialize, PartialEq)] |
6 | struct TermFrequency { |
7 | #[serde(rename = "tf" )] |
8 | pub term_freq: f64, |
9 | } |
10 | |
11 | #[derive (Serialize, Deserialize, Debug, Clone, PartialEq, Default)] |
12 | struct IndexItem { |
13 | pub docs: BTreeMap<String, TermFrequency>, |
14 | #[serde(rename = "df" )] |
15 | pub doc_freq: i64, |
16 | #[serde(flatten, serialize_with = "IndexItem::serialize" )] |
17 | pub children: BTreeMap<char, IndexItem>, |
18 | } |
19 | |
20 | impl IndexItem { |
21 | fn new() -> Self { |
22 | Default::default() |
23 | } |
24 | |
25 | fn serialize<S>(map: &BTreeMap<char, IndexItem>, ser: S) -> Result<S::Ok, S::Error> |
26 | where |
27 | S: ::serde::Serializer, |
28 | { |
29 | use serde::ser::SerializeMap; |
30 | |
31 | let mut ser_map = ser.serialize_map(Some(map.len()))?; |
32 | let mut buf = [0u8; 4]; |
33 | for (key, value) in map { |
34 | let key = key.encode_utf8(&mut buf); |
35 | ser_map.serialize_entry(key, value)?; |
36 | } |
37 | ser_map.end() |
38 | } |
39 | |
40 | fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) { |
41 | let mut iter = token.chars(); |
42 | if let Some(character) = iter.next() { |
43 | let mut item = self |
44 | .children |
45 | .entry(character) |
46 | .or_insert_with(IndexItem::new); |
47 | |
48 | for character in iter { |
49 | let tmp = item; |
50 | item = tmp.children.entry(character).or_insert_with(IndexItem::new); |
51 | } |
52 | |
53 | if !item.docs.contains_key(doc_ref) { |
54 | item.doc_freq += 1; |
55 | } |
56 | item.docs |
57 | .insert(doc_ref.into(), TermFrequency { term_freq }); |
58 | } |
59 | } |
60 | |
61 | fn get_node(&self, token: &str) -> Option<&IndexItem> { |
62 | let mut root = self; |
63 | for ch in token.chars() { |
64 | if let Some(item) = root.children.get(&ch) { |
65 | root = item; |
66 | } else { |
67 | return None; |
68 | } |
69 | } |
70 | |
71 | Some(root) |
72 | } |
73 | |
74 | fn remove_token(&mut self, doc_ref: &str, token: &str) { |
75 | let mut iter = token.char_indices(); |
76 | if let Some((_, ch)) = iter.next() { |
77 | if let Some(item) = self.children.get_mut(&ch) { |
78 | if let Some((idx, _)) = iter.next() { |
79 | item.remove_token(doc_ref, &token[idx..]); |
80 | } else if item.docs.contains_key(doc_ref) { |
81 | item.docs.remove(doc_ref); |
82 | item.doc_freq -= 1; |
83 | } |
84 | } |
85 | } |
86 | } |
87 | } |
88 | |
89 | /// Implements an elasticlunr.js inverted index. Most users do not need to use this type directly. |
90 | #[derive (Serialize, Deserialize, Debug, PartialEq, Default)] |
91 | pub struct InvertedIndex { |
92 | root: IndexItem, |
93 | } |
94 | |
95 | impl InvertedIndex { |
96 | pub fn new() -> Self { |
97 | Default::default() |
98 | } |
99 | |
100 | pub fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) { |
101 | self.root.add_token(doc_ref, token, term_freq) |
102 | } |
103 | |
104 | pub fn has_token(&self, token: &str) -> bool { |
105 | self.root.get_node(token).map_or(false, |_| true) |
106 | } |
107 | |
108 | pub fn remove_token(&mut self, doc_ref: &str, token: &str) { |
109 | self.root.remove_token(doc_ref, token) |
110 | } |
111 | |
112 | pub fn get_docs(&self, token: &str) -> Option<BTreeMap<String, f64>> { |
113 | self.root.get_node(token).map(|node| { |
114 | node.docs |
115 | .iter() |
116 | .map(|(k, &v)| (k.clone(), v.term_freq)) |
117 | .collect() |
118 | }) |
119 | } |
120 | |
121 | pub fn get_term_frequency(&self, doc_ref: &str, token: &str) -> f64 { |
122 | self.root |
123 | .get_node(token) |
124 | .and_then(|node| node.docs.get(doc_ref)) |
125 | .map_or(0., |docs| docs.term_freq) |
126 | } |
127 | |
128 | pub fn get_doc_frequency(&self, token: &str) -> i64 { |
129 | self.root.get_node(token).map_or(0, |node| node.doc_freq) |
130 | } |
131 | } |
132 | |
133 | #[cfg (test)] |
134 | mod tests { |
135 | use super::*; |
136 | |
137 | #[test ] |
138 | fn adding_token() { |
139 | let mut inverted_index = InvertedIndex::new(); |
140 | let token = "foo" ; |
141 | |
142 | inverted_index.add_token("123" , token, 1.); |
143 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 1); |
144 | assert_eq!(inverted_index.get_term_frequency("123" , "foo" ), 1.); |
145 | } |
146 | |
147 | #[test ] |
148 | fn has_token() { |
149 | let mut inverted_index = InvertedIndex::new(); |
150 | let token = "foo" ; |
151 | |
152 | inverted_index.add_token("123" , token, 1.); |
153 | assert!(inverted_index.has_token(token)); |
154 | assert!(inverted_index.has_token("fo" )); |
155 | assert!(inverted_index.has_token("f" )); |
156 | |
157 | assert!(!inverted_index.has_token("bar" )); |
158 | assert!(!inverted_index.has_token("foo " )); |
159 | assert!(!inverted_index.has_token("foo " )) |
160 | } |
161 | |
162 | #[test ] |
163 | fn adding_another_document_to_the_token() { |
164 | let mut inverted_index = InvertedIndex::new(); |
165 | let token = "foo" ; |
166 | |
167 | inverted_index.add_token("123" , token, 1.); |
168 | inverted_index.add_token("456" , token, 1.); |
169 | |
170 | assert_eq!(inverted_index.get_term_frequency("123" , "foo" ), 1.); |
171 | assert_eq!(inverted_index.get_term_frequency("456" , "foo" ), 1.); |
172 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 2); |
173 | } |
174 | |
175 | #[test ] |
176 | fn df_of_nonexistant_token() { |
177 | let mut inverted_index = InvertedIndex::new(); |
178 | let token = "foo" ; |
179 | |
180 | inverted_index.add_token("123" , token, 1.); |
181 | inverted_index.add_token("456" , token, 1.); |
182 | |
183 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 2); |
184 | assert_eq!(inverted_index.get_doc_frequency("fox" ), 0); |
185 | } |
186 | |
187 | #[test ] |
188 | fn adding_existing_doc() { |
189 | let mut inverted_index = InvertedIndex::new(); |
190 | let token = "foo" ; |
191 | |
192 | inverted_index.add_token("123" , token, 1.); |
193 | inverted_index.add_token("456" , token, 1.); |
194 | inverted_index.add_token("456" , token, 100.); |
195 | |
196 | assert_eq!(inverted_index.get_term_frequency("456" , "foo" ), 100.); |
197 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 2); |
198 | } |
199 | |
200 | #[test ] |
201 | fn checking_token_exists_in() { |
202 | let mut inverted_index = InvertedIndex::new(); |
203 | let token = "foo" ; |
204 | |
205 | inverted_index.add_token("123" , token, 1.); |
206 | |
207 | assert!(inverted_index.has_token(token)); |
208 | } |
209 | |
210 | #[test ] |
211 | fn checking_if_a_token_does_not_exist() { |
212 | let mut inverted_index = InvertedIndex::new(); |
213 | let token = "foo" ; |
214 | |
215 | inverted_index.add_token("123" , token, 1.); |
216 | assert!(!inverted_index.has_token("fooo" )); |
217 | assert!(!inverted_index.has_token("bar" )); |
218 | assert!(!inverted_index.has_token("fof" )); |
219 | } |
220 | |
221 | #[test ] |
222 | fn retrieving_items() { |
223 | let mut inverted_index = InvertedIndex::new(); |
224 | let token = "foo" ; |
225 | |
226 | inverted_index.add_token("123" , token, 1.); |
227 | assert_eq!( |
228 | inverted_index.get_docs(token).unwrap(), |
229 | btreemap! { |
230 | "123" .into() => 1. |
231 | } |
232 | ); |
233 | |
234 | assert_eq!(inverted_index.get_docs("" ), Some(BTreeMap::new())); |
235 | |
236 | inverted_index.add_token("234" , "boo" , 100.); |
237 | inverted_index.add_token("345" , "too" , 101.); |
238 | |
239 | assert_eq!( |
240 | inverted_index.get_docs(token).unwrap(), |
241 | btreemap! { |
242 | "123" .into() => 1. |
243 | } |
244 | ); |
245 | |
246 | inverted_index.add_token("234" , token, 100.); |
247 | inverted_index.add_token("345" , token, 101.); |
248 | |
249 | assert_eq!( |
250 | inverted_index.get_docs(token).unwrap(), |
251 | btreemap! { |
252 | "123" .into() => 1., |
253 | "234" .into() => 100., |
254 | "345" .into() => 101., |
255 | } |
256 | ); |
257 | } |
258 | |
259 | #[test ] |
260 | fn retrieving_nonexistant_items() { |
261 | let inverted_index = InvertedIndex::new(); |
262 | |
263 | assert_eq!(inverted_index.get_docs("foo" ), None); |
264 | assert_eq!(inverted_index.get_docs("fox" ), None); |
265 | } |
266 | |
267 | #[test ] |
268 | fn df_of_items() { |
269 | let mut inverted_index = InvertedIndex::new(); |
270 | |
271 | inverted_index.add_token("123" , "foo" , 1.); |
272 | inverted_index.add_token("456" , "foo" , 1.); |
273 | inverted_index.add_token("789" , "bar" , 1.); |
274 | |
275 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 2); |
276 | assert_eq!(inverted_index.get_doc_frequency("bar" ), 1); |
277 | assert_eq!(inverted_index.get_doc_frequency("baz" ), 0); |
278 | assert_eq!(inverted_index.get_doc_frequency("ba" ), 0); |
279 | assert_eq!(inverted_index.get_doc_frequency("b" ), 0); |
280 | assert_eq!(inverted_index.get_doc_frequency("fo" ), 0); |
281 | assert_eq!(inverted_index.get_doc_frequency("f" ), 0); |
282 | } |
283 | |
284 | #[test ] |
285 | fn removing_document_from_token() { |
286 | let mut inverted_index = InvertedIndex::new(); |
287 | assert_eq!(inverted_index.get_docs("foo" ), None); |
288 | |
289 | inverted_index.add_token("123" , "foo" , 1.); |
290 | assert_eq!( |
291 | inverted_index.get_docs("foo" ).unwrap(), |
292 | btreemap! { |
293 | "123" .into() => 1., |
294 | } |
295 | ); |
296 | |
297 | inverted_index.remove_token("123" , "foo" ); |
298 | assert_eq!(inverted_index.get_docs("foo" ), Some(BTreeMap::new())); |
299 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 0); |
300 | assert_eq!(inverted_index.has_token("foo" ), true); |
301 | } |
302 | |
303 | #[test ] |
304 | fn removing_nonexistant_document() { |
305 | let mut inverted_index = InvertedIndex::new(); |
306 | |
307 | inverted_index.add_token("123" , "foo" , 1.); |
308 | inverted_index.add_token("567" , "bar" , 1.); |
309 | inverted_index.remove_token("foo" , "456" ); |
310 | |
311 | assert_eq!( |
312 | inverted_index.get_docs("foo" ).unwrap(), |
313 | btreemap! { |
314 | "123" .into() => 1. |
315 | } |
316 | ); |
317 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 1); |
318 | } |
319 | |
320 | #[test ] |
321 | fn removing_documet_nonexistant_key() { |
322 | let mut inverted_index = InvertedIndex::new(); |
323 | |
324 | inverted_index.remove_token("123" , "foo" ); |
325 | assert!(!inverted_index.has_token("foo" )); |
326 | assert_eq!(inverted_index.get_doc_frequency("foo" ), 0); |
327 | } |
328 | |
329 | #[test ] |
330 | fn get_term_frequency() { |
331 | let mut inverted_index = InvertedIndex::new(); |
332 | let token = "foo" ; |
333 | |
334 | inverted_index.add_token("123" , token, 2.); |
335 | inverted_index.add_token("456" , token, 3.); |
336 | |
337 | assert_eq!(inverted_index.get_term_frequency("123" , token), 2.); |
338 | assert_eq!(inverted_index.get_term_frequency("456" , token), 3.); |
339 | assert_eq!(inverted_index.get_term_frequency("789" , token), 0.); |
340 | } |
341 | |
342 | #[test ] |
343 | fn get_term_frequency_nonexistant_token() { |
344 | let mut inverted_index = InvertedIndex::new(); |
345 | let token = "foo" ; |
346 | |
347 | inverted_index.add_token("123" , token, 2.); |
348 | inverted_index.add_token("456" , token, 3.); |
349 | |
350 | assert_eq!(inverted_index.get_term_frequency("123" , "ken" ), 0.); |
351 | assert_eq!(inverted_index.get_term_frequency("456" , "ken" ), 0.); |
352 | } |
353 | |
354 | #[test ] |
355 | fn get_term_frequency_nonexistant_docref() { |
356 | let mut inverted_index = InvertedIndex::new(); |
357 | let token = "foo" ; |
358 | |
359 | inverted_index.add_token("123" , token, 2.); |
360 | inverted_index.add_token("456" , token, 3.); |
361 | |
362 | assert_eq!(inverted_index.get_term_frequency(token, "12" ), 0.); |
363 | assert_eq!(inverted_index.get_term_frequency(token, "23" ), 0.); |
364 | assert_eq!(inverted_index.get_term_frequency(token, "45" ), 0.); |
365 | } |
366 | |
367 | #[test ] |
368 | fn get_term_frequency_nonexistant_token_and_docref() { |
369 | let mut inverted_index = InvertedIndex::new(); |
370 | let token = "foo" ; |
371 | |
372 | inverted_index.add_token("123" , token, 2.); |
373 | inverted_index.add_token("456" , token, 3.); |
374 | |
375 | assert_eq!(inverted_index.get_term_frequency("token" , "1" ), 0.); |
376 | assert_eq!(inverted_index.get_term_frequency("abc" , "2" ), 0.); |
377 | assert_eq!(inverted_index.get_term_frequency("fo" , "123" ), 0.); |
378 | } |
379 | } |
380 | |