1use super::{common::StopWordFilter, Language};
2use crate::pipeline::{FnWrapper, Pipeline, PipelineFn};
3use regex::Regex;
4
5const WORDS: &[&str] = &[
6 "", "a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an",
7 "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot",
8 "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get",
9 "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if",
10 "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me",
11 "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on",
12 "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since",
13 "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this",
14 "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where",
15 "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your",
16];
17
18#[derive(Clone)]
19pub struct English {
20 stemmer: Stemmer,
21}
22
23impl English {
24 pub fn new() -> Self {
25 let stemmer: Stemmer = Stemmer::new();
26 Self { stemmer }
27 }
28}
29
30impl Language for English {
31 fn name(&self) -> String {
32 "English".into()
33 }
34 fn code(&self) -> String {
35 "en".into()
36 }
37
38 fn tokenize(&self, text: &str) -> Vec<String> {
39 super::tokenize_whitespace(text)
40 }
41
42 fn make_pipeline(&self) -> Pipeline {
43 Pipeline {
44 queue: vec![
45 Box::new(FnWrapper("trimmer".into(), trimmer)),
46 Box::new(StopWordFilter::new("stopWordFilter", WORDS)),
47 Box::new(self.stemmer.clone()),
48 ],
49 }
50 }
51}
52
53fn trimmer(token: String) -> Option<String> {
54 Some(
55 token&str
56 .trim_matches(|c: char| !c.is_digit(radix:36) && c != '_')
57 .into(),
58 )
59}
60
61static STEP_2: &[(&str, &str)] = &[
62 ("ational", "ate"),
63 ("tional", "tion"),
64 ("enci", "ence"),
65 ("anci", "ance"),
66 ("izer", "ize"),
67 ("bli", "ble"),
68 ("alli", "al"),
69 ("entli", "ent"),
70 ("eli", "e"),
71 ("ousli", "ous"),
72 ("ization", "ize"),
73 ("ation", "ate"),
74 ("ator", "ate"),
75 ("alism", "al"),
76 ("iveness", "ive"),
77 ("fulness", "ful"),
78 ("ousness", "ous"),
79 ("aliti", "al"),
80 ("iviti", "ive"),
81 ("biliti", "ble"),
82 ("logi", "log"),
83];
84
85static STEP_3: &[(&str, &str)] = &[
86 ("icate", "ic"),
87 ("ative", ""),
88 ("alize", "al"),
89 ("iciti", "ic"),
90 ("ical", "ic"),
91 ("ful", ""),
92 ("ness", ""),
93];
94
95// This is a direct port of the stemmer from elasticlunr.js
96// It's not very efficient and very not-rusty, but it
97// generates identical output.
98
99#[derive(Clone)]
100struct Stemmer {
101 re_mgr0: Regex,
102 re_mgr1: Regex,
103 re_meq1: Regex,
104 re_s_v: Regex,
105
106 re_1a: Regex,
107 re2_1a: Regex,
108 re_1b: Regex,
109 re2_1b: Regex,
110 re2_1b_2: Regex,
111 re3_1b_2: Regex,
112 re4_1b_2: Regex,
113
114 re_1c: Regex,
115 re_2: Regex,
116
117 re_3: Regex,
118
119 re_4: Regex,
120 re2_4: Regex,
121
122 re_5: Regex,
123 re3_5: Regex,
124}
125
126impl PipelineFn for Stemmer {
127 fn name(&self) -> String {
128 "stemmer".into()
129 }
130
131 fn filter(&self, token: String) -> Option<String> {
132 Some(self.stem(token))
133 }
134}
135
136// vowel
137macro_rules! V {
138 () => {
139 "[aeiouy]"
140 };
141}
142
143// consonant sequence
144macro_rules! CS {
145 () => {
146 "[^aeiou][^aeiouy]*"
147 };
148}
149
150// vowel sequence
151macro_rules! VS {
152 () => {
153 "[aeiouy][aeiou]*"
154 };
155}
156
157#[inline]
158fn concat_string(strs: &[&str]) -> String {
159 strs.iter().cloned().collect()
160}
161
162impl Stemmer {
163 fn new() -> Self {
164 let mgr0 = concat!("^(", CS!(), ")?", VS!(), CS!());
165 let meq1 = concat!("^(", CS!(), ")?", VS!(), CS!(), "(", VS!(), ")?$");
166 let mgr1 = concat!("^(", CS!(), ")?", VS!(), CS!(), VS!(), CS!());
167 let s_v = concat!("^(", CS!(), ")?", V!());
168
169 let re_mgr0 = Regex::new(mgr0).unwrap();
170 let re_mgr1 = Regex::new(mgr1).unwrap();
171 let re_meq1 = Regex::new(meq1).unwrap();
172 let re_s_v = Regex::new(s_v).unwrap();
173
174 let re_1a = Regex::new("^(.+?)(ss|i)es$").unwrap();
175 let re2_1a = Regex::new("^(.+?)([^s])s$").unwrap();
176 let re_1b = Regex::new("^(.+?)eed$").unwrap();
177 let re2_1b = Regex::new("^(.+?)(ed|ing)$").unwrap();
178 let re2_1b_2 = Regex::new("(at|bl|iz)$").unwrap();
179 let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$").unwrap();
180 let re4_1b_2 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
181
182 let re_1c = Regex::new("^(.+?[^aeiou])y$").unwrap();
183 let re_2 = Regex::new(
184 "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
185 ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$",
186 )
187 .unwrap();
188
189 let re_3 = Regex::new("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$").unwrap();
190
191 let re_4 = Regex::new(
192 "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$",
193 )
194 .unwrap();
195 let re2_4 = Regex::new("^(.+?)(s|t)(ion)$").unwrap();
196
197 let re_5 = Regex::new("^(.+?)e$").unwrap();
198 let re3_5 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
199
200 Stemmer {
201 re_mgr0,
202 re_mgr1,
203 re_meq1,
204 re_s_v,
205 re_1a,
206 re2_1a,
207 re_1b,
208 re2_1b,
209 re2_1b_2,
210 re3_1b_2,
211 re4_1b_2,
212 re_1c,
213 re_2,
214 re_3,
215 re_4,
216 re2_4,
217 re_5,
218 re3_5,
219 }
220 }
221
222 /// Implements the Porter stemming algorithm
223 pub fn stem(&self, mut w: String) -> String {
224 if w.len() < 3 {
225 return w;
226 }
227
228 let starts_with_y = w.as_bytes()[0] == b'y';
229 if starts_with_y {
230 w.remove(0);
231 w.insert(0, 'Y');
232 }
233
234 // TODO: There's probably a better way to handle the
235 // borrowchecker than cloning w a million times
236
237 // Step 1a
238 if let Some(caps) = self.re_1a.captures(&w.clone()) {
239 w = concat_string(&[&caps[1], &caps[2]]);
240 }
241 if let Some(caps) = self.re2_1a.captures(&w.clone()) {
242 w = concat_string(&[&caps[1], &caps[2]]);
243 }
244
245 // Step 1b
246 if let Some(caps) = self.re_1b.captures(&w.clone()) {
247 let stem = &caps[1];
248 if self.re_mgr0.is_match(stem) {
249 w.pop();
250 }
251 } else if let Some(caps) = self.re2_1b.captures(&w.clone()) {
252 let stem = &caps[1];
253 if self.re_s_v.is_match(stem) {
254 w = stem.into();
255
256 let mut re3_1b_2_matched = false;
257
258 if self.re2_1b_2.is_match(&w) {
259 w.push('e');
260 } else if let Some(m) = self.re3_1b_2.find(&w.clone()) {
261 let mut suffix = m.as_str().chars();
262 // Make sure the two characters are the same since we can't use backreferences
263 if suffix.next() == suffix.next() {
264 re3_1b_2_matched = true;
265 w.pop();
266 }
267 }
268
269 // re4_1b_2 still runs if re3_1b_2 matches but
270 // the matched chcaracters are not the same
271 if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) {
272 w.push('e');
273 }
274 }
275 }
276
277 // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first
278 // letter of the word (so cry -> cri, by -> by, say -> say)
279 if let Some(caps) = self.re_1c.captures(&w.clone()) {
280 let stem = &caps[1];
281 w = concat_string(&[stem, "i"]);
282 }
283
284 // Step 2
285 if let Some(caps) = self.re_2.captures(&w.clone()) {
286 let stem = &caps[1];
287 let suffix = &caps[2];
288 if self.re_mgr0.is_match(stem) {
289 w = concat_string(&[stem, STEP_2.iter().find(|&&(k, _)| k == suffix).unwrap().1]);
290 }
291 }
292
293 // Step 3
294 if let Some(caps) = self.re_3.captures(&w.clone()) {
295 let stem = &caps[1];
296 let suffix = &caps[2];
297 if self.re_mgr0.is_match(stem) {
298 w = concat_string(&[stem, STEP_3.iter().find(|&&(k, _)| k == suffix).unwrap().1]);
299 }
300 }
301
302 // Step 4
303 if let Some(caps) = self.re_4.captures(&w.clone()) {
304 let stem = &caps[1];
305 if self.re_mgr1.is_match(stem) {
306 w = stem.into();
307 }
308 } else if let Some(caps) = self.re2_4.captures(&w.clone()) {
309 let stem = concat_string(&[&caps[1], &caps[2]]);
310 if self.re_mgr1.is_match(&stem) {
311 w = stem;
312 }
313 }
314
315 // Step 5
316 if let Some(caps) = self.re_5.captures(&w.clone()) {
317 let stem = &caps[1];
318 if self.re_mgr1.is_match(stem)
319 || (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem)))
320 {
321 w = stem.into();
322 }
323 }
324
325 if w.ends_with("ll") && self.re_mgr1.is_match(&w) {
326 w.pop();
327 }
328
329 // replace the original 'y'
330 if starts_with_y {
331 w.remove(0);
332 w.insert(0, 'y');
333 }
334
335 w
336 }
337}
338
339#[cfg(test)]
340mod tests {
341 use super::*;
342
343 macro_rules! pipeline_eq {
344 ($func:expr, $input:expr, $output:expr) => {
345 assert_eq!(&$func($input.to_string()).unwrap(), $output);
346 };
347 }
348
349 #[test]
350 fn latin_characters() {
351 pipeline_eq!(trimmer, "hello", "hello");
352 }
353
354 #[test]
355 fn removing_punctuation() {
356 pipeline_eq!(trimmer, "hello.", "hello");
357 pipeline_eq!(trimmer, "it's", "it's");
358 pipeline_eq!(trimmer, "james'", "james");
359 pipeline_eq!(trimmer, "stop!", "stop");
360 pipeline_eq!(trimmer, "first,", "first");
361 pipeline_eq!(trimmer, "", "");
362 pipeline_eq!(trimmer, "[tag]", "tag");
363 pipeline_eq!(trimmer, "[[[tag]]]", "tag");
364 pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}", "hello");
365 pipeline_eq!(trimmer, "~!@@@hello***()()()]]", "hello");
366 }
367
368 #[test]
369 fn test_stemmer() {
370 let cases = [
371 ("consign", "consign"),
372 ("consigned", "consign"),
373 ("consigning", "consign"),
374 ("consignment", "consign"),
375 ("consist", "consist"),
376 ("consisted", "consist"),
377 ("consistency", "consist"),
378 ("consistent", "consist"),
379 ("consistently", "consist"),
380 ("consisting", "consist"),
381 ("consists", "consist"),
382 ("consolation", "consol"),
383 ("consolations", "consol"),
384 ("consolatory", "consolatori"),
385 ("console", "consol"),
386 ("consoled", "consol"),
387 ("consoles", "consol"),
388 ("consolidate", "consolid"),
389 ("consolidated", "consolid"),
390 ("consolidating", "consolid"),
391 ("consoling", "consol"),
392 ("consols", "consol"),
393 ("consonant", "conson"),
394 ("consort", "consort"),
395 ("consorted", "consort"),
396 ("consorting", "consort"),
397 ("conspicuous", "conspicu"),
398 ("conspicuously", "conspicu"),
399 ("conspiracy", "conspiraci"),
400 ("conspirator", "conspir"),
401 ("conspirators", "conspir"),
402 ("conspire", "conspir"),
403 ("conspired", "conspir"),
404 ("conspiring", "conspir"),
405 ("constable", "constabl"),
406 ("constables", "constabl"),
407 ("constance", "constanc"),
408 ("constancy", "constanc"),
409 ("constant", "constant"),
410 ("knack", "knack"),
411 ("knackeries", "knackeri"),
412 ("knacks", "knack"),
413 ("knag", "knag"),
414 ("knave", "knave"),
415 ("knaves", "knave"),
416 ("knavish", "knavish"),
417 ("kneaded", "knead"),
418 ("kneading", "knead"),
419 ("knee", "knee"),
420 ("kneel", "kneel"),
421 ("kneeled", "kneel"),
422 ("kneeling", "kneel"),
423 ("kneels", "kneel"),
424 ("knees", "knee"),
425 ("knell", "knell"),
426 ("knelt", "knelt"),
427 ("knew", "knew"),
428 ("knick", "knick"),
429 ("knif", "knif"),
430 ("knife", "knife"),
431 ("knight", "knight"),
432 ("knights", "knight"),
433 ("knit", "knit"),
434 ("knits", "knit"),
435 ("knitted", "knit"),
436 ("knitting", "knit"),
437 ("knives", "knive"),
438 ("knob", "knob"),
439 ("knobs", "knob"),
440 ("knock", "knock"),
441 ("knocked", "knock"),
442 ("knocker", "knocker"),
443 ("knockers", "knocker"),
444 ("knocking", "knock"),
445 ("knocks", "knock"),
446 ("knopp", "knopp"),
447 ("knot", "knot"),
448 ("knots", "knot"),
449 ("lay", "lay"),
450 ("try", "tri"),
451 ];
452
453 let stemmer = Stemmer::new();
454 for &(input, output) in cases.iter() {
455 assert_eq!(&stemmer.stem(input.into()), output);
456 }
457 }
458}
459