1 | use super::{common::StopWordFilter, Language}; |
2 | use crate::pipeline::{FnWrapper, Pipeline, PipelineFn}; |
3 | use regex::Regex; |
4 | |
5 | const WORDS: &[&str] = &[ |
6 | "" , "a" , "able" , "about" , "across" , "after" , "all" , "almost" , "also" , "am" , "among" , "an" , |
7 | "and" , "any" , "are" , "as" , "at" , "be" , "because" , "been" , "but" , "by" , "can" , "cannot" , |
8 | "could" , "dear" , "did" , "do" , "does" , "either" , "else" , "ever" , "every" , "for" , "from" , "get" , |
9 | "got" , "had" , "has" , "have" , "he" , "her" , "hers" , "him" , "his" , "how" , "however" , "i" , "if" , |
10 | "in" , "into" , "is" , "it" , "its" , "just" , "least" , "let" , "like" , "likely" , "may" , "me" , |
11 | "might" , "most" , "must" , "my" , "neither" , "no" , "nor" , "not" , "of" , "off" , "often" , "on" , |
12 | "only" , "or" , "other" , "our" , "own" , "rather" , "said" , "say" , "says" , "she" , "should" , "since" , |
13 | "so" , "some" , "than" , "that" , "the" , "their" , "them" , "then" , "there" , "these" , "they" , "this" , |
14 | "tis" , "to" , "too" , "twas" , "us" , "wants" , "was" , "we" , "were" , "what" , "when" , "where" , |
15 | "which" , "while" , "who" , "whom" , "why" , "will" , "with" , "would" , "yet" , "you" , "your" , |
16 | ]; |
17 | |
18 | #[derive (Clone)] |
19 | pub struct English { |
20 | stemmer: Stemmer, |
21 | } |
22 | |
23 | impl English { |
24 | pub fn new() -> Self { |
25 | let stemmer: Stemmer = Stemmer::new(); |
26 | Self { stemmer } |
27 | } |
28 | } |
29 | |
30 | impl Language for English { |
31 | fn name(&self) -> String { |
32 | "English" .into() |
33 | } |
34 | fn code(&self) -> String { |
35 | "en" .into() |
36 | } |
37 | |
38 | fn tokenize(&self, text: &str) -> Vec<String> { |
39 | super::tokenize_whitespace(text) |
40 | } |
41 | |
42 | fn make_pipeline(&self) -> Pipeline { |
43 | Pipeline { |
44 | queue: vec![ |
45 | Box::new(FnWrapper("trimmer" .into(), trimmer)), |
46 | Box::new(StopWordFilter::new("stopWordFilter" , WORDS)), |
47 | Box::new(self.stemmer.clone()), |
48 | ], |
49 | } |
50 | } |
51 | } |
52 | |
53 | fn trimmer(token: String) -> Option<String> { |
54 | Some( |
55 | token&str |
56 | .trim_matches(|c: char| !c.is_digit(radix:36) && c != '_' ) |
57 | .into(), |
58 | ) |
59 | } |
60 | |
61 | static STEP_2: &[(&str, &str)] = &[ |
62 | ("ational" , "ate" ), |
63 | ("tional" , "tion" ), |
64 | ("enci" , "ence" ), |
65 | ("anci" , "ance" ), |
66 | ("izer" , "ize" ), |
67 | ("bli" , "ble" ), |
68 | ("alli" , "al" ), |
69 | ("entli" , "ent" ), |
70 | ("eli" , "e" ), |
71 | ("ousli" , "ous" ), |
72 | ("ization" , "ize" ), |
73 | ("ation" , "ate" ), |
74 | ("ator" , "ate" ), |
75 | ("alism" , "al" ), |
76 | ("iveness" , "ive" ), |
77 | ("fulness" , "ful" ), |
78 | ("ousness" , "ous" ), |
79 | ("aliti" , "al" ), |
80 | ("iviti" , "ive" ), |
81 | ("biliti" , "ble" ), |
82 | ("logi" , "log" ), |
83 | ]; |
84 | |
85 | static STEP_3: &[(&str, &str)] = &[ |
86 | ("icate" , "ic" ), |
87 | ("ative" , "" ), |
88 | ("alize" , "al" ), |
89 | ("iciti" , "ic" ), |
90 | ("ical" , "ic" ), |
91 | ("ful" , "" ), |
92 | ("ness" , "" ), |
93 | ]; |
94 | |
95 | // This is a direct port of the stemmer from elasticlunr.js |
96 | // It's not very efficient and very not-rusty, but it |
97 | // generates identical output. |
98 | |
99 | #[derive (Clone)] |
100 | struct Stemmer { |
101 | re_mgr0: Regex, |
102 | re_mgr1: Regex, |
103 | re_meq1: Regex, |
104 | re_s_v: Regex, |
105 | |
106 | re_1a: Regex, |
107 | re2_1a: Regex, |
108 | re_1b: Regex, |
109 | re2_1b: Regex, |
110 | re2_1b_2: Regex, |
111 | re3_1b_2: Regex, |
112 | re4_1b_2: Regex, |
113 | |
114 | re_1c: Regex, |
115 | re_2: Regex, |
116 | |
117 | re_3: Regex, |
118 | |
119 | re_4: Regex, |
120 | re2_4: Regex, |
121 | |
122 | re_5: Regex, |
123 | re3_5: Regex, |
124 | } |
125 | |
126 | impl PipelineFn for Stemmer { |
127 | fn name(&self) -> String { |
128 | "stemmer" .into() |
129 | } |
130 | |
131 | fn filter(&self, token: String) -> Option<String> { |
132 | Some(self.stem(token)) |
133 | } |
134 | } |
135 | |
136 | // vowel |
137 | macro_rules! V { |
138 | () => { |
139 | "[aeiouy]" |
140 | }; |
141 | } |
142 | |
143 | // consonant sequence |
144 | macro_rules! CS { |
145 | () => { |
146 | "[^aeiou][^aeiouy]*" |
147 | }; |
148 | } |
149 | |
150 | // vowel sequence |
151 | macro_rules! VS { |
152 | () => { |
153 | "[aeiouy][aeiou]*" |
154 | }; |
155 | } |
156 | |
157 | #[inline ] |
158 | fn concat_string(strs: &[&str]) -> String { |
159 | strs.iter().cloned().collect() |
160 | } |
161 | |
162 | impl Stemmer { |
163 | fn new() -> Self { |
164 | let mgr0 = concat!("^(" , CS!(), ")?" , VS!(), CS!()); |
165 | let meq1 = concat!("^(" , CS!(), ")?" , VS!(), CS!(), "(" , VS!(), ")?$" ); |
166 | let mgr1 = concat!("^(" , CS!(), ")?" , VS!(), CS!(), VS!(), CS!()); |
167 | let s_v = concat!("^(" , CS!(), ")?" , V!()); |
168 | |
169 | let re_mgr0 = Regex::new(mgr0).unwrap(); |
170 | let re_mgr1 = Regex::new(mgr1).unwrap(); |
171 | let re_meq1 = Regex::new(meq1).unwrap(); |
172 | let re_s_v = Regex::new(s_v).unwrap(); |
173 | |
174 | let re_1a = Regex::new("^(.+?)(ss|i)es$" ).unwrap(); |
175 | let re2_1a = Regex::new("^(.+?)([^s])s$" ).unwrap(); |
176 | let re_1b = Regex::new("^(.+?)eed$" ).unwrap(); |
177 | let re2_1b = Regex::new("^(.+?)(ed|ing)$" ).unwrap(); |
178 | let re2_1b_2 = Regex::new("(at|bl|iz)$" ).unwrap(); |
179 | let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$" ).unwrap(); |
180 | let re4_1b_2 = Regex::new(concat!("^" , CS!(), V!(), "[^aeiouwxy]$" )).unwrap(); |
181 | |
182 | let re_1c = Regex::new("^(.+?[^aeiou])y$" ).unwrap(); |
183 | let re_2 = Regex::new( |
184 | "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ |
185 | ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$" , |
186 | ) |
187 | .unwrap(); |
188 | |
189 | let re_3 = Regex::new("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$" ).unwrap(); |
190 | |
191 | let re_4 = Regex::new( |
192 | "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$" , |
193 | ) |
194 | .unwrap(); |
195 | let re2_4 = Regex::new("^(.+?)(s|t)(ion)$" ).unwrap(); |
196 | |
197 | let re_5 = Regex::new("^(.+?)e$" ).unwrap(); |
198 | let re3_5 = Regex::new(concat!("^" , CS!(), V!(), "[^aeiouwxy]$" )).unwrap(); |
199 | |
200 | Stemmer { |
201 | re_mgr0, |
202 | re_mgr1, |
203 | re_meq1, |
204 | re_s_v, |
205 | re_1a, |
206 | re2_1a, |
207 | re_1b, |
208 | re2_1b, |
209 | re2_1b_2, |
210 | re3_1b_2, |
211 | re4_1b_2, |
212 | re_1c, |
213 | re_2, |
214 | re_3, |
215 | re_4, |
216 | re2_4, |
217 | re_5, |
218 | re3_5, |
219 | } |
220 | } |
221 | |
222 | /// Implements the Porter stemming algorithm |
223 | pub fn stem(&self, mut w: String) -> String { |
224 | if w.len() < 3 { |
225 | return w; |
226 | } |
227 | |
228 | let starts_with_y = w.as_bytes()[0] == b'y' ; |
229 | if starts_with_y { |
230 | w.remove(0); |
231 | w.insert(0, 'Y' ); |
232 | } |
233 | |
234 | // TODO: There's probably a better way to handle the |
235 | // borrowchecker than cloning w a million times |
236 | |
237 | // Step 1a |
238 | if let Some(caps) = self.re_1a.captures(&w.clone()) { |
239 | w = concat_string(&[&caps[1], &caps[2]]); |
240 | } |
241 | if let Some(caps) = self.re2_1a.captures(&w.clone()) { |
242 | w = concat_string(&[&caps[1], &caps[2]]); |
243 | } |
244 | |
245 | // Step 1b |
246 | if let Some(caps) = self.re_1b.captures(&w.clone()) { |
247 | let stem = &caps[1]; |
248 | if self.re_mgr0.is_match(stem) { |
249 | w.pop(); |
250 | } |
251 | } else if let Some(caps) = self.re2_1b.captures(&w.clone()) { |
252 | let stem = &caps[1]; |
253 | if self.re_s_v.is_match(stem) { |
254 | w = stem.into(); |
255 | |
256 | let mut re3_1b_2_matched = false; |
257 | |
258 | if self.re2_1b_2.is_match(&w) { |
259 | w.push('e' ); |
260 | } else if let Some(m) = self.re3_1b_2.find(&w.clone()) { |
261 | let mut suffix = m.as_str().chars(); |
262 | // Make sure the two characters are the same since we can't use backreferences |
263 | if suffix.next() == suffix.next() { |
264 | re3_1b_2_matched = true; |
265 | w.pop(); |
266 | } |
267 | } |
268 | |
269 | // re4_1b_2 still runs if re3_1b_2 matches but |
270 | // the matched chcaracters are not the same |
271 | if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) { |
272 | w.push('e' ); |
273 | } |
274 | } |
275 | } |
276 | |
277 | // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first |
278 | // letter of the word (so cry -> cri, by -> by, say -> say) |
279 | if let Some(caps) = self.re_1c.captures(&w.clone()) { |
280 | let stem = &caps[1]; |
281 | w = concat_string(&[stem, "i" ]); |
282 | } |
283 | |
284 | // Step 2 |
285 | if let Some(caps) = self.re_2.captures(&w.clone()) { |
286 | let stem = &caps[1]; |
287 | let suffix = &caps[2]; |
288 | if self.re_mgr0.is_match(stem) { |
289 | w = concat_string(&[stem, STEP_2.iter().find(|&&(k, _)| k == suffix).unwrap().1]); |
290 | } |
291 | } |
292 | |
293 | // Step 3 |
294 | if let Some(caps) = self.re_3.captures(&w.clone()) { |
295 | let stem = &caps[1]; |
296 | let suffix = &caps[2]; |
297 | if self.re_mgr0.is_match(stem) { |
298 | w = concat_string(&[stem, STEP_3.iter().find(|&&(k, _)| k == suffix).unwrap().1]); |
299 | } |
300 | } |
301 | |
302 | // Step 4 |
303 | if let Some(caps) = self.re_4.captures(&w.clone()) { |
304 | let stem = &caps[1]; |
305 | if self.re_mgr1.is_match(stem) { |
306 | w = stem.into(); |
307 | } |
308 | } else if let Some(caps) = self.re2_4.captures(&w.clone()) { |
309 | let stem = concat_string(&[&caps[1], &caps[2]]); |
310 | if self.re_mgr1.is_match(&stem) { |
311 | w = stem; |
312 | } |
313 | } |
314 | |
315 | // Step 5 |
316 | if let Some(caps) = self.re_5.captures(&w.clone()) { |
317 | let stem = &caps[1]; |
318 | if self.re_mgr1.is_match(stem) |
319 | || (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem))) |
320 | { |
321 | w = stem.into(); |
322 | } |
323 | } |
324 | |
325 | if w.ends_with("ll" ) && self.re_mgr1.is_match(&w) { |
326 | w.pop(); |
327 | } |
328 | |
329 | // replace the original 'y' |
330 | if starts_with_y { |
331 | w.remove(0); |
332 | w.insert(0, 'y' ); |
333 | } |
334 | |
335 | w |
336 | } |
337 | } |
338 | |
339 | #[cfg (test)] |
340 | mod tests { |
341 | use super::*; |
342 | |
343 | macro_rules! pipeline_eq { |
344 | ($func:expr, $input:expr, $output:expr) => { |
345 | assert_eq!(&$func($input.to_string()).unwrap(), $output); |
346 | }; |
347 | } |
348 | |
349 | #[test ] |
350 | fn latin_characters() { |
351 | pipeline_eq!(trimmer, "hello" , "hello" ); |
352 | } |
353 | |
354 | #[test ] |
355 | fn removing_punctuation() { |
356 | pipeline_eq!(trimmer, "hello." , "hello" ); |
357 | pipeline_eq!(trimmer, "it's" , "it's" ); |
358 | pipeline_eq!(trimmer, "james'" , "james" ); |
359 | pipeline_eq!(trimmer, "stop!" , "stop" ); |
360 | pipeline_eq!(trimmer, "first," , "first" ); |
361 | pipeline_eq!(trimmer, "" , "" ); |
362 | pipeline_eq!(trimmer, "[tag]" , "tag" ); |
363 | pipeline_eq!(trimmer, "[[[tag]]]" , "tag" ); |
364 | pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}" , "hello" ); |
365 | pipeline_eq!(trimmer, "~!@@@hello***()()()]]" , "hello" ); |
366 | } |
367 | |
368 | #[test ] |
369 | fn test_stemmer() { |
370 | let cases = [ |
371 | ("consign" , "consign" ), |
372 | ("consigned" , "consign" ), |
373 | ("consigning" , "consign" ), |
374 | ("consignment" , "consign" ), |
375 | ("consist" , "consist" ), |
376 | ("consisted" , "consist" ), |
377 | ("consistency" , "consist" ), |
378 | ("consistent" , "consist" ), |
379 | ("consistently" , "consist" ), |
380 | ("consisting" , "consist" ), |
381 | ("consists" , "consist" ), |
382 | ("consolation" , "consol" ), |
383 | ("consolations" , "consol" ), |
384 | ("consolatory" , "consolatori" ), |
385 | ("console" , "consol" ), |
386 | ("consoled" , "consol" ), |
387 | ("consoles" , "consol" ), |
388 | ("consolidate" , "consolid" ), |
389 | ("consolidated" , "consolid" ), |
390 | ("consolidating" , "consolid" ), |
391 | ("consoling" , "consol" ), |
392 | ("consols" , "consol" ), |
393 | ("consonant" , "conson" ), |
394 | ("consort" , "consort" ), |
395 | ("consorted" , "consort" ), |
396 | ("consorting" , "consort" ), |
397 | ("conspicuous" , "conspicu" ), |
398 | ("conspicuously" , "conspicu" ), |
399 | ("conspiracy" , "conspiraci" ), |
400 | ("conspirator" , "conspir" ), |
401 | ("conspirators" , "conspir" ), |
402 | ("conspire" , "conspir" ), |
403 | ("conspired" , "conspir" ), |
404 | ("conspiring" , "conspir" ), |
405 | ("constable" , "constabl" ), |
406 | ("constables" , "constabl" ), |
407 | ("constance" , "constanc" ), |
408 | ("constancy" , "constanc" ), |
409 | ("constant" , "constant" ), |
410 | ("knack" , "knack" ), |
411 | ("knackeries" , "knackeri" ), |
412 | ("knacks" , "knack" ), |
413 | ("knag" , "knag" ), |
414 | ("knave" , "knave" ), |
415 | ("knaves" , "knave" ), |
416 | ("knavish" , "knavish" ), |
417 | ("kneaded" , "knead" ), |
418 | ("kneading" , "knead" ), |
419 | ("knee" , "knee" ), |
420 | ("kneel" , "kneel" ), |
421 | ("kneeled" , "kneel" ), |
422 | ("kneeling" , "kneel" ), |
423 | ("kneels" , "kneel" ), |
424 | ("knees" , "knee" ), |
425 | ("knell" , "knell" ), |
426 | ("knelt" , "knelt" ), |
427 | ("knew" , "knew" ), |
428 | ("knick" , "knick" ), |
429 | ("knif" , "knif" ), |
430 | ("knife" , "knife" ), |
431 | ("knight" , "knight" ), |
432 | ("knights" , "knight" ), |
433 | ("knit" , "knit" ), |
434 | ("knits" , "knit" ), |
435 | ("knitted" , "knit" ), |
436 | ("knitting" , "knit" ), |
437 | ("knives" , "knive" ), |
438 | ("knob" , "knob" ), |
439 | ("knobs" , "knob" ), |
440 | ("knock" , "knock" ), |
441 | ("knocked" , "knock" ), |
442 | ("knocker" , "knocker" ), |
443 | ("knockers" , "knocker" ), |
444 | ("knocking" , "knock" ), |
445 | ("knocks" , "knock" ), |
446 | ("knopp" , "knopp" ), |
447 | ("knot" , "knot" ), |
448 | ("knots" , "knot" ), |
449 | ("lay" , "lay" ), |
450 | ("try" , "tri" ), |
451 | ]; |
452 | |
453 | let stemmer = Stemmer::new(); |
454 | for &(input, output) in cases.iter() { |
455 | assert_eq!(&stemmer.stem(input.into()), output); |
456 | } |
457 | } |
458 | } |
459 | |