en.rs source code [crates/elasticlunr-rs/src/lang/en.rs]

1	use super::{common::StopWordFilter, Language};
2	use crate::pipeline::{FnWrapper, Pipeline, PipelineFn};
3	use regex::Regex;
4
5	const WORDS: &[&str] = &[
6	"", "a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an",
7	"and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot",
8	"could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get",
9	"got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if",
10	"in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me",
11	"might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on",
12	"only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since",
13	"so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this",
14	"tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where",
15	"which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your",
16	];
17
18	#[derive(Clone)]
19	pub struct English {
20	stemmer: Stemmer,
21	}
22
23	impl English {
24	pub fn new() -> Self {
25	let stemmer: Stemmer = Stemmer::new();
26	Self { stemmer }
27	}
28	}
29
30	impl Language for English {
31	fn name(&self) -> String {
32	"English".into()
33	}
34	fn code(&self) -> String {
35	"en".into()
36	}
37
38	fn tokenize(&self, text: &str) -> Vec<String> {
39	super::tokenize_whitespace(text)
40	}
41
42	fn make_pipeline(&self) -> Pipeline {
43	Pipeline {
44	queue: vec![
45	Box::new(FnWrapper("trimmer".into(), trimmer)),
46	Box::new(StopWordFilter::new("stopWordFilter", WORDS)),
47	Box::new(self.stemmer.clone()),
48	],
49	}
50	}
51	}
52
53	fn trimmer(token: String) -> Option<String> {
54	Some(
55	token&str
56	.trim_matches(\|c: char\| !c.is_digit(radix:`36`) && c != '_')
57	.into(),
58	)
59	}
60
61	static STEP_2: &[(&str, &str)] = &[
62	("ational", "ate"),
63	("tional", "tion"),
64	("enci", "ence"),
65	("anci", "ance"),
66	("izer", "ize"),
67	("bli", "ble"),
68	("alli", "al"),
69	("entli", "ent"),
70	("eli", "e"),
71	("ousli", "ous"),
72	("ization", "ize"),
73	("ation", "ate"),
74	("ator", "ate"),
75	("alism", "al"),
76	("iveness", "ive"),
77	("fulness", "ful"),
78	("ousness", "ous"),
79	("aliti", "al"),
80	("iviti", "ive"),
81	("biliti", "ble"),
82	("logi", "log"),
83	];
84
85	static STEP_3: &[(&str, &str)] = &[
86	("icate", "ic"),
87	("ative", ""),
88	("alize", "al"),
89	("iciti", "ic"),
90	("ical", "ic"),
91	("ful", ""),
92	("ness", ""),
93	];
94
95	// This is a direct port of the stemmer from elasticlunr.js
96	// It's not very efficient and very not-rusty, but it
97	// generates identical output.
98
99	#[derive(Clone)]
100	struct Stemmer {
101	re_mgr0: Regex,
102	re_mgr1: Regex,
103	re_meq1: Regex,
104	re_s_v: Regex,
105
106	re_1a: Regex,
107	re2_1a: Regex,
108	re_1b: Regex,
109	re2_1b: Regex,
110	re2_1b_2: Regex,
111	re3_1b_2: Regex,
112	re4_1b_2: Regex,
113
114	re_1c: Regex,
115	re_2: Regex,
116
117	re_3: Regex,
118
119	re_4: Regex,
120	re2_4: Regex,
121
122	re_5: Regex,
123	re3_5: Regex,
124	}
125
126	impl PipelineFn for Stemmer {
127	fn name(&self) -> String {
128	"stemmer".into()
129	}
130
131	fn filter(&self, token: String) -> Option<String> {
132	Some(self.stem(token))
133	}
134	}
135
136	// vowel
137	macro_rules! V {
138	() => {
139	"[aeiouy]"
140	};
141	}
142
143	// consonant sequence
144	macro_rules! CS {
145	() => {
146	"[^aeiou][^aeiouy]*"
147	};
148	}
149
150	// vowel sequence
151	macro_rules! VS {
152	() => {
153	"[aeiouy][aeiou]*"
154	};
155	}
156
157	#[inline]
158	fn concat_string(strs: &[&str]) -> String {
159	strs.iter().cloned().collect()
160	}
161
162	impl Stemmer {
163	fn new() -> Self {
164	let mgr0 = concat!("^(", CS!(), ")?", VS!(), CS!());
165	let meq1 = concat!("^(", CS!(), ")?", VS!(), CS!(), "(", VS!(), ")?$");
166	let mgr1 = concat!("^(", CS!(), ")?", VS!(), CS!(), VS!(), CS!());
167	let s_v = concat!("^(", CS!(), ")?", V!());
168
169	let re_mgr0 = Regex::new(mgr0).unwrap();
170	let re_mgr1 = Regex::new(mgr1).unwrap();
171	let re_meq1 = Regex::new(meq1).unwrap();
172	let re_s_v = Regex::new(s_v).unwrap();
173
174	let re_1a = Regex::new("^(.+?)(ss\|i)es$").unwrap();
175	let re2_1a = Regex::new("^(.+?)([^s])s$").unwrap();
176	let re_1b = Regex::new("^(.+?)eed$").unwrap();
177	let re2_1b = Regex::new("^(.+?)(ed\|ing)$").unwrap();
178	let re2_1b_2 = Regex::new("(at\|bl\|iz)$").unwrap();
179	let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$").unwrap();
180	let re4_1b_2 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
181
182	let re_1c = Regex::new("^(.+?[^aeiou])y$").unwrap();
183	let re_2 = Regex::new(
184	"^(.+?)(ational\|tional\|enci\|anci\|izer\|bli\|alli\|entli\|eli\|ousli\|\
185	ization\|ation\|ator\|alism\|iveness\|fulness\|ousness\|aliti\|iviti\|biliti\|logi)$",
186	)
187	.unwrap();
188
189	let re_3 = Regex::new("^(.+?)(icate\|ative\|alize\|iciti\|ical\|ful\|ness)$").unwrap();
190
191	let re_4 = Regex::new(
192	"^(.+?)(al\|ance\|ence\|er\|ic\|able\|ible\|ant\|ement\|ment\|ent\|ou\|ism\|ate\|iti\|ous\|ive\|ize)$",
193	)
194	.unwrap();
195	let re2_4 = Regex::new("^(.+?)(s\|t)(ion)$").unwrap();
196
197	let re_5 = Regex::new("^(.+?)e$").unwrap();
198	let re3_5 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
199
200	Stemmer {
201	re_mgr0,
202	re_mgr1,
203	re_meq1,
204	re_s_v,
205	re_1a,
206	re2_1a,
207	re_1b,
208	re2_1b,
209	re2_1b_2,
210	re3_1b_2,
211	re4_1b_2,
212	re_1c,
213	re_2,
214	re_3,
215	re_4,
216	re2_4,
217	re_5,
218	re3_5,
219	}
220	}
221
222	/// Implements the Porter stemming algorithm
223	pub fn stem(&self, mut w: String) -> String {
224	if w.len() < `3` {
225	return w;
226	}
227
228	let starts_with_y = w.as_bytes()[`0`] == b'y';
229	if starts_with_y {
230	w.remove(`0`);
231	w.insert(`0`, 'Y');
232	}
233
234	// TODO: There's probably a better way to handle the
235	// borrowchecker than cloning w a million times
236
237	// Step 1a
238	if let Some(caps) = self.re_1a.captures(&w.clone()) {
239	w = concat_string(&[&caps[`1`], &caps[`2`]]);
240	}
241	if let Some(caps) = self.re2_1a.captures(&w.clone()) {
242	w = concat_string(&[&caps[`1`], &caps[`2`]]);
243	}
244
245	// Step 1b
246	if let Some(caps) = self.re_1b.captures(&w.clone()) {
247	let stem = &caps[`1`];
248	if self.re_mgr0.is_match(stem) {
249	w.pop();
250	}
251	} else if let Some(caps) = self.re2_1b.captures(&w.clone()) {
252	let stem = &caps[`1`];
253	if self.re_s_v.is_match(stem) {
254	w = stem.into();
255
256	let mut re3_1b_2_matched = `false`;
257
258	if self.re2_1b_2.is_match(&w) {
259	w.push('e');
260	} else if let Some(m) = self.re3_1b_2.find(&w.clone()) {
261	let mut suffix = m.as_str().chars();
262	// Make sure the two characters are the same since we can't use backreferences
263	if suffix.next() == suffix.next() {
264	re3_1b_2_matched = `true`;
265	w.pop();
266	}
267	}
268
269	// re4_1b_2 still runs if re3_1b_2 matches but
270	// the matched chcaracters are not the same
271	if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) {
272	w.push('e');
273	}
274	}
275	}
276
277	// Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first
278	// letter of the word (so cry -> cri, by -> by, say -> say)
279	if let Some(caps) = self.re_1c.captures(&w.clone()) {
280	let stem = &caps[`1`];
281	w = concat_string(&[stem, "i"]);
282	}
283
284	// Step 2
285	if let Some(caps) = self.re_2.captures(&w.clone()) {
286	let stem = &caps[`1`];
287	let suffix = &caps[`2`];
288	if self.re_mgr0.is_match(stem) {
289	w = concat_string(&[stem, STEP_2.iter().find(\|&&(k, _)\| k == suffix).unwrap().1]);
290	}
291	}
292
293	// Step 3
294	if let Some(caps) = self.re_3.captures(&w.clone()) {
295	let stem = &caps[`1`];
296	let suffix = &caps[`2`];
297	if self.re_mgr0.is_match(stem) {
298	w = concat_string(&[stem, STEP_3.iter().find(\|&&(k, _)\| k == suffix).unwrap().1]);
299	}
300	}
301
302	// Step 4
303	if let Some(caps) = self.re_4.captures(&w.clone()) {
304	let stem = &caps[`1`];
305	if self.re_mgr1.is_match(stem) {
306	w = stem.into();
307	}
308	} else if let Some(caps) = self.re2_4.captures(&w.clone()) {
309	let stem = concat_string(&[&caps[`1`], &caps[`2`]]);
310	if self.re_mgr1.is_match(&stem) {
311	w = stem;
312	}
313	}
314
315	// Step 5
316	if let Some(caps) = self.re_5.captures(&w.clone()) {
317	let stem = &caps[`1`];
318	if self.re_mgr1.is_match(stem)
319	\|\| (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem)))
320	{
321	w = stem.into();
322	}
323	}
324
325	if w.ends_with("ll") && self.re_mgr1.is_match(&w) {
326	w.pop();
327	}
328
329	// replace the original 'y'
330	if starts_with_y {
331	w.remove(`0`);
332	w.insert(`0`, 'y');
333	}
334
335	w
336	}
337	}
338
339	#[cfg(test)]
340	mod tests {
341	use super::*;
342
343	macro_rules! pipeline_eq {
344	($func:expr, $input:expr, $output:expr) => {
345	assert_eq!(&$func($input.to_string()).unwrap(), $output);
346	};
347	}
348
349	#[test]
350	fn latin_characters() {
351	pipeline_eq!(trimmer, "hello", "hello");
352	}
353
354	#[test]
355	fn removing_punctuation() {
356	pipeline_eq!(trimmer, "hello.", "hello");
357	pipeline_eq!(trimmer, "it's", "it's");
358	pipeline_eq!(trimmer, "james'", "james");
359	pipeline_eq!(trimmer, "stop!", "stop");
360	pipeline_eq!(trimmer, "first,", "first");
361	pipeline_eq!(trimmer, "", "");
362	pipeline_eq!(trimmer, "[tag]", "tag");
363	pipeline_eq!(trimmer, "[[[tag]]]", "tag");
364	pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}", "hello");
365	pipeline_eq!(trimmer, "~!@@@hello***()()()]]", "hello");
366	}
367
368	#[test]
369	fn test_stemmer() {
370	let cases = [
371	("consign", "consign"),
372	("consigned", "consign"),
373	("consigning", "consign"),
374	("consignment", "consign"),
375	("consist", "consist"),
376	("consisted", "consist"),
377	("consistency", "consist"),
378	("consistent", "consist"),
379	("consistently", "consist"),
380	("consisting", "consist"),
381	("consists", "consist"),
382	("consolation", "consol"),
383	("consolations", "consol"),
384	("consolatory", "consolatori"),
385	("console", "consol"),
386	("consoled", "consol"),
387	("consoles", "consol"),
388	("consolidate", "consolid"),
389	("consolidated", "consolid"),
390	("consolidating", "consolid"),
391	("consoling", "consol"),
392	("consols", "consol"),
393	("consonant", "conson"),
394	("consort", "consort"),
395	("consorted", "consort"),
396	("consorting", "consort"),
397	("conspicuous", "conspicu"),
398	("conspicuously", "conspicu"),
399	("conspiracy", "conspiraci"),
400	("conspirator", "conspir"),
401	("conspirators", "conspir"),
402	("conspire", "conspir"),
403	("conspired", "conspir"),
404	("conspiring", "conspir"),
405	("constable", "constabl"),
406	("constables", "constabl"),
407	("constance", "constanc"),
408	("constancy", "constanc"),
409	("constant", "constant"),
410	("knack", "knack"),
411	("knackeries", "knackeri"),
412	("knacks", "knack"),
413	("knag", "knag"),
414	("knave", "knave"),
415	("knaves", "knave"),
416	("knavish", "knavish"),
417	("kneaded", "knead"),
418	("kneading", "knead"),
419	("knee", "knee"),
420	("kneel", "kneel"),
421	("kneeled", "kneel"),
422	("kneeling", "kneel"),
423	("kneels", "kneel"),
424	("knees", "knee"),
425	("knell", "knell"),
426	("knelt", "knelt"),
427	("knew", "knew"),
428	("knick", "knick"),
429	("knif", "knif"),
430	("knife", "knife"),
431	("knight", "knight"),
432	("knights", "knight"),
433	("knit", "knit"),
434	("knits", "knit"),
435	("knitted", "knit"),
436	("knitting", "knit"),
437	("knives", "knive"),
438	("knob", "knob"),
439	("knobs", "knob"),
440	("knock", "knock"),
441	("knocked", "knock"),
442	("knocker", "knocker"),
443	("knockers", "knocker"),
444	("knocking", "knock"),
445	("knocks", "knock"),
446	("knopp", "knopp"),
447	("knot", "knot"),
448	("knots", "knot"),
449	("lay", "lay"),
450	("try", "tri"),
451	];
452
453	let stemmer = Stemmer::new();
454	for &(input, output) in cases.iter() {
455	assert_eq!(&stemmer.stem(input.into()), output);
456	}
457	}
458	}
459