1use crate::pipeline::PipelineFn;
2use regex::Regex;
3use std::collections::HashSet;
4
5#[derive(Clone)]
6pub struct StopWordFilter {
7 name: String,
8 stop_words: HashSet<String>,
9}
10
11impl StopWordFilter {
12 pub fn new(name: &str, stop_words: &[&str]) -> Self {
13 Self {
14 name: name.into(),
15 stop_words: stop_words.iter().map(|s: &&str| s.to_string()).collect(),
16 }
17 }
18}
19
20impl PipelineFn for StopWordFilter {
21 fn name(&self) -> String {
22 self.name.clone()
23 }
24
25 fn filter(&self, token: String) -> Option<String> {
26 if self.stop_words.contains(&token) {
27 None
28 } else {
29 Some(token)
30 }
31 }
32}
33
34#[derive(Clone)]
35pub struct RegexTrimmer {
36 name: String,
37 trimmer: Regex,
38}
39
40impl RegexTrimmer {
41 pub fn new(name: &str, word_chars: &str) -> Self {
42 let name: String = name.into();
43 let trimmer: Regex = Regex::new(&format!("^[^{0}]+|[^{0}]+$", word_chars)).unwrap();
44 Self { name, trimmer }
45 }
46}
47
48impl PipelineFn for RegexTrimmer {
49 fn name(&self) -> String {
50 self.name.clone()
51 }
52
53 fn filter(&self, token: String) -> Option<String> {
54 let result: Cow<'_, str> = self.trimmer.replace_all(&token, rep:"");
55 if result.is_empty() {
56 None
57 } else if result == token {
58 Some(token)
59 } else {
60 Some(result.into())
61 }
62 }
63}
64
65#[cfg(feature = "rust-stemmers")]
66pub struct RustStemmer {
67 name: String,
68 stemmer: rust_stemmers::Stemmer,
69}
70
71#[cfg(feature = "rust-stemmers")]
72impl RustStemmer {
73 pub fn new(name: &str, algo: rust_stemmers::Algorithm) -> Self {
74 Self {
75 name: name.into(),
76 stemmer: rust_stemmers::Stemmer::create(algo),
77 }
78 }
79}
80
81#[cfg(feature = "rust-stemmers")]
82impl PipelineFn for RustStemmer {
83 fn name(&self) -> String {
84 self.name.clone()
85 }
86
87 fn filter(&self, token: String) -> Option<String> {
88 let result = self.stemmer.stem(&token);
89 if result.is_empty() {
90 None
91 } else if result == token {
92 Some(token)
93 } else {
94 Some(result.into())
95 }
96 }
97}
98