1 | use crate::pipeline::PipelineFn; |
2 | use regex::Regex; |
3 | use std::collections::HashSet; |
4 | |
5 | #[derive (Clone)] |
6 | pub struct StopWordFilter { |
7 | name: String, |
8 | stop_words: HashSet<String>, |
9 | } |
10 | |
11 | impl StopWordFilter { |
12 | pub fn new(name: &str, stop_words: &[&str]) -> Self { |
13 | Self { |
14 | name: name.into(), |
15 | stop_words: stop_words.iter().map(|s: &&str| s.to_string()).collect(), |
16 | } |
17 | } |
18 | } |
19 | |
20 | impl PipelineFn for StopWordFilter { |
21 | fn name(&self) -> String { |
22 | self.name.clone() |
23 | } |
24 | |
25 | fn filter(&self, token: String) -> Option<String> { |
26 | if self.stop_words.contains(&token) { |
27 | None |
28 | } else { |
29 | Some(token) |
30 | } |
31 | } |
32 | } |
33 | |
34 | #[derive (Clone)] |
35 | pub struct RegexTrimmer { |
36 | name: String, |
37 | trimmer: Regex, |
38 | } |
39 | |
40 | impl RegexTrimmer { |
41 | pub fn new(name: &str, word_chars: &str) -> Self { |
42 | let name: String = name.into(); |
43 | let trimmer: Regex = Regex::new(&format!("^[^ {0}]+|[^ {0}]+$" , word_chars)).unwrap(); |
44 | Self { name, trimmer } |
45 | } |
46 | } |
47 | |
48 | impl PipelineFn for RegexTrimmer { |
49 | fn name(&self) -> String { |
50 | self.name.clone() |
51 | } |
52 | |
53 | fn filter(&self, token: String) -> Option<String> { |
54 | let result: Cow<'_, str> = self.trimmer.replace_all(&token, rep:"" ); |
55 | if result.is_empty() { |
56 | None |
57 | } else if result == token { |
58 | Some(token) |
59 | } else { |
60 | Some(result.into()) |
61 | } |
62 | } |
63 | } |
64 | |
65 | #[cfg (feature = "rust-stemmers" )] |
66 | pub struct RustStemmer { |
67 | name: String, |
68 | stemmer: rust_stemmers::Stemmer, |
69 | } |
70 | |
71 | #[cfg (feature = "rust-stemmers" )] |
72 | impl RustStemmer { |
73 | pub fn new(name: &str, algo: rust_stemmers::Algorithm) -> Self { |
74 | Self { |
75 | name: name.into(), |
76 | stemmer: rust_stemmers::Stemmer::create(algo), |
77 | } |
78 | } |
79 | } |
80 | |
81 | #[cfg (feature = "rust-stemmers" )] |
82 | impl PipelineFn for RustStemmer { |
83 | fn name(&self) -> String { |
84 | self.name.clone() |
85 | } |
86 | |
87 | fn filter(&self, token: String) -> Option<String> { |
88 | let result = self.stemmer.stem(&token); |
89 | if result.is_empty() { |
90 | None |
91 | } else if result == token { |
92 | Some(token) |
93 | } else { |
94 | Some(result.into()) |
95 | } |
96 | } |
97 | } |
98 | |