1 | //! Intended to be compatible with <https://github.com/MihaiValentin/lunr-languages>. Each supported |
2 | //! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use |
3 | //! these modules directly. |
4 | |
5 | pub mod common; |
6 | |
7 | use crate::Pipeline; |
8 | |
9 | pub trait Language { |
10 | /// The name of the language in English |
11 | fn name(&self) -> String; |
12 | |
13 | /// The ISO 639-1 language code of the language |
14 | fn code(&self) -> String; |
15 | |
16 | /// Separates the input text into individual tokens. In most languages a token is a word, separated by whitespace. |
17 | fn tokenize(&self, text: &str) -> Vec<String>; |
18 | |
19 | /// Returns the [`Pipeline`] to process the tokens with |
20 | fn make_pipeline(&self) -> Pipeline; |
21 | } |
22 | |
23 | /// Splits a text string into a vector of individual tokens. |
24 | pub fn tokenize_whitespace(text: &str) -> Vec<String> { |
25 | textimpl Iterator .split(|c: char| c.is_whitespace() || c == '-' ) |
26 | .filter(|s: &&str| !s.is_empty()) |
27 | .map(|s: &str| s.trim().to_lowercase()) |
28 | .collect() |
29 | } |
30 | |
31 | macro_rules! impl_language { |
32 | ($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => { |
33 | /// Returns a list of all the [`Language`] implementations in the crate |
34 | pub fn languages() -> Vec<Box<dyn Language>> { |
35 | vec![ |
36 | $( |
37 | $(#[$cfgs])? |
38 | Box::new($code::$name::new()), |
39 | )+ |
40 | ] |
41 | } |
42 | |
43 | /// Returns the [`Language`] for the given two-character [ISO 639-1][iso] language code if the |
44 | /// language is supported. Returns `None` if not supported. |
45 | /// |
46 | /// *Note:* |
47 | /// |
48 | /// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name |
49 | /// and pipeline suffix in order to match lunr-languages. |
50 | /// |
51 | /// [iso]: https://en.wikipedia.org/wiki/ISO_639-1 |
52 | pub fn from_code(code: &str) -> Option<Box<dyn Language>> { |
53 | match code.to_ascii_lowercase().as_str() { |
54 | $( |
55 | $(#[$cfgs])? |
56 | stringify!($code) => Some(Box::new($code::$name::new())), |
57 | )+ |
58 | _ => None, |
59 | } |
60 | } |
61 | |
62 | /// Returns the [`Language`] for the given English language name if the |
63 | /// language is supported. Returns `None` if not supported. The first letter must |
64 | /// be capitalized. |
65 | pub fn from_name(name: &str) -> Option<Box<dyn Language>> { |
66 | match name { |
67 | $( |
68 | $(#[$cfgs])? |
69 | stringify!($name) => Some(Box::new($code::$name::new())), |
70 | )+ |
71 | _ => None, |
72 | } |
73 | } |
74 | |
75 | $( |
76 | $(#[$cfgs])? |
77 | mod $code; |
78 | |
79 | $(#[$cfgs])? |
80 | pub use $code::$name; |
81 | )+ |
82 | }; |
83 | } |
84 | |
85 | impl_language! { |
86 | (English, en), |
87 | (Arabic, ar, #[cfg (feature = "ar" )]), |
88 | (Chinese, zh, #[cfg (feature = "zh" )]), |
89 | (Danish, da, #[cfg (feature = "da" )]), |
90 | (Dutch, du, #[cfg (feature = "du" )]), |
91 | (Finnish, fi, #[cfg (feature = "fi" )]), |
92 | (French, fr, #[cfg (feature = "fr" )]), |
93 | (German, de, #[cfg (feature = "de" )]), |
94 | (Hungarian, hu, #[cfg (feature = "hu" )]), |
95 | (Italian, it, #[cfg (feature = "it" )]), |
96 | (Japanese, ja, #[cfg (feature = "ja" )]), |
97 | (Korean, ko, #[cfg (feature = "ko" )]), |
98 | (Norwegian, no, #[cfg (feature = "no" )]), |
99 | (Portuguese, pt, #[cfg (feature = "pt" )]), |
100 | (Romanian, ro, #[cfg (feature = "ro" )]), |
101 | (Russian, ru, #[cfg (feature = "ru" )]), |
102 | (Spanish, es, #[cfg (feature = "es" )]), |
103 | (Swedish, sv, #[cfg (feature = "sv" )]), |
104 | (Turkish, tr, #[cfg (feature = "tr" )]), |
105 | } |
106 | |
107 | #[cfg (test)] |
108 | mod tests { |
109 | use super::tokenize_whitespace; |
110 | |
111 | #[test ] |
112 | fn split_simple_strings() { |
113 | let string = "this is a simple string" ; |
114 | assert_eq!( |
115 | &tokenize_whitespace(string), |
116 | &["this" , "is" , "a" , "simple" , "string" ] |
117 | ); |
118 | } |
119 | |
120 | #[test ] |
121 | fn multiple_white_space() { |
122 | let string = " foo bar " ; |
123 | assert_eq!(&tokenize_whitespace(string), &["foo" , "bar" ]); |
124 | } |
125 | |
126 | #[test ] |
127 | fn hyphens() { |
128 | let string = "take the New York-San Francisco flight" ; |
129 | assert_eq!( |
130 | &tokenize_whitespace(string), |
131 | &["take" , "the" , "new" , "york" , "san" , "francisco" , "flight" ] |
132 | ); |
133 | } |
134 | |
135 | #[test ] |
136 | fn splitting_strings_with_hyphens() { |
137 | let string = "Solve for A - B" ; |
138 | assert_eq!(&tokenize_whitespace(string), &["solve" , "for" , "a" , "b" ]); |
139 | } |
140 | } |
141 | |