1 | #[cfg (test)] |
2 | use strum_macros::EnumIter; |
3 | |
4 | use unicode_segmentation::{UnicodeSegmentation}; //, GraphemeCursor}; |
5 | |
6 | /// A boundary defines how a string is split into words. Some boundaries, `Hyphen`, `Underscore`, |
7 | /// and `Space`, consume the character they split on, whereas the other boundaries |
8 | /// do not. |
9 | /// |
10 | /// The struct offers methods that return `Vec`s containing useful groups of boundaries. It also |
11 | /// contains the [`list_from`](Boundary::list_from) method which will generate a list of boundaries |
12 | /// based on a string slice. |
13 | /// |
14 | /// Note that all boundaries are distinct and do not share functionality. That is, there is no |
15 | /// such DigitLetter variant, because that would be equivalent to the current `DigitUpper` and |
16 | /// `DigitLower` variants. For common functionality, consider using |
17 | /// some provided functions that return a list of boundaries. |
18 | /// ``` |
19 | /// use convert_case::{Boundary, Case, Casing, Converter}; |
20 | /// |
21 | /// assert_eq!( |
22 | /// "transformations_in_3d" , |
23 | /// "TransformationsIn3D" |
24 | /// .from_case(Case::Camel) |
25 | /// .without_boundaries(&Boundary::digit_letter()) |
26 | /// .to_case(Case::Snake) |
27 | /// ); |
28 | /// |
29 | /// let conv = Converter::new() |
30 | /// .set_boundaries(&Boundary::list_from("aA " )) |
31 | /// .to_case(Case::Title); |
32 | /// assert_eq!("7empest By Tool" , conv.convert("7empest byTool" )); |
33 | /// ``` |
34 | #[cfg_attr (test, derive(EnumIter))] |
35 | #[derive (Clone, Copy, Eq, PartialEq, Debug)] |
36 | pub enum Boundary { |
37 | /// Splits on `-`, consuming the character on segmentation. |
38 | /// ``` |
39 | /// use convert_case::Boundary; |
40 | /// assert_eq!( |
41 | /// vec![Boundary::Hyphen], |
42 | /// Boundary::list_from("-" ) |
43 | /// ); |
44 | /// ``` |
45 | Hyphen, |
46 | |
47 | /// Splits on `_`, consuming the character on segmentation. |
48 | /// ``` |
49 | /// use convert_case::Boundary; |
50 | /// assert_eq!( |
51 | /// vec![Boundary::Underscore], |
52 | /// Boundary::list_from("_" ) |
53 | /// ); |
54 | /// ``` |
55 | Underscore, |
56 | |
57 | /// Splits on space, consuming the character on segmentation. |
58 | /// ``` |
59 | /// use convert_case::Boundary; |
60 | /// assert_eq!( |
61 | /// vec![Boundary::Space], |
62 | /// Boundary::list_from(" " ) |
63 | /// ); |
64 | /// ``` |
65 | Space, |
66 | |
67 | /// Splits where an uppercase letter is followed by a lowercase letter. This is seldom used, |
68 | /// and is not included in the [defaults](Boundary::defaults). |
69 | /// ``` |
70 | /// use convert_case::Boundary; |
71 | /// assert_eq!( |
72 | /// vec![Boundary::UpperLower], |
73 | /// Boundary::list_from("Aa" ) |
74 | /// ); |
75 | /// ``` |
76 | UpperLower, |
77 | |
78 | /// Splits where a lowercase letter is followed by an uppercase letter. |
79 | /// ``` |
80 | /// use convert_case::Boundary; |
81 | /// assert_eq!( |
82 | /// vec![Boundary::LowerUpper], |
83 | /// Boundary::list_from("aA" ) |
84 | /// ); |
85 | /// ``` |
86 | LowerUpper, |
87 | |
88 | /// Splits where digit is followed by an uppercase letter. |
89 | /// ``` |
90 | /// use convert_case::Boundary; |
91 | /// assert_eq!( |
92 | /// vec![Boundary::DigitUpper], |
93 | /// Boundary::list_from("1A" ) |
94 | /// ); |
95 | /// ``` |
96 | DigitUpper, |
97 | |
98 | /// Splits where an uppercase letter is followed by a digit. |
99 | /// ``` |
100 | /// use convert_case::Boundary; |
101 | /// assert_eq!( |
102 | /// vec![Boundary::UpperDigit], |
103 | /// Boundary::list_from("A1" ) |
104 | /// ); |
105 | /// ``` |
106 | UpperDigit, |
107 | |
108 | /// Splits where digit is followed by a lowercase letter. |
109 | /// ``` |
110 | /// use convert_case::Boundary; |
111 | /// assert_eq!( |
112 | /// vec![Boundary::DigitLower], |
113 | /// Boundary::list_from("1a" ) |
114 | /// ); |
115 | /// ``` |
116 | DigitLower, |
117 | |
118 | /// Splits where a lowercase letter is followed by a digit. |
119 | /// ``` |
120 | /// use convert_case::Boundary; |
121 | /// assert_eq!( |
122 | /// vec![Boundary::LowerDigit], |
123 | /// Boundary::list_from("a1" ) |
124 | /// ); |
125 | /// ``` |
126 | LowerDigit, |
127 | |
128 | /// Acronyms are identified by two uppercase letters followed by a lowercase letter. |
129 | /// The word boundary is between the two uppercase letters. For example, "HTTPRequest" |
130 | /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request". |
131 | /// ``` |
132 | /// use convert_case::Boundary; |
133 | /// assert_eq!( |
134 | /// vec![Boundary::Acronym], |
135 | /// Boundary::list_from("AAa" ) |
136 | /// ); |
137 | /// ``` |
138 | Acronym, |
139 | } |
140 | |
141 | impl Boundary { |
142 | /// Returns a list of all boundaries that are identified within the given string. |
143 | /// Could be a short of writing out all the boundaries in a list directly. This will not |
144 | /// identify boundary `UpperLower` if it also used as part of `Acronym`. |
145 | /// |
146 | /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon |
147 | /// character. |
148 | /// ``` |
149 | /// use convert_case::Boundary; |
150 | /// use Boundary::*; |
151 | /// assert_eq!( |
152 | /// vec![Hyphen, Space, LowerUpper, UpperDigit, DigitLower], |
153 | /// Boundary::list_from("aA8a -" ) |
154 | /// ); |
155 | /// assert_eq!( |
156 | /// vec![Underscore, LowerUpper, DigitUpper, Acronym], |
157 | /// Boundary::list_from("bD:0B:_:AAa" ) |
158 | /// ); |
159 | /// ``` |
160 | pub fn list_from(s: &str) -> Vec<Self> { |
161 | Boundary::all().iter().filter(|boundary| { |
162 | let left_iter = s.graphemes(true); |
163 | let mid_iter = s.graphemes(true).skip(1); |
164 | let right_iter = s.graphemes(true).skip(2); |
165 | |
166 | let mut one_iter = left_iter.clone(); |
167 | |
168 | // Also capture when the previous pair was both uppercase, so we don't |
169 | // match the UpperLower boundary in the case of Acronym |
170 | let two_iter = left_iter.clone().zip(mid_iter.clone()); |
171 | let mut two_iter_and_upper = two_iter.clone() |
172 | .zip(std::iter::once(false).chain( |
173 | two_iter.map(|(a, b)| grapheme_is_uppercase(a) && grapheme_is_uppercase(b)) |
174 | )); |
175 | |
176 | let mut three_iter = left_iter.zip(mid_iter).zip(right_iter); |
177 | |
178 | one_iter.any(|a| boundary.detect_one(a)) |
179 | || two_iter_and_upper.any(|((a, b), is_acro)| boundary.detect_two(a, b) && !is_acro) |
180 | || three_iter.any(|((a, b), c)| boundary.detect_three(a, b, c)) |
181 | }).copied().collect() |
182 | } |
183 | |
184 | /// The default list of boundaries used when `Casing::to_case` is called directly |
185 | /// and in a `Converter` generated from `Converter::new()`. This includes |
186 | /// all the boundaries except the `UpperLower` boundary. |
187 | /// ``` |
188 | /// use convert_case::Boundary; |
189 | /// use Boundary::*; |
190 | /// assert_eq!( |
191 | /// vec![ |
192 | /// Underscore, Hyphen, Space, LowerUpper, UpperDigit, |
193 | /// DigitUpper, DigitLower, LowerDigit, Acronym, |
194 | /// ], |
195 | /// Boundary::defaults() |
196 | /// ); |
197 | /// ``` |
198 | pub fn defaults() -> Vec<Self> { |
199 | use Boundary::*; |
200 | vec![ |
201 | Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit, |
202 | Acronym, |
203 | ] |
204 | } |
205 | |
206 | /// Returns the boundaries that split around single characters: `Hyphen`, |
207 | /// `Underscore`, and `Space`. |
208 | /// ``` |
209 | /// use convert_case::Boundary; |
210 | /// use Boundary::*; |
211 | /// assert_eq!( |
212 | /// vec![Hyphen, Underscore, Space], |
213 | /// Boundary::delims() |
214 | /// ); |
215 | /// ``` |
216 | pub fn delims() -> Vec<Self> { |
217 | use Boundary::*; |
218 | vec![Hyphen, Underscore, Space] |
219 | } |
220 | |
221 | /// Returns the boundaries that involve digits: `DigitUpper`, `DigitLower`, `UpperDigit`, and |
222 | /// `LowerDigit`. |
223 | /// ``` |
224 | /// use convert_case::Boundary; |
225 | /// use Boundary::*; |
226 | /// assert_eq!( |
227 | /// vec![DigitUpper, UpperDigit, DigitLower, LowerDigit], |
228 | /// Boundary::digits() |
229 | /// ); |
230 | /// ``` |
231 | pub fn digits() -> Vec<Self> { |
232 | use Boundary::*; |
233 | vec![DigitUpper, UpperDigit, DigitLower, LowerDigit] |
234 | } |
235 | |
236 | /// Returns the boundaries that are letters followed by digits: `UpperDigit` and `LowerDigit`. |
237 | /// ``` |
238 | /// use convert_case::Boundary; |
239 | /// use Boundary::*; |
240 | /// assert_eq!( |
241 | /// vec![UpperDigit, LowerDigit], |
242 | /// Boundary::letter_digit() |
243 | /// ); |
244 | /// ``` |
245 | pub fn letter_digit() -> Vec<Self> { |
246 | use Boundary::*; |
247 | vec![UpperDigit, LowerDigit] |
248 | } |
249 | |
250 | /// Returns the boundaries that are digits followed by letters: `DigitUpper` and |
251 | /// `DigitLower`. |
252 | /// ``` |
253 | /// use convert_case::Boundary; |
254 | /// use Boundary::*; |
255 | /// assert_eq!( |
256 | /// vec![DigitUpper, DigitLower], |
257 | /// Boundary::digit_letter() |
258 | /// ); |
259 | /// ``` |
260 | pub fn digit_letter() -> Vec<Self> { |
261 | use Boundary::*; |
262 | vec![DigitUpper, DigitLower] |
263 | } |
264 | |
265 | /// Returns all boundaries. Note that this includes the `UpperLower` variant which |
266 | /// might be unhelpful. Please look at [`Boundary::defaults`]. |
267 | /// ``` |
268 | /// use convert_case::Boundary; |
269 | /// use Boundary::*; |
270 | /// assert_eq!( |
271 | /// vec![ |
272 | /// Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, |
273 | /// UpperDigit, DigitLower, LowerDigit, Acronym, |
274 | /// ], |
275 | /// Boundary::all() |
276 | /// ); |
277 | /// ``` |
278 | pub fn all() -> Vec<Self> { |
279 | use Boundary::*; |
280 | vec![ |
281 | Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit, |
282 | DigitLower, LowerDigit, Acronym |
283 | ] |
284 | } |
285 | |
286 | fn detect_one(&self, c: &str) -> bool { |
287 | use Boundary::*; |
288 | match self { |
289 | Hyphen => c == "-" , |
290 | Underscore => c == "_" , |
291 | Space => c == " " , |
292 | _ => false, |
293 | } |
294 | } |
295 | |
296 | fn detect_two(&self, c: &str, d: &str) -> bool { |
297 | use Boundary::*; |
298 | match self { |
299 | UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d), |
300 | LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d), |
301 | DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d), |
302 | UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d), |
303 | DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d), |
304 | LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d), |
305 | _ => false, |
306 | } |
307 | } |
308 | |
309 | fn detect_three(&self, c: &str, d: &str, e: &str) -> bool { |
310 | use Boundary::*; |
311 | if let Acronym = self { |
312 | grapheme_is_uppercase(c) |
313 | && grapheme_is_uppercase(d) |
314 | && grapheme_is_lowercase(e) |
315 | } else { |
316 | false |
317 | } |
318 | } |
319 | } |
320 | |
321 | fn grapheme_is_digit(c: &str) -> bool { |
322 | c.chars().all(|c: char| c.is_ascii_digit()) |
323 | } |
324 | |
325 | fn grapheme_is_uppercase(c: &str) -> bool { |
326 | c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase() |
327 | } |
328 | |
329 | fn grapheme_is_lowercase(c: &str) -> bool { |
330 | c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase() |
331 | } |
332 | |
333 | pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String> |
334 | where |
335 | T: AsRef<str>, |
336 | { |
337 | use std::iter::once; |
338 | // create split_points function that counts off by graphemes into list |
339 | |
340 | let s = s.as_ref(); |
341 | |
342 | // Some<bool> means the following |
343 | // None: no split |
344 | // Some(false): split between characters |
345 | // Some(true): split consuming characters |
346 | |
347 | let left_iter = s.graphemes(true); |
348 | let mid_iter = s.graphemes(true).skip(1); |
349 | let right_iter = s.graphemes(true).skip(2); |
350 | |
351 | let singles = left_iter.clone(); |
352 | let doubles = left_iter.clone().zip(mid_iter.clone()); |
353 | let triples = left_iter.zip(mid_iter).zip(right_iter); |
354 | |
355 | let singles = singles |
356 | .map(|c| boundaries.iter().any(|b| b.detect_one(c))) |
357 | .map(|split| if split {Some(true)} else {None}); |
358 | let doubles = doubles |
359 | .map(|(c,d)| boundaries.iter().any(|b| b.detect_two(c, d))) |
360 | .map(|split| if split {Some(false)} else {None}); |
361 | let triples = triples |
362 | .map(|((c,d),e)| boundaries.iter().any(|b| b.detect_three(c, d, e))) |
363 | .map(|split| if split {Some(false)} else {None}); |
364 | |
365 | let split_points = singles |
366 | .zip(once(None).chain(doubles)) |
367 | .zip(once(None).chain(triples).chain(once(None))) |
368 | .map(|((s, d), t)| s.or(d).or(t)); |
369 | |
370 | let mut words = Vec::new(); |
371 | let mut word = String::new(); |
372 | for (c, split) in s.graphemes(true).zip(split_points) { |
373 | match split { |
374 | // no split here |
375 | None => word.push_str(c), |
376 | // split here, consume letter |
377 | Some(true) => words.push(std::mem::take(&mut word)), |
378 | // split here, keep letter |
379 | Some(false) => { |
380 | words.push(std::mem::take(&mut word)); |
381 | word.push_str(c); |
382 | } |
383 | } |
384 | } |
385 | words.push(word); |
386 | |
387 | /* |
388 | let mut words = Vec::new(); |
389 | let mut left_idx = 0; |
390 | let mut total_chars = 0; |
391 | let mut skip = 0; |
392 | let mut cur = GraphemeCursor::new(left_idx, s.len(), true); |
393 | |
394 | for (right_idx, split) in split_points.enumerate() { |
395 | match split { |
396 | // no split here |
397 | None => {}, |
398 | // split here, consume letter |
399 | Some(true) => { |
400 | let mut right_bound = left_bound; |
401 | for _ in 0..total_chars { |
402 | right_bound = cur.next_boundary(s, skip).unwrap().unwrap(); |
403 | } |
404 | words.push(&s[left_bound..right_bound]) |
405 | } |
406 | // split here, keep letter |
407 | Some(false) => { |
408 | } |
409 | // dont push an empty string, do nothing |
410 | _ => {} |
411 | } |
412 | } |
413 | */ |
414 | |
415 | words.into_iter().filter(|s| !s.is_empty()).collect() |
416 | } |
417 | |
418 | #[cfg (test)] |
419 | mod test { |
420 | use super::*; |
421 | use strum::IntoEnumIterator; |
422 | |
423 | #[test ] |
424 | fn all_boundaries_in_iter() { |
425 | let all = Boundary::all(); |
426 | for boundary in Boundary::iter() { |
427 | assert!(all.contains(&boundary)); |
428 | } |
429 | } |
430 | |
431 | #[test ] |
432 | fn split_on_delims() { |
433 | assert_eq!( |
434 | vec!["my" , "word" , "list" , "separated" , "by" , "delims" ], |
435 | split("my_word-list separated-by_delims" , &Boundary::delims()) |
436 | ) |
437 | } |
438 | |
439 | #[test ] |
440 | fn boundaries_found_in_string() { |
441 | use Boundary::*; |
442 | assert_eq!( |
443 | vec![UpperLower], |
444 | Boundary::list_from(".Aaaa" ) |
445 | ); |
446 | assert_eq!( |
447 | vec![LowerUpper, UpperLower, LowerDigit], |
448 | Boundary::list_from("a8.Aa.aA" ) |
449 | ); |
450 | assert_eq!( |
451 | Boundary::digits(), |
452 | Boundary::list_from("b1B1b" ) |
453 | ); |
454 | assert_eq!( |
455 | vec![Hyphen, Underscore, Space, Acronym], |
456 | Boundary::list_from("AAa -_" ) |
457 | ); |
458 | } |
459 | } |
460 | |