1#[cfg(test)]
2use strum_macros::EnumIter;
3
4use unicode_segmentation::{UnicodeSegmentation}; //, GraphemeCursor};
5
6/// A boundary defines how a string is split into words. Some boundaries, `Hyphen`, `Underscore`,
7/// and `Space`, consume the character they split on, whereas the other boundaries
8/// do not.
9///
10/// The struct offers methods that return `Vec`s containing useful groups of boundaries. It also
11/// contains the [`list_from`](Boundary::list_from) method which will generate a list of boundaries
12/// based on a string slice.
13///
14/// Note that all boundaries are distinct and do not share functionality. That is, there is no
15/// such DigitLetter variant, because that would be equivalent to the current `DigitUpper` and
16/// `DigitLower` variants. For common functionality, consider using
17/// some provided functions that return a list of boundaries.
18/// ```
19/// use convert_case::{Boundary, Case, Casing, Converter};
20///
21/// assert_eq!(
22/// "transformations_in_3d",
23/// "TransformationsIn3D"
24/// .from_case(Case::Camel)
25/// .without_boundaries(&Boundary::digit_letter())
26/// .to_case(Case::Snake)
27/// );
28///
29/// let conv = Converter::new()
30/// .set_boundaries(&Boundary::list_from("aA "))
31/// .to_case(Case::Title);
32/// assert_eq!("7empest By Tool", conv.convert("7empest byTool"));
33/// ```
34#[cfg_attr(test, derive(EnumIter))]
35#[derive(Clone, Copy, Eq, PartialEq, Debug)]
36pub enum Boundary {
37 /// Splits on `-`, consuming the character on segmentation.
38 /// ```
39 /// use convert_case::Boundary;
40 /// assert_eq!(
41 /// vec![Boundary::Hyphen],
42 /// Boundary::list_from("-")
43 /// );
44 /// ```
45 Hyphen,
46
47 /// Splits on `_`, consuming the character on segmentation.
48 /// ```
49 /// use convert_case::Boundary;
50 /// assert_eq!(
51 /// vec![Boundary::Underscore],
52 /// Boundary::list_from("_")
53 /// );
54 /// ```
55 Underscore,
56
57 /// Splits on space, consuming the character on segmentation.
58 /// ```
59 /// use convert_case::Boundary;
60 /// assert_eq!(
61 /// vec![Boundary::Space],
62 /// Boundary::list_from(" ")
63 /// );
64 /// ```
65 Space,
66
67 /// Splits where an uppercase letter is followed by a lowercase letter. This is seldom used,
68 /// and is not included in the [defaults](Boundary::defaults).
69 /// ```
70 /// use convert_case::Boundary;
71 /// assert_eq!(
72 /// vec![Boundary::UpperLower],
73 /// Boundary::list_from("Aa")
74 /// );
75 /// ```
76 UpperLower,
77
78 /// Splits where a lowercase letter is followed by an uppercase letter.
79 /// ```
80 /// use convert_case::Boundary;
81 /// assert_eq!(
82 /// vec![Boundary::LowerUpper],
83 /// Boundary::list_from("aA")
84 /// );
85 /// ```
86 LowerUpper,
87
88 /// Splits where digit is followed by an uppercase letter.
89 /// ```
90 /// use convert_case::Boundary;
91 /// assert_eq!(
92 /// vec![Boundary::DigitUpper],
93 /// Boundary::list_from("1A")
94 /// );
95 /// ```
96 DigitUpper,
97
98 /// Splits where an uppercase letter is followed by a digit.
99 /// ```
100 /// use convert_case::Boundary;
101 /// assert_eq!(
102 /// vec![Boundary::UpperDigit],
103 /// Boundary::list_from("A1")
104 /// );
105 /// ```
106 UpperDigit,
107
108 /// Splits where digit is followed by a lowercase letter.
109 /// ```
110 /// use convert_case::Boundary;
111 /// assert_eq!(
112 /// vec![Boundary::DigitLower],
113 /// Boundary::list_from("1a")
114 /// );
115 /// ```
116 DigitLower,
117
118 /// Splits where a lowercase letter is followed by a digit.
119 /// ```
120 /// use convert_case::Boundary;
121 /// assert_eq!(
122 /// vec![Boundary::LowerDigit],
123 /// Boundary::list_from("a1")
124 /// );
125 /// ```
126 LowerDigit,
127
128 /// Acronyms are identified by two uppercase letters followed by a lowercase letter.
129 /// The word boundary is between the two uppercase letters. For example, "HTTPRequest"
130 /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
131 /// ```
132 /// use convert_case::Boundary;
133 /// assert_eq!(
134 /// vec![Boundary::Acronym],
135 /// Boundary::list_from("AAa")
136 /// );
137 /// ```
138 Acronym,
139}
140
141impl Boundary {
142 /// Returns a list of all boundaries that are identified within the given string.
143 /// Could be a short of writing out all the boundaries in a list directly. This will not
144 /// identify boundary `UpperLower` if it also used as part of `Acronym`.
145 ///
146 /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
147 /// character.
148 /// ```
149 /// use convert_case::Boundary;
150 /// use Boundary::*;
151 /// assert_eq!(
152 /// vec![Hyphen, Space, LowerUpper, UpperDigit, DigitLower],
153 /// Boundary::list_from("aA8a -")
154 /// );
155 /// assert_eq!(
156 /// vec![Underscore, LowerUpper, DigitUpper, Acronym],
157 /// Boundary::list_from("bD:0B:_:AAa")
158 /// );
159 /// ```
160 pub fn list_from(s: &str) -> Vec<Self> {
161 Boundary::all().iter().filter(|boundary| {
162 let left_iter = s.graphemes(true);
163 let mid_iter = s.graphemes(true).skip(1);
164 let right_iter = s.graphemes(true).skip(2);
165
166 let mut one_iter = left_iter.clone();
167
168 // Also capture when the previous pair was both uppercase, so we don't
169 // match the UpperLower boundary in the case of Acronym
170 let two_iter = left_iter.clone().zip(mid_iter.clone());
171 let mut two_iter_and_upper = two_iter.clone()
172 .zip(std::iter::once(false).chain(
173 two_iter.map(|(a, b)| grapheme_is_uppercase(a) && grapheme_is_uppercase(b))
174 ));
175
176 let mut three_iter = left_iter.zip(mid_iter).zip(right_iter);
177
178 one_iter.any(|a| boundary.detect_one(a))
179 || two_iter_and_upper.any(|((a, b), is_acro)| boundary.detect_two(a, b) && !is_acro)
180 || three_iter.any(|((a, b), c)| boundary.detect_three(a, b, c))
181 }).copied().collect()
182 }
183
184 /// The default list of boundaries used when `Casing::to_case` is called directly
185 /// and in a `Converter` generated from `Converter::new()`. This includes
186 /// all the boundaries except the `UpperLower` boundary.
187 /// ```
188 /// use convert_case::Boundary;
189 /// use Boundary::*;
190 /// assert_eq!(
191 /// vec![
192 /// Underscore, Hyphen, Space, LowerUpper, UpperDigit,
193 /// DigitUpper, DigitLower, LowerDigit, Acronym,
194 /// ],
195 /// Boundary::defaults()
196 /// );
197 /// ```
198 pub fn defaults() -> Vec<Self> {
199 use Boundary::*;
200 vec![
201 Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit,
202 Acronym,
203 ]
204 }
205
206 /// Returns the boundaries that split around single characters: `Hyphen`,
207 /// `Underscore`, and `Space`.
208 /// ```
209 /// use convert_case::Boundary;
210 /// use Boundary::*;
211 /// assert_eq!(
212 /// vec![Hyphen, Underscore, Space],
213 /// Boundary::delims()
214 /// );
215 /// ```
216 pub fn delims() -> Vec<Self> {
217 use Boundary::*;
218 vec![Hyphen, Underscore, Space]
219 }
220
221 /// Returns the boundaries that involve digits: `DigitUpper`, `DigitLower`, `UpperDigit`, and
222 /// `LowerDigit`.
223 /// ```
224 /// use convert_case::Boundary;
225 /// use Boundary::*;
226 /// assert_eq!(
227 /// vec![DigitUpper, UpperDigit, DigitLower, LowerDigit],
228 /// Boundary::digits()
229 /// );
230 /// ```
231 pub fn digits() -> Vec<Self> {
232 use Boundary::*;
233 vec![DigitUpper, UpperDigit, DigitLower, LowerDigit]
234 }
235
236 /// Returns the boundaries that are letters followed by digits: `UpperDigit` and `LowerDigit`.
237 /// ```
238 /// use convert_case::Boundary;
239 /// use Boundary::*;
240 /// assert_eq!(
241 /// vec![UpperDigit, LowerDigit],
242 /// Boundary::letter_digit()
243 /// );
244 /// ```
245 pub fn letter_digit() -> Vec<Self> {
246 use Boundary::*;
247 vec![UpperDigit, LowerDigit]
248 }
249
250 /// Returns the boundaries that are digits followed by letters: `DigitUpper` and
251 /// `DigitLower`.
252 /// ```
253 /// use convert_case::Boundary;
254 /// use Boundary::*;
255 /// assert_eq!(
256 /// vec![DigitUpper, DigitLower],
257 /// Boundary::digit_letter()
258 /// );
259 /// ```
260 pub fn digit_letter() -> Vec<Self> {
261 use Boundary::*;
262 vec![DigitUpper, DigitLower]
263 }
264
265 /// Returns all boundaries. Note that this includes the `UpperLower` variant which
266 /// might be unhelpful. Please look at [`Boundary::defaults`].
267 /// ```
268 /// use convert_case::Boundary;
269 /// use Boundary::*;
270 /// assert_eq!(
271 /// vec![
272 /// Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper,
273 /// UpperDigit, DigitLower, LowerDigit, Acronym,
274 /// ],
275 /// Boundary::all()
276 /// );
277 /// ```
278 pub fn all() -> Vec<Self> {
279 use Boundary::*;
280 vec![
281 Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit,
282 DigitLower, LowerDigit, Acronym
283 ]
284 }
285
286 fn detect_one(&self, c: &str) -> bool {
287 use Boundary::*;
288 match self {
289 Hyphen => c == "-",
290 Underscore => c == "_",
291 Space => c == " ",
292 _ => false,
293 }
294 }
295
296 fn detect_two(&self, c: &str, d: &str) -> bool {
297 use Boundary::*;
298 match self {
299 UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d),
300 LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d),
301 DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d),
302 UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d),
303 DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d),
304 LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d),
305 _ => false,
306 }
307 }
308
309 fn detect_three(&self, c: &str, d: &str, e: &str) -> bool {
310 use Boundary::*;
311 if let Acronym = self {
312 grapheme_is_uppercase(c)
313 && grapheme_is_uppercase(d)
314 && grapheme_is_lowercase(e)
315 } else {
316 false
317 }
318 }
319}
320
321fn grapheme_is_digit(c: &str) -> bool {
322 c.chars().all(|c: char| c.is_ascii_digit())
323}
324
325fn grapheme_is_uppercase(c: &str) -> bool {
326 c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase()
327}
328
329fn grapheme_is_lowercase(c: &str) -> bool {
330 c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase()
331}
332
333pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String>
334where
335 T: AsRef<str>,
336{
337 use std::iter::once;
338 // create split_points function that counts off by graphemes into list
339
340 let s = s.as_ref();
341
342 // Some<bool> means the following
343 // None: no split
344 // Some(false): split between characters
345 // Some(true): split consuming characters
346
347 let left_iter = s.graphemes(true);
348 let mid_iter = s.graphemes(true).skip(1);
349 let right_iter = s.graphemes(true).skip(2);
350
351 let singles = left_iter.clone();
352 let doubles = left_iter.clone().zip(mid_iter.clone());
353 let triples = left_iter.zip(mid_iter).zip(right_iter);
354
355 let singles = singles
356 .map(|c| boundaries.iter().any(|b| b.detect_one(c)))
357 .map(|split| if split {Some(true)} else {None});
358 let doubles = doubles
359 .map(|(c,d)| boundaries.iter().any(|b| b.detect_two(c, d)))
360 .map(|split| if split {Some(false)} else {None});
361 let triples = triples
362 .map(|((c,d),e)| boundaries.iter().any(|b| b.detect_three(c, d, e)))
363 .map(|split| if split {Some(false)} else {None});
364
365 let split_points = singles
366 .zip(once(None).chain(doubles))
367 .zip(once(None).chain(triples).chain(once(None)))
368 .map(|((s, d), t)| s.or(d).or(t));
369
370 let mut words = Vec::new();
371 let mut word = String::new();
372 for (c, split) in s.graphemes(true).zip(split_points) {
373 match split {
374 // no split here
375 None => word.push_str(c),
376 // split here, consume letter
377 Some(true) => words.push(std::mem::take(&mut word)),
378 // split here, keep letter
379 Some(false) => {
380 words.push(std::mem::take(&mut word));
381 word.push_str(c);
382 }
383 }
384 }
385 words.push(word);
386
387 /*
388 let mut words = Vec::new();
389 let mut left_idx = 0;
390 let mut total_chars = 0;
391 let mut skip = 0;
392 let mut cur = GraphemeCursor::new(left_idx, s.len(), true);
393
394 for (right_idx, split) in split_points.enumerate() {
395 match split {
396 // no split here
397 None => {},
398 // split here, consume letter
399 Some(true) => {
400 let mut right_bound = left_bound;
401 for _ in 0..total_chars {
402 right_bound = cur.next_boundary(s, skip).unwrap().unwrap();
403 }
404 words.push(&s[left_bound..right_bound])
405 }
406 // split here, keep letter
407 Some(false) => {
408 }
409 // dont push an empty string, do nothing
410 _ => {}
411 }
412 }
413 */
414
415 words.into_iter().filter(|s| !s.is_empty()).collect()
416}
417
418#[cfg(test)]
419mod test {
420 use super::*;
421 use strum::IntoEnumIterator;
422
423 #[test]
424 fn all_boundaries_in_iter() {
425 let all = Boundary::all();
426 for boundary in Boundary::iter() {
427 assert!(all.contains(&boundary));
428 }
429 }
430
431 #[test]
432 fn split_on_delims() {
433 assert_eq!(
434 vec!["my", "word", "list", "separated", "by", "delims"],
435 split("my_word-list separated-by_delims", &Boundary::delims())
436 )
437 }
438
439 #[test]
440 fn boundaries_found_in_string() {
441 use Boundary::*;
442 assert_eq!(
443 vec![UpperLower],
444 Boundary::list_from(".Aaaa")
445 );
446 assert_eq!(
447 vec![LowerUpper, UpperLower, LowerDigit],
448 Boundary::list_from("a8.Aa.aA")
449 );
450 assert_eq!(
451 Boundary::digits(),
452 Boundary::list_from("b1B1b")
453 );
454 assert_eq!(
455 vec![Hyphen, Underscore, Space, Acronym],
456 Boundary::list_from("AAa -_")
457 );
458 }
459}
460