| 1 | #[cfg (test)] |
| 2 | use strum_macros::EnumIter; |
| 3 | |
| 4 | use unicode_segmentation::{UnicodeSegmentation}; //, GraphemeCursor}; |
| 5 | |
| 6 | /// A boundary defines how a string is split into words. Some boundaries, `Hyphen`, `Underscore`, |
| 7 | /// and `Space`, consume the character they split on, whereas the other boundaries |
| 8 | /// do not. |
| 9 | /// |
| 10 | /// The struct offers methods that return `Vec`s containing useful groups of boundaries. It also |
| 11 | /// contains the [`list_from`](Boundary::list_from) method which will generate a list of boundaries |
| 12 | /// based on a string slice. |
| 13 | /// |
| 14 | /// Note that all boundaries are distinct and do not share functionality. That is, there is no |
| 15 | /// such DigitLetter variant, because that would be equivalent to the current `DigitUpper` and |
| 16 | /// `DigitLower` variants. For common functionality, consider using |
| 17 | /// some provided functions that return a list of boundaries. |
| 18 | /// ``` |
| 19 | /// use convert_case::{Boundary, Case, Casing, Converter}; |
| 20 | /// |
| 21 | /// assert_eq!( |
| 22 | /// "transformations_in_3d" , |
| 23 | /// "TransformationsIn3D" |
| 24 | /// .from_case(Case::Camel) |
| 25 | /// .without_boundaries(&Boundary::digit_letter()) |
| 26 | /// .to_case(Case::Snake) |
| 27 | /// ); |
| 28 | /// |
| 29 | /// let conv = Converter::new() |
| 30 | /// .set_boundaries(&Boundary::list_from("aA " )) |
| 31 | /// .to_case(Case::Title); |
| 32 | /// assert_eq!("7empest By Tool" , conv.convert("7empest byTool" )); |
| 33 | /// ``` |
| 34 | #[cfg_attr (test, derive(EnumIter))] |
| 35 | #[derive (Clone, Copy, Eq, PartialEq, Debug)] |
| 36 | pub enum Boundary { |
| 37 | /// Splits on `-`, consuming the character on segmentation. |
| 38 | /// ``` |
| 39 | /// use convert_case::Boundary; |
| 40 | /// assert_eq!( |
| 41 | /// vec![Boundary::Hyphen], |
| 42 | /// Boundary::list_from("-" ) |
| 43 | /// ); |
| 44 | /// ``` |
| 45 | Hyphen, |
| 46 | |
| 47 | /// Splits on `_`, consuming the character on segmentation. |
| 48 | /// ``` |
| 49 | /// use convert_case::Boundary; |
| 50 | /// assert_eq!( |
| 51 | /// vec![Boundary::Underscore], |
| 52 | /// Boundary::list_from("_" ) |
| 53 | /// ); |
| 54 | /// ``` |
| 55 | Underscore, |
| 56 | |
| 57 | /// Splits on space, consuming the character on segmentation. |
| 58 | /// ``` |
| 59 | /// use convert_case::Boundary; |
| 60 | /// assert_eq!( |
| 61 | /// vec![Boundary::Space], |
| 62 | /// Boundary::list_from(" " ) |
| 63 | /// ); |
| 64 | /// ``` |
| 65 | Space, |
| 66 | |
| 67 | /// Splits where an uppercase letter is followed by a lowercase letter. This is seldom used, |
| 68 | /// and is not included in the [defaults](Boundary::defaults). |
| 69 | /// ``` |
| 70 | /// use convert_case::Boundary; |
| 71 | /// assert_eq!( |
| 72 | /// vec![Boundary::UpperLower], |
| 73 | /// Boundary::list_from("Aa" ) |
| 74 | /// ); |
| 75 | /// ``` |
| 76 | UpperLower, |
| 77 | |
| 78 | /// Splits where a lowercase letter is followed by an uppercase letter. |
| 79 | /// ``` |
| 80 | /// use convert_case::Boundary; |
| 81 | /// assert_eq!( |
| 82 | /// vec![Boundary::LowerUpper], |
| 83 | /// Boundary::list_from("aA" ) |
| 84 | /// ); |
| 85 | /// ``` |
| 86 | LowerUpper, |
| 87 | |
| 88 | /// Splits where digit is followed by an uppercase letter. |
| 89 | /// ``` |
| 90 | /// use convert_case::Boundary; |
| 91 | /// assert_eq!( |
| 92 | /// vec![Boundary::DigitUpper], |
| 93 | /// Boundary::list_from("1A" ) |
| 94 | /// ); |
| 95 | /// ``` |
| 96 | DigitUpper, |
| 97 | |
| 98 | /// Splits where an uppercase letter is followed by a digit. |
| 99 | /// ``` |
| 100 | /// use convert_case::Boundary; |
| 101 | /// assert_eq!( |
| 102 | /// vec![Boundary::UpperDigit], |
| 103 | /// Boundary::list_from("A1" ) |
| 104 | /// ); |
| 105 | /// ``` |
| 106 | UpperDigit, |
| 107 | |
| 108 | /// Splits where digit is followed by a lowercase letter. |
| 109 | /// ``` |
| 110 | /// use convert_case::Boundary; |
| 111 | /// assert_eq!( |
| 112 | /// vec![Boundary::DigitLower], |
| 113 | /// Boundary::list_from("1a" ) |
| 114 | /// ); |
| 115 | /// ``` |
| 116 | DigitLower, |
| 117 | |
| 118 | /// Splits where a lowercase letter is followed by a digit. |
| 119 | /// ``` |
| 120 | /// use convert_case::Boundary; |
| 121 | /// assert_eq!( |
| 122 | /// vec![Boundary::LowerDigit], |
| 123 | /// Boundary::list_from("a1" ) |
| 124 | /// ); |
| 125 | /// ``` |
| 126 | LowerDigit, |
| 127 | |
| 128 | /// Acronyms are identified by two uppercase letters followed by a lowercase letter. |
| 129 | /// The word boundary is between the two uppercase letters. For example, "HTTPRequest" |
| 130 | /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request". |
| 131 | /// ``` |
| 132 | /// use convert_case::Boundary; |
| 133 | /// assert_eq!( |
| 134 | /// vec![Boundary::Acronym], |
| 135 | /// Boundary::list_from("AAa" ) |
| 136 | /// ); |
| 137 | /// ``` |
| 138 | Acronym, |
| 139 | } |
| 140 | |
| 141 | impl Boundary { |
| 142 | /// Returns a list of all boundaries that are identified within the given string. |
| 143 | /// Could be a short of writing out all the boundaries in a list directly. This will not |
| 144 | /// identify boundary `UpperLower` if it also used as part of `Acronym`. |
| 145 | /// |
| 146 | /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon |
| 147 | /// character. |
| 148 | /// ``` |
| 149 | /// use convert_case::Boundary; |
| 150 | /// use Boundary::*; |
| 151 | /// assert_eq!( |
| 152 | /// vec![Hyphen, Space, LowerUpper, UpperDigit, DigitLower], |
| 153 | /// Boundary::list_from("aA8a -" ) |
| 154 | /// ); |
| 155 | /// assert_eq!( |
| 156 | /// vec![Underscore, LowerUpper, DigitUpper, Acronym], |
| 157 | /// Boundary::list_from("bD:0B:_:AAa" ) |
| 158 | /// ); |
| 159 | /// ``` |
| 160 | pub fn list_from(s: &str) -> Vec<Self> { |
| 161 | Boundary::all().iter().filter(|boundary| { |
| 162 | let left_iter = s.graphemes(true); |
| 163 | let mid_iter = s.graphemes(true).skip(1); |
| 164 | let right_iter = s.graphemes(true).skip(2); |
| 165 | |
| 166 | let mut one_iter = left_iter.clone(); |
| 167 | |
| 168 | // Also capture when the previous pair was both uppercase, so we don't |
| 169 | // match the UpperLower boundary in the case of Acronym |
| 170 | let two_iter = left_iter.clone().zip(mid_iter.clone()); |
| 171 | let mut two_iter_and_upper = two_iter.clone() |
| 172 | .zip(std::iter::once(false).chain( |
| 173 | two_iter.map(|(a, b)| grapheme_is_uppercase(a) && grapheme_is_uppercase(b)) |
| 174 | )); |
| 175 | |
| 176 | let mut three_iter = left_iter.zip(mid_iter).zip(right_iter); |
| 177 | |
| 178 | one_iter.any(|a| boundary.detect_one(a)) |
| 179 | || two_iter_and_upper.any(|((a, b), is_acro)| boundary.detect_two(a, b) && !is_acro) |
| 180 | || three_iter.any(|((a, b), c)| boundary.detect_three(a, b, c)) |
| 181 | }).copied().collect() |
| 182 | } |
| 183 | |
| 184 | /// The default list of boundaries used when `Casing::to_case` is called directly |
| 185 | /// and in a `Converter` generated from `Converter::new()`. This includes |
| 186 | /// all the boundaries except the `UpperLower` boundary. |
| 187 | /// ``` |
| 188 | /// use convert_case::Boundary; |
| 189 | /// use Boundary::*; |
| 190 | /// assert_eq!( |
| 191 | /// vec![ |
| 192 | /// Underscore, Hyphen, Space, LowerUpper, UpperDigit, |
| 193 | /// DigitUpper, DigitLower, LowerDigit, Acronym, |
| 194 | /// ], |
| 195 | /// Boundary::defaults() |
| 196 | /// ); |
| 197 | /// ``` |
| 198 | pub fn defaults() -> Vec<Self> { |
| 199 | use Boundary::*; |
| 200 | vec![ |
| 201 | Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit, |
| 202 | Acronym, |
| 203 | ] |
| 204 | } |
| 205 | |
| 206 | /// Returns the boundaries that split around single characters: `Hyphen`, |
| 207 | /// `Underscore`, and `Space`. |
| 208 | /// ``` |
| 209 | /// use convert_case::Boundary; |
| 210 | /// use Boundary::*; |
| 211 | /// assert_eq!( |
| 212 | /// vec![Hyphen, Underscore, Space], |
| 213 | /// Boundary::delims() |
| 214 | /// ); |
| 215 | /// ``` |
| 216 | pub fn delims() -> Vec<Self> { |
| 217 | use Boundary::*; |
| 218 | vec![Hyphen, Underscore, Space] |
| 219 | } |
| 220 | |
| 221 | /// Returns the boundaries that involve digits: `DigitUpper`, `DigitLower`, `UpperDigit`, and |
| 222 | /// `LowerDigit`. |
| 223 | /// ``` |
| 224 | /// use convert_case::Boundary; |
| 225 | /// use Boundary::*; |
| 226 | /// assert_eq!( |
| 227 | /// vec![DigitUpper, UpperDigit, DigitLower, LowerDigit], |
| 228 | /// Boundary::digits() |
| 229 | /// ); |
| 230 | /// ``` |
| 231 | pub fn digits() -> Vec<Self> { |
| 232 | use Boundary::*; |
| 233 | vec![DigitUpper, UpperDigit, DigitLower, LowerDigit] |
| 234 | } |
| 235 | |
| 236 | /// Returns the boundaries that are letters followed by digits: `UpperDigit` and `LowerDigit`. |
| 237 | /// ``` |
| 238 | /// use convert_case::Boundary; |
| 239 | /// use Boundary::*; |
| 240 | /// assert_eq!( |
| 241 | /// vec![UpperDigit, LowerDigit], |
| 242 | /// Boundary::letter_digit() |
| 243 | /// ); |
| 244 | /// ``` |
| 245 | pub fn letter_digit() -> Vec<Self> { |
| 246 | use Boundary::*; |
| 247 | vec![UpperDigit, LowerDigit] |
| 248 | } |
| 249 | |
| 250 | /// Returns the boundaries that are digits followed by letters: `DigitUpper` and |
| 251 | /// `DigitLower`. |
| 252 | /// ``` |
| 253 | /// use convert_case::Boundary; |
| 254 | /// use Boundary::*; |
| 255 | /// assert_eq!( |
| 256 | /// vec![DigitUpper, DigitLower], |
| 257 | /// Boundary::digit_letter() |
| 258 | /// ); |
| 259 | /// ``` |
| 260 | pub fn digit_letter() -> Vec<Self> { |
| 261 | use Boundary::*; |
| 262 | vec![DigitUpper, DigitLower] |
| 263 | } |
| 264 | |
| 265 | /// Returns all boundaries. Note that this includes the `UpperLower` variant which |
| 266 | /// might be unhelpful. Please look at [`Boundary::defaults`]. |
| 267 | /// ``` |
| 268 | /// use convert_case::Boundary; |
| 269 | /// use Boundary::*; |
| 270 | /// assert_eq!( |
| 271 | /// vec![ |
| 272 | /// Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, |
| 273 | /// UpperDigit, DigitLower, LowerDigit, Acronym, |
| 274 | /// ], |
| 275 | /// Boundary::all() |
| 276 | /// ); |
| 277 | /// ``` |
| 278 | pub fn all() -> Vec<Self> { |
| 279 | use Boundary::*; |
| 280 | vec![ |
| 281 | Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit, |
| 282 | DigitLower, LowerDigit, Acronym |
| 283 | ] |
| 284 | } |
| 285 | |
| 286 | fn detect_one(&self, c: &str) -> bool { |
| 287 | use Boundary::*; |
| 288 | match self { |
| 289 | Hyphen => c == "-" , |
| 290 | Underscore => c == "_" , |
| 291 | Space => c == " " , |
| 292 | _ => false, |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | fn detect_two(&self, c: &str, d: &str) -> bool { |
| 297 | use Boundary::*; |
| 298 | match self { |
| 299 | UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d), |
| 300 | LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d), |
| 301 | DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d), |
| 302 | UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d), |
| 303 | DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d), |
| 304 | LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d), |
| 305 | _ => false, |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | fn detect_three(&self, c: &str, d: &str, e: &str) -> bool { |
| 310 | use Boundary::*; |
| 311 | if let Acronym = self { |
| 312 | grapheme_is_uppercase(c) |
| 313 | && grapheme_is_uppercase(d) |
| 314 | && grapheme_is_lowercase(e) |
| 315 | } else { |
| 316 | false |
| 317 | } |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | fn grapheme_is_digit(c: &str) -> bool { |
| 322 | c.chars().all(|c: char| c.is_ascii_digit()) |
| 323 | } |
| 324 | |
| 325 | fn grapheme_is_uppercase(c: &str) -> bool { |
| 326 | c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase() |
| 327 | } |
| 328 | |
| 329 | fn grapheme_is_lowercase(c: &str) -> bool { |
| 330 | c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase() |
| 331 | } |
| 332 | |
| 333 | pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String> |
| 334 | where |
| 335 | T: AsRef<str>, |
| 336 | { |
| 337 | use std::iter::once; |
| 338 | // create split_points function that counts off by graphemes into list |
| 339 | |
| 340 | let s = s.as_ref(); |
| 341 | |
| 342 | // Some<bool> means the following |
| 343 | // None: no split |
| 344 | // Some(false): split between characters |
| 345 | // Some(true): split consuming characters |
| 346 | |
| 347 | let left_iter = s.graphemes(true); |
| 348 | let mid_iter = s.graphemes(true).skip(1); |
| 349 | let right_iter = s.graphemes(true).skip(2); |
| 350 | |
| 351 | let singles = left_iter.clone(); |
| 352 | let doubles = left_iter.clone().zip(mid_iter.clone()); |
| 353 | let triples = left_iter.zip(mid_iter).zip(right_iter); |
| 354 | |
| 355 | let singles = singles |
| 356 | .map(|c| boundaries.iter().any(|b| b.detect_one(c))) |
| 357 | .map(|split| if split {Some(true)} else {None}); |
| 358 | let doubles = doubles |
| 359 | .map(|(c,d)| boundaries.iter().any(|b| b.detect_two(c, d))) |
| 360 | .map(|split| if split {Some(false)} else {None}); |
| 361 | let triples = triples |
| 362 | .map(|((c,d),e)| boundaries.iter().any(|b| b.detect_three(c, d, e))) |
| 363 | .map(|split| if split {Some(false)} else {None}); |
| 364 | |
| 365 | let split_points = singles |
| 366 | .zip(once(None).chain(doubles)) |
| 367 | .zip(once(None).chain(triples).chain(once(None))) |
| 368 | .map(|((s, d), t)| s.or(d).or(t)); |
| 369 | |
| 370 | let mut words = Vec::new(); |
| 371 | let mut word = String::new(); |
| 372 | for (c, split) in s.graphemes(true).zip(split_points) { |
| 373 | match split { |
| 374 | // no split here |
| 375 | None => word.push_str(c), |
| 376 | // split here, consume letter |
| 377 | Some(true) => words.push(std::mem::take(&mut word)), |
| 378 | // split here, keep letter |
| 379 | Some(false) => { |
| 380 | words.push(std::mem::take(&mut word)); |
| 381 | word.push_str(c); |
| 382 | } |
| 383 | } |
| 384 | } |
| 385 | words.push(word); |
| 386 | |
| 387 | /* |
| 388 | let mut words = Vec::new(); |
| 389 | let mut left_idx = 0; |
| 390 | let mut total_chars = 0; |
| 391 | let mut skip = 0; |
| 392 | let mut cur = GraphemeCursor::new(left_idx, s.len(), true); |
| 393 | |
| 394 | for (right_idx, split) in split_points.enumerate() { |
| 395 | match split { |
| 396 | // no split here |
| 397 | None => {}, |
| 398 | // split here, consume letter |
| 399 | Some(true) => { |
| 400 | let mut right_bound = left_bound; |
| 401 | for _ in 0..total_chars { |
| 402 | right_bound = cur.next_boundary(s, skip).unwrap().unwrap(); |
| 403 | } |
| 404 | words.push(&s[left_bound..right_bound]) |
| 405 | } |
| 406 | // split here, keep letter |
| 407 | Some(false) => { |
| 408 | } |
| 409 | // dont push an empty string, do nothing |
| 410 | _ => {} |
| 411 | } |
| 412 | } |
| 413 | */ |
| 414 | |
| 415 | words.into_iter().filter(|s| !s.is_empty()).collect() |
| 416 | } |
| 417 | |
| 418 | #[cfg (test)] |
| 419 | mod test { |
| 420 | use super::*; |
| 421 | use strum::IntoEnumIterator; |
| 422 | |
| 423 | #[test ] |
| 424 | fn all_boundaries_in_iter() { |
| 425 | let all = Boundary::all(); |
| 426 | for boundary in Boundary::iter() { |
| 427 | assert!(all.contains(&boundary)); |
| 428 | } |
| 429 | } |
| 430 | |
| 431 | #[test ] |
| 432 | fn split_on_delims() { |
| 433 | assert_eq!( |
| 434 | vec!["my" , "word" , "list" , "separated" , "by" , "delims" ], |
| 435 | split("my_word-list separated-by_delims" , &Boundary::delims()) |
| 436 | ) |
| 437 | } |
| 438 | |
| 439 | #[test ] |
| 440 | fn boundaries_found_in_string() { |
| 441 | use Boundary::*; |
| 442 | assert_eq!( |
| 443 | vec![UpperLower], |
| 444 | Boundary::list_from(".Aaaa" ) |
| 445 | ); |
| 446 | assert_eq!( |
| 447 | vec![LowerUpper, UpperLower, LowerDigit], |
| 448 | Boundary::list_from("a8.Aa.aA" ) |
| 449 | ); |
| 450 | assert_eq!( |
| 451 | Boundary::digits(), |
| 452 | Boundary::list_from("b1B1b" ) |
| 453 | ); |
| 454 | assert_eq!( |
| 455 | vec![Hyphen, Underscore, Space, Acronym], |
| 456 | Boundary::list_from("AAa -_" ) |
| 457 | ); |
| 458 | } |
| 459 | } |
| 460 | |