segmentation.rs source code [crates/convert_case-0.6.0/src/segmentation.rs]

1	#[cfg(test)]
2	use strum_macros::EnumIter;
3
4	use unicode_segmentation::{UnicodeSegmentation}; //, GraphemeCursor};
5
6	/// A boundary defines how a string is split into words. Some boundaries, `Hyphen`, `Underscore`,
7	/// and `Space`, consume the character they split on, whereas the other boundaries
8	/// do not.
9	///
10	/// The struct offers methods that return `Vec`s containing useful groups of boundaries. It also
11	/// contains the [`list_from`](Boundary::list_from) method which will generate a list of boundaries
12	/// based on a string slice.
13	///
14	/// Note that all boundaries are distinct and do not share functionality. That is, there is no
15	/// such DigitLetter variant, because that would be equivalent to the current `DigitUpper` and
16	/// `DigitLower` variants. For common functionality, consider using
17	/// some provided functions that return a list of boundaries.
18	/// ```
19	/// use convert_case::{Boundary, Case, Casing, Converter};
20	///
21	/// assert_eq!(
22	/// "transformations_in_3d",
23	/// "TransformationsIn3D"
24	/// .from_case(Case::Camel)
25	/// .without_boundaries(&Boundary::digit_letter())
26	/// .to_case(Case::Snake)
27	/// );
28	///
29	/// let conv = Converter::new()
30	/// .set_boundaries(&Boundary::list_from("aA "))
31	/// .to_case(Case::Title);
32	/// assert_eq!("7empest By Tool", conv.convert("7empest byTool"));
33	/// ```
34	#[cfg_attr(test, derive(EnumIter))]
35	#[derive(Clone, Copy, Eq, PartialEq, Debug)]
36	pub enum Boundary {
37	/// Splits on `-`, consuming the character on segmentation.
38	/// ```
39	/// use convert_case::Boundary;
40	/// assert_eq!(
41	/// vec![Boundary::Hyphen],
42	/// Boundary::list_from("-")
43	/// );
44	/// ```
45	Hyphen,
46
47	/// Splits on `_`, consuming the character on segmentation.
48	/// ```
49	/// use convert_case::Boundary;
50	/// assert_eq!(
51	/// vec![Boundary::Underscore],
52	/// Boundary::list_from("_")
53	/// );
54	/// ```
55	Underscore,
56
57	/// Splits on space, consuming the character on segmentation.
58	/// ```
59	/// use convert_case::Boundary;
60	/// assert_eq!(
61	/// vec![Boundary::Space],
62	/// Boundary::list_from(" ")
63	/// );
64	/// ```
65	Space,
66
67	/// Splits where an uppercase letter is followed by a lowercase letter. This is seldom used,
68	/// and is not included in the [defaults](Boundary::defaults).
69	/// ```
70	/// use convert_case::Boundary;
71	/// assert_eq!(
72	/// vec![Boundary::UpperLower],
73	/// Boundary::list_from("Aa")
74	/// );
75	/// ```
76	UpperLower,
77
78	/// Splits where a lowercase letter is followed by an uppercase letter.
79	/// ```
80	/// use convert_case::Boundary;
81	/// assert_eq!(
82	/// vec![Boundary::LowerUpper],
83	/// Boundary::list_from("aA")
84	/// );
85	/// ```
86	LowerUpper,
87
88	/// Splits where digit is followed by an uppercase letter.
89	/// ```
90	/// use convert_case::Boundary;
91	/// assert_eq!(
92	/// vec![Boundary::DigitUpper],
93	/// Boundary::list_from("1A")
94	/// );
95	/// ```
96	DigitUpper,
97
98	/// Splits where an uppercase letter is followed by a digit.
99	/// ```
100	/// use convert_case::Boundary;
101	/// assert_eq!(
102	/// vec![Boundary::UpperDigit],
103	/// Boundary::list_from("A1")
104	/// );
105	/// ```
106	UpperDigit,
107
108	/// Splits where digit is followed by a lowercase letter.
109	/// ```
110	/// use convert_case::Boundary;
111	/// assert_eq!(
112	/// vec![Boundary::DigitLower],
113	/// Boundary::list_from("1a")
114	/// );
115	/// ```
116	DigitLower,
117
118	/// Splits where a lowercase letter is followed by a digit.
119	/// ```
120	/// use convert_case::Boundary;
121	/// assert_eq!(
122	/// vec![Boundary::LowerDigit],
123	/// Boundary::list_from("a1")
124	/// );
125	/// ```
126	LowerDigit,
127
128	/// Acronyms are identified by two uppercase letters followed by a lowercase letter.
129	/// The word boundary is between the two uppercase letters. For example, "HTTPRequest"
130	/// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
131	/// ```
132	/// use convert_case::Boundary;
133	/// assert_eq!(
134	/// vec![Boundary::Acronym],
135	/// Boundary::list_from("AAa")
136	/// );
137	/// ```
138	Acronym,
139	}
140
141	impl Boundary {
142	/// Returns a list of all boundaries that are identified within the given string.
143	/// Could be a short of writing out all the boundaries in a list directly. This will not
144	/// identify boundary `UpperLower` if it also used as part of `Acronym`.
145	///
146	/// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
147	/// character.
148	/// ```
149	/// use convert_case::Boundary;
150	/// use Boundary::*;
151	/// assert_eq!(
152	/// vec![Hyphen, Space, LowerUpper, UpperDigit, DigitLower],
153	/// Boundary::list_from("aA8a -")
154	/// );
155	/// assert_eq!(
156	/// vec![Underscore, LowerUpper, DigitUpper, Acronym],
157	/// Boundary::list_from("bD:0B:_:AAa")
158	/// );
159	/// ```
160	pub fn list_from(s: &str) -> Vec<Self> {
161	Boundary::all().iter().filter(\|boundary\| {
162	let left_iter = s.graphemes(`true`);
163	let mid_iter = s.graphemes(`true`).skip(`1`);
164	let right_iter = s.graphemes(`true`).skip(`2`);
165
166	let mut one_iter = left_iter.clone();
167
168	// Also capture when the previous pair was both uppercase, so we don't
169	// match the UpperLower boundary in the case of Acronym
170	let two_iter = left_iter.clone().zip(mid_iter.clone());
171	let mut two_iter_and_upper = two_iter.clone()
172	.zip(std::iter::once(`false`).chain(
173	two_iter.map(\|(a, b)\| grapheme_is_uppercase(a) && grapheme_is_uppercase(b))
174	));
175
176	let mut three_iter = left_iter.zip(mid_iter).zip(right_iter);
177
178	one_iter.any(\|a\| boundary.detect_one(a))
179	\|\| two_iter_and_upper.any(\|((a, b), is_acro)\| boundary.detect_two(a, b) && !is_acro)
180	\|\| three_iter.any(\|((a, b), c)\| boundary.detect_three(a, b, c))
181	}).copied().collect()
182	}
183
184	/// The default list of boundaries used when `Casing::to_case` is called directly
185	/// and in a `Converter` generated from `Converter::new()`. This includes
186	/// all the boundaries except the `UpperLower` boundary.
187	/// ```
188	/// use convert_case::Boundary;
189	/// use Boundary::*;
190	/// assert_eq!(
191	/// vec![
192	/// Underscore, Hyphen, Space, LowerUpper, UpperDigit,
193	/// DigitUpper, DigitLower, LowerDigit, Acronym,
194	/// ],
195	/// Boundary::defaults()
196	/// );
197	/// ```
198	pub fn defaults() -> Vec<Self> {
199	use Boundary::*;
200	vec![
201	Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit,
202	Acronym,
203	]
204	}
205
206	/// Returns the boundaries that split around single characters: `Hyphen`,
207	/// `Underscore`, and `Space`.
208	/// ```
209	/// use convert_case::Boundary;
210	/// use Boundary::*;
211	/// assert_eq!(
212	/// vec![Hyphen, Underscore, Space],
213	/// Boundary::delims()
214	/// );
215	/// ```
216	pub fn delims() -> Vec<Self> {
217	use Boundary::*;
218	vec![Hyphen, Underscore, Space]
219	}
220
221	/// Returns the boundaries that involve digits: `DigitUpper`, `DigitLower`, `UpperDigit`, and
222	/// `LowerDigit`.
223	/// ```
224	/// use convert_case::Boundary;
225	/// use Boundary::*;
226	/// assert_eq!(
227	/// vec![DigitUpper, UpperDigit, DigitLower, LowerDigit],
228	/// Boundary::digits()
229	/// );
230	/// ```
231	pub fn digits() -> Vec<Self> {
232	use Boundary::*;
233	vec![DigitUpper, UpperDigit, DigitLower, LowerDigit]
234	}
235
236	/// Returns the boundaries that are letters followed by digits: `UpperDigit` and `LowerDigit`.
237	/// ```
238	/// use convert_case::Boundary;
239	/// use Boundary::*;
240	/// assert_eq!(
241	/// vec![UpperDigit, LowerDigit],
242	/// Boundary::letter_digit()
243	/// );
244	/// ```
245	pub fn letter_digit() -> Vec<Self> {
246	use Boundary::*;
247	vec![UpperDigit, LowerDigit]
248	}
249
250	/// Returns the boundaries that are digits followed by letters: `DigitUpper` and
251	/// `DigitLower`.
252	/// ```
253	/// use convert_case::Boundary;
254	/// use Boundary::*;
255	/// assert_eq!(
256	/// vec![DigitUpper, DigitLower],
257	/// Boundary::digit_letter()
258	/// );
259	/// ```
260	pub fn digit_letter() -> Vec<Self> {
261	use Boundary::*;
262	vec![DigitUpper, DigitLower]
263	}
264
265	/// Returns all boundaries. Note that this includes the `UpperLower` variant which
266	/// might be unhelpful. Please look at [`Boundary::defaults`].
267	/// ```
268	/// use convert_case::Boundary;
269	/// use Boundary::*;
270	/// assert_eq!(
271	/// vec![
272	/// Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper,
273	/// UpperDigit, DigitLower, LowerDigit, Acronym,
274	/// ],
275	/// Boundary::all()
276	/// );
277	/// ```
278	pub fn all() -> Vec<Self> {
279	use Boundary::*;
280	vec![
281	Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit,
282	DigitLower, LowerDigit, Acronym
283	]
284	}
285
286	fn detect_one(&self, c: &str) -> bool {
287	use Boundary::*;
288	match self {
289	Hyphen => c == "-",
290	Underscore => c == "_",
291	Space => c == " ",
292	_ => `false`,
293	}
294	}
295
296	fn detect_two(&self, c: &str, d: &str) -> bool {
297	use Boundary::*;
298	match self {
299	UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d),
300	LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d),
301	DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d),
302	UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d),
303	DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d),
304	LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d),
305	_ => `false`,
306	}
307	}
308
309	fn detect_three(&self, c: &str, d: &str, e: &str) -> bool {
310	use Boundary::*;
311	if let Acronym = self {
312	grapheme_is_uppercase(c)
313	&& grapheme_is_uppercase(d)
314	&& grapheme_is_lowercase(e)
315	} else {
316	`false`
317	}
318	}
319	}
320
321	fn grapheme_is_digit(c: &str) -> bool {
322	c.chars().all(\|c: char\| c.is_ascii_digit())
323	}
324
325	fn grapheme_is_uppercase(c: &str) -> bool {
326	c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase()
327	}
328
329	fn grapheme_is_lowercase(c: &str) -> bool {
330	c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase()
331	}
332
333	pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String>
334	where
335	T: AsRef<str>,
336	{
337	use std::iter::once;
338	// create split_points function that counts off by graphemes into list
339
340	let s = s.as_ref();
341
342	// Some<bool> means the following
343	// None: no split
344	// Some(false): split between characters
345	// Some(true): split consuming characters
346
347	let left_iter = s.graphemes(`true`);
348	let mid_iter = s.graphemes(`true`).skip(`1`);
349	let right_iter = s.graphemes(`true`).skip(`2`);
350
351	let singles = left_iter.clone();
352	let doubles = left_iter.clone().zip(mid_iter.clone());
353	let triples = left_iter.zip(mid_iter).zip(right_iter);
354
355	let singles = singles
356	.map(\|c\| boundaries.iter().any(\|b\| b.detect_one(c)))
357	.map(\|split\| if split {Some(`true`)} else {None});
358	let doubles = doubles
359	.map(\|(c,d)\| boundaries.iter().any(\|b\| b.detect_two(c, d)))
360	.map(\|split\| if split {Some(`false`)} else {None});
361	let triples = triples
362	.map(\|((c,d),e)\| boundaries.iter().any(\|b\| b.detect_three(c, d, e)))
363	.map(\|split\| if split {Some(`false`)} else {None});
364
365	let split_points = singles
366	.zip(once(None).chain(doubles))
367	.zip(once(None).chain(triples).chain(once(None)))
368	.map(\|((s, d), t)\| s.or(d).or(t));
369
370	let mut words = Vec::new();
371	let mut word = String::new();
372	for (c, split) in s.graphemes(`true`).zip(split_points) {
373	match split {
374	// no split here
375	None => word.push_str(c),
376	// split here, consume letter
377	Some(`true`) => words.push(std::mem::take(&mut word)),
378	// split here, keep letter
379	Some(`false`) => {
380	words.push(std::mem::take(&mut word));
381	word.push_str(c);
382	}
383	}
384	}
385	words.push(word);
386
387	/*
388	let mut words = Vec::new();
389	let mut left_idx = 0;
390	let mut total_chars = 0;
391	let mut skip = 0;
392	let mut cur = GraphemeCursor::new(left_idx, s.len(), true);
393
394	for (right_idx, split) in split_points.enumerate() {
395	match split {
396	// no split here
397	None => {},
398	// split here, consume letter
399	Some(true) => {
400	let mut right_bound = left_bound;
401	for _ in 0..total_chars {
402	right_bound = cur.next_boundary(s, skip).unwrap().unwrap();
403	}
404	words.push(&s[left_bound..right_bound])
405	}
406	// split here, keep letter
407	Some(false) => {
408	}
409	// dont push an empty string, do nothing
410	_ => {}
411	}
412	}
413	*/
414
415	words.into_iter().filter(\|s\| !s.is_empty()).collect()
416	}
417
418	#[cfg(test)]
419	mod test {
420	use super::*;
421	use strum::IntoEnumIterator;
422
423	#[test]
424	fn all_boundaries_in_iter() {
425	let all = Boundary::all();
426	for boundary in Boundary::iter() {
427	assert!(all.contains(&boundary));
428	}
429	}
430
431	#[test]
432	fn split_on_delims() {
433	assert_eq!(
434	vec!["my", "word", "list", "separated", "by", "delims"],
435	split("my_word-list separated-by_delims", &Boundary::delims())
436	)
437	}
438
439	#[test]
440	fn boundaries_found_in_string() {
441	use Boundary::*;
442	assert_eq!(
443	vec![UpperLower],
444	Boundary::list_from(".Aaaa")
445	);
446	assert_eq!(
447	vec![LowerUpper, UpperLower, LowerDigit],
448	Boundary::list_from("a8.Aa.aA")
449	);
450	assert_eq!(
451	Boundary::digits(),
452	Boundary::list_from("b1B1b")
453	);
454	assert_eq!(
455	vec![Hyphen, Underscore, Space, Acronym],
456	Boundary::list_from("AAa -_")
457	);
458	}
459	}
460