mod.rs - Codebrowser

1	/!*
2	Provides routines for extracting literal prefixes and suffixes from an `Hir`.
3	*/
4
5	use std::cmp;
6	use std::fmt;
7	use std::iter;
8	use std::mem;
9	use std::ops;
10
11	use crate::hir::{self, Hir, HirKind};
12
13	/// A set of literal byte strings extracted from a regular expression.
14	///
15	/// Every member of the set is a `Literal`, which is represented by a
16	/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
17	/// said to be either complete* or cut. A complete literal means that*
18	/// it extends until the beginning (or end) of the regular expression. In
19	/// some circumstances, this can be used to indicate a match in the regular
20	/// expression.
21	///
22	/// A key aspect of literal extraction is knowing when to stop. It is not
23	/// feasible to blindly extract all literals from a regular expression, even if
24	/// there are finitely many. For example, the regular expression `[0-9]{10}`
25	/// has `10^10` distinct literals. For this reason, literal extraction is
26	/// bounded to some low number by default using heuristics, but the limits can
27	/// be tweaked.
28	///
29	/// WARNING: Literal extraction uses stack space proportional to the size
30	/// of the `Hir` expression. At some point, this drawback will be eliminated.
31	/// To protect yourself, set a reasonable
32	/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
33	/// This is done for you by default.
34	#[derive(Clone, Eq, PartialEq)]
35	pub struct Literals {
36	lits: Vec<Literal>,
37	limit_size: usize,
38	limit_class: usize,
39	}
40
41	/// A single member of a set of literals extracted from a regular expression.
42	///
43	/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
44	/// and `Vec` operations are available.
45	#[derive(Clone, Eq, Ord)]
46	pub struct Literal {
47	v: Vec<u8>,
48	cut: bool,
49	}
50
51	impl Literals {
52	/// Returns a new empty set of literals using default limits.
53	pub fn empty() -> Literals {
54	Literals { lits: vec![], limit_size: `250`, limit_class: `10` }
55	}
56
57	/// Returns a set of literal prefixes extracted from the given `Hir`.
58	pub fn prefixes(expr: &Hir) -> Literals {
59	let mut lits = Literals::empty();
60	lits.union_prefixes(expr);
61	lits
62	}
63
64	/// Returns a set of literal suffixes extracted from the given `Hir`.
65	pub fn suffixes(expr: &Hir) -> Literals {
66	let mut lits = Literals::empty();
67	lits.union_suffixes(expr);
68	lits
69	}
70
71	/// Get the approximate size limit (in bytes) of this set.
72	pub fn limit_size(&self) -> usize {
73	self.limit_size
74	}
75
76	/// Set the approximate size limit (in bytes) of this set.
77	///
78	/// If extracting a literal would put the set over this limit, then
79	/// extraction stops.
80	///
81	/// The new limits will only apply to additions to this set. Existing
82	/// members remain unchanged, even if the set exceeds the new limit.
83	pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
84	self.limit_size = size;
85	self
86	}
87
88	/// Get the character class size limit for this set.
89	pub fn limit_class(&self) -> usize {
90	self.limit_class
91	}
92
93	/// Limits the size of character(or byte) classes considered.
94	///
95	/// A value of `0` prevents all character classes from being considered.
96	///
97	/// This limit also applies to case insensitive literals, since each
98	/// character in the case insensitive literal is converted to a class, and
99	/// then case folded.
100	///
101	/// The new limits will only apply to additions to this set. Existing
102	/// members remain unchanged, even if the set exceeds the new limit.
103	pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
104	self.limit_class = size;
105	self
106	}
107
108	/// Returns the set of literals as a slice. Its order is unspecified.
109	pub fn literals(&self) -> &[Literal] {
110	&self.lits
111	}
112
113	/// Returns the length of the smallest literal.
114	///
115	/// Returns None is there are no literals in the set.
116	pub fn min_len(&self) -> Option<usize> {
117	let mut min = None;
118	for lit in &self.lits {
119	match min {
120	None => min = Some(lit.len()),
121	Some(m) if lit.len() < m => min = Some(lit.len()),
122	_ => {}
123	}
124	}
125	min
126	}
127
128	/// Returns true if all members in this set are complete.
129	pub fn all_complete(&self) -> bool {
130	!self.lits.is_empty() && self.lits.iter().all(\|l\| !l.is_cut())
131	}
132
133	/// Returns true if any member in this set is complete.
134	pub fn any_complete(&self) -> bool {
135	self.lits.iter().any(\|lit\| !lit.is_cut())
136	}
137
138	/// Returns true if this set contains an empty literal.
139	pub fn contains_empty(&self) -> bool {
140	self.lits.iter().any(\|lit\| lit.is_empty())
141	}
142
143	/// Returns true if this set is empty or if all of its members is empty.
144	pub fn is_empty(&self) -> bool {
145	self.lits.is_empty() \|\| self.lits.iter().all(\|lit\| lit.is_empty())
146	}
147
148	/// Returns a new empty set of literals using this set's limits.
149	pub fn to_empty(&self) -> Literals {
150	let mut lits = Literals::empty();
151	lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
152	lits
153	}
154
155	/// Returns the longest common prefix of all members in this set.
156	pub fn longest_common_prefix(&self) -> &[u8] {
157	if self.is_empty() {
158	return &[];
159	}
160	let lit0 = &*self.lits[`0`];
161	let mut len = lit0.len();
162	for lit in &self.lits[`1`..] {
163	len = cmp::min(
164	len,
165	lit.iter().zip(lit0).take_while(\|&(a, b)\| a == b).count(),
166	);
167	}
168	&self.lits[`0`][..len]
169	}
170
171	/// Returns the longest common suffix of all members in this set.
172	pub fn longest_common_suffix(&self) -> &[u8] {
173	if self.is_empty() {
174	return &[];
175	}
176	let lit0 = &*self.lits[`0`];
177	let mut len = lit0.len();
178	for lit in &self.lits[`1`..] {
179	len = cmp::min(
180	len,
181	lit.iter()
182	.rev()
183	.zip(lit0.iter().rev())
184	.take_while(\|&(a, b)\| a == b)
185	.count(),
186	);
187	}
188	&self.lits[`0`][self.lits[`0`].len() - len..]
189	}
190
191	/// Returns a new set of literals with the given number of bytes trimmed
192	/// from the suffix of each literal.
193	///
194	/// If any literal would be cut out completely by trimming, then None is
195	/// returned.
196	///
197	/// Any duplicates that are created as a result of this transformation are
198	/// removed.
199	pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
200	if self.min_len().map(\|len\| len <= num_bytes).unwrap_or(`true`) {
201	return None;
202	}
203	let mut new = self.to_empty();
204	for mut lit in self.lits.iter().cloned() {
205	let new_len = lit.len() - num_bytes;
206	lit.truncate(new_len);
207	lit.cut();
208	new.lits.push(lit);
209	}
210	new.lits.sort();
211	new.lits.dedup();
212	Some(new)
213	}
214
215	/// Returns a new set of prefixes of this set of literals that are
216	/// guaranteed to be unambiguous.
217	///
218	/// Any substring match with a member of the set is returned is guaranteed
219	/// to never overlap with a substring match of another member of the set
220	/// at the same starting position.
221	///
222	/// Given any two members of the returned set, neither is a substring of
223	/// the other.
224	pub fn unambiguous_prefixes(&self) -> Literals {
225	if self.lits.is_empty() {
226	return self.to_empty();
227	}
228	let mut old = self.lits.to_vec();
229	let mut new = self.to_empty();
230	'OUTER: while let Some(mut candidate) = old.pop() {
231	if candidate.is_empty() {
232	continue;
233	}
234	if new.lits.is_empty() {
235	new.lits.push(candidate);
236	continue;
237	}
238	for lit2 in &mut new.lits {
239	if lit2.is_empty() {
240	continue;
241	}
242	if &candidate == lit2 {
243	// If the literal is already in the set, then we can
244	// just drop it. But make sure that cut literals are
245	// infectious!
246	candidate.cut = candidate.cut \|\| lit2.cut;
247	lit2.cut = candidate.cut;
248	continue 'OUTER;
249	}
250	if candidate.len() < lit2.len() {
251	if let Some(i) = position(&candidate, &lit2) {
252	candidate.cut();
253	let mut lit3 = lit2.clone();
254	lit3.truncate(i);
255	lit3.cut();
256	old.push(lit3);
257	lit2.clear();
258	}
259	} else if let Some(i) = position(&lit2, &candidate) {
260	lit2.cut();
261	let mut new_candidate = candidate.clone();
262	new_candidate.truncate(i);
263	new_candidate.cut();
264	old.push(new_candidate);
265	candidate.clear();
266	}
267	// Oops, the candidate is already represented in the set.
268	if candidate.is_empty() {
269	continue 'OUTER;
270	}
271	}
272	new.lits.push(candidate);
273	}
274	new.lits.retain(\|lit\| !lit.is_empty());
275	new.lits.sort();
276	new.lits.dedup();
277	new
278	}
279
280	/// Returns a new set of suffixes of this set of literals that are
281	/// guaranteed to be unambiguous.
282	///
283	/// Any substring match with a member of the set is returned is guaranteed
284	/// to never overlap with a substring match of another member of the set
285	/// at the same ending position.
286	///
287	/// Given any two members of the returned set, neither is a substring of
288	/// the other.
289	pub fn unambiguous_suffixes(&self) -> Literals {
290	// This is a touch wasteful...
291	let mut lits = self.clone();
292	lits.reverse();
293	let mut unamb = lits.unambiguous_prefixes();
294	unamb.reverse();
295	unamb
296	}
297
298	/// Unions the prefixes from the given expression to this set.
299	///
300	/// If prefixes could not be added (for example, this set would exceed its
301	/// size limits or the set of prefixes from `expr` includes the empty
302	/// string), then false is returned.
303	///
304	/// Note that prefix literals extracted from `expr` are said to be complete
305	/// if and only if the literal extends from the beginning of `expr` to the
306	/// end of `expr`.
307	pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
308	let mut lits = self.to_empty();
309	prefixes(expr, &mut lits);
310	!lits.is_empty() && !lits.contains_empty() && self.union(lits)
311	}
312
313	/// Unions the suffixes from the given expression to this set.
314	///
315	/// If suffixes could not be added (for example, this set would exceed its
316	/// size limits or the set of suffixes from `expr` includes the empty
317	/// string), then false is returned.
318	///
319	/// Note that prefix literals extracted from `expr` are said to be complete
320	/// if and only if the literal extends from the end of `expr` to the
321	/// beginning of `expr`.
322	pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
323	let mut lits = self.to_empty();
324	suffixes(expr, &mut lits);
325	lits.reverse();
326	!lits.is_empty() && !lits.contains_empty() && self.union(lits)
327	}
328
329	/// Unions this set with another set.
330	///
331	/// If the union would cause the set to exceed its limits, then the union
332	/// is skipped and it returns false. Otherwise, if the union succeeds, it
333	/// returns true.
334	pub fn union(&mut self, lits: Literals) -> bool {
335	if self.num_bytes() + lits.num_bytes() > self.limit_size {
336	return `false`;
337	}
338	if lits.is_empty() {
339	self.lits.push(Literal::empty());
340	} else {
341	self.lits.extend(lits.lits);
342	}
343	`true`
344	}
345
346	/// Extends this set with another set.
347	///
348	/// The set of literals is extended via a cross product.
349	///
350	/// If a cross product would cause this set to exceed its limits, then the
351	/// cross product is skipped and it returns false. Otherwise, if the cross
352	/// product succeeds, it returns true.
353	pub fn cross_product(&mut self, lits: &Literals) -> bool {
354	if lits.is_empty() {
355	return `true`;
356	}
357	// Check that we make sure we stay in our limits.
358	let mut size_after;
359	if self.is_empty() \|\| !self.any_complete() {
360	size_after = self.num_bytes();
361	for lits_lit in lits.literals() {
362	size_after += lits_lit.len();
363	}
364	} else {
365	size_after = self.lits.iter().fold(`0`, \|accum, lit\| {
366	accum + if lit.is_cut() { lit.len() } else { `0` }
367	});
368	for lits_lit in lits.literals() {
369	for self_lit in self.literals() {
370	if !self_lit.is_cut() {
371	size_after += self_lit.len() + lits_lit.len();
372	}
373	}
374	}
375	}
376	if size_after > self.limit_size {
377	return `false`;
378	}
379
380	let mut base = self.remove_complete();
381	if base.is_empty() {
382	base = vec![Literal::empty()];
383	}
384	for lits_lit in lits.literals() {
385	for mut self_lit in base.clone() {
386	self_lit.extend(&**lits_lit);
387	self_lit.cut = lits_lit.cut;
388	self.lits.push(self_lit);
389	}
390	}
391	`true`
392	}
393
394	/// Extends each literal in this set with the bytes given.
395	///
396	/// If the set is empty, then the given literal is added to the set.
397	///
398	/// If adding any number of bytes to all members of this set causes a limit
399	/// to be exceeded, then no bytes are added and false is returned. If a
400	/// prefix of `bytes` can be fit into this set, then it is used and all
401	/// resulting literals are cut.
402	pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
403	// N.B. This could be implemented by simply calling cross_product with
404	// a literal set containing just `bytes`, but we can be smarter about
405	// taking shorter prefixes of `bytes` if they'll fit.
406	if bytes.is_empty() {
407	return `true`;
408	}
409	if self.lits.is_empty() {
410	let i = cmp::min(self.limit_size, bytes.len());
411	self.lits.push(Literal::new(bytes[..i].to_owned()));
412	self.lits[`0`].cut = i < bytes.len();
413	return !self.lits[`0`].is_cut();
414	}
415	let size = self.num_bytes();
416	if size + self.lits.len() >= self.limit_size {
417	return `false`;
418	}
419	let mut i = `1`;
420	while size + (i * self.lits.len()) <= self.limit_size
421	&& i < bytes.len()
422	{
423	i += `1`;
424	}
425	for lit in &mut self.lits {
426	if !lit.is_cut() {
427	lit.extend(&bytes[..i]);
428	if i < bytes.len() {
429	lit.cut();
430	}
431	}
432	}
433	`true`
434	}
435
436	/// Adds the given literal to this set.
437	///
438	/// Returns false if adding this literal would cause the class to be too
439	/// big.
440	pub fn add(&mut self, lit: Literal) -> bool {
441	if self.num_bytes() + lit.len() > self.limit_size {
442	return `false`;
443	}
444	self.lits.push(lit);
445	`true`
446	}
447
448	/// Extends each literal in this set with the character class given.
449	///
450	/// Returns false if the character class was too big to add.
451	pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
452	self._add_char_class(cls, `false`)
453	}
454
455	/// Extends each literal in this set with the character class given,
456	/// writing the bytes of each character in reverse.
457	///
458	/// Returns false if the character class was too big to add.
459	fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
460	self._add_char_class(cls, `true`)
461	}
462
463	fn _add_char_class(
464	&mut self,
465	cls: &hir::ClassUnicode,
466	reverse: bool,
467	) -> bool {
468	use std::char;
469
470	if self.class_exceeds_limits(cls_char_count(cls)) {
471	return `false`;
472	}
473	let mut base = self.remove_complete();
474	if base.is_empty() {
475	base = vec![Literal::empty()];
476	}
477	for r in cls.iter() {
478	let (s, e) = (r.start as u32, r.end as u32 + `1`);
479	for c in (s..e).filter_map(char::from_u32) {
480	for mut lit in base.clone() {
481	let mut bytes = c.to_string().into_bytes();
482	if reverse {
483	bytes.reverse();
484	}
485	lit.extend(&bytes);
486	self.lits.push(lit);
487	}
488	}
489	}
490	`true`
491	}
492
493	/// Extends each literal in this set with the byte class given.
494	///
495	/// Returns false if the byte class was too big to add.
496	pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
497	if self.class_exceeds_limits(cls_byte_count(cls)) {
498	return `false`;
499	}
500	let mut base = self.remove_complete();
501	if base.is_empty() {
502	base = vec![Literal::empty()];
503	}
504	for r in cls.iter() {
505	let (s, e) = (r.start as u32, r.end as u32 + `1`);
506	for b in (s..e).map(\|b\| b as u8) {
507	for mut lit in base.clone() {
508	lit.push(b);
509	self.lits.push(lit);
510	}
511	}
512	}
513	`true`
514	}
515
516	/// Cuts every member of this set. When a member is cut, it can never
517	/// be extended.
518	pub fn cut(&mut self) {
519	for lit in &mut self.lits {
520	lit.cut();
521	}
522	}
523
524	/// Reverses all members in place.
525	pub fn reverse(&mut self) {
526	for lit in &mut self.lits {
527	lit.reverse();
528	}
529	}
530
531	/// Clears this set of all members.
532	pub fn clear(&mut self) {
533	self.lits.clear();
534	}
535
536	/// Pops all complete literals out of this set.
537	fn remove_complete(&mut self) -> Vec<Literal> {
538	let mut base = vec![];
539	for lit in mem::replace(&mut self.lits, vec![]) {
540	if lit.is_cut() {
541	self.lits.push(lit);
542	} else {
543	base.push(lit);
544	}
545	}
546	base
547	}
548
549	/// Returns the total number of bytes in this set.
550	fn num_bytes(&self) -> usize {
551	self.lits.iter().fold(`0`, \|accum, lit\| accum + lit.len())
552	}
553
554	/// Returns true if a character class with the given size would cause this
555	/// set to exceed its limits.
556	///
557	/// The size given should correspond to the number of items in the class.
558	fn class_exceeds_limits(&self, size: usize) -> bool {
559	if size > self.limit_class {
560	return `true`;
561	}
562	// This is an approximation since codepoints in a char class can encode
563	// to 1-4 bytes.
564	let new_byte_count = if self.lits.is_empty() {
565	size
566	} else {
567	self.lits.iter().fold(`0`, \|accum, lit\| {
568	accum
569	+ if lit.is_cut() {
570	// If the literal is cut, then we'll never add
571	// anything to it, so don't count it.
572	`0`
573	} else {
574	(lit.len() + `1`) * size
575	}
576	})
577	};
578	new_byte_count > self.limit_size
579	}
580	}
581
582	fn prefixes(expr: &Hir, lits: &mut Literals) {
583	match *expr.kind() {
584	HirKind::Literal(hir::Literal::Unicode(c)) => {
585	let mut buf = [`0`; `4`];
586	lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
587	}
588	HirKind::Literal(hir::Literal::Byte(b)) => {
589	lits.cross_add(&[b]);
590	}
591	HirKind::Class(hir::Class::Unicode(ref cls)) => {
592	if !lits.add_char_class(cls) {
593	lits.cut();
594	}
595	}
596	HirKind::Class(hir::Class::Bytes(ref cls)) => {
597	if !lits.add_byte_class(cls) {
598	lits.cut();
599	}
600	}
601	HirKind::Group(hir::Group { ref hir, .. }) => {
602	prefixes(&**hir, lits);
603	}
604	HirKind::Repetition(ref x) => match x.kind {
605	hir::RepetitionKind::ZeroOrOne => {
606	repeat_zero_or_one_literals(&x.hir, lits, prefixes);
607	}
608	hir::RepetitionKind::ZeroOrMore => {
609	repeat_zero_or_more_literals(&x.hir, lits, prefixes);
610	}
611	hir::RepetitionKind::OneOrMore => {
612	repeat_one_or_more_literals(&x.hir, lits, prefixes);
613	}
614	hir::RepetitionKind::Range(ref rng) => {
615	let (min, max) = match *rng {
616	hir::RepetitionRange::Exactly(m) => (m, Some(m)),
617	hir::RepetitionRange::AtLeast(m) => (m, None),
618	hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
619	};
620	repeat_range_literals(
621	&x.hir, min, max, x.greedy, lits, prefixes,
622	)
623	}
624	},
625	HirKind::Concat(ref es) if es.is_empty() => {}
626	HirKind::Concat(ref es) if es.len() == `1` => prefixes(&es[`0`], lits),
627	HirKind::Concat(ref es) => {
628	for e in es {
629	if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
630	if !lits.is_empty() {
631	lits.cut();
632	break;
633	}
634	lits.add(Literal::empty());
635	continue;
636	}
637	let mut lits2 = lits.to_empty();
638	prefixes(e, &mut lits2);
639	if !lits.cross_product(&lits2) \|\| !lits2.any_complete() {
640	// If this expression couldn't yield any literal that
641	// could be extended, then we need to quit. Since we're
642	// short-circuiting, we also need to freeze every member.
643	lits.cut();
644	break;
645	}
646	}
647	}
648	HirKind::Alternation(ref es) => {
649	alternate_literals(es, lits, prefixes);
650	}
651	_ => lits.cut(),
652	}
653	}
654
655	fn suffixes(expr: &Hir, lits: &mut Literals) {
656	match *expr.kind() {
657	HirKind::Literal(hir::Literal::Unicode(c)) => {
658	let mut buf = [`0u8`; `4`];
659	let i = c.encode_utf8(&mut buf).len();
660	let buf = &mut buf[..i];
661	buf.reverse();
662	lits.cross_add(buf);
663	}
664	HirKind::Literal(hir::Literal::Byte(b)) => {
665	lits.cross_add(&[b]);
666	}
667	HirKind::Class(hir::Class::Unicode(ref cls)) => {
668	if !lits.add_char_class_reverse(cls) {
669	lits.cut();
670	}
671	}
672	HirKind::Class(hir::Class::Bytes(ref cls)) => {
673	if !lits.add_byte_class(cls) {
674	lits.cut();
675	}
676	}
677	HirKind::Group(hir::Group { ref hir, .. }) => {
678	suffixes(&**hir, lits);
679	}
680	HirKind::Repetition(ref x) => match x.kind {
681	hir::RepetitionKind::ZeroOrOne => {
682	repeat_zero_or_one_literals(&x.hir, lits, suffixes);
683	}
684	hir::RepetitionKind::ZeroOrMore => {
685	repeat_zero_or_more_literals(&x.hir, lits, suffixes);
686	}
687	hir::RepetitionKind::OneOrMore => {
688	repeat_one_or_more_literals(&x.hir, lits, suffixes);
689	}
690	hir::RepetitionKind::Range(ref rng) => {
691	let (min, max) = match *rng {
692	hir::RepetitionRange::Exactly(m) => (m, Some(m)),
693	hir::RepetitionRange::AtLeast(m) => (m, None),
694	hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
695	};
696	repeat_range_literals(
697	&x.hir, min, max, x.greedy, lits, suffixes,
698	)
699	}
700	},
701	HirKind::Concat(ref es) if es.is_empty() => {}
702	HirKind::Concat(ref es) if es.len() == `1` => suffixes(&es[`0`], lits),
703	HirKind::Concat(ref es) => {
704	for e in es.iter().rev() {
705	if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
706	if !lits.is_empty() {
707	lits.cut();
708	break;
709	}
710	lits.add(Literal::empty());
711	continue;
712	}
713	let mut lits2 = lits.to_empty();
714	suffixes(e, &mut lits2);
715	if !lits.cross_product(&lits2) \|\| !lits2.any_complete() {
716	// If this expression couldn't yield any literal that
717	// could be extended, then we need to quit. Since we're
718	// short-circuiting, we also need to freeze every member.
719	lits.cut();
720	break;
721	}
722	}
723	}
724	HirKind::Alternation(ref es) => {
725	alternate_literals(es, lits, suffixes);
726	}
727	_ => lits.cut(),
728	}
729	}
730
731	fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
732	e: &Hir,
733	lits: &mut Literals,
734	mut f: F,
735	) {
736	f(
737	&Hir::repetition(hir::Repetition {
738	kind: hir::RepetitionKind::ZeroOrMore,
739	// FIXME: Our literal extraction doesn't care about greediness.
740	// Which is partially why we're treating 'e?' as 'e'. Namely,*
741	// 'ab??' yields [Complete(ab), Complete(a)], but it should yield
742	// [Complete(a), Complete(ab)] because of the non-greediness.
743	greedy: `true`,
744	hir: Box::new(e.clone()),
745	}),
746	lits,
747	);
748	}
749
750	fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
751	e: &Hir,
752	lits: &mut Literals,
753	mut f: F,
754	) {
755	let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
756	lits3.set_limit_size(lits.limit_size() / `2`);
757	f(e, &mut lits3);
758
759	if lits3.is_empty() \|\| !lits2.cross_product(&lits3) {
760	lits.cut();
761	return;
762	}
763	lits2.cut();
764	lits2.add(Literal::empty());
765	if !lits.union(lits2) {
766	lits.cut();
767	}
768	}
769
770	fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
771	e: &Hir,
772	lits: &mut Literals,
773	mut f: F,
774	) {
775	f(e, lits);
776	lits.cut();
777	}
778
779	fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
780	e: &Hir,
781	min: u32,
782	max: Option<u32>,
783	greedy: bool,
784	lits: &mut Literals,
785	mut f: F,
786	) {
787	if min == `0` {
788	// This is a bit conservative. If `max` is set, then we could
789	// treat this as a finite set of alternations. For now, we
790	// just treat it as `e`.*
791	f(
792	&Hir::repetition(hir::Repetition {
793	kind: hir::RepetitionKind::ZeroOrMore,
794	greedy,
795	hir: Box::new(e.clone()),
796	}),
797	lits,
798	);
799	} else {
800	if min > `0` {
801	let n = cmp::min(lits.limit_size, min as usize);
802	let es = iter::repeat(e.clone()).take(n).collect();
803	f(&Hir::concat(es), lits);
804	if n < min as usize \|\| lits.contains_empty() {
805	lits.cut();
806	}
807	}
808	if max.map_or(`true`, \|max\| min < max) {
809	lits.cut();
810	}
811	}
812	}
813
814	fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
815	es: &[Hir],
816	lits: &mut Literals,
817	mut f: F,
818	) {
819	let mut lits2 = lits.to_empty();
820	for e in es {
821	let mut lits3 = lits.to_empty();
822	lits3.set_limit_size(lits.limit_size() / `5`);
823	f(e, &mut lits3);
824	if lits3.is_empty() \|\| !lits2.union(lits3) {
825	// If we couldn't find suffixes for any* of the*
826	// alternates, then the entire alternation has to be thrown
827	// away and any existing members must be frozen. Similarly,
828	// if the union couldn't complete, stop and freeze.
829	lits.cut();
830	return;
831	}
832	}
833	if !lits.cross_product(&lits2) {
834	lits.cut();
835	}
836	}
837
838	impl fmt::Debug for Literals {
839	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
840	f.debug_struct("Literals")
841	.field("lits", &self.lits)
842	.field("limit_size", &self.limit_size)
843	.field("limit_class", &self.limit_class)
844	.finish()
845	}
846	}
847
848	impl Literal {
849	/// Returns a new complete literal with the bytes given.
850	pub fn new(bytes: Vec<u8>) -> Literal {
851	Literal { v: bytes, cut: `false` }
852	}
853
854	/// Returns a new complete empty literal.
855	pub fn empty() -> Literal {
856	Literal { v: vec![], cut: `false` }
857	}
858
859	/// Returns true if this literal was "cut."
860	pub fn is_cut(&self) -> bool {
861	self.cut
862	}
863
864	/// Cuts this literal.
865	pub fn cut(&mut self) {
866	self.cut = `true`;
867	}
868	}
869
870	impl PartialEq for Literal {
871	fn eq(&self, other: &Literal) -> bool {
872	self.v == other.v
873	}
874	}
875
876	impl PartialOrd for Literal {
877	fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
878	self.v.partial_cmp(&other.v)
879	}
880	}
881
882	impl fmt::Debug for Literal {
883	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
884	if self.is_cut() {
885	write!(f, "Cut({})", escape_unicode(&self.v))
886	} else {
887	write!(f, "Complete({})", escape_unicode(&self.v))
888	}
889	}
890	}
891
892	impl AsRef<[u8]> for Literal {
893	fn as_ref(&self) -> &[u8] {
894	&self.v
895	}
896	}
897
898	impl ops::Deref for Literal {
899	type Target = Vec<u8>;
900	fn deref(&self) -> &Vec<u8> {
901	&self.v
902	}
903	}
904
905	impl ops::DerefMut for Literal {
906	fn deref_mut(&mut self) -> &mut Vec<u8> {
907	&mut self.v
908	}
909	}
910
911	fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
912	let mut i = `0`;
913	while haystack.len() >= needle.len() {
914	if needle == &haystack[..needle.len()] {
915	return Some(i);
916	}
917	i += `1`;
918	haystack = &haystack[`1`..];
919	}
920	None
921	}
922
923	fn escape_unicode(bytes: &[u8]) -> String {
924	let show = match ::std::str::from_utf8(bytes) {
925	Ok(v) => v.to_string(),
926	Err(_) => escape_bytes(bytes),
927	};
928	let mut space_escaped = String::new();
929	for c in show.chars() {
930	if c.is_whitespace() {
931	let escaped = if c as u32 <= `0x7F` {
932	escape_byte(c as u8)
933	} else if c as u32 <= `0xFFFF` {
934	format!(r"\u{{{:04x}}}", c as u32)
935	} else {
936	format!(r"\U{{{:08x}}}", c as u32)
937	};
938	space_escaped.push_str(&escaped);
939	} else {
940	space_escaped.push(c);
941	}
942	}
943	space_escaped
944	}
945
946	fn escape_bytes(bytes: &[u8]) -> String {
947	let mut s = String::new();
948	for &b in bytes {
949	s.push_str(&escape_byte(b));
950	}
951	s
952	}
953
954	fn escape_byte(byte: u8) -> String {
955	use std::ascii::escape_default;
956
957	let escaped: Vec<u8> = escape_default(byte).collect();
958	String::from_utf8_lossy(&escaped).into_owned()
959	}
960
961	fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
962	cls.iter().map(\|&r\| `1` + (r.end as u32) - (r.start as u32)).sum::<u32>()
963	as usize
964	}
965
966	fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
967	cls.iter().map(\|&r\| `1` + (r.end as u32) - (r.start as u32)).sum::<u32>()
968	as usize
969	}
970
971	#[cfg(test)]
972	mod tests {
973	use std::fmt;
974
975	use super::{escape_bytes, Literal, Literals};
976	use crate::hir::Hir;
977	use crate::ParserBuilder;
978
979	// To make test failures easier to read.
980	#[derive(Debug, Eq, PartialEq)]
981	struct Bytes(Vec<ULiteral>);
982	#[derive(Debug, Eq, PartialEq)]
983	struct Unicode(Vec<ULiteral>);
984
985	fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
986	let mut ulits = vec![];
987	for blit in blits {
988	ulits
989	.push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
990	}
991	ulits
992	}
993
994	fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
995	Literals {
996	lits: it.into_iter().collect(),
997	limit_size: `0`,
998	limit_class: `0`,
999	}
1000	}
1001
1002	// Needs to be pub for 1.3?
1003	#[derive(Clone, Eq, PartialEq)]
1004	pub struct ULiteral {
1005	v: String,
1006	cut: bool,
1007	}
1008
1009	impl ULiteral {
1010	fn is_cut(&self) -> bool {
1011	self.cut
1012	}
1013	}
1014
1015	impl fmt::Debug for ULiteral {
1016	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1017	if self.is_cut() {
1018	write!(f, "Cut({})", self.v)
1019	} else {
1020	write!(f, "Complete({})", self.v)
1021	}
1022	}
1023	}
1024
1025	impl PartialEq<Literal> for ULiteral {
1026	fn eq(&self, other: &Literal) -> bool {
1027	self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1028	}
1029	}
1030
1031	impl PartialEq<ULiteral> for Literal {
1032	fn eq(&self, other: &ULiteral) -> bool {
1033	&*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1034	}
1035	}
1036
1037	#[allow(non_snake_case)]
1038	fn C(s: &'static str) -> ULiteral {
1039	ULiteral { v: s.to_owned(), cut: `true` }
1040	}
1041	#[allow(non_snake_case)]
1042	fn M(s: &'static str) -> ULiteral {
1043	ULiteral { v: s.to_owned(), cut: `false` }
1044	}
1045
1046	fn prefixes(lits: &mut Literals, expr: &Hir) {
1047	lits.union_prefixes(expr);
1048	}
1049
1050	fn suffixes(lits: &mut Literals, expr: &Hir) {
1051	lits.union_suffixes(expr);
1052	}
1053
1054	macro_rules! assert_lit_eq {
1055	($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1056	let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1057	let lits = $got_lits;
1058	assert_eq!(
1059	$which(expected.clone()),
1060	$which(escape_lits(lits.literals())));
1061	assert_eq!(
1062	!expected.is_empty() && expected.iter().all(\|l\| !l.is_cut()),
1063	lits.all_complete());
1064	assert_eq!(
1065	expected.iter().any(\|l\| !l.is_cut()),
1066	lits.any_complete());
1067	}};
1068	}
1069
1070	macro_rules! test_lit {
1071	($name:ident, $which:ident, $re:expr) => {
1072	test_lit!($name, $which, $re,);
1073	};
1074	($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1075	#[test]
1076	fn $name() {
1077	let expr = ParserBuilder::new()
1078	.build()
1079	.parse($re)
1080	.unwrap();
1081	let lits = Literals::$which(&expr);
1082	assert_lit_eq!(Unicode, lits, $($lit),*);
1083
1084	let expr = ParserBuilder::new()
1085	.allow_invalid_utf8(`true`)
1086	.unicode(`false`)
1087	.build()
1088	.parse($re)
1089	.unwrap();
1090	let lits = Literals::$which(&expr);
1091	assert_lit_eq!(Bytes, lits, $($lit),*);
1092	}
1093	};
1094	}
1095
1096	// ************************************************************************
1097	// Tests for prefix literal extraction.
1098	// ************************************************************************
1099
1100	// Elementary tests.
1101	test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1102	test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1103	test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("`\\`xe2`\\`x98`\\`x83"));
1104	#[cfg(feature = "unicode-case")]
1105	test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("`\\`xe2`\\`x98`\\`x83"));
1106	test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1107	test_lit!(
1108	pfx_class2,
1109	prefixes,
1110	"(?u)[☃Ⅰ]",
1111	M("`\\`xe2`\\`x85`\\`xa0"),
1112	M("`\\`xe2`\\`x98`\\`x83")
1113	);
1114	#[cfg(feature = "unicode-case")]
1115	test_lit!(
1116	pfx_class3,
1117	prefixes,
1118	"(?ui)[☃Ⅰ]",
1119	M("`\\`xe2`\\`x85`\\`xa0"),
1120	M("`\\`xe2`\\`x85`\\`xb0"),
1121	M("`\\`xe2`\\`x98`\\`x83")
1122	);
1123	test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1124	test_lit!(
1125	pfx_one_lit_casei2,
1126	prefixes,
1127	"(?i-u)abc",
1128	M("ABC"),
1129	M("aBC"),
1130	M("AbC"),
1131	M("abC"),
1132	M("ABc"),
1133	M("aBc"),
1134	M("Abc"),
1135	M("abc")
1136	);
1137	test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1138	test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1139	test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1140	test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1141	// FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1142	// repetition. As a work-around, we rewrite ab?? as ab?, and thus we get*
1143	// a cut literal.
1144	test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
1145	test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1146	test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1147	test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1148	test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1149	test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1150	test_lit!(pfx_rep_range1, prefixes, "a{0}");
1151	test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1152	test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1153	test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1154	test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1155	test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1156	test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1157
1158	// Test regexes with concatenations.
1159	test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1160	test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1161	test_lit!(
1162	pfx_cat3,
1163	prefixes,
1164	"(?i-u)[ab]z",
1165	M("AZ"),
1166	M("BZ"),
1167	M("aZ"),
1168	M("bZ"),
1169	M("Az"),
1170	M("Bz"),
1171	M("az"),
1172	M("bz")
1173	);
1174	test_lit!(
1175	pfx_cat4,
1176	prefixes,
1177	"[ab][yz]",
1178	M("ay"),
1179	M("by"),
1180	M("az"),
1181	M("bz")
1182	);
1183	test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1184	test_lit!(pfx_cat6, prefixes, "abc", C("a"), C("b"), M("c"));
1185	test_lit!(pfx_cat7, prefixes, "abc+", C("a"), C("b"), C("c"));
1186	test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1187	test_lit!(pfx_cat9, prefixes, "ab+c", C("a"), C("b"));
1188	test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1189	test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1190	test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1191	test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1192	test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1193	test_lit!(pfx_cat15, prefixes, "$a");
1194	test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1195	test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1196	test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1197	test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1198
1199	// Test regexes with alternations.
1200	test_lit!(pfx_alt1, prefixes, "a\|b", M("a"), M("b"));
1201	test_lit!(pfx_alt2, prefixes, "[1-3]\|b", M("1"), M("2"), M("3"), M("b"));
1202	test_lit!(pfx_alt3, prefixes, "y(?:a\|b)z", M("yaz"), M("ybz"));
1203	test_lit!(pfx_alt4, prefixes, "a\|b*");
1204	test_lit!(pfx_alt5, prefixes, "a\|b+", M("a"), C("b"));
1205	test_lit!(pfx_alt6, prefixes, "a\|(?:b\|c*)");
1206	test_lit!(
1207	pfx_alt7,
1208	prefixes,
1209	"(a\|b)c\|(a\|ab)c",
1210	C("a"),
1211	C("b"),
1212	M("c"),
1213	C("a"),
1214	C("ab"),
1215	M("c")
1216	);
1217	test_lit!(pfx_alt8, prefixes, "a*b\|c", C("a"), M("b"), M("c"));
1218
1219	// Test regexes with empty assertions.
1220	test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1221	test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1222	test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1223	test_lit!(pfx_empty4, prefixes, "(?:^abc)\|(?:^z)", M("abc"), M("z"));
1224
1225	// Make sure some curious regexes have no prefixes.
1226	test_lit!(pfx_nothing1, prefixes, ".");
1227	test_lit!(pfx_nothing2, prefixes, "(?s).");
1228	test_lit!(pfx_nothing3, prefixes, "^");
1229	test_lit!(pfx_nothing4, prefixes, "$");
1230	test_lit!(pfx_nothing6, prefixes, "(?m)$");
1231	test_lit!(pfx_nothing7, prefixes, r"\b");
1232	test_lit!(pfx_nothing8, prefixes, r"\B");
1233
1234	// Test a few regexes that defeat any prefix literal detection.
1235	test_lit!(pfx_defeated1, prefixes, ".a");
1236	test_lit!(pfx_defeated2, prefixes, "(?s).a");
1237	test_lit!(pfx_defeated3, prefixes, "abc*");
1238	test_lit!(pfx_defeated4, prefixes, "a\|.");
1239	test_lit!(pfx_defeated5, prefixes, ".\|a");
1240	test_lit!(pfx_defeated6, prefixes, "a\|^");
1241	test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1242	test_lit!(pfx_defeated8, prefixes, "$a");
1243	test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1244	test_lit!(pfx_defeated10, prefixes, r"\ba");
1245	test_lit!(pfx_defeated11, prefixes, r"\Ba");
1246	test_lit!(pfx_defeated12, prefixes, "^*a");
1247	test_lit!(pfx_defeated13, prefixes, "^+a");
1248
1249	test_lit!(
1250	pfx_crazy1,
1251	prefixes,
1252	r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1253	C("Mo`\\`'"),
1254	C("Mu`\\`'"),
1255	C("Moam"),
1256	C("Muam")
1257	);
1258
1259	// ************************************************************************
1260	// Tests for quiting prefix literal search.
1261	// ************************************************************************
1262
1263	macro_rules! test_exhausted {
1264	($name:ident, $which:ident, $re:expr) => {
1265	test_exhausted!($name, $which, $re,);
1266	};
1267	($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1268	#[test]
1269	fn $name() {
1270	let expr = ParserBuilder::new()
1271	.build()
1272	.parse($re)
1273	.unwrap();
1274	let mut lits = Literals::empty();
1275	lits.set_limit_size(`20`).set_limit_class(`10`);
1276	$which(&mut lits, &expr);
1277	assert_lit_eq!(Unicode, lits, $($lit),*);
1278
1279	let expr = ParserBuilder::new()
1280	.allow_invalid_utf8(`true`)
1281	.unicode(`false`)
1282	.build()
1283	.parse($re)
1284	.unwrap();
1285	let mut lits = Literals::empty();
1286	lits.set_limit_size(`20`).set_limit_class(`10`);
1287	$which(&mut lits, &expr);
1288	assert_lit_eq!(Bytes, lits, $($lit),*);
1289	}
1290	};
1291	}
1292
1293	// These test use a much lower limit than the default so that we can
1294	// write test cases of reasonable size.
1295	test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1296	test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1297	test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1298	test_exhausted!(
1299	pfx_exhausted4,
1300	prefixes,
1301	"(?i-u)foobar",
1302	C("FO"),
1303	C("fO"),
1304	C("Fo"),
1305	C("fo")
1306	);
1307	test_exhausted!(
1308	pfx_exhausted5,
1309	prefixes,
1310	"(?:ab){100}",
1311	C("abababababababababab")
1312	);
1313	test_exhausted!(
1314	pfx_exhausted6,
1315	prefixes,
1316	"(?:(?:ab){100})*cd",
1317	C("ababababab"),
1318	M("cd")
1319	);
1320	test_exhausted!(
1321	pfx_exhausted7,
1322	prefixes,
1323	"z(?:(?:ab){100})*cd",
1324	C("zababababab"),
1325	M("zcd")
1326	);
1327	test_exhausted!(
1328	pfx_exhausted8,
1329	prefixes,
1330	"aaaaaaaaaaaaaaaaaaaaz",
1331	C("aaaaaaaaaaaaaaaaaaaa")
1332	);
1333
1334	// ************************************************************************
1335	// Tests for suffix literal extraction.
1336	// ************************************************************************
1337
1338	// Elementary tests.
1339	test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1340	test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1341	test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("`\\`xe2`\\`x98`\\`x83"));
1342	#[cfg(feature = "unicode-case")]
1343	test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("`\\`xe2`\\`x98`\\`x83"));
1344	test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1345	test_lit!(
1346	sfx_class2,
1347	suffixes,
1348	"(?u)[☃Ⅰ]",
1349	M("`\\`xe2`\\`x85`\\`xa0"),
1350	M("`\\`xe2`\\`x98`\\`x83")
1351	);
1352	#[cfg(feature = "unicode-case")]
1353	test_lit!(
1354	sfx_class3,
1355	suffixes,
1356	"(?ui)[☃Ⅰ]",
1357	M("`\\`xe2`\\`x85`\\`xa0"),
1358	M("`\\`xe2`\\`x85`\\`xb0"),
1359	M("`\\`xe2`\\`x98`\\`x83")
1360	);
1361	test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1362	test_lit!(
1363	sfx_one_lit_casei2,
1364	suffixes,
1365	"(?i-u)abc",
1366	M("ABC"),
1367	M("ABc"),
1368	M("AbC"),
1369	M("Abc"),
1370	M("aBC"),
1371	M("aBc"),
1372	M("abC"),
1373	M("abc")
1374	);
1375	test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1376	test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1377	test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1378	test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1379	test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1380	test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1381	test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1382	test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1383	test_lit!(sfx_rep_range1, suffixes, "a{0}");
1384	test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1385	test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1386	test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1387	test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1388	test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1389	test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1390
1391	// Test regexes with concatenations.
1392	test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1393	test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1394	test_lit!(
1395	sfx_cat3,
1396	suffixes,
1397	"(?i-u)[ab]z",
1398	M("AZ"),
1399	M("Az"),
1400	M("BZ"),
1401	M("Bz"),
1402	M("aZ"),
1403	M("az"),
1404	M("bZ"),
1405	M("bz")
1406	);
1407	test_lit!(
1408	sfx_cat4,
1409	suffixes,
1410	"[ab][yz]",
1411	M("ay"),
1412	M("az"),
1413	M("by"),
1414	M("bz")
1415	);
1416	test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1417	test_lit!(sfx_cat6, suffixes, "abc", C("bc"), C("ac"), M("c"));
1418	test_lit!(sfx_cat7, suffixes, "abc+", C("c"));
1419	test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1420	test_lit!(sfx_cat9, suffixes, "ab+c", C("c"), C("b"));
1421	test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1422	test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1423	test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1424	test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1425	test_lit!(sfx_cat14, suffixes, "a^");
1426	test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1427	test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1428	test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1429	test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1430	test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1431
1432	// Test regexes with alternations.
1433	test_lit!(sfx_alt1, suffixes, "a\|b", M("a"), M("b"));
1434	test_lit!(sfx_alt2, suffixes, "[1-3]\|b", M("1"), M("2"), M("3"), M("b"));
1435	test_lit!(sfx_alt3, suffixes, "y(?:a\|b)z", M("yaz"), M("ybz"));
1436	test_lit!(sfx_alt4, suffixes, "a\|b*");
1437	test_lit!(sfx_alt5, suffixes, "a\|b+", M("a"), C("b"));
1438	test_lit!(sfx_alt6, suffixes, "a\|(?:b\|c*)");
1439	test_lit!(
1440	sfx_alt7,
1441	suffixes,
1442	"(a\|b)c\|(a\|ab)c",
1443	C("ac"),
1444	C("bc"),
1445	M("c"),
1446	C("ac"),
1447	C("abc"),
1448	M("c")
1449	);
1450	test_lit!(sfx_alt8, suffixes, "a*b\|c", C("ab"), M("b"), M("c"));
1451
1452	// Test regexes with empty assertions.
1453	test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1454	test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1455
1456	// Make sure some curious regexes have no suffixes.
1457	test_lit!(sfx_nothing1, suffixes, ".");
1458	test_lit!(sfx_nothing2, suffixes, "(?s).");
1459	test_lit!(sfx_nothing3, suffixes, "^");
1460	test_lit!(sfx_nothing4, suffixes, "$");
1461	test_lit!(sfx_nothing6, suffixes, "(?m)$");
1462	test_lit!(sfx_nothing7, suffixes, r"\b");
1463	test_lit!(sfx_nothing8, suffixes, r"\B");
1464
1465	// Test a few regexes that defeat any suffix literal detection.
1466	test_lit!(sfx_defeated1, suffixes, "a.");
1467	test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1468	test_lit!(sfx_defeated3, suffixes, "abc*");
1469	test_lit!(sfx_defeated4, suffixes, "a\|.");
1470	test_lit!(sfx_defeated5, suffixes, ".\|a");
1471	test_lit!(sfx_defeated6, suffixes, "a\|^");
1472	test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1473	test_lit!(sfx_defeated8, suffixes, "a^");
1474	test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1475	test_lit!(sfx_defeated10, suffixes, r"a\b");
1476	test_lit!(sfx_defeated11, suffixes, r"a\B");
1477	test_lit!(sfx_defeated12, suffixes, "a^*");
1478	test_lit!(sfx_defeated13, suffixes, "a^+");
1479
1480	// These test use a much lower limit than the default so that we can
1481	// write test cases of reasonable size.
1482	test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1483	test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1484	test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1485	test_exhausted!(
1486	sfx_exhausted4,
1487	suffixes,
1488	"(?i-u)foobar",
1489	C("AR"),
1490	C("Ar"),
1491	C("aR"),
1492	C("ar")
1493	);
1494	test_exhausted!(
1495	sfx_exhausted5,
1496	suffixes,
1497	"(?:ab){100}",
1498	C("abababababababababab")
1499	);
1500	test_exhausted!(
1501	sfx_exhausted6,
1502	suffixes,
1503	"cd(?:(?:ab){100})*",
1504	C("ababababab"),
1505	M("cd")
1506	);
1507	test_exhausted!(
1508	sfx_exhausted7,
1509	suffixes,
1510	"cd(?:(?:ab){100})*z",
1511	C("abababababz"),
1512	M("cdz")
1513	);
1514	test_exhausted!(
1515	sfx_exhausted8,
1516	suffixes,
1517	"zaaaaaaaaaaaaaaaaaaaa",
1518	C("aaaaaaaaaaaaaaaaaaaa")
1519	);
1520
1521	// ************************************************************************
1522	// Tests for generating unambiguous literal sets.
1523	// ************************************************************************
1524
1525	macro_rules! test_unamb {
1526	($name:ident, $given:expr, $expected:expr) => {
1527	#[test]
1528	fn $name() {
1529	let given: Vec<Literal> = $given
1530	.into_iter()
1531	.map(\|ul\| {
1532	let cut = ul.is_cut();
1533	Literal { v: ul.v.into_bytes(), cut: cut }
1534	})
1535	.collect();
1536	let lits = create_lits(given);
1537	let got = lits.unambiguous_prefixes();
1538	assert_eq!($expected, escape_lits(got.literals()));
1539	}
1540	};
1541	}
1542
1543	test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1544	test_unamb!(
1545	unambiguous2,
1546	vec![M("zaaaaaa"), M("aa")],
1547	vec![C("aa"), C("z")]
1548	);
1549	test_unamb!(
1550	unambiguous3,
1551	vec![M("Sherlock"), M("Watson")],
1552	vec![M("Sherlock"), M("Watson")]
1553	);
1554	test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1555	test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1556	test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1557	test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1558	test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1559	test_unamb!(
1560	unambiguous9,
1561	vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1562	vec![C("a"), C("b"), C("c")]
1563	);
1564	test_unamb!(
1565	unambiguous10,
1566	vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1567	vec![C("Mo"), C("Mu")]
1568	);
1569	test_unamb!(
1570	unambiguous11,
1571	vec![M("zazb"), M("azb")],
1572	vec![C("a"), C("z")]
1573	);
1574	test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1575	test_unamb!(
1576	unambiguous13,
1577	vec![M("ABCX"), M("CDAX"), M("BCX")],
1578	vec![C("A"), C("BCX"), C("CD")]
1579	);
1580	test_unamb!(
1581	unambiguous14,
1582	vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1583	vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1584	);
1585	test_unamb!(
1586	unambiguous15,
1587	vec![M("IMG_"), M("MG_"), M("CIMG")],
1588	vec![C("C"), C("I"), C("MG_")]
1589	);
1590
1591	// ************************************************************************
1592	// Tests for suffix trimming.
1593	// ************************************************************************
1594	macro_rules! test_trim {
1595	($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1596	#[test]
1597	fn $name() {
1598	let given: Vec<Literal> = $given
1599	.into_iter()
1600	.map(\|ul\| {
1601	let cut = ul.is_cut();
1602	Literal { v: ul.v.into_bytes(), cut: cut }
1603	})
1604	.collect();
1605	let lits = create_lits(given);
1606	let got = lits.trim_suffix($trim).unwrap();
1607	assert_eq!($expected, escape_lits(got.literals()));
1608	}
1609	};
1610	}
1611
1612	test_trim!(trim1, `1`, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1613	test_trim!(trim2, `1`, vec![M("abc"), M("abd")], vec![C("ab")]);
1614	test_trim!(trim3, `2`, vec![M("abc"), M("abd")], vec![C("a")]);
1615	test_trim!(trim4, `2`, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1616
1617	// ************************************************************************
1618	// Tests for longest common prefix.
1619	// ************************************************************************
1620
1621	macro_rules! test_lcp {
1622	($name:ident, $given:expr, $expected:expr) => {
1623	#[test]
1624	fn $name() {
1625	let given: Vec<Literal> = $given
1626	.into_iter()
1627	.map(\|s: &str\| Literal {
1628	v: s.to_owned().into_bytes(),
1629	cut: `false`,
1630	})
1631	.collect();
1632	let lits = create_lits(given);
1633	let got = lits.longest_common_prefix();
1634	assert_eq!($expected, escape_bytes(got));
1635	}
1636	};
1637	}
1638
1639	test_lcp!(lcp1, vec!["a"], "a");
1640	test_lcp!(lcp2, vec![], "");
1641	test_lcp!(lcp3, vec!["a", "b"], "");
1642	test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1643	test_lcp!(lcp5, vec!["ab", "a"], "a");
1644	test_lcp!(lcp6, vec!["a", "ab"], "a");
1645	test_lcp!(lcp7, vec!["ab", "b"], "");
1646	test_lcp!(lcp8, vec!["b", "ab"], "");
1647	test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1648	test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1649	test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1650	test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1651
1652	// ************************************************************************
1653	// Tests for longest common suffix.
1654	// ************************************************************************
1655
1656	macro_rules! test_lcs {
1657	($name:ident, $given:expr, $expected:expr) => {
1658	#[test]
1659	fn $name() {
1660	let given: Vec<Literal> = $given
1661	.into_iter()
1662	.map(\|s: &str\| Literal {
1663	v: s.to_owned().into_bytes(),
1664	cut: `false`,
1665	})
1666	.collect();
1667	let lits = create_lits(given);
1668	let got = lits.longest_common_suffix();
1669	assert_eq!($expected, escape_bytes(got));
1670	}
1671	};
1672	}
1673
1674	test_lcs!(lcs1, vec!["a"], "a");
1675	test_lcs!(lcs2, vec![], "");
1676	test_lcs!(lcs3, vec!["a", "b"], "");
1677	test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1678	test_lcs!(lcs5, vec!["ab", "a"], "");
1679	test_lcs!(lcs6, vec!["a", "ab"], "");
1680	test_lcs!(lcs7, vec!["ab", "b"], "b");
1681	test_lcs!(lcs8, vec!["b", "ab"], "b");
1682	test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1683	test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1684	test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1685	test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1686	}
1687