collection.rs - Codebrowser

1	use std::collections::BTreeMap;
2	use std::env;
3	use std::fmt::{self, Write};
4	use std::thread;
5
6	use regex;
7	use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
8	use serde_bytes;
9	use toml;
10
11	macro_rules! load {
12	($col:ident, $path:expr) => {
13	$col.extend(RegexTests::load(
14	concat!("../data/tests/", $path),
15	include_bytes!(concat!("../data/tests/", $path)),
16	));
17	};
18	}
19
20	lazy_static! {
21	pub static ref SUITE: RegexTestCollection = {
22	let mut col = RegexTestCollection::new();
23	load!(col, "fowler/basic.toml");
24	load!(col, "fowler/nullsubexpr.toml");
25	load!(col, "fowler/repetition.toml");
26	load!(col, "fowler/repetition-long.toml");
27	load!(col, "crazy.toml");
28	load!(col, "flags.toml");
29	load!(col, "iter.toml");
30	load!(col, "no-unicode.toml");
31	load!(col, "unicode.toml");
32	col
33	};
34	}
35
36	#[derive(Clone, Debug)]
37	pub struct RegexTestCollection {
38	pub by_name: BTreeMap<String, RegexTest>,
39	}
40
41	#[derive(Clone, Debug, Deserialize)]
42	pub struct RegexTests {
43	pub tests: Vec<RegexTest>,
44	}
45
46	#[derive(Clone, Debug, Deserialize)]
47	pub struct RegexTest {
48	pub name: String,
49	#[serde(default)]
50	pub options: Vec<RegexTestOption>,
51	pub pattern: String,
52	#[serde(with = "serde_bytes")]
53	pub input: Vec<u8>,
54	#[serde(rename = "matches")]
55	pub matches: Vec<Match>,
56	#[serde(default)]
57	pub captures: Vec<Option<Match>>,
58	#[serde(default)]
59	pub fowler_line_number: Option<u64>,
60	}
61
62	#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
63	#[serde(rename_all = "kebab-case")]
64	pub enum RegexTestOption {
65	Anchored,
66	CaseInsensitive,
67	NoUnicode,
68	Escaped,
69	#[serde(rename = "invalid-utf8")]
70	InvalidUTF8,
71	}
72
73	#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
74	pub struct Match {
75	pub start: usize,
76	pub end: usize,
77	}
78
79	impl RegexTestCollection {
80	fn new() -> RegexTestCollection {
81	RegexTestCollection { by_name: BTreeMap::new() }
82	}
83
84	fn extend(&mut self, tests: RegexTests) {
85	for test in tests.tests {
86	let name = test.name.clone();
87	if self.by_name.contains_key(&name) {
88	panic!("found duplicate test {}", name);
89	}
90	self.by_name.insert(name, test);
91	}
92	}
93
94	pub fn tests(&self) -> Vec<&RegexTest> {
95	self.by_name.values().collect()
96	}
97	}
98
99	impl RegexTests {
100	fn load(path: &str, slice: &[u8]) -> RegexTests {
101	let mut data: RegexTests = toml::from_slice(slice)
102	.expect(&format!("failed to load {}", path));
103	for test in &mut data.tests {
104	if test.options.contains(&RegexTestOption::Escaped) {
105	test.input = unescape_bytes(&test.input);
106	}
107	}
108	data
109	}
110	}
111
112	#[derive(Debug)]
113	pub struct RegexTester {
114	asserted: bool,
115	results: RegexTestResults,
116	skip_expensive: bool,
117	whitelist: Vec<regex::Regex>,
118	blacklist: Vec<regex::Regex>,
119	}
120
121	impl Drop for RegexTester {
122	fn drop(&mut self) {
123	// If we haven't asserted yet, then the test is probably buggy, so
124	// fail it. But if we're already panicking (e.g., a bug in the regex
125	// engine), then don't double-panic, which causes an immediate abort.
126	if !thread::panicking() && !self.asserted {
127	panic!("must call RegexTester::assert at end of test");
128	}
129	}
130	}
131
132	impl RegexTester {
133	pub fn new() -> RegexTester {
134	let mut tester = RegexTester {
135	asserted: `false`,
136	results: RegexTestResults::default(),
137	skip_expensive: `false`,
138	whitelist: vec![],
139	blacklist: vec![],
140	};
141	for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
142	let x = x.trim();
143	if x.is_empty() {
144	continue;
145	}
146	if x.starts_with("-") {
147	tester = tester.blacklist(&x[`1`..]);
148	} else {
149	tester = tester.whitelist(x);
150	}
151	}
152	tester
153	}
154
155	pub fn skip_expensive(mut self) -> RegexTester {
156	self.skip_expensive = `true`;
157	self
158	}
159
160	pub fn whitelist(mut self, name: &str) -> RegexTester {
161	self.whitelist.push(regex::Regex::new(name).unwrap());
162	self
163	}
164
165	pub fn blacklist(mut self, name: &str) -> RegexTester {
166	self.blacklist.push(regex::Regex::new(name).unwrap());
167	self
168	}
169
170	pub fn assert(&mut self) {
171	self.asserted = `true`;
172	self.results.assert();
173	}
174
175	pub fn build_regex<S: StateID>(
176	&self,
177	mut builder: RegexBuilder,
178	test: &RegexTest,
179	) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
180	if self.skip(test) {
181	return None;
182	}
183	self.apply_options(test, &mut builder);
184
185	match builder.build_with_size::<S>(&test.pattern) {
186	Ok(re) => Some(re),
187	Err(err) => {
188	if let ErrorKind::Unsupported(_) = *err.kind() {
189	None
190	} else {
191	panic!(
192	"failed to build {:?} with pattern '{:?}': {}",
193	test.name, test.pattern, err
194	);
195	}
196	}
197	}
198	}
199
200	pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
201	where
202	I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
203	T: Iterator<Item = &'a RegexTest>,
204	{
205	for test in tests {
206	let builder = builder.clone();
207	let re: Regex = match self.build_regex(builder, test) {
208	None => continue,
209	Some(re) => re,
210	};
211	self.test(test, &re);
212	}
213	}
214
215	pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
216	self.test_is_match(test, re);
217	self.test_find(test, re);
218	// Some tests (namely, fowler) are designed only to detect the
219	// first match even if there are more subsequent matches. To that
220	// end, we only test match iteration when the number of matches
221	// expected is not 1, or if the test name has 'iter' in it.
222	if test.name.contains("iter") \|\| test.matches.len() != `1` {
223	self.test_find_iter(test, re);
224	}
225	}
226
227	pub fn test_is_match<'a, D: DFA>(
228	&mut self,
229	test: &RegexTest,
230	re: &Regex<D>,
231	) {
232	self.asserted = `false`;
233
234	let got = re.is_match(&test.input);
235	let expected = test.matches.len() >= `1`;
236	if got == expected {
237	self.results.succeeded.push(test.clone());
238	return;
239	}
240	self.results.failed.push(RegexTestFailure {
241	test: test.clone(),
242	kind: RegexTestFailureKind::IsMatch,
243	});
244	}
245
246	pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
247	self.asserted = `false`;
248
249	let got =
250	re.find(&test.input).map(\|(start, end)\| Match { start, end });
251	if got == test.matches.get(`0`).map(\|&m\| m) {
252	self.results.succeeded.push(test.clone());
253	return;
254	}
255	self.results.failed.push(RegexTestFailure {
256	test: test.clone(),
257	kind: RegexTestFailureKind::Find { got },
258	});
259	}
260
261	pub fn test_find_iter<'a, D: DFA>(
262	&mut self,
263	test: &RegexTest,
264	re: &Regex<D>,
265	) {
266	self.asserted = `false`;
267
268	let got: Vec<Match> = re
269	.find_iter(&test.input)
270	.map(\|(start, end)\| Match { start, end })
271	.collect();
272	if got == test.matches {
273	self.results.succeeded.push(test.clone());
274	return;
275	}
276	self.results.failed.push(RegexTestFailure {
277	test: test.clone(),
278	kind: RegexTestFailureKind::FindIter { got },
279	});
280	}
281
282	fn skip(&self, test: &RegexTest) -> bool {
283	if self.skip_expensive {
284	if test.name.starts_with("repetition-long") {
285	return `true`;
286	}
287	}
288	if !self.blacklist.is_empty() {
289	if self.blacklist.iter().any(\|re\| re.is_match(&test.name)) {
290	return `true`;
291	}
292	}
293	if !self.whitelist.is_empty() {
294	if !self.whitelist.iter().any(\|re\| re.is_match(&test.name)) {
295	return `true`;
296	}
297	}
298	`false`
299	}
300
301	fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
302	for opt in &test.options {
303	match *opt {
304	RegexTestOption::Anchored => {
305	builder.anchored(`true`);
306	}
307	RegexTestOption::CaseInsensitive => {
308	builder.case_insensitive(`true`);
309	}
310	RegexTestOption::NoUnicode => {
311	builder.unicode(`false`);
312	}
313	RegexTestOption::Escaped => {}
314	RegexTestOption::InvalidUTF8 => {
315	builder.allow_invalid_utf8(`true`);
316	}
317	}
318	}
319	}
320	}
321
322	#[derive(Clone, Debug, Default)]
323	pub struct RegexTestResults {
324	/// Tests that succeeded.
325	pub succeeded: Vec<RegexTest>,
326	/// Failed tests, indexed by group name.
327	pub failed: Vec<RegexTestFailure>,
328	}
329
330	#[derive(Clone, Debug)]
331	pub struct RegexTestFailure {
332	test: RegexTest,
333	kind: RegexTestFailureKind,
334	}
335
336	#[derive(Clone, Debug)]
337	pub enum RegexTestFailureKind {
338	IsMatch,
339	Find { got: Option<Match> },
340	FindIter { got: Vec<Match> },
341	}
342
343	impl RegexTestResults {
344	pub fn assert(&self) {
345	if self.failed.is_empty() {
346	return;
347	}
348	let failures = self
349	.failed
350	.iter()
351	.map(\|f\| f.to_string())
352	.collect::<Vec<String>>()
353	.join("`\n\n`");
354	panic!(
355	"found {} failures:`\n`{}`\n`{}`\n`{}`\n\n`\
356	Set the REGEX_TEST environment variable to filter tests, `\n`\
357	e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test `\n`\
358	whose name contains crazy-misc but not crazy-misc2`\n\n`",
359	self.failed.len(),
360	"~".repeat(`79`),
361	failures.trim(),
362	"~".repeat(`79`)
363	)
364	}
365	}
366
367	impl fmt::Display for RegexTestFailure {
368	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
369	write!(
370	f,
371	"{}: {}`\n` \
372	options: {:?}`\n` \
373	pattern: {}`\n` \
374	pattern (escape): {}`\n` \
375	input: {}`\n` \
376	input (escape): {}`\n` \
377	input (hex): {}",
378	self.test.name,
379	self.kind.fmt(&self.test)?,
380	self.test.options,
381	self.test.pattern,
382	escape_default(&self.test.pattern),
383	nice_raw_bytes(&self.test.input),
384	escape_bytes(&self.test.input),
385	hex_bytes(&self.test.input)
386	)
387	}
388	}
389
390	impl RegexTestFailureKind {
391	fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
392	let mut buf = String::new();
393	match *self {
394	RegexTestFailureKind::IsMatch => {
395	if let Some(&m) = test.matches.get(`0`) {
396	write!(buf, "expected match (at {}), but none found", m)?
397	} else {
398	write!(buf, "expected no match, but found a match")?
399	}
400	}
401	RegexTestFailureKind::Find { got } => write!(
402	buf,
403	"expected {:?}, but found {:?}",
404	test.matches.get(`0`),
405	got
406	)?,
407	RegexTestFailureKind::FindIter { ref got } => write!(
408	buf,
409	"expected {:?}, but found {:?}",
410	test.matches, got
411	)?,
412	}
413	Ok(buf)
414	}
415	}
416
417	impl fmt::Display for Match {
418	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
419	write!(f, "({}, {})", self.start, self.end)
420	}
421	}
422
423	impl fmt::Debug for Match {
424	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
425	write!(f, "({}, {})", self.start, self.end)
426	}
427	}
428
429	fn nice_raw_bytes(bytes: &[u8]) -> String {
430	use std::str;
431
432	match str::from_utf8(bytes) {
433	Ok(s) => s.to_string(),
434	Err(_) => escape_bytes(bytes),
435	}
436	}
437
438	fn escape_bytes(bytes: &[u8]) -> String {
439	use std::ascii;
440
441	let escaped = bytes
442	.iter()
443	.flat_map(\|&b\| ascii::escape_default(b))
444	.collect::<Vec<u8>>();
445	String::from_utf8(escaped).unwrap()
446	}
447
448	fn hex_bytes(bytes: &[u8]) -> String {
449	bytes.iter().map(\|&b\| format!(r"\x{:02X}", b)).collect()
450	}
451
452	fn escape_default(s: &str) -> String {
453	s.chars().flat_map(\|c\| c.escape_default()).collect()
454	}
455
456	fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
457	use std::str;
458	use unescape::unescape;
459
460	unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
461	}
462