1use std::collections::BTreeMap;
2use std::env;
3use std::fmt::{self, Write};
4use std::thread;
5
6use regex;
7use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
8use serde_bytes;
9use toml;
10
11macro_rules! load {
12 ($col:ident, $path:expr) => {
13 $col.extend(RegexTests::load(
14 concat!("../data/tests/", $path),
15 include_bytes!(concat!("../data/tests/", $path)),
16 ));
17 };
18}
19
20lazy_static! {
21 pub static ref SUITE: RegexTestCollection = {
22 let mut col = RegexTestCollection::new();
23 load!(col, "fowler/basic.toml");
24 load!(col, "fowler/nullsubexpr.toml");
25 load!(col, "fowler/repetition.toml");
26 load!(col, "fowler/repetition-long.toml");
27 load!(col, "crazy.toml");
28 load!(col, "flags.toml");
29 load!(col, "iter.toml");
30 load!(col, "no-unicode.toml");
31 load!(col, "unicode.toml");
32 col
33 };
34}
35
36#[derive(Clone, Debug)]
37pub struct RegexTestCollection {
38 pub by_name: BTreeMap<String, RegexTest>,
39}
40
41#[derive(Clone, Debug, Deserialize)]
42pub struct RegexTests {
43 pub tests: Vec<RegexTest>,
44}
45
46#[derive(Clone, Debug, Deserialize)]
47pub struct RegexTest {
48 pub name: String,
49 #[serde(default)]
50 pub options: Vec<RegexTestOption>,
51 pub pattern: String,
52 #[serde(with = "serde_bytes")]
53 pub input: Vec<u8>,
54 #[serde(rename = "matches")]
55 pub matches: Vec<Match>,
56 #[serde(default)]
57 pub captures: Vec<Option<Match>>,
58 #[serde(default)]
59 pub fowler_line_number: Option<u64>,
60}
61
62#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
63#[serde(rename_all = "kebab-case")]
64pub enum RegexTestOption {
65 Anchored,
66 CaseInsensitive,
67 NoUnicode,
68 Escaped,
69 #[serde(rename = "invalid-utf8")]
70 InvalidUTF8,
71}
72
73#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
74pub struct Match {
75 pub start: usize,
76 pub end: usize,
77}
78
79impl RegexTestCollection {
80 fn new() -> RegexTestCollection {
81 RegexTestCollection { by_name: BTreeMap::new() }
82 }
83
84 fn extend(&mut self, tests: RegexTests) {
85 for test in tests.tests {
86 let name = test.name.clone();
87 if self.by_name.contains_key(&name) {
88 panic!("found duplicate test {}", name);
89 }
90 self.by_name.insert(name, test);
91 }
92 }
93
94 pub fn tests(&self) -> Vec<&RegexTest> {
95 self.by_name.values().collect()
96 }
97}
98
99impl RegexTests {
100 fn load(path: &str, slice: &[u8]) -> RegexTests {
101 let mut data: RegexTests = toml::from_slice(slice)
102 .expect(&format!("failed to load {}", path));
103 for test in &mut data.tests {
104 if test.options.contains(&RegexTestOption::Escaped) {
105 test.input = unescape_bytes(&test.input);
106 }
107 }
108 data
109 }
110}
111
112#[derive(Debug)]
113pub struct RegexTester {
114 asserted: bool,
115 results: RegexTestResults,
116 skip_expensive: bool,
117 whitelist: Vec<regex::Regex>,
118 blacklist: Vec<regex::Regex>,
119}
120
121impl Drop for RegexTester {
122 fn drop(&mut self) {
123 // If we haven't asserted yet, then the test is probably buggy, so
124 // fail it. But if we're already panicking (e.g., a bug in the regex
125 // engine), then don't double-panic, which causes an immediate abort.
126 if !thread::panicking() && !self.asserted {
127 panic!("must call RegexTester::assert at end of test");
128 }
129 }
130}
131
132impl RegexTester {
133 pub fn new() -> RegexTester {
134 let mut tester = RegexTester {
135 asserted: false,
136 results: RegexTestResults::default(),
137 skip_expensive: false,
138 whitelist: vec![],
139 blacklist: vec![],
140 };
141 for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
142 let x = x.trim();
143 if x.is_empty() {
144 continue;
145 }
146 if x.starts_with("-") {
147 tester = tester.blacklist(&x[1..]);
148 } else {
149 tester = tester.whitelist(x);
150 }
151 }
152 tester
153 }
154
155 pub fn skip_expensive(mut self) -> RegexTester {
156 self.skip_expensive = true;
157 self
158 }
159
160 pub fn whitelist(mut self, name: &str) -> RegexTester {
161 self.whitelist.push(regex::Regex::new(name).unwrap());
162 self
163 }
164
165 pub fn blacklist(mut self, name: &str) -> RegexTester {
166 self.blacklist.push(regex::Regex::new(name).unwrap());
167 self
168 }
169
170 pub fn assert(&mut self) {
171 self.asserted = true;
172 self.results.assert();
173 }
174
175 pub fn build_regex<S: StateID>(
176 &self,
177 mut builder: RegexBuilder,
178 test: &RegexTest,
179 ) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
180 if self.skip(test) {
181 return None;
182 }
183 self.apply_options(test, &mut builder);
184
185 match builder.build_with_size::<S>(&test.pattern) {
186 Ok(re) => Some(re),
187 Err(err) => {
188 if let ErrorKind::Unsupported(_) = *err.kind() {
189 None
190 } else {
191 panic!(
192 "failed to build {:?} with pattern '{:?}': {}",
193 test.name, test.pattern, err
194 );
195 }
196 }
197 }
198 }
199
200 pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
201 where
202 I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
203 T: Iterator<Item = &'a RegexTest>,
204 {
205 for test in tests {
206 let builder = builder.clone();
207 let re: Regex = match self.build_regex(builder, test) {
208 None => continue,
209 Some(re) => re,
210 };
211 self.test(test, &re);
212 }
213 }
214
215 pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
216 self.test_is_match(test, re);
217 self.test_find(test, re);
218 // Some tests (namely, fowler) are designed only to detect the
219 // first match even if there are more subsequent matches. To that
220 // end, we only test match iteration when the number of matches
221 // expected is not 1, or if the test name has 'iter' in it.
222 if test.name.contains("iter") || test.matches.len() != 1 {
223 self.test_find_iter(test, re);
224 }
225 }
226
227 pub fn test_is_match<'a, D: DFA>(
228 &mut self,
229 test: &RegexTest,
230 re: &Regex<D>,
231 ) {
232 self.asserted = false;
233
234 let got = re.is_match(&test.input);
235 let expected = test.matches.len() >= 1;
236 if got == expected {
237 self.results.succeeded.push(test.clone());
238 return;
239 }
240 self.results.failed.push(RegexTestFailure {
241 test: test.clone(),
242 kind: RegexTestFailureKind::IsMatch,
243 });
244 }
245
246 pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
247 self.asserted = false;
248
249 let got =
250 re.find(&test.input).map(|(start, end)| Match { start, end });
251 if got == test.matches.get(0).map(|&m| m) {
252 self.results.succeeded.push(test.clone());
253 return;
254 }
255 self.results.failed.push(RegexTestFailure {
256 test: test.clone(),
257 kind: RegexTestFailureKind::Find { got },
258 });
259 }
260
261 pub fn test_find_iter<'a, D: DFA>(
262 &mut self,
263 test: &RegexTest,
264 re: &Regex<D>,
265 ) {
266 self.asserted = false;
267
268 let got: Vec<Match> = re
269 .find_iter(&test.input)
270 .map(|(start, end)| Match { start, end })
271 .collect();
272 if got == test.matches {
273 self.results.succeeded.push(test.clone());
274 return;
275 }
276 self.results.failed.push(RegexTestFailure {
277 test: test.clone(),
278 kind: RegexTestFailureKind::FindIter { got },
279 });
280 }
281
282 fn skip(&self, test: &RegexTest) -> bool {
283 if self.skip_expensive {
284 if test.name.starts_with("repetition-long") {
285 return true;
286 }
287 }
288 if !self.blacklist.is_empty() {
289 if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
290 return true;
291 }
292 }
293 if !self.whitelist.is_empty() {
294 if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
295 return true;
296 }
297 }
298 false
299 }
300
301 fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
302 for opt in &test.options {
303 match *opt {
304 RegexTestOption::Anchored => {
305 builder.anchored(true);
306 }
307 RegexTestOption::CaseInsensitive => {
308 builder.case_insensitive(true);
309 }
310 RegexTestOption::NoUnicode => {
311 builder.unicode(false);
312 }
313 RegexTestOption::Escaped => {}
314 RegexTestOption::InvalidUTF8 => {
315 builder.allow_invalid_utf8(true);
316 }
317 }
318 }
319 }
320}
321
322#[derive(Clone, Debug, Default)]
323pub struct RegexTestResults {
324 /// Tests that succeeded.
325 pub succeeded: Vec<RegexTest>,
326 /// Failed tests, indexed by group name.
327 pub failed: Vec<RegexTestFailure>,
328}
329
330#[derive(Clone, Debug)]
331pub struct RegexTestFailure {
332 test: RegexTest,
333 kind: RegexTestFailureKind,
334}
335
336#[derive(Clone, Debug)]
337pub enum RegexTestFailureKind {
338 IsMatch,
339 Find { got: Option<Match> },
340 FindIter { got: Vec<Match> },
341}
342
343impl RegexTestResults {
344 pub fn assert(&self) {
345 if self.failed.is_empty() {
346 return;
347 }
348 let failures = self
349 .failed
350 .iter()
351 .map(|f| f.to_string())
352 .collect::<Vec<String>>()
353 .join("\n\n");
354 panic!(
355 "found {} failures:\n{}\n{}\n{}\n\n\
356 Set the REGEX_TEST environment variable to filter tests, \n\
357 e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
358 whose name contains crazy-misc but not crazy-misc2\n\n",
359 self.failed.len(),
360 "~".repeat(79),
361 failures.trim(),
362 "~".repeat(79)
363 )
364 }
365}
366
367impl fmt::Display for RegexTestFailure {
368 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
369 write!(
370 f,
371 "{}: {}\n \
372 options: {:?}\n \
373 pattern: {}\n \
374 pattern (escape): {}\n \
375 input: {}\n \
376 input (escape): {}\n \
377 input (hex): {}",
378 self.test.name,
379 self.kind.fmt(&self.test)?,
380 self.test.options,
381 self.test.pattern,
382 escape_default(&self.test.pattern),
383 nice_raw_bytes(&self.test.input),
384 escape_bytes(&self.test.input),
385 hex_bytes(&self.test.input)
386 )
387 }
388}
389
390impl RegexTestFailureKind {
391 fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
392 let mut buf = String::new();
393 match *self {
394 RegexTestFailureKind::IsMatch => {
395 if let Some(&m) = test.matches.get(0) {
396 write!(buf, "expected match (at {}), but none found", m)?
397 } else {
398 write!(buf, "expected no match, but found a match")?
399 }
400 }
401 RegexTestFailureKind::Find { got } => write!(
402 buf,
403 "expected {:?}, but found {:?}",
404 test.matches.get(0),
405 got
406 )?,
407 RegexTestFailureKind::FindIter { ref got } => write!(
408 buf,
409 "expected {:?}, but found {:?}",
410 test.matches, got
411 )?,
412 }
413 Ok(buf)
414 }
415}
416
417impl fmt::Display for Match {
418 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
419 write!(f, "({}, {})", self.start, self.end)
420 }
421}
422
423impl fmt::Debug for Match {
424 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
425 write!(f, "({}, {})", self.start, self.end)
426 }
427}
428
429fn nice_raw_bytes(bytes: &[u8]) -> String {
430 use std::str;
431
432 match str::from_utf8(bytes) {
433 Ok(s) => s.to_string(),
434 Err(_) => escape_bytes(bytes),
435 }
436}
437
438fn escape_bytes(bytes: &[u8]) -> String {
439 use std::ascii;
440
441 let escaped = bytes
442 .iter()
443 .flat_map(|&b| ascii::escape_default(b))
444 .collect::<Vec<u8>>();
445 String::from_utf8(escaped).unwrap()
446}
447
448fn hex_bytes(bytes: &[u8]) -> String {
449 bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
450}
451
452fn escape_default(s: &str) -> String {
453 s.chars().flat_map(|c| c.escape_default()).collect()
454}
455
456fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
457 use std::str;
458 use unescape::unescape;
459
460 unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
461}
462