1 | use std::collections::BTreeMap; |
2 | use std::env; |
3 | use std::fmt::{self, Write}; |
4 | use std::thread; |
5 | |
6 | use regex; |
7 | use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA}; |
8 | use serde_bytes; |
9 | use toml; |
10 | |
11 | macro_rules! load { |
12 | ($col:ident, $path:expr) => { |
13 | $col.extend(RegexTests::load( |
14 | concat!("../data/tests/" , $path), |
15 | include_bytes!(concat!("../data/tests/" , $path)), |
16 | )); |
17 | }; |
18 | } |
19 | |
20 | lazy_static! { |
21 | pub static ref SUITE: RegexTestCollection = { |
22 | let mut col = RegexTestCollection::new(); |
23 | load!(col, "fowler/basic.toml" ); |
24 | load!(col, "fowler/nullsubexpr.toml" ); |
25 | load!(col, "fowler/repetition.toml" ); |
26 | load!(col, "fowler/repetition-long.toml" ); |
27 | load!(col, "crazy.toml" ); |
28 | load!(col, "flags.toml" ); |
29 | load!(col, "iter.toml" ); |
30 | load!(col, "no-unicode.toml" ); |
31 | load!(col, "unicode.toml" ); |
32 | col |
33 | }; |
34 | } |
35 | |
36 | #[derive(Clone, Debug)] |
37 | pub struct RegexTestCollection { |
38 | pub by_name: BTreeMap<String, RegexTest>, |
39 | } |
40 | |
41 | #[derive(Clone, Debug, Deserialize)] |
42 | pub struct RegexTests { |
43 | pub tests: Vec<RegexTest>, |
44 | } |
45 | |
46 | #[derive(Clone, Debug, Deserialize)] |
47 | pub struct RegexTest { |
48 | pub name: String, |
49 | #[serde(default)] |
50 | pub options: Vec<RegexTestOption>, |
51 | pub pattern: String, |
52 | #[serde(with = "serde_bytes" )] |
53 | pub input: Vec<u8>, |
54 | #[serde(rename = "matches" )] |
55 | pub matches: Vec<Match>, |
56 | #[serde(default)] |
57 | pub captures: Vec<Option<Match>>, |
58 | #[serde(default)] |
59 | pub fowler_line_number: Option<u64>, |
60 | } |
61 | |
62 | #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] |
63 | #[serde(rename_all = "kebab-case" )] |
64 | pub enum RegexTestOption { |
65 | Anchored, |
66 | CaseInsensitive, |
67 | NoUnicode, |
68 | Escaped, |
69 | #[serde(rename = "invalid-utf8" )] |
70 | InvalidUTF8, |
71 | } |
72 | |
73 | #[derive(Clone, Copy, Deserialize, Eq, PartialEq)] |
74 | pub struct Match { |
75 | pub start: usize, |
76 | pub end: usize, |
77 | } |
78 | |
79 | impl RegexTestCollection { |
80 | fn new() -> RegexTestCollection { |
81 | RegexTestCollection { by_name: BTreeMap::new() } |
82 | } |
83 | |
84 | fn extend(&mut self, tests: RegexTests) { |
85 | for test in tests.tests { |
86 | let name = test.name.clone(); |
87 | if self.by_name.contains_key(&name) { |
88 | panic!("found duplicate test {}" , name); |
89 | } |
90 | self.by_name.insert(name, test); |
91 | } |
92 | } |
93 | |
94 | pub fn tests(&self) -> Vec<&RegexTest> { |
95 | self.by_name.values().collect() |
96 | } |
97 | } |
98 | |
99 | impl RegexTests { |
100 | fn load(path: &str, slice: &[u8]) -> RegexTests { |
101 | let mut data: RegexTests = toml::from_slice(slice) |
102 | .expect(&format!("failed to load {}" , path)); |
103 | for test in &mut data.tests { |
104 | if test.options.contains(&RegexTestOption::Escaped) { |
105 | test.input = unescape_bytes(&test.input); |
106 | } |
107 | } |
108 | data |
109 | } |
110 | } |
111 | |
112 | #[derive(Debug)] |
113 | pub struct RegexTester { |
114 | asserted: bool, |
115 | results: RegexTestResults, |
116 | skip_expensive: bool, |
117 | whitelist: Vec<regex::Regex>, |
118 | blacklist: Vec<regex::Regex>, |
119 | } |
120 | |
121 | impl Drop for RegexTester { |
122 | fn drop(&mut self) { |
123 | // If we haven't asserted yet, then the test is probably buggy, so |
124 | // fail it. But if we're already panicking (e.g., a bug in the regex |
125 | // engine), then don't double-panic, which causes an immediate abort. |
126 | if !thread::panicking() && !self.asserted { |
127 | panic!("must call RegexTester::assert at end of test" ); |
128 | } |
129 | } |
130 | } |
131 | |
132 | impl RegexTester { |
133 | pub fn new() -> RegexTester { |
134 | let mut tester = RegexTester { |
135 | asserted: false, |
136 | results: RegexTestResults::default(), |
137 | skip_expensive: false, |
138 | whitelist: vec![], |
139 | blacklist: vec![], |
140 | }; |
141 | for x in env::var("REGEX_TEST" ).unwrap_or("" .to_string()).split("," ) { |
142 | let x = x.trim(); |
143 | if x.is_empty() { |
144 | continue; |
145 | } |
146 | if x.starts_with("-" ) { |
147 | tester = tester.blacklist(&x[1..]); |
148 | } else { |
149 | tester = tester.whitelist(x); |
150 | } |
151 | } |
152 | tester |
153 | } |
154 | |
155 | pub fn skip_expensive(mut self) -> RegexTester { |
156 | self.skip_expensive = true; |
157 | self |
158 | } |
159 | |
160 | pub fn whitelist(mut self, name: &str) -> RegexTester { |
161 | self.whitelist.push(regex::Regex::new(name).unwrap()); |
162 | self |
163 | } |
164 | |
165 | pub fn blacklist(mut self, name: &str) -> RegexTester { |
166 | self.blacklist.push(regex::Regex::new(name).unwrap()); |
167 | self |
168 | } |
169 | |
170 | pub fn assert(&mut self) { |
171 | self.asserted = true; |
172 | self.results.assert(); |
173 | } |
174 | |
175 | pub fn build_regex<S: StateID>( |
176 | &self, |
177 | mut builder: RegexBuilder, |
178 | test: &RegexTest, |
179 | ) -> Option<Regex<DenseDFA<Vec<S>, S>>> { |
180 | if self.skip(test) { |
181 | return None; |
182 | } |
183 | self.apply_options(test, &mut builder); |
184 | |
185 | match builder.build_with_size::<S>(&test.pattern) { |
186 | Ok(re) => Some(re), |
187 | Err(err) => { |
188 | if let ErrorKind::Unsupported(_) = *err.kind() { |
189 | None |
190 | } else { |
191 | panic!( |
192 | "failed to build {:?} with pattern '{:?}': {}" , |
193 | test.name, test.pattern, err |
194 | ); |
195 | } |
196 | } |
197 | } |
198 | } |
199 | |
200 | pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I) |
201 | where |
202 | I: IntoIterator<IntoIter = T, Item = &'a RegexTest>, |
203 | T: Iterator<Item = &'a RegexTest>, |
204 | { |
205 | for test in tests { |
206 | let builder = builder.clone(); |
207 | let re: Regex = match self.build_regex(builder, test) { |
208 | None => continue, |
209 | Some(re) => re, |
210 | }; |
211 | self.test(test, &re); |
212 | } |
213 | } |
214 | |
215 | pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) { |
216 | self.test_is_match(test, re); |
217 | self.test_find(test, re); |
218 | // Some tests (namely, fowler) are designed only to detect the |
219 | // first match even if there are more subsequent matches. To that |
220 | // end, we only test match iteration when the number of matches |
221 | // expected is not 1, or if the test name has 'iter' in it. |
222 | if test.name.contains("iter" ) || test.matches.len() != 1 { |
223 | self.test_find_iter(test, re); |
224 | } |
225 | } |
226 | |
227 | pub fn test_is_match<'a, D: DFA>( |
228 | &mut self, |
229 | test: &RegexTest, |
230 | re: &Regex<D>, |
231 | ) { |
232 | self.asserted = false; |
233 | |
234 | let got = re.is_match(&test.input); |
235 | let expected = test.matches.len() >= 1; |
236 | if got == expected { |
237 | self.results.succeeded.push(test.clone()); |
238 | return; |
239 | } |
240 | self.results.failed.push(RegexTestFailure { |
241 | test: test.clone(), |
242 | kind: RegexTestFailureKind::IsMatch, |
243 | }); |
244 | } |
245 | |
246 | pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) { |
247 | self.asserted = false; |
248 | |
249 | let got = |
250 | re.find(&test.input).map(|(start, end)| Match { start, end }); |
251 | if got == test.matches.get(0).map(|&m| m) { |
252 | self.results.succeeded.push(test.clone()); |
253 | return; |
254 | } |
255 | self.results.failed.push(RegexTestFailure { |
256 | test: test.clone(), |
257 | kind: RegexTestFailureKind::Find { got }, |
258 | }); |
259 | } |
260 | |
261 | pub fn test_find_iter<'a, D: DFA>( |
262 | &mut self, |
263 | test: &RegexTest, |
264 | re: &Regex<D>, |
265 | ) { |
266 | self.asserted = false; |
267 | |
268 | let got: Vec<Match> = re |
269 | .find_iter(&test.input) |
270 | .map(|(start, end)| Match { start, end }) |
271 | .collect(); |
272 | if got == test.matches { |
273 | self.results.succeeded.push(test.clone()); |
274 | return; |
275 | } |
276 | self.results.failed.push(RegexTestFailure { |
277 | test: test.clone(), |
278 | kind: RegexTestFailureKind::FindIter { got }, |
279 | }); |
280 | } |
281 | |
282 | fn skip(&self, test: &RegexTest) -> bool { |
283 | if self.skip_expensive { |
284 | if test.name.starts_with("repetition-long" ) { |
285 | return true; |
286 | } |
287 | } |
288 | if !self.blacklist.is_empty() { |
289 | if self.blacklist.iter().any(|re| re.is_match(&test.name)) { |
290 | return true; |
291 | } |
292 | } |
293 | if !self.whitelist.is_empty() { |
294 | if !self.whitelist.iter().any(|re| re.is_match(&test.name)) { |
295 | return true; |
296 | } |
297 | } |
298 | false |
299 | } |
300 | |
301 | fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) { |
302 | for opt in &test.options { |
303 | match *opt { |
304 | RegexTestOption::Anchored => { |
305 | builder.anchored(true); |
306 | } |
307 | RegexTestOption::CaseInsensitive => { |
308 | builder.case_insensitive(true); |
309 | } |
310 | RegexTestOption::NoUnicode => { |
311 | builder.unicode(false); |
312 | } |
313 | RegexTestOption::Escaped => {} |
314 | RegexTestOption::InvalidUTF8 => { |
315 | builder.allow_invalid_utf8(true); |
316 | } |
317 | } |
318 | } |
319 | } |
320 | } |
321 | |
322 | #[derive(Clone, Debug, Default)] |
323 | pub struct RegexTestResults { |
324 | /// Tests that succeeded. |
325 | pub succeeded: Vec<RegexTest>, |
326 | /// Failed tests, indexed by group name. |
327 | pub failed: Vec<RegexTestFailure>, |
328 | } |
329 | |
330 | #[derive(Clone, Debug)] |
331 | pub struct RegexTestFailure { |
332 | test: RegexTest, |
333 | kind: RegexTestFailureKind, |
334 | } |
335 | |
336 | #[derive(Clone, Debug)] |
337 | pub enum RegexTestFailureKind { |
338 | IsMatch, |
339 | Find { got: Option<Match> }, |
340 | FindIter { got: Vec<Match> }, |
341 | } |
342 | |
343 | impl RegexTestResults { |
344 | pub fn assert(&self) { |
345 | if self.failed.is_empty() { |
346 | return; |
347 | } |
348 | let failures = self |
349 | .failed |
350 | .iter() |
351 | .map(|f| f.to_string()) |
352 | .collect::<Vec<String>>() |
353 | .join(" \n\n" ); |
354 | panic!( |
355 | "found {} failures: \n{} \n{} \n{} \n\n\ |
356 | Set the REGEX_TEST environment variable to filter tests, \n\ |
357 | e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\ |
358 | whose name contains crazy-misc but not crazy-misc2 \n\n" , |
359 | self.failed.len(), |
360 | "~" .repeat(79), |
361 | failures.trim(), |
362 | "~" .repeat(79) |
363 | ) |
364 | } |
365 | } |
366 | |
367 | impl fmt::Display for RegexTestFailure { |
368 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
369 | write!( |
370 | f, |
371 | "{}: {} \n \ |
372 | options: {:?} \n \ |
373 | pattern: {} \n \ |
374 | pattern (escape): {} \n \ |
375 | input: {} \n \ |
376 | input (escape): {} \n \ |
377 | input (hex): {}" , |
378 | self.test.name, |
379 | self.kind.fmt(&self.test)?, |
380 | self.test.options, |
381 | self.test.pattern, |
382 | escape_default(&self.test.pattern), |
383 | nice_raw_bytes(&self.test.input), |
384 | escape_bytes(&self.test.input), |
385 | hex_bytes(&self.test.input) |
386 | ) |
387 | } |
388 | } |
389 | |
390 | impl RegexTestFailureKind { |
391 | fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> { |
392 | let mut buf = String::new(); |
393 | match *self { |
394 | RegexTestFailureKind::IsMatch => { |
395 | if let Some(&m) = test.matches.get(0) { |
396 | write!(buf, "expected match (at {}), but none found" , m)? |
397 | } else { |
398 | write!(buf, "expected no match, but found a match" )? |
399 | } |
400 | } |
401 | RegexTestFailureKind::Find { got } => write!( |
402 | buf, |
403 | "expected {:?}, but found {:?}" , |
404 | test.matches.get(0), |
405 | got |
406 | )?, |
407 | RegexTestFailureKind::FindIter { ref got } => write!( |
408 | buf, |
409 | "expected {:?}, but found {:?}" , |
410 | test.matches, got |
411 | )?, |
412 | } |
413 | Ok(buf) |
414 | } |
415 | } |
416 | |
417 | impl fmt::Display for Match { |
418 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
419 | write!(f, "({}, {})" , self.start, self.end) |
420 | } |
421 | } |
422 | |
423 | impl fmt::Debug for Match { |
424 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
425 | write!(f, "({}, {})" , self.start, self.end) |
426 | } |
427 | } |
428 | |
429 | fn nice_raw_bytes(bytes: &[u8]) -> String { |
430 | use std::str; |
431 | |
432 | match str::from_utf8(bytes) { |
433 | Ok(s) => s.to_string(), |
434 | Err(_) => escape_bytes(bytes), |
435 | } |
436 | } |
437 | |
438 | fn escape_bytes(bytes: &[u8]) -> String { |
439 | use std::ascii; |
440 | |
441 | let escaped = bytes |
442 | .iter() |
443 | .flat_map(|&b| ascii::escape_default(b)) |
444 | .collect::<Vec<u8>>(); |
445 | String::from_utf8(escaped).unwrap() |
446 | } |
447 | |
448 | fn hex_bytes(bytes: &[u8]) -> String { |
449 | bytes.iter().map(|&b| format!(r"\x{:02X}" , b)).collect() |
450 | } |
451 | |
452 | fn escape_default(s: &str) -> String { |
453 | s.chars().flat_map(|c| c.escape_default()).collect() |
454 | } |
455 | |
456 | fn unescape_bytes(bytes: &[u8]) -> Vec<u8> { |
457 | use std::str; |
458 | use unescape::unescape; |
459 | |
460 | unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8" )) |
461 | } |
462 | |