1 | #[allow (unused, deprecated)] |
2 | use std::ascii::AsciiExt; |
3 | use std::error::Error; |
4 | use std::fmt; |
5 | use std::iter::Enumerate; |
6 | use std::str::Bytes; |
7 | |
8 | use super::{Mime, MimeIter, Source, ParamSource, Indexed, CHARSET, UTF_8}; |
9 | |
10 | #[derive (Debug)] |
11 | pub enum ParseError { |
12 | MissingSlash, |
13 | MissingEqual, |
14 | MissingQuote, |
15 | InvalidToken { |
16 | pos: usize, |
17 | byte: u8, |
18 | }, |
19 | } |
20 | |
21 | impl ParseError { |
22 | fn s(&self) -> &str { |
23 | use self::ParseError::*; |
24 | |
25 | match *self { |
26 | MissingSlash => "a slash (/) was missing between the type and subtype" , |
27 | MissingEqual => "an equals sign (=) was missing between a parameter and its value" , |
28 | MissingQuote => "a quote ( \") was missing from a parameter value" , |
29 | InvalidToken { .. } => "an invalid token was encountered" , |
30 | } |
31 | } |
32 | } |
33 | |
34 | impl fmt::Display for ParseError { |
35 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
36 | if let ParseError::InvalidToken { pos: usize, byte: u8 } = *self { |
37 | write!(f, " {}, {:X} at position {}" , self.s(), byte, pos) |
38 | } else { |
39 | f.write_str(self.s()) |
40 | } |
41 | } |
42 | } |
43 | |
44 | impl Error for ParseError { |
45 | // Minimum Rust is 1.15, Error::description was still required then |
46 | #[allow (deprecated)] |
47 | fn description(&self) -> &str { |
48 | self.s() |
49 | } |
50 | } |
51 | |
52 | impl<'a> MimeIter<'a> { |
53 | /// A new iterator over mimes or media types |
54 | pub fn new(s: &'a str) -> Self { |
55 | Self { |
56 | pos: 0, |
57 | source: s, |
58 | } |
59 | } |
60 | } |
61 | |
62 | impl<'a> Iterator for MimeIter<'a> { |
63 | type Item = Result<Mime, &'a str>; |
64 | |
65 | fn next(&mut self) -> Option<Self::Item> { |
66 | let start = self.pos; |
67 | let len = self.source.bytes().len(); |
68 | |
69 | if start >= len { |
70 | return None |
71 | } |
72 | |
73 | // Try parsing the whole remaining slice, until the end |
74 | match parse(&self.source[start ..len]) { |
75 | Ok(value) => { |
76 | self.pos = len; |
77 | Some(Ok(value)) |
78 | } |
79 | Err(ParseError::InvalidToken { pos, .. }) => { |
80 | // The first token is immediately found to be wrong by `parse`. Skip it |
81 | if pos == 0 { |
82 | self.pos += 1; |
83 | return self.next() |
84 | } |
85 | let slice = &self.source[start .. start + pos]; |
86 | // Try parsing the longest slice (until the first invalid token) |
87 | return match parse(slice) { |
88 | Ok(mime) => { |
89 | self.pos = start + pos + 1; |
90 | Some(Ok(mime)) |
91 | } |
92 | Err(_) => { |
93 | if start + pos < len { |
94 | // Skip this invalid slice, |
95 | // try parsing the remaining slice in the next iteration |
96 | self.pos = start + pos; |
97 | Some(Err(slice)) |
98 | } else { |
99 | None |
100 | } |
101 | } |
102 | } |
103 | } |
104 | // Do not process any other error condition: the slice is malformed and |
105 | // no character is found to be invalid: a character is missing |
106 | Err(_) => None, |
107 | } |
108 | } |
109 | } |
110 | |
111 | pub fn parse(s: &str) -> Result<Mime, ParseError> { |
112 | if s == "*/*" { |
113 | return Ok(::STAR_STAR); |
114 | } |
115 | |
116 | let mut iter = s.bytes().enumerate(); |
117 | // toplevel |
118 | let mut start; |
119 | let slash; |
120 | loop { |
121 | match iter.next() { |
122 | Some((_, c)) if is_token(c) => (), |
123 | Some((i, b'/' )) if i > 0 => { |
124 | slash = i; |
125 | start = i + 1; |
126 | break; |
127 | }, |
128 | None => return Err(ParseError::MissingSlash), // EOF and no toplevel is no Mime |
129 | Some((pos, byte)) => return Err(ParseError::InvalidToken { |
130 | pos: pos, |
131 | byte: byte, |
132 | }) |
133 | }; |
134 | |
135 | } |
136 | |
137 | // sublevel |
138 | let mut plus = None; |
139 | loop { |
140 | match iter.next() { |
141 | Some((i, b'+' )) if i > start => { |
142 | plus = Some(i); |
143 | }, |
144 | Some((i, b';' )) if i > start => { |
145 | start = i; |
146 | break; |
147 | }, |
148 | Some((_, c)) if is_token(c) => (), |
149 | None => { |
150 | return Ok(Mime { |
151 | source: Source::Dynamic(s.to_ascii_lowercase()), |
152 | slash: slash, |
153 | plus: plus, |
154 | params: ParamSource::None, |
155 | }); |
156 | }, |
157 | Some((pos, byte)) => return Err(ParseError::InvalidToken { |
158 | pos: pos, |
159 | byte: byte, |
160 | }) |
161 | }; |
162 | } |
163 | |
164 | // params |
165 | let params = params_from_str(s, &mut iter, start)?; |
166 | |
167 | let src = match params { |
168 | ParamSource::Utf8(_) => s.to_ascii_lowercase(), |
169 | ParamSource::Custom(semicolon, ref indices) => lower_ascii_with_params(s, semicolon, indices), |
170 | ParamSource::None => { |
171 | // Chop off the empty list |
172 | s[..start].to_ascii_lowercase() |
173 | } |
174 | }; |
175 | |
176 | Ok(Mime { |
177 | source: Source::Dynamic(src), |
178 | slash: slash, |
179 | plus: plus, |
180 | params: params, |
181 | }) |
182 | } |
183 | |
184 | |
185 | fn params_from_str(s: &str, iter: &mut Enumerate<Bytes>, mut start: usize) -> Result<ParamSource, ParseError> { |
186 | let semicolon = start; |
187 | start += 1; |
188 | let mut params = ParamSource::None; |
189 | 'params: while start < s.len() { |
190 | let name; |
191 | // name |
192 | 'name: loop { |
193 | match iter.next() { |
194 | Some((i, b' ' )) if i == start => { |
195 | start = i + 1; |
196 | continue 'params; |
197 | }, |
198 | Some((_, c)) if is_token(c) => (), |
199 | Some((i, b'=' )) if i > start => { |
200 | name = Indexed(start, i); |
201 | start = i + 1; |
202 | break 'name; |
203 | }, |
204 | None => return Err(ParseError::MissingEqual), |
205 | Some((pos, byte)) => return Err(ParseError::InvalidToken { |
206 | pos: pos, |
207 | byte: byte, |
208 | }), |
209 | } |
210 | } |
211 | |
212 | let value; |
213 | // values must be restrict-name-char or "anything goes" |
214 | let mut is_quoted = false; |
215 | |
216 | 'value: loop { |
217 | if is_quoted { |
218 | match iter.next() { |
219 | Some((i, b'"' )) if i > start => { |
220 | value = Indexed(start, i); |
221 | break 'value; |
222 | }, |
223 | Some((_, c)) if is_restricted_quoted_char(c) => (), |
224 | None => return Err(ParseError::MissingQuote), |
225 | Some((pos, byte)) => return Err(ParseError::InvalidToken { |
226 | pos: pos, |
227 | byte: byte, |
228 | }), |
229 | } |
230 | } else { |
231 | match iter.next() { |
232 | Some((i, b'"' )) if i == start => { |
233 | is_quoted = true; |
234 | start = i + 1; |
235 | }, |
236 | Some((_, c)) if is_token(c) => (), |
237 | Some((i, b';' )) if i > start => { |
238 | value = Indexed(start, i); |
239 | start = i + 1; |
240 | break 'value; |
241 | } |
242 | None => { |
243 | value = Indexed(start, s.len()); |
244 | start = s.len(); |
245 | break 'value; |
246 | }, |
247 | |
248 | Some((pos, byte)) => return Err(ParseError::InvalidToken { |
249 | pos: pos, |
250 | byte: byte, |
251 | }), |
252 | } |
253 | } |
254 | } |
255 | |
256 | if is_quoted { |
257 | 'ws: loop { |
258 | match iter.next() { |
259 | Some((i, b';' )) => { |
260 | // next param |
261 | start = i + 1; |
262 | break 'ws; |
263 | }, |
264 | Some((_, b' ' )) => { |
265 | // skip whitespace |
266 | }, |
267 | None => { |
268 | // eof |
269 | start = s.len(); |
270 | break 'ws; |
271 | }, |
272 | Some((pos, byte)) => return Err(ParseError::InvalidToken { |
273 | pos: pos, |
274 | byte: byte, |
275 | }), |
276 | } |
277 | } |
278 | } |
279 | |
280 | match params { |
281 | ParamSource::Utf8(i) => { |
282 | let i = i + 2; |
283 | let charset = Indexed(i, "charset" .len() + i); |
284 | let utf8 = Indexed(charset.1 + 1, charset.1 + "utf-8" .len() + 1); |
285 | params = ParamSource::Custom(semicolon, vec![ |
286 | (charset, utf8), |
287 | (name, value), |
288 | ]); |
289 | }, |
290 | ParamSource::Custom(_, ref mut vec) => { |
291 | vec.push((name, value)); |
292 | }, |
293 | ParamSource::None => { |
294 | if semicolon + 2 == name.0 && CHARSET == &s[name.0..name.1] { |
295 | if UTF_8 == &s[value.0..value.1] { |
296 | params = ParamSource::Utf8(semicolon); |
297 | continue 'params; |
298 | } |
299 | } |
300 | params = ParamSource::Custom(semicolon, vec![(name, value)]); |
301 | }, |
302 | } |
303 | } |
304 | Ok(params) |
305 | } |
306 | |
307 | fn lower_ascii_with_params(s: &str, semi: usize, params: &[(Indexed, Indexed)]) -> String { |
308 | let mut owned: String = s.to_owned(); |
309 | owned[..semi].make_ascii_lowercase(); |
310 | |
311 | for &(ref name: &Indexed, ref value: &Indexed) in params { |
312 | owned[name.0..name.1].make_ascii_lowercase(); |
313 | // Since we just converted this part of the string to lowercase, |
314 | // we can skip the `Name == &str` unicase check and do a faster |
315 | // memcmp instead. |
316 | if &owned[name.0..name.1] == CHARSET.source { |
317 | owned[value.0..value.1].make_ascii_lowercase(); |
318 | } |
319 | } |
320 | |
321 | owned |
322 | } |
323 | |
324 | // From [RFC6838](http://tools.ietf.org/html/rfc6838#section-4.2): |
325 | // |
326 | // > All registered media types MUST be assigned top-level type and |
327 | // > subtype names. The combination of these names serves to uniquely |
328 | // > identify the media type, and the subtype name facet (or the absence |
329 | // > of one) identifies the registration tree. Both top-level type and |
330 | // > subtype names are case-insensitive. |
331 | // > |
332 | // > Type and subtype names MUST conform to the following ABNF: |
333 | // > |
334 | // > type-name = restricted-name |
335 | // > subtype-name = restricted-name |
336 | // > |
337 | // > restricted-name = restricted-name-first *126restricted-name-chars |
338 | // > restricted-name-first = ALPHA / DIGIT |
339 | // > restricted-name-chars = ALPHA / DIGIT / "!" / "#" / |
340 | // > "$" / "&" / "-" / "^" / "_" |
341 | // > restricted-name-chars =/ "." ; Characters before first dot always |
342 | // > ; specify a facet name |
343 | // > restricted-name-chars =/ "+" ; Characters after last plus always |
344 | // > ; specify a structured syntax suffix |
345 | |
346 | // However, [HTTP](https://tools.ietf.org/html/rfc7231#section-3.1.1.1): |
347 | // |
348 | // > media-type = type "/" subtype *( OWS ";" OWS parameter ) |
349 | // > type = token |
350 | // > subtype = token |
351 | // > parameter = token "=" ( token / quoted-string ) |
352 | // |
353 | // Where token is defined as: |
354 | // |
355 | // > token = 1*tchar |
356 | // > tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / |
357 | // > "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA |
358 | // |
359 | // So, clearly, ¯\_(Ä_/¯ |
360 | |
361 | macro_rules! byte_map { |
362 | ($($flag:expr,)*) => ([ |
363 | $($flag != 0,)* |
364 | ]) |
365 | } |
366 | |
367 | static TOKEN_MAP: [bool; 256] = byte_map![ |
368 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
369 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
370 | 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, |
371 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, |
372 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
373 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, |
374 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
375 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, |
376 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
377 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
378 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
379 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
380 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
381 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
382 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
383 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
384 | ]; |
385 | |
386 | fn is_token(c: u8) -> bool { |
387 | TOKEN_MAP[c as usize] |
388 | } |
389 | |
390 | fn is_restricted_quoted_char(c: u8) -> bool { |
391 | c > 31 && c != 127 |
392 | } |
393 | |
394 | #[test ] |
395 | #[allow (warnings)] // ... ranges deprecated |
396 | fn test_lookup_tables() { |
397 | for (i, &valid) in TOKEN_MAP.iter().enumerate() { |
398 | let i = i as u8; |
399 | let should = match i { |
400 | b'a' ...b'z' | |
401 | b'A' ...b'Z' | |
402 | b'0' ...b'9' | |
403 | b'!' | |
404 | b'#' | |
405 | b'$' | |
406 | b'%' | |
407 | b'&' | |
408 | b' \'' | |
409 | b'*' | |
410 | b'+' | |
411 | b'-' | |
412 | b'.' | |
413 | b'^' | |
414 | b'_' | |
415 | b'`' | |
416 | b'|' | |
417 | b'~' => true, |
418 | _ => false |
419 | }; |
420 | assert_eq!(valid, should, " {:?} ( {}) should be {}" , i as char, i, should); |
421 | } |
422 | } |
423 | |
424 | #[test ] |
425 | fn test_parse_iterator() { |
426 | let mut iter: MimeIter<'_> = MimeIter::new("application/json, application/json" ); |
427 | assert_eq!(iter.next().unwrap().unwrap(), parse("application/json" ).unwrap()); |
428 | assert_eq!(iter.next().unwrap().unwrap(), parse("application/json" ).unwrap()); |
429 | assert_eq!(iter.next(), None); |
430 | |
431 | let mut iter: MimeIter<'_> = MimeIter::new("application/json" ); |
432 | assert_eq!(iter.next().unwrap().unwrap(), parse("application/json" ).unwrap()); |
433 | assert_eq!(iter.next(), None); |
434 | |
435 | let mut iter: MimeIter<'_> = MimeIter::new("application/json; " ); |
436 | assert_eq!(iter.next().unwrap().unwrap(), parse("application/json" ).unwrap()); |
437 | assert_eq!(iter.next(), None); |
438 | } |
439 | |
440 | #[test ] |
441 | fn test_parse_iterator_invalid() { |
442 | let mut iter: MimeIter<'_> = MimeIter::new("application/json, invalid, application/json" ); |
443 | assert_eq!(iter.next().unwrap().unwrap(), parse("application/json" ).unwrap()); |
444 | assert_eq!(iter.next().unwrap().unwrap_err(), "invalid" ); |
445 | assert_eq!(iter.next().unwrap().unwrap(), parse("application/json" ).unwrap()); |
446 | assert_eq!(iter.next(), None); |
447 | } |
448 | |
449 | #[test ] |
450 | fn test_parse_iterator_all_invalid() { |
451 | let mut iter: MimeIter<'_> = MimeIter::new("application/json, text/html" ); |
452 | assert_eq!(iter.next().unwrap().unwrap_err(), "application/json" ); |
453 | assert_eq!(iter.next(), None); |
454 | } |
455 | |