1 | // This is a part of Chrono. |
2 | // See README.md and LICENSE.txt for details. |
3 | |
4 | /*! |
5 | * Various scanning routines for the parser. |
6 | */ |
7 | |
8 | use super::{ParseResult, INVALID, OUT_OF_RANGE, TOO_SHORT}; |
9 | use crate::Weekday; |
10 | |
11 | /// Tries to parse the non-negative number from `min` to `max` digits. |
12 | /// |
13 | /// The absence of digits at all is an unconditional error. |
14 | /// More than `max` digits are consumed up to the first `max` digits. |
15 | /// Any number that does not fit in `i64` is an error. |
16 | #[inline ] |
17 | pub(super) fn number(s: &str, min: usize, max: usize) -> ParseResult<(&str, i64)> { |
18 | assert!(min <= max); |
19 | |
20 | // We are only interested in ascii numbers, so we can work with the `str` as bytes. We stop on |
21 | // the first non-numeric byte, which may be another ascii character or beginning of multi-byte |
22 | // UTF-8 character. |
23 | let bytes = s.as_bytes(); |
24 | if bytes.len() < min { |
25 | return Err(TOO_SHORT); |
26 | } |
27 | |
28 | let mut n = 0i64; |
29 | for (i, c) in bytes.iter().take(max).cloned().enumerate() { |
30 | // cloned() = copied() |
31 | if !c.is_ascii_digit() { |
32 | if i < min { |
33 | return Err(INVALID); |
34 | } else { |
35 | return Ok((&s[i..], n)); |
36 | } |
37 | } |
38 | |
39 | n = match n.checked_mul(10).and_then(|n| n.checked_add((c - b'0' ) as i64)) { |
40 | Some(n) => n, |
41 | None => return Err(OUT_OF_RANGE), |
42 | }; |
43 | } |
44 | |
45 | Ok((&s[core::cmp::min(max, bytes.len())..], n)) |
46 | } |
47 | |
48 | /// Tries to consume at least one digits as a fractional second. |
49 | /// Returns the number of whole nanoseconds (0--999,999,999). |
50 | pub(super) fn nanosecond(s: &str) -> ParseResult<(&str, i64)> { |
51 | // record the number of digits consumed for later scaling. |
52 | let origlen: usize = s.len(); |
53 | let (s: &str, v: i64) = number(s, min:1, max:9)?; |
54 | let consumed: usize = origlen - s.len(); |
55 | |
56 | // scale the number accordingly. |
57 | static SCALE: [i64; 10] = |
58 | [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1]; |
59 | let v: i64 = v.checked_mul(SCALE[consumed]).ok_or(OUT_OF_RANGE)?; |
60 | |
61 | // if there are more than 9 digits, skip next digits. |
62 | let s: &str = s.trim_start_matches(|c: char| c.is_ascii_digit()); |
63 | |
64 | Ok((s, v)) |
65 | } |
66 | |
67 | /// Tries to consume a fixed number of digits as a fractional second. |
68 | /// Returns the number of whole nanoseconds (0--999,999,999). |
69 | pub(super) fn nanosecond_fixed(s: &str, digits: usize) -> ParseResult<(&str, i64)> { |
70 | // record the number of digits consumed for later scaling. |
71 | let (s: &str, v: i64) = number(s, min:digits, max:digits)?; |
72 | |
73 | // scale the number accordingly. |
74 | static SCALE: [i64; 10] = |
75 | [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1]; |
76 | let v: i64 = v.checked_mul(SCALE[digits]).ok_or(OUT_OF_RANGE)?; |
77 | |
78 | Ok((s, v)) |
79 | } |
80 | |
81 | /// Tries to parse the month index (0 through 11) with the first three ASCII letters. |
82 | pub(super) fn short_month0(s: &str) -> ParseResult<(&str, u8)> { |
83 | if s.len() < 3 { |
84 | return Err(TOO_SHORT); |
85 | } |
86 | let buf: &[u8] = s.as_bytes(); |
87 | let month0: u8 = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) { |
88 | (b'j' , b'a' , b'n' ) => 0, |
89 | (b'f' , b'e' , b'b' ) => 1, |
90 | (b'm' , b'a' , b'r' ) => 2, |
91 | (b'a' , b'p' , b'r' ) => 3, |
92 | (b'm' , b'a' , b'y' ) => 4, |
93 | (b'j' , b'u' , b'n' ) => 5, |
94 | (b'j' , b'u' , b'l' ) => 6, |
95 | (b'a' , b'u' , b'g' ) => 7, |
96 | (b's' , b'e' , b'p' ) => 8, |
97 | (b'o' , b'c' , b't' ) => 9, |
98 | (b'n' , b'o' , b'v' ) => 10, |
99 | (b'd' , b'e' , b'c' ) => 11, |
100 | _ => return Err(INVALID), |
101 | }; |
102 | Ok((&s[3..], month0)) |
103 | } |
104 | |
105 | /// Tries to parse the weekday with the first three ASCII letters. |
106 | pub(super) fn short_weekday(s: &str) -> ParseResult<(&str, Weekday)> { |
107 | if s.len() < 3 { |
108 | return Err(TOO_SHORT); |
109 | } |
110 | let buf: &[u8] = s.as_bytes(); |
111 | let weekday: Weekday = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) { |
112 | (b'm' , b'o' , b'n' ) => Weekday::Mon, |
113 | (b't' , b'u' , b'e' ) => Weekday::Tue, |
114 | (b'w' , b'e' , b'd' ) => Weekday::Wed, |
115 | (b't' , b'h' , b'u' ) => Weekday::Thu, |
116 | (b'f' , b'r' , b'i' ) => Weekday::Fri, |
117 | (b's' , b'a' , b't' ) => Weekday::Sat, |
118 | (b's' , b'u' , b'n' ) => Weekday::Sun, |
119 | _ => return Err(INVALID), |
120 | }; |
121 | Ok((&s[3..], weekday)) |
122 | } |
123 | |
124 | /// Tries to parse the month index (0 through 11) with short or long month names. |
125 | /// It prefers long month names to short month names when both are possible. |
126 | pub(super) fn short_or_long_month0(s: &str) -> ParseResult<(&str, u8)> { |
127 | // lowercased month names, minus first three chars |
128 | static LONG_MONTH_SUFFIXES: [&[u8]; 12] = [ |
129 | b"uary" , b"ruary" , b"ch" , b"il" , b"" , b"e" , b"y" , b"ust" , b"tember" , b"ober" , b"ember" , |
130 | b"ember" , |
131 | ]; |
132 | |
133 | let (mut s: &str, month0: u8) = short_month0(s)?; |
134 | |
135 | // tries to consume the suffix if possible |
136 | let suffix: &[u8] = LONG_MONTH_SUFFIXES[month0 as usize]; |
137 | if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) { |
138 | s = &s[suffix.len()..]; |
139 | } |
140 | |
141 | Ok((s, month0)) |
142 | } |
143 | |
144 | /// Tries to parse the weekday with short or long weekday names. |
145 | /// It prefers long weekday names to short weekday names when both are possible. |
146 | pub(super) fn short_or_long_weekday(s: &str) -> ParseResult<(&str, Weekday)> { |
147 | // lowercased weekday names, minus first three chars |
148 | static LONG_WEEKDAY_SUFFIXES: [&[u8]; 7] = |
149 | [b"day" , b"sday" , b"nesday" , b"rsday" , b"day" , b"urday" , b"day" ]; |
150 | |
151 | let (mut s: &str, weekday: Weekday) = short_weekday(s)?; |
152 | |
153 | // tries to consume the suffix if possible |
154 | let suffix: &[u8] = LONG_WEEKDAY_SUFFIXES[weekday.num_days_from_monday() as usize]; |
155 | if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) { |
156 | s = &s[suffix.len()..]; |
157 | } |
158 | |
159 | Ok((s, weekday)) |
160 | } |
161 | |
162 | /// Tries to consume exactly one given character. |
163 | pub(super) fn char(s: &str, c1: u8) -> ParseResult<&str> { |
164 | match s.as_bytes().first() { |
165 | Some(&c: u8) if c == c1 => Ok(&s[1..]), |
166 | Some(_) => Err(INVALID), |
167 | None => Err(TOO_SHORT), |
168 | } |
169 | } |
170 | |
171 | /// Tries to consume one or more whitespace. |
172 | pub(super) fn space(s: &str) -> ParseResult<&str> { |
173 | let s_: &str = s.trim_start(); |
174 | if s_.len() < s.len() { |
175 | Ok(s_) |
176 | } else if s.is_empty() { |
177 | Err(TOO_SHORT) |
178 | } else { |
179 | Err(INVALID) |
180 | } |
181 | } |
182 | |
183 | /// Consumes any number (including zero) of colon or spaces. |
184 | pub(crate) fn colon_or_space(s: &str) -> ParseResult<&str> { |
185 | Ok(s.trim_start_matches(|c: char| c == ':' || c.is_whitespace())) |
186 | } |
187 | |
188 | /// Parse a timezone from `s` and return the offset in seconds. |
189 | /// |
190 | /// The `consume_colon` function is used to parse a mandatory or optional `:` |
191 | /// separator between hours offset and minutes offset. |
192 | /// |
193 | /// The `allow_missing_minutes` flag allows the timezone minutes offset to be |
194 | /// missing from `s`. |
195 | /// |
196 | /// The `allow_tz_minus_sign` flag allows the timezone offset negative character |
197 | /// to also be `−` MINUS SIGN (U+2212) in addition to the typical |
198 | /// ASCII-compatible `-` HYPHEN-MINUS (U+2D). |
199 | /// This is part of [RFC 3339 & ISO 8601]. |
200 | /// |
201 | /// [RFC 3339 & ISO 8601]: https://en.wikipedia.org/w/index.php?title=ISO_8601&oldid=1114309368#Time_offsets_from_UTC |
202 | pub(crate) fn timezone_offset<F>( |
203 | mut s: &str, |
204 | mut consume_colon: F, |
205 | allow_zulu: bool, |
206 | allow_missing_minutes: bool, |
207 | allow_tz_minus_sign: bool, |
208 | ) -> ParseResult<(&str, i32)> |
209 | where |
210 | F: FnMut(&str) -> ParseResult<&str>, |
211 | { |
212 | if allow_zulu { |
213 | if let Some(&b'Z' | &b'z' ) = s.as_bytes().first() { |
214 | return Ok((&s[1..], 0)); |
215 | } |
216 | } |
217 | |
218 | const fn digits(s: &str) -> ParseResult<(u8, u8)> { |
219 | let b = s.as_bytes(); |
220 | if b.len() < 2 { |
221 | Err(TOO_SHORT) |
222 | } else { |
223 | Ok((b[0], b[1])) |
224 | } |
225 | } |
226 | let negative = match s.chars().next() { |
227 | Some('+' ) => { |
228 | // PLUS SIGN (U+2B) |
229 | s = &s['+' .len_utf8()..]; |
230 | |
231 | false |
232 | } |
233 | Some('-' ) => { |
234 | // HYPHEN-MINUS (U+2D) |
235 | s = &s['-' .len_utf8()..]; |
236 | |
237 | true |
238 | } |
239 | Some('−' ) => { |
240 | // MINUS SIGN (U+2212) |
241 | if !allow_tz_minus_sign { |
242 | return Err(INVALID); |
243 | } |
244 | s = &s['−' .len_utf8()..]; |
245 | |
246 | true |
247 | } |
248 | Some(_) => return Err(INVALID), |
249 | None => return Err(TOO_SHORT), |
250 | }; |
251 | |
252 | // hours (00--99) |
253 | let hours = match digits(s)? { |
254 | (h1 @ b'0' ..=b'9' , h2 @ b'0' ..=b'9' ) => i32::from((h1 - b'0' ) * 10 + (h2 - b'0' )), |
255 | _ => return Err(INVALID), |
256 | }; |
257 | s = &s[2..]; |
258 | |
259 | // colons (and possibly other separators) |
260 | s = consume_colon(s)?; |
261 | |
262 | // minutes (00--59) |
263 | // if the next two items are digits then we have to add minutes |
264 | let minutes = if let Ok(ds) = digits(s) { |
265 | match ds { |
266 | (m1 @ b'0' ..=b'5' , m2 @ b'0' ..=b'9' ) => i32::from((m1 - b'0' ) * 10 + (m2 - b'0' )), |
267 | (b'6' ..=b'9' , b'0' ..=b'9' ) => return Err(OUT_OF_RANGE), |
268 | _ => return Err(INVALID), |
269 | } |
270 | } else if allow_missing_minutes { |
271 | 0 |
272 | } else { |
273 | return Err(TOO_SHORT); |
274 | }; |
275 | s = match s.len() { |
276 | len if len >= 2 => &s[2..], |
277 | 0 => s, |
278 | _ => return Err(TOO_SHORT), |
279 | }; |
280 | |
281 | let seconds = hours * 3600 + minutes * 60; |
282 | Ok((s, if negative { -seconds } else { seconds })) |
283 | } |
284 | |
285 | /// Same as `timezone_offset` but also allows for RFC 2822 legacy timezones. |
286 | /// May return `None` which indicates an insufficient offset data (i.e. `-0000`). |
287 | /// See [RFC 2822 Section 4.3]. |
288 | /// |
289 | /// [RFC 2822 Section 4.3]: https://tools.ietf.org/html/rfc2822#section-4.3 |
290 | pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, i32)> { |
291 | // tries to parse legacy time zone names |
292 | let upto = s.as_bytes().iter().position(|&c| !c.is_ascii_alphabetic()).unwrap_or(s.len()); |
293 | if upto > 0 { |
294 | let name = &s.as_bytes()[..upto]; |
295 | let s = &s[upto..]; |
296 | let offset_hours = |o| Ok((s, o * 3600)); |
297 | // RFC 2822 requires support for some named North America timezones, a small subset of all |
298 | // named timezones. |
299 | if name.eq_ignore_ascii_case(b"gmt" ) |
300 | || name.eq_ignore_ascii_case(b"ut" ) |
301 | || name.eq_ignore_ascii_case(b"z" ) |
302 | { |
303 | return offset_hours(0); |
304 | } else if name.eq_ignore_ascii_case(b"edt" ) { |
305 | return offset_hours(-4); |
306 | } else if name.eq_ignore_ascii_case(b"est" ) || name.eq_ignore_ascii_case(b"cdt" ) { |
307 | return offset_hours(-5); |
308 | } else if name.eq_ignore_ascii_case(b"cst" ) || name.eq_ignore_ascii_case(b"mdt" ) { |
309 | return offset_hours(-6); |
310 | } else if name.eq_ignore_ascii_case(b"mst" ) || name.eq_ignore_ascii_case(b"pdt" ) { |
311 | return offset_hours(-7); |
312 | } else if name.eq_ignore_ascii_case(b"pst" ) { |
313 | return offset_hours(-8); |
314 | } else if name.len() == 1 { |
315 | if let b'a' ..=b'i' | b'k' ..=b'y' | b'A' ..=b'I' | b'K' ..=b'Y' = name[0] { |
316 | // recommended by RFC 2822: consume but treat it as -0000 |
317 | return Ok((s, 0)); |
318 | } |
319 | } |
320 | Err(INVALID) |
321 | } else { |
322 | timezone_offset(s, |s| Ok(s), false, false, false) |
323 | } |
324 | } |
325 | |
326 | /// Tries to consume an RFC2822 comment including preceding ` `. |
327 | /// |
328 | /// Returns the remaining string after the closing parenthesis. |
329 | pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> { |
330 | use CommentState::*; |
331 | |
332 | let s: &str = s.trim_start(); |
333 | |
334 | let mut state: CommentState = Start; |
335 | for (i: usize, c: u8) in s.bytes().enumerate() { |
336 | state = match (state, c) { |
337 | (Start, b'(' ) => Next(1), |
338 | (Next(1), b')' ) => return Ok((&s[i + 1..], ())), |
339 | (Next(depth: usize), b' \\' ) => Escape(depth), |
340 | (Next(depth: usize), b'(' ) => Next(depth + 1), |
341 | (Next(depth: usize), b')' ) => Next(depth - 1), |
342 | (Next(depth: usize), _) | (Escape(depth: usize), _) => Next(depth), |
343 | _ => return Err(INVALID), |
344 | }; |
345 | } |
346 | |
347 | Err(TOO_SHORT) |
348 | } |
349 | |
350 | enum CommentState { |
351 | Start, |
352 | Next(usize), |
353 | Escape(usize), |
354 | } |
355 | |
356 | #[cfg (test)] |
357 | mod tests { |
358 | use super::{ |
359 | comment_2822, nanosecond, nanosecond_fixed, short_or_long_month0, short_or_long_weekday, |
360 | timezone_offset_2822, |
361 | }; |
362 | use crate::format::{INVALID, TOO_SHORT}; |
363 | use crate::Weekday; |
364 | |
365 | #[test ] |
366 | fn test_rfc2822_comments() { |
367 | let testdata = [ |
368 | ("" , Err(TOO_SHORT)), |
369 | (" " , Err(TOO_SHORT)), |
370 | ("x" , Err(INVALID)), |
371 | ("(" , Err(TOO_SHORT)), |
372 | ("()" , Ok("" )), |
373 | (" \r\n\t()" , Ok("" )), |
374 | ("() " , Ok(" " )), |
375 | ("()z" , Ok("z" )), |
376 | ("(x)" , Ok("" )), |
377 | ("(())" , Ok("" )), |
378 | ("((()))" , Ok("" )), |
379 | ("(x(x(x)x)x)" , Ok("" )), |
380 | ("( x ( x ( x ) x ) x )" , Ok("" )), |
381 | (r"(\)" , Err(TOO_SHORT)), |
382 | (r"(\()" , Ok("" )), |
383 | (r"(\))" , Ok("" )), |
384 | (r"(\\)" , Ok("" )), |
385 | ("(()())" , Ok("" )), |
386 | ("( x ( x ) x ( x ) x )" , Ok("" )), |
387 | ]; |
388 | |
389 | for (test_in, expected) in testdata.iter() { |
390 | let actual = comment_2822(test_in).map(|(s, _)| s); |
391 | assert_eq!( |
392 | *expected, actual, |
393 | " {:?} expected to produce {:?}, but produced {:?}." , |
394 | test_in, expected, actual |
395 | ); |
396 | } |
397 | } |
398 | |
399 | #[test ] |
400 | fn test_timezone_offset_2822() { |
401 | assert_eq!(timezone_offset_2822("cSt" ).unwrap(), ("" , -21600)); |
402 | assert_eq!(timezone_offset_2822("pSt" ).unwrap(), ("" , -28800)); |
403 | assert_eq!(timezone_offset_2822("mSt" ).unwrap(), ("" , -25200)); |
404 | assert_eq!(timezone_offset_2822("-1551" ).unwrap(), ("" , -57060)); |
405 | assert_eq!(timezone_offset_2822("Gp" ), Err(INVALID)); |
406 | } |
407 | |
408 | #[test ] |
409 | fn test_short_or_long_month0() { |
410 | assert_eq!(short_or_long_month0("JUn" ).unwrap(), ("" , 5)); |
411 | assert_eq!(short_or_long_month0("mAy" ).unwrap(), ("" , 4)); |
412 | assert_eq!(short_or_long_month0("AuG" ).unwrap(), ("" , 7)); |
413 | assert_eq!(short_or_long_month0("Aprâ" ).unwrap(), ("â" , 3)); |
414 | assert_eq!(short_or_long_month0("JUl" ).unwrap(), ("" , 6)); |
415 | assert_eq!(short_or_long_month0("mAr" ).unwrap(), ("" , 2)); |
416 | assert_eq!(short_or_long_month0("Jan" ).unwrap(), ("" , 0)); |
417 | } |
418 | |
419 | #[test ] |
420 | fn test_short_or_long_weekday() { |
421 | assert_eq!(short_or_long_weekday("sAtu" ).unwrap(), ("u" , Weekday::Sat)); |
422 | assert_eq!(short_or_long_weekday("thu" ).unwrap(), ("" , Weekday::Thu)); |
423 | } |
424 | |
425 | #[test ] |
426 | fn test_nanosecond_fixed() { |
427 | assert_eq!(nanosecond_fixed("" , 0usize).unwrap(), ("" , 0)); |
428 | assert!(nanosecond_fixed("" , 1usize).is_err()); |
429 | } |
430 | |
431 | #[test ] |
432 | fn test_nanosecond() { |
433 | assert_eq!(nanosecond("2Ù" ).unwrap(), ("Ù" , 200000000)); |
434 | assert_eq!(nanosecond("8" ).unwrap(), ("" , 800000000)); |
435 | } |
436 | } |
437 | |