| 1 | // This is a part of Chrono. |
| 2 | // See README.md and LICENSE.txt for details. |
| 3 | |
| 4 | /*! |
| 5 | * Various scanning routines for the parser. |
| 6 | */ |
| 7 | |
| 8 | use super::{ParseResult, INVALID, OUT_OF_RANGE, TOO_SHORT}; |
| 9 | use crate::Weekday; |
| 10 | |
| 11 | /// Tries to parse the non-negative number from `min` to `max` digits. |
| 12 | /// |
| 13 | /// The absence of digits at all is an unconditional error. |
| 14 | /// More than `max` digits are consumed up to the first `max` digits. |
| 15 | /// Any number that does not fit in `i64` is an error. |
| 16 | #[inline ] |
| 17 | pub(super) fn number(s: &str, min: usize, max: usize) -> ParseResult<(&str, i64)> { |
| 18 | assert!(min <= max); |
| 19 | |
| 20 | // We are only interested in ascii numbers, so we can work with the `str` as bytes. We stop on |
| 21 | // the first non-numeric byte, which may be another ascii character or beginning of multi-byte |
| 22 | // UTF-8 character. |
| 23 | let bytes = s.as_bytes(); |
| 24 | if bytes.len() < min { |
| 25 | return Err(TOO_SHORT); |
| 26 | } |
| 27 | |
| 28 | let mut n = 0i64; |
| 29 | for (i, c) in bytes.iter().take(max).cloned().enumerate() { |
| 30 | // cloned() = copied() |
| 31 | if !c.is_ascii_digit() { |
| 32 | if i < min { |
| 33 | return Err(INVALID); |
| 34 | } else { |
| 35 | return Ok((&s[i..], n)); |
| 36 | } |
| 37 | } |
| 38 | |
| 39 | n = match n.checked_mul(10).and_then(|n| n.checked_add((c - b'0' ) as i64)) { |
| 40 | Some(n) => n, |
| 41 | None => return Err(OUT_OF_RANGE), |
| 42 | }; |
| 43 | } |
| 44 | |
| 45 | Ok((&s[core::cmp::min(max, bytes.len())..], n)) |
| 46 | } |
| 47 | |
| 48 | /// Tries to consume at least one digits as a fractional second. |
| 49 | /// Returns the number of whole nanoseconds (0--999,999,999). |
| 50 | pub(super) fn nanosecond(s: &str) -> ParseResult<(&str, i64)> { |
| 51 | // record the number of digits consumed for later scaling. |
| 52 | let origlen: usize = s.len(); |
| 53 | let (s: &str, v: i64) = number(s, min:1, max:9)?; |
| 54 | let consumed: usize = origlen - s.len(); |
| 55 | |
| 56 | // scale the number accordingly. |
| 57 | static SCALE: [i64; 10] = |
| 58 | [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1]; |
| 59 | let v: i64 = v.checked_mul(SCALE[consumed]).ok_or(OUT_OF_RANGE)?; |
| 60 | |
| 61 | // if there are more than 9 digits, skip next digits. |
| 62 | let s: &str = s.trim_start_matches(|c: char| c.is_ascii_digit()); |
| 63 | |
| 64 | Ok((s, v)) |
| 65 | } |
| 66 | |
| 67 | /// Tries to consume a fixed number of digits as a fractional second. |
| 68 | /// Returns the number of whole nanoseconds (0--999,999,999). |
| 69 | pub(super) fn nanosecond_fixed(s: &str, digits: usize) -> ParseResult<(&str, i64)> { |
| 70 | // record the number of digits consumed for later scaling. |
| 71 | let (s: &str, v: i64) = number(s, min:digits, max:digits)?; |
| 72 | |
| 73 | // scale the number accordingly. |
| 74 | static SCALE: [i64; 10] = |
| 75 | [0, 100_000_000, 10_000_000, 1_000_000, 100_000, 10_000, 1_000, 100, 10, 1]; |
| 76 | let v: i64 = v.checked_mul(SCALE[digits]).ok_or(OUT_OF_RANGE)?; |
| 77 | |
| 78 | Ok((s, v)) |
| 79 | } |
| 80 | |
| 81 | /// Tries to parse the month index (0 through 11) with the first three ASCII letters. |
| 82 | pub(super) fn short_month0(s: &str) -> ParseResult<(&str, u8)> { |
| 83 | if s.len() < 3 { |
| 84 | return Err(TOO_SHORT); |
| 85 | } |
| 86 | let buf: &[u8] = s.as_bytes(); |
| 87 | let month0: u8 = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) { |
| 88 | (b'j' , b'a' , b'n' ) => 0, |
| 89 | (b'f' , b'e' , b'b' ) => 1, |
| 90 | (b'm' , b'a' , b'r' ) => 2, |
| 91 | (b'a' , b'p' , b'r' ) => 3, |
| 92 | (b'm' , b'a' , b'y' ) => 4, |
| 93 | (b'j' , b'u' , b'n' ) => 5, |
| 94 | (b'j' , b'u' , b'l' ) => 6, |
| 95 | (b'a' , b'u' , b'g' ) => 7, |
| 96 | (b's' , b'e' , b'p' ) => 8, |
| 97 | (b'o' , b'c' , b't' ) => 9, |
| 98 | (b'n' , b'o' , b'v' ) => 10, |
| 99 | (b'd' , b'e' , b'c' ) => 11, |
| 100 | _ => return Err(INVALID), |
| 101 | }; |
| 102 | Ok((&s[3..], month0)) |
| 103 | } |
| 104 | |
| 105 | /// Tries to parse the weekday with the first three ASCII letters. |
| 106 | pub(super) fn short_weekday(s: &str) -> ParseResult<(&str, Weekday)> { |
| 107 | if s.len() < 3 { |
| 108 | return Err(TOO_SHORT); |
| 109 | } |
| 110 | let buf: &[u8] = s.as_bytes(); |
| 111 | let weekday: Weekday = match (buf[0] | 32, buf[1] | 32, buf[2] | 32) { |
| 112 | (b'm' , b'o' , b'n' ) => Weekday::Mon, |
| 113 | (b't' , b'u' , b'e' ) => Weekday::Tue, |
| 114 | (b'w' , b'e' , b'd' ) => Weekday::Wed, |
| 115 | (b't' , b'h' , b'u' ) => Weekday::Thu, |
| 116 | (b'f' , b'r' , b'i' ) => Weekday::Fri, |
| 117 | (b's' , b'a' , b't' ) => Weekday::Sat, |
| 118 | (b's' , b'u' , b'n' ) => Weekday::Sun, |
| 119 | _ => return Err(INVALID), |
| 120 | }; |
| 121 | Ok((&s[3..], weekday)) |
| 122 | } |
| 123 | |
| 124 | /// Tries to parse the month index (0 through 11) with short or long month names. |
| 125 | /// It prefers long month names to short month names when both are possible. |
| 126 | pub(super) fn short_or_long_month0(s: &str) -> ParseResult<(&str, u8)> { |
| 127 | // lowercased month names, minus first three chars |
| 128 | static LONG_MONTH_SUFFIXES: [&[u8]; 12] = [ |
| 129 | b"uary" , b"ruary" , b"ch" , b"il" , b"" , b"e" , b"y" , b"ust" , b"tember" , b"ober" , b"ember" , |
| 130 | b"ember" , |
| 131 | ]; |
| 132 | |
| 133 | let (mut s: &str, month0: u8) = short_month0(s)?; |
| 134 | |
| 135 | // tries to consume the suffix if possible |
| 136 | let suffix: &[u8] = LONG_MONTH_SUFFIXES[month0 as usize]; |
| 137 | if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) { |
| 138 | s = &s[suffix.len()..]; |
| 139 | } |
| 140 | |
| 141 | Ok((s, month0)) |
| 142 | } |
| 143 | |
| 144 | /// Tries to parse the weekday with short or long weekday names. |
| 145 | /// It prefers long weekday names to short weekday names when both are possible. |
| 146 | pub(super) fn short_or_long_weekday(s: &str) -> ParseResult<(&str, Weekday)> { |
| 147 | // lowercased weekday names, minus first three chars |
| 148 | static LONG_WEEKDAY_SUFFIXES: [&[u8]; 7] = |
| 149 | [b"day" , b"sday" , b"nesday" , b"rsday" , b"day" , b"urday" , b"day" ]; |
| 150 | |
| 151 | let (mut s: &str, weekday: Weekday) = short_weekday(s)?; |
| 152 | |
| 153 | // tries to consume the suffix if possible |
| 154 | let suffix: &[u8] = LONG_WEEKDAY_SUFFIXES[weekday.num_days_from_monday() as usize]; |
| 155 | if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) { |
| 156 | s = &s[suffix.len()..]; |
| 157 | } |
| 158 | |
| 159 | Ok((s, weekday)) |
| 160 | } |
| 161 | |
| 162 | /// Tries to consume exactly one given character. |
| 163 | pub(super) fn char(s: &str, c1: u8) -> ParseResult<&str> { |
| 164 | match s.as_bytes().first() { |
| 165 | Some(&c: u8) if c == c1 => Ok(&s[1..]), |
| 166 | Some(_) => Err(INVALID), |
| 167 | None => Err(TOO_SHORT), |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | /// Tries to consume one or more whitespace. |
| 172 | pub(super) fn space(s: &str) -> ParseResult<&str> { |
| 173 | let s_: &str = s.trim_start(); |
| 174 | if s_.len() < s.len() { |
| 175 | Ok(s_) |
| 176 | } else if s.is_empty() { |
| 177 | Err(TOO_SHORT) |
| 178 | } else { |
| 179 | Err(INVALID) |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | /// Consumes any number (including zero) of colon or spaces. |
| 184 | pub(crate) fn colon_or_space(s: &str) -> ParseResult<&str> { |
| 185 | Ok(s.trim_start_matches(|c: char| c == ':' || c.is_whitespace())) |
| 186 | } |
| 187 | |
| 188 | /// Parse a timezone from `s` and return the offset in seconds. |
| 189 | /// |
| 190 | /// The `consume_colon` function is used to parse a mandatory or optional `:` |
| 191 | /// separator between hours offset and minutes offset. |
| 192 | /// |
| 193 | /// The `allow_missing_minutes` flag allows the timezone minutes offset to be |
| 194 | /// missing from `s`. |
| 195 | /// |
| 196 | /// The `allow_tz_minus_sign` flag allows the timezone offset negative character |
| 197 | /// to also be `−` MINUS SIGN (U+2212) in addition to the typical |
| 198 | /// ASCII-compatible `-` HYPHEN-MINUS (U+2D). |
| 199 | /// This is part of [RFC 3339 & ISO 8601]. |
| 200 | /// |
| 201 | /// [RFC 3339 & ISO 8601]: https://en.wikipedia.org/w/index.php?title=ISO_8601&oldid=1114309368#Time_offsets_from_UTC |
| 202 | pub(crate) fn timezone_offset<F>( |
| 203 | mut s: &str, |
| 204 | mut consume_colon: F, |
| 205 | allow_zulu: bool, |
| 206 | allow_missing_minutes: bool, |
| 207 | allow_tz_minus_sign: bool, |
| 208 | ) -> ParseResult<(&str, i32)> |
| 209 | where |
| 210 | F: FnMut(&str) -> ParseResult<&str>, |
| 211 | { |
| 212 | if allow_zulu { |
| 213 | if let Some(&b'Z' | &b'z' ) = s.as_bytes().first() { |
| 214 | return Ok((&s[1..], 0)); |
| 215 | } |
| 216 | } |
| 217 | |
| 218 | const fn digits(s: &str) -> ParseResult<(u8, u8)> { |
| 219 | let b = s.as_bytes(); |
| 220 | if b.len() < 2 { |
| 221 | Err(TOO_SHORT) |
| 222 | } else { |
| 223 | Ok((b[0], b[1])) |
| 224 | } |
| 225 | } |
| 226 | let negative = match s.chars().next() { |
| 227 | Some('+' ) => { |
| 228 | // PLUS SIGN (U+2B) |
| 229 | s = &s['+' .len_utf8()..]; |
| 230 | |
| 231 | false |
| 232 | } |
| 233 | Some('-' ) => { |
| 234 | // HYPHEN-MINUS (U+2D) |
| 235 | s = &s['-' .len_utf8()..]; |
| 236 | |
| 237 | true |
| 238 | } |
| 239 | Some('−' ) => { |
| 240 | // MINUS SIGN (U+2212) |
| 241 | if !allow_tz_minus_sign { |
| 242 | return Err(INVALID); |
| 243 | } |
| 244 | s = &s['−' .len_utf8()..]; |
| 245 | |
| 246 | true |
| 247 | } |
| 248 | Some(_) => return Err(INVALID), |
| 249 | None => return Err(TOO_SHORT), |
| 250 | }; |
| 251 | |
| 252 | // hours (00--99) |
| 253 | let hours = match digits(s)? { |
| 254 | (h1 @ b'0' ..=b'9' , h2 @ b'0' ..=b'9' ) => i32::from((h1 - b'0' ) * 10 + (h2 - b'0' )), |
| 255 | _ => return Err(INVALID), |
| 256 | }; |
| 257 | s = &s[2..]; |
| 258 | |
| 259 | // colons (and possibly other separators) |
| 260 | s = consume_colon(s)?; |
| 261 | |
| 262 | // minutes (00--59) |
| 263 | // if the next two items are digits then we have to add minutes |
| 264 | let minutes = if let Ok(ds) = digits(s) { |
| 265 | match ds { |
| 266 | (m1 @ b'0' ..=b'5' , m2 @ b'0' ..=b'9' ) => i32::from((m1 - b'0' ) * 10 + (m2 - b'0' )), |
| 267 | (b'6' ..=b'9' , b'0' ..=b'9' ) => return Err(OUT_OF_RANGE), |
| 268 | _ => return Err(INVALID), |
| 269 | } |
| 270 | } else if allow_missing_minutes { |
| 271 | 0 |
| 272 | } else { |
| 273 | return Err(TOO_SHORT); |
| 274 | }; |
| 275 | s = match s.len() { |
| 276 | len if len >= 2 => &s[2..], |
| 277 | 0 => s, |
| 278 | _ => return Err(TOO_SHORT), |
| 279 | }; |
| 280 | |
| 281 | let seconds = hours * 3600 + minutes * 60; |
| 282 | Ok((s, if negative { -seconds } else { seconds })) |
| 283 | } |
| 284 | |
| 285 | /// Same as `timezone_offset` but also allows for RFC 2822 legacy timezones. |
| 286 | /// May return `None` which indicates an insufficient offset data (i.e. `-0000`). |
| 287 | /// See [RFC 2822 Section 4.3]. |
| 288 | /// |
| 289 | /// [RFC 2822 Section 4.3]: https://tools.ietf.org/html/rfc2822#section-4.3 |
| 290 | pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, i32)> { |
| 291 | // tries to parse legacy time zone names |
| 292 | let upto = s.as_bytes().iter().position(|&c| !c.is_ascii_alphabetic()).unwrap_or(s.len()); |
| 293 | if upto > 0 { |
| 294 | let name = &s.as_bytes()[..upto]; |
| 295 | let s = &s[upto..]; |
| 296 | let offset_hours = |o| Ok((s, o * 3600)); |
| 297 | // RFC 2822 requires support for some named North America timezones, a small subset of all |
| 298 | // named timezones. |
| 299 | if name.eq_ignore_ascii_case(b"gmt" ) |
| 300 | || name.eq_ignore_ascii_case(b"ut" ) |
| 301 | || name.eq_ignore_ascii_case(b"z" ) |
| 302 | { |
| 303 | return offset_hours(0); |
| 304 | } else if name.eq_ignore_ascii_case(b"edt" ) { |
| 305 | return offset_hours(-4); |
| 306 | } else if name.eq_ignore_ascii_case(b"est" ) || name.eq_ignore_ascii_case(b"cdt" ) { |
| 307 | return offset_hours(-5); |
| 308 | } else if name.eq_ignore_ascii_case(b"cst" ) || name.eq_ignore_ascii_case(b"mdt" ) { |
| 309 | return offset_hours(-6); |
| 310 | } else if name.eq_ignore_ascii_case(b"mst" ) || name.eq_ignore_ascii_case(b"pdt" ) { |
| 311 | return offset_hours(-7); |
| 312 | } else if name.eq_ignore_ascii_case(b"pst" ) { |
| 313 | return offset_hours(-8); |
| 314 | } else if name.len() == 1 { |
| 315 | if let b'a' ..=b'i' | b'k' ..=b'y' | b'A' ..=b'I' | b'K' ..=b'Y' = name[0] { |
| 316 | // recommended by RFC 2822: consume but treat it as -0000 |
| 317 | return Ok((s, 0)); |
| 318 | } |
| 319 | } |
| 320 | Err(INVALID) |
| 321 | } else { |
| 322 | timezone_offset(s, |s| Ok(s), false, false, false) |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | /// Tries to consume an RFC2822 comment including preceding ` `. |
| 327 | /// |
| 328 | /// Returns the remaining string after the closing parenthesis. |
| 329 | pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> { |
| 330 | use CommentState::*; |
| 331 | |
| 332 | let s: &str = s.trim_start(); |
| 333 | |
| 334 | let mut state: CommentState = Start; |
| 335 | for (i: usize, c: u8) in s.bytes().enumerate() { |
| 336 | state = match (state, c) { |
| 337 | (Start, b'(' ) => Next(1), |
| 338 | (Next(1), b')' ) => return Ok((&s[i + 1..], ())), |
| 339 | (Next(depth: usize), b' \\' ) => Escape(depth), |
| 340 | (Next(depth: usize), b'(' ) => Next(depth + 1), |
| 341 | (Next(depth: usize), b')' ) => Next(depth - 1), |
| 342 | (Next(depth: usize), _) | (Escape(depth: usize), _) => Next(depth), |
| 343 | _ => return Err(INVALID), |
| 344 | }; |
| 345 | } |
| 346 | |
| 347 | Err(TOO_SHORT) |
| 348 | } |
| 349 | |
| 350 | enum CommentState { |
| 351 | Start, |
| 352 | Next(usize), |
| 353 | Escape(usize), |
| 354 | } |
| 355 | |
| 356 | #[cfg (test)] |
| 357 | mod tests { |
| 358 | use super::{ |
| 359 | comment_2822, nanosecond, nanosecond_fixed, short_or_long_month0, short_or_long_weekday, |
| 360 | timezone_offset_2822, |
| 361 | }; |
| 362 | use crate::format::{INVALID, TOO_SHORT}; |
| 363 | use crate::Weekday; |
| 364 | |
| 365 | #[test ] |
| 366 | fn test_rfc2822_comments() { |
| 367 | let testdata = [ |
| 368 | ("" , Err(TOO_SHORT)), |
| 369 | (" " , Err(TOO_SHORT)), |
| 370 | ("x" , Err(INVALID)), |
| 371 | ("(" , Err(TOO_SHORT)), |
| 372 | ("()" , Ok("" )), |
| 373 | (" \r\n\t()" , Ok("" )), |
| 374 | ("() " , Ok(" " )), |
| 375 | ("()z" , Ok("z" )), |
| 376 | ("(x)" , Ok("" )), |
| 377 | ("(())" , Ok("" )), |
| 378 | ("((()))" , Ok("" )), |
| 379 | ("(x(x(x)x)x)" , Ok("" )), |
| 380 | ("( x ( x ( x ) x ) x )" , Ok("" )), |
| 381 | (r"(\)" , Err(TOO_SHORT)), |
| 382 | (r"(\()" , Ok("" )), |
| 383 | (r"(\))" , Ok("" )), |
| 384 | (r"(\\)" , Ok("" )), |
| 385 | ("(()())" , Ok("" )), |
| 386 | ("( x ( x ) x ( x ) x )" , Ok("" )), |
| 387 | ]; |
| 388 | |
| 389 | for (test_in, expected) in testdata.iter() { |
| 390 | let actual = comment_2822(test_in).map(|(s, _)| s); |
| 391 | assert_eq!( |
| 392 | *expected, actual, |
| 393 | "{:?} expected to produce {:?}, but produced {:?}." , |
| 394 | test_in, expected, actual |
| 395 | ); |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | #[test ] |
| 400 | fn test_timezone_offset_2822() { |
| 401 | assert_eq!(timezone_offset_2822("cSt" ).unwrap(), ("" , -21600)); |
| 402 | assert_eq!(timezone_offset_2822("pSt" ).unwrap(), ("" , -28800)); |
| 403 | assert_eq!(timezone_offset_2822("mSt" ).unwrap(), ("" , -25200)); |
| 404 | assert_eq!(timezone_offset_2822("-1551" ).unwrap(), ("" , -57060)); |
| 405 | assert_eq!(timezone_offset_2822("Gp" ), Err(INVALID)); |
| 406 | } |
| 407 | |
| 408 | #[test ] |
| 409 | fn test_short_or_long_month0() { |
| 410 | assert_eq!(short_or_long_month0("JUn" ).unwrap(), ("" , 5)); |
| 411 | assert_eq!(short_or_long_month0("mAy" ).unwrap(), ("" , 4)); |
| 412 | assert_eq!(short_or_long_month0("AuG" ).unwrap(), ("" , 7)); |
| 413 | assert_eq!(short_or_long_month0("Aprâ" ).unwrap(), ("â" , 3)); |
| 414 | assert_eq!(short_or_long_month0("JUl" ).unwrap(), ("" , 6)); |
| 415 | assert_eq!(short_or_long_month0("mAr" ).unwrap(), ("" , 2)); |
| 416 | assert_eq!(short_or_long_month0("Jan" ).unwrap(), ("" , 0)); |
| 417 | } |
| 418 | |
| 419 | #[test ] |
| 420 | fn test_short_or_long_weekday() { |
| 421 | assert_eq!(short_or_long_weekday("sAtu" ).unwrap(), ("u" , Weekday::Sat)); |
| 422 | assert_eq!(short_or_long_weekday("thu" ).unwrap(), ("" , Weekday::Thu)); |
| 423 | } |
| 424 | |
| 425 | #[test ] |
| 426 | fn test_nanosecond_fixed() { |
| 427 | assert_eq!(nanosecond_fixed("" , 0usize).unwrap(), ("" , 0)); |
| 428 | assert!(nanosecond_fixed("" , 1usize).is_err()); |
| 429 | } |
| 430 | |
| 431 | #[test ] |
| 432 | fn test_nanosecond() { |
| 433 | assert_eq!(nanosecond("2Ù" ).unwrap(), ("Ù" , 200000000)); |
| 434 | assert_eq!(nanosecond("8" ).unwrap(), ("" , 800000000)); |
| 435 | } |
| 436 | } |
| 437 | |