scan.rs source code [crates/chrono-0.4.35/src/format/scan.rs]

1	// This is a part of Chrono.
2	// See README.md and LICENSE.txt for details.
3
4	/!*
5	* Various scanning routines for the parser.
6	*/
7
8	use super::{ParseResult, INVALID, OUT_OF_RANGE, TOO_SHORT};
9	use crate::Weekday;
10
11	/// Tries to parse the non-negative number from `min` to `max` digits.
12	///
13	/// The absence of digits at all is an unconditional error.
14	/// More than `max` digits are consumed up to the first `max` digits.
15	/// Any number that does not fit in `i64` is an error.
16	#[inline]
17	pub(super) fn number(s: &str, min: usize, max: usize) -> ParseResult<(&str, i64)> {
18	assert!(min <= max);
19
20	// We are only interested in ascii numbers, so we can work with the `str` as bytes. We stop on
21	// the first non-numeric byte, which may be another ascii character or beginning of multi-byte
22	// UTF-8 character.
23	let bytes = s.as_bytes();
24	if bytes.len() < min {
25	return Err(TOO_SHORT);
26	}
27
28	let mut n = `0i64`;
29	for (i, c) in bytes.iter().take(max).cloned().enumerate() {
30	// cloned() = copied()
31	if !c.is_ascii_digit() {
32	if i < min {
33	return Err(INVALID);
34	} else {
35	return Ok((&s[i..], n));
36	}
37	}
38
39	n = match n.checked_mul(`10`).and_then(\|n\| n.checked_add((c - b'0') as i64)) {
40	Some(n) => n,
41	None => return Err(OUT_OF_RANGE),
42	};
43	}
44
45	Ok((&s[core::cmp::min(max, bytes.len())..], n))
46	}
47
48	/// Tries to consume at least one digits as a fractional second.
49	/// Returns the number of whole nanoseconds (0--999,999,999).
50	pub(super) fn nanosecond(s: &str) -> ParseResult<(&str, i64)> {
51	// record the number of digits consumed for later scaling.
52	let origlen: usize = s.len();
53	let (s: &str, v: i64) = number(s, min:`1`, max:`9`)?;
54	let consumed: usize = origlen - s.len();
55
56	// scale the number accordingly.
57	static SCALE: [i64; `10`] =
58	[`0`, `100_000_000`, `10_000_000`, `1_000_000`, `100_000`, `10_000`, `1_000`, `100`, `10`, `1`];
59	let v: i64 = v.checked_mul(SCALE[consumed]).ok_or(OUT_OF_RANGE)?;
60
61	// if there are more than 9 digits, skip next digits.
62	let s: &str = s.trim_start_matches(\|c: char\| c.is_ascii_digit());
63
64	Ok((s, v))
65	}
66
67	/// Tries to consume a fixed number of digits as a fractional second.
68	/// Returns the number of whole nanoseconds (0--999,999,999).
69	pub(super) fn nanosecond_fixed(s: &str, digits: usize) -> ParseResult<(&str, i64)> {
70	// record the number of digits consumed for later scaling.
71	let (s: &str, v: i64) = number(s, min:digits, max:digits)?;
72
73	// scale the number accordingly.
74	static SCALE: [i64; `10`] =
75	[`0`, `100_000_000`, `10_000_000`, `1_000_000`, `100_000`, `10_000`, `1_000`, `100`, `10`, `1`];
76	let v: i64 = v.checked_mul(SCALE[digits]).ok_or(OUT_OF_RANGE)?;
77
78	Ok((s, v))
79	}
80
81	/// Tries to parse the month index (0 through 11) with the first three ASCII letters.
82	pub(super) fn short_month0(s: &str) -> ParseResult<(&str, u8)> {
83	if s.len() < `3` {
84	return Err(TOO_SHORT);
85	}
86	let buf: &[u8] = s.as_bytes();
87	let month0: u8 = match (buf[`0`] \| `32`, buf[`1`] \| `32`, buf[`2`] \| `32`) {
88	(b'j', b'a', b'n') => `0`,
89	(b'f', b'e', b'b') => `1`,
90	(b'm', b'a', b'r') => `2`,
91	(b'a', b'p', b'r') => `3`,
92	(b'm', b'a', b'y') => `4`,
93	(b'j', b'u', b'n') => `5`,
94	(b'j', b'u', b'l') => `6`,
95	(b'a', b'u', b'g') => `7`,
96	(b's', b'e', b'p') => `8`,
97	(b'o', b'c', b't') => `9`,
98	(b'n', b'o', b'v') => `10`,
99	(b'd', b'e', b'c') => `11`,
100	_ => return Err(INVALID),
101	};
102	Ok((&s[`3`..], month0))
103	}
104
105	/// Tries to parse the weekday with the first three ASCII letters.
106	pub(super) fn short_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
107	if s.len() < `3` {
108	return Err(TOO_SHORT);
109	}
110	let buf: &[u8] = s.as_bytes();
111	let weekday: Weekday = match (buf[`0`] \| `32`, buf[`1`] \| `32`, buf[`2`] \| `32`) {
112	(b'm', b'o', b'n') => Weekday::Mon,
113	(b't', b'u', b'e') => Weekday::Tue,
114	(b'w', b'e', b'd') => Weekday::Wed,
115	(b't', b'h', b'u') => Weekday::Thu,
116	(b'f', b'r', b'i') => Weekday::Fri,
117	(b's', b'a', b't') => Weekday::Sat,
118	(b's', b'u', b'n') => Weekday::Sun,
119	_ => return Err(INVALID),
120	};
121	Ok((&s[`3`..], weekday))
122	}
123
124	/// Tries to parse the month index (0 through 11) with short or long month names.
125	/// It prefers long month names to short month names when both are possible.
126	pub(super) fn short_or_long_month0(s: &str) -> ParseResult<(&str, u8)> {
127	// lowercased month names, minus first three chars
128	static LONG_MONTH_SUFFIXES: [&[u8]; `12`] = [
129	b"uary", b"ruary", b"ch", b"il", b"", b"e", b"y", b"ust", b"tember", b"ober", b"ember",
130	b"ember",
131	];
132
133	let (mut s: &str, month0: u8) = short_month0(s)?;
134
135	// tries to consume the suffix if possible
136	let suffix: &[u8] = LONG_MONTH_SUFFIXES[month0 as usize];
137	if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
138	s = &s[suffix.len()..];
139	}
140
141	Ok((s, month0))
142	}
143
144	/// Tries to parse the weekday with short or long weekday names.
145	/// It prefers long weekday names to short weekday names when both are possible.
146	pub(super) fn short_or_long_weekday(s: &str) -> ParseResult<(&str, Weekday)> {
147	// lowercased weekday names, minus first three chars
148	static LONG_WEEKDAY_SUFFIXES: [&[u8]; `7`] =
149	[b"day", b"sday", b"nesday", b"rsday", b"day", b"urday", b"day"];
150
151	let (mut s: &str, weekday: Weekday) = short_weekday(s)?;
152
153	// tries to consume the suffix if possible
154	let suffix: &[u8] = LONG_WEEKDAY_SUFFIXES[weekday.num_days_from_monday() as usize];
155	if s.len() >= suffix.len() && s.as_bytes()[..suffix.len()].eq_ignore_ascii_case(suffix) {
156	s = &s[suffix.len()..];
157	}
158
159	Ok((s, weekday))
160	}
161
162	/// Tries to consume exactly one given character.
163	pub(super) fn char(s: &str, c1: u8) -> ParseResult<&str> {
164	match s.as_bytes().first() {
165	Some(&c: u8) if c == c1 => Ok(&s[`1`..]),
166	Some(_) => Err(INVALID),
167	None => Err(TOO_SHORT),
168	}
169	}
170
171	/// Tries to consume one or more whitespace.
172	pub(super) fn space(s: &str) -> ParseResult<&str> {
173	let s_: &str = s.trim_start();
174	if s_.len() < s.len() {
175	Ok(s_)
176	} else if s.is_empty() {
177	Err(TOO_SHORT)
178	} else {
179	Err(INVALID)
180	}
181	}
182
183	/// Consumes any number (including zero) of colon or spaces.
184	pub(crate) fn colon_or_space(s: &str) -> ParseResult<&str> {
185	Ok(s.trim_start_matches(\|c: char\| c == ':' \|\| c.is_whitespace()))
186	}
187
188	/// Parse a timezone from `s` and return the offset in seconds.
189	///
190	/// The `consume_colon` function is used to parse a mandatory or optional `:`
191	/// separator between hours offset and minutes offset.
192	///
193	/// The `allow_missing_minutes` flag allows the timezone minutes offset to be
194	/// missing from `s`.
195	///
196	/// The `allow_tz_minus_sign` flag allows the timezone offset negative character
197	/// to also be `−` MINUS SIGN (U+2212) in addition to the typical
198	/// ASCII-compatible `-` HYPHEN-MINUS (U+2D).
199	/// This is part of [RFC 3339 & ISO 8601].
200	///
201	/// [RFC 3339 & ISO 8601]: https://en.wikipedia.org/w/index.php?title=ISO_8601&oldid=1114309368#Time_offsets_from_UTC
202	pub(crate) fn timezone_offset<F>(
203	mut s: &str,
204	mut consume_colon: F,
205	allow_zulu: bool,
206	allow_missing_minutes: bool,
207	allow_tz_minus_sign: bool,
208	) -> ParseResult<(&str, i32)>
209	where
210	F: FnMut(&str) -> ParseResult<&str>,
211	{
212	if allow_zulu {
213	if let Some(&b'Z' \| &b'z') = s.as_bytes().first() {
214	return Ok((&s[`1`..], `0`));
215	}
216	}
217
218	const fn digits(s: &str) -> ParseResult<(u8, u8)> {
219	let b = s.as_bytes();
220	if b.len() < `2` {
221	Err(TOO_SHORT)
222	} else {
223	Ok((b[`0`], b[`1`]))
224	}
225	}
226	let negative = match s.chars().next() {
227	Some('+') => {
228	// PLUS SIGN (U+2B)
229	s = &s['+'.len_utf8()..];
230
231	`false`
232	}
233	Some('-') => {
234	// HYPHEN-MINUS (U+2D)
235	s = &s['-'.len_utf8()..];
236
237	`true`
238	}
239	Some('−') => {
240	// MINUS SIGN (U+2212)
241	if !allow_tz_minus_sign {
242	return Err(INVALID);
243	}
244	s = &s['−'.len_utf8()..];
245
246	`true`
247	}
248	Some(_) => return Err(INVALID),
249	None => return Err(TOO_SHORT),
250	};
251
252	// hours (00--99)
253	let hours = match digits(s)? {
254	(h1 @ b'0'..=b'9', h2 @ b'0'..=b'9') => i32::from((h1 - b'0') * `10` + (h2 - b'0')),
255	_ => return Err(INVALID),
256	};
257	s = &s[`2`..];
258
259	// colons (and possibly other separators)
260	s = consume_colon(s)?;
261
262	// minutes (00--59)
263	// if the next two items are digits then we have to add minutes
264	let minutes = if let Ok(ds) = digits(s) {
265	match ds {
266	(m1 @ b'0'..=b'5', m2 @ b'0'..=b'9') => i32::from((m1 - b'0') * `10` + (m2 - b'0')),
267	(b'6'..=b'9', b'0'..=b'9') => return Err(OUT_OF_RANGE),
268	_ => return Err(INVALID),
269	}
270	} else if allow_missing_minutes {
271	`0`
272	} else {
273	return Err(TOO_SHORT);
274	};
275	s = match s.len() {
276	len if len >= `2` => &s[`2`..],
277	`0` => s,
278	_ => return Err(TOO_SHORT),
279	};
280
281	let seconds = hours * `3600` + minutes * `60`;
282	Ok((s, if negative { -seconds } else { seconds }))
283	}
284
285	/// Same as `timezone_offset` but also allows for RFC 2822 legacy timezones.
286	/// May return `None` which indicates an insufficient offset data (i.e. `-0000`).
287	/// See [RFC 2822 Section 4.3].
288	///
289	/// [RFC 2822 Section 4.3]: https://tools.ietf.org/html/rfc2822#section-4.3
290	pub(super) fn timezone_offset_2822(s: &str) -> ParseResult<(&str, i32)> {
291	// tries to parse legacy time zone names
292	let upto = s.as_bytes().iter().position(\|&c\| !c.is_ascii_alphabetic()).unwrap_or(s.len());
293	if upto > `0` {
294	let name = &s.as_bytes()[..upto];
295	let s = &s[upto..];
296	let offset_hours = \|o\| Ok((s, o * `3600`));
297	// RFC 2822 requires support for some named North America timezones, a small subset of all
298	// named timezones.
299	if name.eq_ignore_ascii_case(b"gmt")
300	\|\| name.eq_ignore_ascii_case(b"ut")
301	\|\| name.eq_ignore_ascii_case(b"z")
302	{
303	return offset_hours(`0`);
304	} else if name.eq_ignore_ascii_case(b"edt") {
305	return offset_hours(`-4`);
306	} else if name.eq_ignore_ascii_case(b"est") \|\| name.eq_ignore_ascii_case(b"cdt") {
307	return offset_hours(`-5`);
308	} else if name.eq_ignore_ascii_case(b"cst") \|\| name.eq_ignore_ascii_case(b"mdt") {
309	return offset_hours(`-6`);
310	} else if name.eq_ignore_ascii_case(b"mst") \|\| name.eq_ignore_ascii_case(b"pdt") {
311	return offset_hours(`-7`);
312	} else if name.eq_ignore_ascii_case(b"pst") {
313	return offset_hours(`-8`);
314	} else if name.len() == `1` {
315	if let b'a'..=b'i' \| b'k'..=b'y' \| b'A'..=b'I' \| b'K'..=b'Y' = name[`0`] {
316	// recommended by RFC 2822: consume but treat it as -0000
317	return Ok((s, `0`));
318	}
319	}
320	Err(INVALID)
321	} else {
322	timezone_offset(s, \|s\| Ok(s), `false`, `false`, `false`)
323	}
324	}
325
326	/// Tries to consume an RFC2822 comment including preceding ` `.
327	///
328	/// Returns the remaining string after the closing parenthesis.
329	pub(super) fn comment_2822(s: &str) -> ParseResult<(&str, ())> {
330	use CommentState::*;
331
332	let s: &str = s.trim_start();
333
334	let mut state: CommentState = Start;
335	for (i: usize, c: u8) in s.bytes().enumerate() {
336	state = match (state, c) {
337	(Start, b'(') => Next(`1`),
338	(Next(`1`), b')') => return Ok((&s[i + `1`..], ())),
339	(Next(depth: usize), b'`\\`') => Escape(depth),
340	(Next(depth: usize), b'(') => Next(depth + `1`),
341	(Next(depth: usize), b')') => Next(depth - `1`),
342	(Next(depth: usize), _) \| (Escape(depth: usize), _) => Next(depth),
343	_ => return Err(INVALID),
344	};
345	}
346
347	Err(TOO_SHORT)
348	}
349
350	enum CommentState {
351	Start,
352	Next(usize),
353	Escape(usize),
354	}
355
356	#[cfg(test)]
357	mod tests {
358	use super::{
359	comment_2822, nanosecond, nanosecond_fixed, short_or_long_month0, short_or_long_weekday,
360	timezone_offset_2822,
361	};
362	use crate::format::{INVALID, TOO_SHORT};
363	use crate::Weekday;
364
365	#[test]
366	fn test_rfc2822_comments() {
367	let testdata = [
368	("", Err(TOO_SHORT)),
369	(" ", Err(TOO_SHORT)),
370	("x", Err(INVALID)),
371	("(", Err(TOO_SHORT)),
372	("()", Ok("")),
373	(" `\r\n\t`()", Ok("")),
374	("() ", Ok(" ")),
375	("()z", Ok("z")),
376	("(x)", Ok("")),
377	("(())", Ok("")),
378	("((()))", Ok("")),
379	("(x(x(x)x)x)", Ok("")),
380	("( x ( x ( x ) x ) x )", Ok("")),
381	(r"(\)", Err(TOO_SHORT)),
382	(r"(\()", Ok("")),
383	(r"(\))", Ok("")),
384	(r"(\\)", Ok("")),
385	("(()())", Ok("")),
386	("( x ( x ) x ( x ) x )", Ok("")),
387	];
388
389	for (test_in, expected) in testdata.iter() {
390	let actual = comment_2822(test_in).map(\|(s, _)\| s);
391	assert_eq!(
392	*expected, actual,
393	"{:?} expected to produce {:?}, but produced {:?}.",
394	test_in, expected, actual
395	);
396	}
397	}
398
399	#[test]
400	fn test_timezone_offset_2822() {
401	assert_eq!(timezone_offset_2822("cSt").unwrap(), ("", `-21600`));
402	assert_eq!(timezone_offset_2822("pSt").unwrap(), ("", `-28800`));
403	assert_eq!(timezone_offset_2822("mSt").unwrap(), ("", `-25200`));
404	assert_eq!(timezone_offset_2822("-1551").unwrap(), ("", `-57060`));
405	assert_eq!(timezone_offset_2822("Gp"), Err(INVALID));
406	}
407
408	#[test]
409	fn test_short_or_long_month0() {
410	assert_eq!(short_or_long_month0("JUn").unwrap(), ("", `5`));
411	assert_eq!(short_or_long_month0("mAy").unwrap(), ("", `4`));
412	assert_eq!(short_or_long_month0("AuG").unwrap(), ("", `7`));
413	assert_eq!(short_or_long_month0("Aprâ").unwrap(), ("â", `3`));
414	assert_eq!(short_or_long_month0("JUl").unwrap(), ("", `6`));
415	assert_eq!(short_or_long_month0("mAr").unwrap(), ("", `2`));
416	assert_eq!(short_or_long_month0("Jan").unwrap(), ("", `0`));
417	}
418
419	#[test]
420	fn test_short_or_long_weekday() {
421	assert_eq!(short_or_long_weekday("sAtu").unwrap(), ("u", Weekday::Sat));
422	assert_eq!(short_or_long_weekday("thu").unwrap(), ("", Weekday::Thu));
423	}
424
425	#[test]
426	fn test_nanosecond_fixed() {
427	assert_eq!(nanosecond_fixed("", `0usize`).unwrap(), ("", `0`));
428	assert!(nanosecond_fixed("", `1usize`).is_err());
429	}
430
431	#[test]
432	fn test_nanosecond() {
433	assert_eq!(nanosecond("2Ù").unwrap(), ("Ù", `200000000`));
434	assert_eq!(nanosecond("8").unwrap(), ("", `800000000`));
435	}
436	}
437