literal.rs source code [crates/cexpr/src/literal.rs]

1	// (C) Copyright 2016 Jethro G. Beekman
2	//
3	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6	// option. This file may not be copied, modified, or distributed
7	// except according to those terms.
8	//! Parsing C literals from byte slices.
9	//!
10	//! This will parse a representation of a C literal into a Rust type.
11	//!
12	//! # characters
13	//! Character literals are stored into the `CChar` type, which can hold values
14	//! that are not valid Unicode code points. ASCII characters are represented as
15	//! `char`, literal bytes with the high byte set are converted into the raw
16	//! representation. Escape sequences are supported. If hex and octal escapes
17	//! map to an ASCII character, that is used, otherwise, the raw encoding is
18	//! used, including for values over 255. Unicode escapes are checked for
19	//! validity and mapped to `char`. Character sequences are not supported. Width
20	//! prefixes are ignored.
21	//!
22	//! # strings
23	//! Strings are interpreted as byte vectors. Escape sequences are supported. If
24	//! hex and octal escapes map onto multi-byte characters, they are truncated to
25	//! one 8-bit character. Unicode escapes are converted into their UTF-8
26	//! encoding. Width prefixes are ignored.
27	//!
28	//! # integers
29	//! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are
30	//! all supported. If the literal value is between `i64::MAX` and `u64::MAX`,
31	//! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and
32	//! sign suffixes are ignored. Sign prefixes are not supported.
33	//!
34	//! # real numbers
35	//! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are
36	//! not supported in the significand. Hexadecimal floating points are not
37	//! supported.
38
39	use std::char;
40	use std::str::{self, FromStr};
41
42	use nom::branch::alt;
43	use nom::bytes::complete::is_not;
44	use nom::bytes::complete::tag;
45	use nom::character::complete::{char, one_of};
46	use nom::combinator::{complete, map, map_opt, opt, recognize};
47	use nom::multi::{fold_many0, many0, many1, many_m_n};
48	use nom::sequence::{delimited, pair, preceded, terminated, tuple};
49	use nom::*;
50
51	use crate::expr::EvalResult;
52	use crate::ToCexprResult;
53
54	#[derive(Debug, Copy, Clone, PartialEq, Eq)]
55	/// Representation of a C character
56	pub enum CChar {
57	/// A character that can be represented as a `char`
58	Char(char),
59	/// Any other character (8-bit characters, unicode surrogates, etc.)
60	Raw(u64),
61	}
62
63	impl From<u8> for CChar {
64	fn from(i: u8) -> CChar {
65	match i {
66	`0`..=`0x7f` => CChar::Char(i as u8 as char),
67	_ => CChar::Raw(i as u64),
68	}
69	}
70	}
71
72	// A non-allocating version of this would be nice...
73	impl std::convert::Into<Vec<u8>> for CChar {
74	fn into(self) -> Vec<u8> {
75	match self {
76	CChar::Char(c: char) => {
77	let mut s: String = String::with_capacity(`4`);
78	s.extend(&[c]);
79	s.into_bytes()
80	}
81	CChar::Raw(i: u64) => {
82	let mut v: Vec = Vec::with_capacity(`1`);
83	v.push(i as u8);
84	v
85	}
86	}
87	}
88	}
89
90	/// ensures the child parser consumes the whole input
91	pub fn full<I: Clone, O, F>(
92	f: F,
93	) -> impl Fn(I) -> nom::IResult<I, O>
94	where
95	I: nom::InputLength,
96	F: Fn(I) -> nom::IResult<I, O>,
97	{
98	move \|input: I\| {
99	let res: Result<(I, O), Err>> = f(input);
100	match res {
101	Ok((i: I, o: O)) => {
102	if i.input_len() == `0` {
103	Ok((i, o))
104	} else {
105	Err(nom::Err::Error(nom::error::Error::new(input:i, code:nom::error::ErrorKind::Complete)))
106	}
107	}
108	r: Result<(I, O), Err>> => r,
109	}
110	}
111	}
112
113	// =================================
114	// ======== matching digits ========
115	// =================================
116
117	macro_rules! byte {
118	($($p: pat)\|* ) => {{
119	fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> {
120	match i.split_first() {
121	$(Some((&c @ $p,rest)))\|* => Ok((rest,c)),
122	Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))),
123	None => Err(nom::Err::Incomplete(Needed::new(`1`))),
124	}
125	}
126
127	parser
128	}}
129	}
130
131	fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> {
132	byte!(b'0'..=b'1')(i)
133	}
134
135	fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> {
136	byte!(b'0'..=b'7')(i)
137	}
138
139	fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
140	byte!(b'0'..=b'9')(i)
141	}
142
143	fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
144	byte!(b'0' ..= b'9' \| b'a' ..= b'f' \| b'A' ..= b'F')(i)
145	}
146
147	// ========================================
148	// ======== characters and strings ========
149	// ========================================
150
151	fn escape2char(c: char) -> CChar {
152	CChar::Char(match c {
153	'a' => '`\x07`',
154	'b' => '`\x08`',
155	'f' => '`\x0c`',
156	'n' => '`\n`',
157	'r' => '`\r`',
158	't' => '`\t`',
159	'v' => '`\x0b`',
160	_ => unreachable!("invalid escape {}", c),
161	})
162	}
163
164	fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> {
165	strOption::from_utf8(&n)
166	.ok()
167	.and_then(\|i: &str\| u64::from_str_radix(src:i, radix).ok())
168	.map(\|i: u64\| match i {
169	`0`..=`0x7f` => CChar::Char(i as u8 as char),
170	_ => CChar::Raw(i),
171	})
172	}
173
174	fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> {
175	strOption::from_utf8(&n)
176	.ok()
177	.and_then(\|i: &str\| u32::from_str_radix(src:i, radix:`16`).ok())
178	.and_then(char::from_u32)
179	.map(CChar::Char)
180	}
181
182	fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
183	preceded(
184	first:char('`\\`'),
185	second:alt((
186	map(parser:one_of(r#"'"?\"#), f:CChar::Char),
187	map(parser:one_of("abfnrtv"), f:escape2char),
188	map_opt(parser:many_m_n(`1`, `3`, octal), \|v: Vec\| c_raw_escape(n:v, radix:`8`)),
189	map_opt(parser:preceded(char('x'), many1(hexadecimal)), \|v: Vec\| {
190	c_raw_escape(n:v, radix:`16`)
191	}),
192	map_opt(
193	parser:preceded(char('u'), many_m_n(`4`, `4`, hexadecimal)),
194	f:c_unicode_escape,
195	),
196	map_opt(
197	parser:preceded(char('U'), many_m_n(`8`, `8`, hexadecimal)),
198	f:c_unicode_escape,
199	),
200	)),
201	)(i)
202	}
203
204	fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> {
205	alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i)
206	}
207
208	fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
209	delimited(
210	first:terminated(opt(c_width_prefix), char('`\'`')),
211	second:alt((
212	escaped_char,
213	map(byte!(`0` ..= `91` / \=92 / \| `93` ..= `255`), CChar::from),
214	)),
215	third:char('`\'`'),
216	)(i)
217	}
218
219	fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> {
220	delimited(
221	first:alt((preceded(c_width_prefix, char('"')), char('"'))),
222	second:fold_many0(
223	alt((
224	map(escaped_char, \|c: CChar\| c.into()),
225	map(is_not([b'`\\`', b'"']), \|c: &[u8]\| c.into()),
226	)),
227	Vec::new,
228	\|mut v: Vec<u8>, res: Vec<u8>\| {
229	v.extend_from_slice(&res);
230	v
231	},
232	),
233	third:char('"'),
234	)(i)
235	}
236
237	// ================================
238	// ======== parse integers ========
239	// ================================
240
241	fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> {
242	strOption<&str>::from_utf8(&n)
243	.ok()
244	.and_then(\|i: &str\| u64::from_str_radix(src:i, radix).ok())
245	}
246
247	fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> {
248	let r: Result<(&[u8], &[u8]), Err<…>> = input.split_at_position(\|c: u8\| c != b'u' && c != b'U' && c != b'l' && c != b'L');
249	match r {
250	Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)),
251	res: Result<(&[u8], &[u8]), Err<…>> => res,
252	}
253	}
254
255	fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> {
256	map(
257	terminated(
258	alt((
259	map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), \|v\| {
260	c_int_radix(v, `16`)
261	}),
262	map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), \|v\| {
263	c_int_radix(v, `16`)
264	}),
265	map_opt(preceded(tag("0b"), many1(complete(binary))), \|v\| {
266	c_int_radix(v, `2`)
267	}),
268	map_opt(preceded(tag("0B"), many1(complete(binary))), \|v\| {
269	c_int_radix(v, `2`)
270	}),
271	map_opt(preceded(char('0'), many1(complete(octal))), \|v\| {
272	c_int_radix(v, `8`)
273	}),
274	map_opt(many1(complete(decimal)), \|v\| c_int_radix(v, `10`)),
275	\|input\| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))),
276	)),
277	opt(take_ul),
278	),
279	\|i\| i as i64,
280	)(i)
281	}
282
283	// ==============================
284	// ======== parse floats ========
285	// ==============================
286
287	fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> {
288	nom::combinator::complete(byte!(b'f' \| b'l' \| b'F' \| b'L'))(i)
289	}
290
291	fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> {
292	preceded(
293	first:byte!(b'e' \| b'E'),
294	second:pair(first:opt(byte!(b'-' \| b'+')), second:many1(complete(decimal))),
295	)(i)
296	}
297
298	fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> {
299	map_opt(
300	alt((
301	terminated(
302	recognize(tuple((
303	many1(complete(decimal)),
304	byte!(b'.'),
305	many0(complete(decimal)),
306	))),
307	opt(float_width),
308	),
309	terminated(
310	recognize(tuple((
311	many0(complete(decimal)),
312	byte!(b'.'),
313	many1(complete(decimal)),
314	))),
315	opt(float_width),
316	),
317	terminated(
318	recognize(tuple((
319	many0(complete(decimal)),
320	opt(byte!(b'.')),
321	many1(complete(decimal)),
322	float_exp,
323	))),
324	opt(float_width),
325	),
326	terminated(
327	recognize(tuple((
328	many1(complete(decimal)),
329	opt(byte!(b'.')),
330	many0(complete(decimal)),
331	float_exp,
332	))),
333	opt(float_width),
334	),
335	terminated(recognize(many1(complete(decimal))), float_width),
336	)),
337	\|v\| str::from_utf8(v).ok().and_then(\|i\| f64::from_str(i).ok()),
338	)(i)
339	}
340
341	// ================================
342	// ======== main interface ========
343	// ================================
344
345	fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
346	altResult<(&[u8], EvalResult), …>((
347	map(parser:full(c_char), f:EvalResult::Char),
348	map(parser:full(c_int), \|i: i64\| EvalResult::Int(::std::num::Wrapping(i))),
349	map(parser:full(c_float), f:EvalResult::Float),
350	map(parser:full(c_string), f:EvalResult::Str),
351	))(input)
352	.to_cexpr_result()
353	}
354
355	/// Parse a C literal.
356	///
357	/// The input must contain exactly the representation of a single literal
358	/// token, and in particular no whitespace or sign prefixes.
359	pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
360	crate::assert_full_parse(result:one_literal(input))
361	}
362