1 | // (C) Copyright 2016 Jethro G. Beekman |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | // option. This file may not be copied, modified, or distributed |
7 | // except according to those terms. |
8 | //! Parsing C literals from byte slices. |
9 | //! |
10 | //! This will parse a representation of a C literal into a Rust type. |
11 | //! |
12 | //! # characters |
13 | //! Character literals are stored into the `CChar` type, which can hold values |
14 | //! that are not valid Unicode code points. ASCII characters are represented as |
15 | //! `char`, literal bytes with the high byte set are converted into the raw |
16 | //! representation. Escape sequences are supported. If hex and octal escapes |
17 | //! map to an ASCII character, that is used, otherwise, the raw encoding is |
18 | //! used, including for values over 255. Unicode escapes are checked for |
19 | //! validity and mapped to `char`. Character sequences are not supported. Width |
20 | //! prefixes are ignored. |
21 | //! |
22 | //! # strings |
23 | //! Strings are interpreted as byte vectors. Escape sequences are supported. If |
24 | //! hex and octal escapes map onto multi-byte characters, they are truncated to |
25 | //! one 8-bit character. Unicode escapes are converted into their UTF-8 |
26 | //! encoding. Width prefixes are ignored. |
27 | //! |
28 | //! # integers |
29 | //! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are |
30 | //! all supported. If the literal value is between `i64::MAX` and `u64::MAX`, |
31 | //! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and |
32 | //! sign suffixes are ignored. Sign prefixes are not supported. |
33 | //! |
34 | //! # real numbers |
35 | //! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are |
36 | //! not supported in the significand. Hexadecimal floating points are not |
37 | //! supported. |
38 | |
39 | use std::char; |
40 | use std::str::{self, FromStr}; |
41 | |
42 | use nom::branch::alt; |
43 | use nom::bytes::complete::is_not; |
44 | use nom::bytes::complete::tag; |
45 | use nom::character::complete::{char, one_of}; |
46 | use nom::combinator::{complete, map, map_opt, opt, recognize}; |
47 | use nom::multi::{fold_many0, many0, many1, many_m_n}; |
48 | use nom::sequence::{delimited, pair, preceded, terminated, tuple}; |
49 | use nom::*; |
50 | |
51 | use crate::expr::EvalResult; |
52 | use crate::ToCexprResult; |
53 | |
54 | #[derive (Debug, Copy, Clone, PartialEq, Eq)] |
55 | /// Representation of a C character |
56 | pub enum CChar { |
57 | /// A character that can be represented as a `char` |
58 | Char(char), |
59 | /// Any other character (8-bit characters, unicode surrogates, etc.) |
60 | Raw(u64), |
61 | } |
62 | |
63 | impl From<u8> for CChar { |
64 | fn from(i: u8) -> CChar { |
65 | match i { |
66 | 0..=0x7f => CChar::Char(i as u8 as char), |
67 | _ => CChar::Raw(i as u64), |
68 | } |
69 | } |
70 | } |
71 | |
72 | // A non-allocating version of this would be nice... |
73 | impl std::convert::Into<Vec<u8>> for CChar { |
74 | fn into(self) -> Vec<u8> { |
75 | match self { |
76 | CChar::Char(c: char) => { |
77 | let mut s: String = String::with_capacity(4); |
78 | s.extend(&[c]); |
79 | s.into_bytes() |
80 | } |
81 | CChar::Raw(i: u64) => { |
82 | let mut v: Vec = Vec::with_capacity(1); |
83 | v.push(i as u8); |
84 | v |
85 | } |
86 | } |
87 | } |
88 | } |
89 | |
90 | /// ensures the child parser consumes the whole input |
91 | pub fn full<I: Clone, O, F>( |
92 | f: F, |
93 | ) -> impl Fn(I) -> nom::IResult<I, O> |
94 | where |
95 | I: nom::InputLength, |
96 | F: Fn(I) -> nom::IResult<I, O>, |
97 | { |
98 | move |input: I| { |
99 | let res: Result<(I, O), Err>> = f(input); |
100 | match res { |
101 | Ok((i: I, o: O)) => { |
102 | if i.input_len() == 0 { |
103 | Ok((i, o)) |
104 | } else { |
105 | Err(nom::Err::Error(nom::error::Error::new(input:i, code:nom::error::ErrorKind::Complete))) |
106 | } |
107 | } |
108 | r: Result<(I, O), Err>> => r, |
109 | } |
110 | } |
111 | } |
112 | |
113 | // ================================= |
114 | // ======== matching digits ======== |
115 | // ================================= |
116 | |
117 | macro_rules! byte { |
118 | ($($p: pat)|* ) => {{ |
119 | fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> { |
120 | match i.split_first() { |
121 | $(Some((&c @ $p,rest)))|* => Ok((rest,c)), |
122 | Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))), |
123 | None => Err(nom::Err::Incomplete(Needed::new(1))), |
124 | } |
125 | } |
126 | |
127 | parser |
128 | }} |
129 | } |
130 | |
131 | fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> { |
132 | byte!(b'0' ..=b'1' )(i) |
133 | } |
134 | |
135 | fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> { |
136 | byte!(b'0' ..=b'7' )(i) |
137 | } |
138 | |
139 | fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> { |
140 | byte!(b'0' ..=b'9' )(i) |
141 | } |
142 | |
143 | fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> { |
144 | byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F' )(i) |
145 | } |
146 | |
147 | // ======================================== |
148 | // ======== characters and strings ======== |
149 | // ======================================== |
150 | |
151 | fn escape2char(c: char) -> CChar { |
152 | CChar::Char(match c { |
153 | 'a' => ' \x07' , |
154 | 'b' => ' \x08' , |
155 | 'f' => ' \x0c' , |
156 | 'n' => ' \n' , |
157 | 'r' => ' \r' , |
158 | 't' => ' \t' , |
159 | 'v' => ' \x0b' , |
160 | _ => unreachable!("invalid escape {}" , c), |
161 | }) |
162 | } |
163 | |
164 | fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> { |
165 | strOption::from_utf8(&n) |
166 | .ok() |
167 | .and_then(|i: &str| u64::from_str_radix(src:i, radix).ok()) |
168 | .map(|i: u64| match i { |
169 | 0..=0x7f => CChar::Char(i as u8 as char), |
170 | _ => CChar::Raw(i), |
171 | }) |
172 | } |
173 | |
174 | fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> { |
175 | strOption::from_utf8(&n) |
176 | .ok() |
177 | .and_then(|i: &str| u32::from_str_radix(src:i, radix:16).ok()) |
178 | .and_then(char::from_u32) |
179 | .map(CChar::Char) |
180 | } |
181 | |
182 | fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { |
183 | preceded( |
184 | first:char(' \\' ), |
185 | second:alt(( |
186 | map(parser:one_of(r#"'"?\"# ), f:CChar::Char), |
187 | map(parser:one_of("abfnrtv" ), f:escape2char), |
188 | map_opt(parser:many_m_n(1, 3, octal), |v: Vec| c_raw_escape(n:v, radix:8)), |
189 | map_opt(parser:preceded(char('x' ), many1(hexadecimal)), |v: Vec| { |
190 | c_raw_escape(n:v, radix:16) |
191 | }), |
192 | map_opt( |
193 | parser:preceded(char('u' ), many_m_n(4, 4, hexadecimal)), |
194 | f:c_unicode_escape, |
195 | ), |
196 | map_opt( |
197 | parser:preceded(char('U' ), many_m_n(8, 8, hexadecimal)), |
198 | f:c_unicode_escape, |
199 | ), |
200 | )), |
201 | )(i) |
202 | } |
203 | |
204 | fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> { |
205 | alt((tag("u8" ), tag("u" ), tag("U" ), tag("L" )))(i) |
206 | } |
207 | |
208 | fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { |
209 | delimited( |
210 | first:terminated(opt(c_width_prefix), char(' \'' )), |
211 | second:alt(( |
212 | escaped_char, |
213 | map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from), |
214 | )), |
215 | third:char(' \'' ), |
216 | )(i) |
217 | } |
218 | |
219 | fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> { |
220 | delimited( |
221 | first:alt((preceded(c_width_prefix, char('"' )), char('"' ))), |
222 | second:fold_many0( |
223 | alt(( |
224 | map(escaped_char, |c: CChar| c.into()), |
225 | map(is_not([b' \\' , b'"' ]), |c: &[u8]| c.into()), |
226 | )), |
227 | Vec::new, |
228 | |mut v: Vec<u8>, res: Vec<u8>| { |
229 | v.extend_from_slice(&res); |
230 | v |
231 | }, |
232 | ), |
233 | third:char('"' ), |
234 | )(i) |
235 | } |
236 | |
237 | // ================================ |
238 | // ======== parse integers ======== |
239 | // ================================ |
240 | |
241 | fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> { |
242 | strOption<&str>::from_utf8(&n) |
243 | .ok() |
244 | .and_then(|i: &str| u64::from_str_radix(src:i, radix).ok()) |
245 | } |
246 | |
247 | fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> { |
248 | let r: Result<(&[u8], &[u8]), Err<…>> = input.split_at_position(|c: u8| c != b'u' && c != b'U' && c != b'l' && c != b'L' ); |
249 | match r { |
250 | Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)), |
251 | res: Result<(&[u8], &[u8]), Err<…>> => res, |
252 | } |
253 | } |
254 | |
255 | fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> { |
256 | map( |
257 | terminated( |
258 | alt(( |
259 | map_opt(preceded(tag("0x" ), many1(complete(hexadecimal))), |v| { |
260 | c_int_radix(v, 16) |
261 | }), |
262 | map_opt(preceded(tag("0X" ), many1(complete(hexadecimal))), |v| { |
263 | c_int_radix(v, 16) |
264 | }), |
265 | map_opt(preceded(tag("0b" ), many1(complete(binary))), |v| { |
266 | c_int_radix(v, 2) |
267 | }), |
268 | map_opt(preceded(tag("0B" ), many1(complete(binary))), |v| { |
269 | c_int_radix(v, 2) |
270 | }), |
271 | map_opt(preceded(char('0' ), many1(complete(octal))), |v| { |
272 | c_int_radix(v, 8) |
273 | }), |
274 | map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)), |
275 | |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))), |
276 | )), |
277 | opt(take_ul), |
278 | ), |
279 | |i| i as i64, |
280 | )(i) |
281 | } |
282 | |
283 | // ============================== |
284 | // ======== parse floats ======== |
285 | // ============================== |
286 | |
287 | fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> { |
288 | nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L' ))(i) |
289 | } |
290 | |
291 | fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> { |
292 | preceded( |
293 | first:byte!(b'e' | b'E' ), |
294 | second:pair(first:opt(byte!(b'-' | b'+' )), second:many1(complete(decimal))), |
295 | )(i) |
296 | } |
297 | |
298 | fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> { |
299 | map_opt( |
300 | alt(( |
301 | terminated( |
302 | recognize(tuple(( |
303 | many1(complete(decimal)), |
304 | byte!(b'.' ), |
305 | many0(complete(decimal)), |
306 | ))), |
307 | opt(float_width), |
308 | ), |
309 | terminated( |
310 | recognize(tuple(( |
311 | many0(complete(decimal)), |
312 | byte!(b'.' ), |
313 | many1(complete(decimal)), |
314 | ))), |
315 | opt(float_width), |
316 | ), |
317 | terminated( |
318 | recognize(tuple(( |
319 | many0(complete(decimal)), |
320 | opt(byte!(b'.' )), |
321 | many1(complete(decimal)), |
322 | float_exp, |
323 | ))), |
324 | opt(float_width), |
325 | ), |
326 | terminated( |
327 | recognize(tuple(( |
328 | many1(complete(decimal)), |
329 | opt(byte!(b'.' )), |
330 | many0(complete(decimal)), |
331 | float_exp, |
332 | ))), |
333 | opt(float_width), |
334 | ), |
335 | terminated(recognize(many1(complete(decimal))), float_width), |
336 | )), |
337 | |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()), |
338 | )(i) |
339 | } |
340 | |
341 | // ================================ |
342 | // ======== main interface ======== |
343 | // ================================ |
344 | |
345 | fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> { |
346 | altResult<(&[u8], EvalResult), …>(( |
347 | map(parser:full(c_char), f:EvalResult::Char), |
348 | map(parser:full(c_int), |i: i64| EvalResult::Int(::std::num::Wrapping(i))), |
349 | map(parser:full(c_float), f:EvalResult::Float), |
350 | map(parser:full(c_string), f:EvalResult::Str), |
351 | ))(input) |
352 | .to_cexpr_result() |
353 | } |
354 | |
355 | /// Parse a C literal. |
356 | /// |
357 | /// The input must contain exactly the representation of a single literal |
358 | /// token, and in particular no whitespace or sign prefixes. |
359 | pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> { |
360 | crate::assert_full_parse(result:one_literal(input)) |
361 | } |
362 | |