| 1 | // (C) Copyright 2016 Jethro G. Beekman |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 6 | // option. This file may not be copied, modified, or distributed |
| 7 | // except according to those terms. |
| 8 | //! Parsing C literals from byte slices. |
| 9 | //! |
| 10 | //! This will parse a representation of a C literal into a Rust type. |
| 11 | //! |
| 12 | //! # characters |
| 13 | //! Character literals are stored into the `CChar` type, which can hold values |
| 14 | //! that are not valid Unicode code points. ASCII characters are represented as |
| 15 | //! `char`, literal bytes with the high byte set are converted into the raw |
| 16 | //! representation. Escape sequences are supported. If hex and octal escapes |
| 17 | //! map to an ASCII character, that is used, otherwise, the raw encoding is |
| 18 | //! used, including for values over 255. Unicode escapes are checked for |
| 19 | //! validity and mapped to `char`. Character sequences are not supported. Width |
| 20 | //! prefixes are ignored. |
| 21 | //! |
| 22 | //! # strings |
| 23 | //! Strings are interpreted as byte vectors. Escape sequences are supported. If |
| 24 | //! hex and octal escapes map onto multi-byte characters, they are truncated to |
| 25 | //! one 8-bit character. Unicode escapes are converted into their UTF-8 |
| 26 | //! encoding. Width prefixes are ignored. |
| 27 | //! |
| 28 | //! # integers |
| 29 | //! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are |
| 30 | //! all supported. If the literal value is between `i64::MAX` and `u64::MAX`, |
| 31 | //! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and |
| 32 | //! sign suffixes are ignored. Sign prefixes are not supported. |
| 33 | //! |
| 34 | //! # real numbers |
| 35 | //! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are |
| 36 | //! not supported in the significand. Hexadecimal floating points are not |
| 37 | //! supported. |
| 38 | |
| 39 | use std::char; |
| 40 | use std::str::{self, FromStr}; |
| 41 | |
| 42 | use nom::branch::alt; |
| 43 | use nom::bytes::complete::is_not; |
| 44 | use nom::bytes::complete::tag; |
| 45 | use nom::character::complete::{char, one_of}; |
| 46 | use nom::combinator::{complete, map, map_opt, opt, recognize}; |
| 47 | use nom::multi::{fold_many0, many0, many1, many_m_n}; |
| 48 | use nom::sequence::{delimited, pair, preceded, terminated, tuple}; |
| 49 | use nom::*; |
| 50 | |
| 51 | use crate::expr::EvalResult; |
| 52 | use crate::ToCexprResult; |
| 53 | |
| 54 | #[derive (Debug, Copy, Clone, PartialEq, Eq)] |
| 55 | /// Representation of a C character |
| 56 | pub enum CChar { |
| 57 | /// A character that can be represented as a `char` |
| 58 | Char(char), |
| 59 | /// Any other character (8-bit characters, unicode surrogates, etc.) |
| 60 | Raw(u64), |
| 61 | } |
| 62 | |
| 63 | impl From<u8> for CChar { |
| 64 | fn from(i: u8) -> CChar { |
| 65 | match i { |
| 66 | 0..=0x7f => CChar::Char(i as u8 as char), |
| 67 | _ => CChar::Raw(i as u64), |
| 68 | } |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | // A non-allocating version of this would be nice... |
| 73 | impl std::convert::Into<Vec<u8>> for CChar { |
| 74 | fn into(self) -> Vec<u8> { |
| 75 | match self { |
| 76 | CChar::Char(c: char) => { |
| 77 | let mut s: String = String::with_capacity(4); |
| 78 | s.extend(&[c]); |
| 79 | s.into_bytes() |
| 80 | } |
| 81 | CChar::Raw(i: u64) => { |
| 82 | let mut v: Vec = Vec::with_capacity(1); |
| 83 | v.push(i as u8); |
| 84 | v |
| 85 | } |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | /// ensures the child parser consumes the whole input |
| 91 | pub fn full<I: Clone, O, F>( |
| 92 | f: F, |
| 93 | ) -> impl Fn(I) -> nom::IResult<I, O> |
| 94 | where |
| 95 | I: nom::InputLength, |
| 96 | F: Fn(I) -> nom::IResult<I, O>, |
| 97 | { |
| 98 | move |input: I| { |
| 99 | let res: Result<(I, O), Err>> = f(input); |
| 100 | match res { |
| 101 | Ok((i: I, o: O)) => { |
| 102 | if i.input_len() == 0 { |
| 103 | Ok((i, o)) |
| 104 | } else { |
| 105 | Err(nom::Err::Error(nom::error::Error::new(input:i, code:nom::error::ErrorKind::Complete))) |
| 106 | } |
| 107 | } |
| 108 | r: Result<(I, O), Err>> => r, |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | // ================================= |
| 114 | // ======== matching digits ======== |
| 115 | // ================================= |
| 116 | |
| 117 | macro_rules! byte { |
| 118 | ($($p: pat)|* ) => {{ |
| 119 | fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> { |
| 120 | match i.split_first() { |
| 121 | $(Some((&c @ $p,rest)))|* => Ok((rest,c)), |
| 122 | Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))), |
| 123 | None => Err(nom::Err::Incomplete(Needed::new(1))), |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | parser |
| 128 | }} |
| 129 | } |
| 130 | |
| 131 | fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> { |
| 132 | byte!(b'0' ..=b'1' )(i) |
| 133 | } |
| 134 | |
| 135 | fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> { |
| 136 | byte!(b'0' ..=b'7' )(i) |
| 137 | } |
| 138 | |
| 139 | fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> { |
| 140 | byte!(b'0' ..=b'9' )(i) |
| 141 | } |
| 142 | |
| 143 | fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> { |
| 144 | byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F' )(i) |
| 145 | } |
| 146 | |
| 147 | // ======================================== |
| 148 | // ======== characters and strings ======== |
| 149 | // ======================================== |
| 150 | |
| 151 | fn escape2char(c: char) -> CChar { |
| 152 | CChar::Char(match c { |
| 153 | 'a' => ' \x07' , |
| 154 | 'b' => ' \x08' , |
| 155 | 'f' => ' \x0c' , |
| 156 | 'n' => ' \n' , |
| 157 | 'r' => ' \r' , |
| 158 | 't' => ' \t' , |
| 159 | 'v' => ' \x0b' , |
| 160 | _ => unreachable!("invalid escape {}" , c), |
| 161 | }) |
| 162 | } |
| 163 | |
| 164 | fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> { |
| 165 | strOption::from_utf8(&n) |
| 166 | .ok() |
| 167 | .and_then(|i: &str| u64::from_str_radix(src:i, radix).ok()) |
| 168 | .map(|i: u64| match i { |
| 169 | 0..=0x7f => CChar::Char(i as u8 as char), |
| 170 | _ => CChar::Raw(i), |
| 171 | }) |
| 172 | } |
| 173 | |
| 174 | fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> { |
| 175 | strOption::from_utf8(&n) |
| 176 | .ok() |
| 177 | .and_then(|i: &str| u32::from_str_radix(src:i, radix:16).ok()) |
| 178 | .and_then(char::from_u32) |
| 179 | .map(CChar::Char) |
| 180 | } |
| 181 | |
| 182 | fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { |
| 183 | preceded( |
| 184 | first:char(' \\' ), |
| 185 | second:alt(( |
| 186 | map(parser:one_of(r#"'"?\"# ), f:CChar::Char), |
| 187 | map(parser:one_of("abfnrtv" ), f:escape2char), |
| 188 | map_opt(parser:many_m_n(1, 3, octal), |v: Vec| c_raw_escape(n:v, radix:8)), |
| 189 | map_opt(parser:preceded(char('x' ), many1(hexadecimal)), |v: Vec| { |
| 190 | c_raw_escape(n:v, radix:16) |
| 191 | }), |
| 192 | map_opt( |
| 193 | parser:preceded(char('u' ), many_m_n(4, 4, hexadecimal)), |
| 194 | f:c_unicode_escape, |
| 195 | ), |
| 196 | map_opt( |
| 197 | parser:preceded(char('U' ), many_m_n(8, 8, hexadecimal)), |
| 198 | f:c_unicode_escape, |
| 199 | ), |
| 200 | )), |
| 201 | )(i) |
| 202 | } |
| 203 | |
| 204 | fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> { |
| 205 | alt((tag("u8" ), tag("u" ), tag("U" ), tag("L" )))(i) |
| 206 | } |
| 207 | |
| 208 | fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { |
| 209 | delimited( |
| 210 | first:terminated(opt(c_width_prefix), char(' \'' )), |
| 211 | second:alt(( |
| 212 | escaped_char, |
| 213 | map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from), |
| 214 | )), |
| 215 | third:char(' \'' ), |
| 216 | )(i) |
| 217 | } |
| 218 | |
| 219 | fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> { |
| 220 | delimited( |
| 221 | first:alt((preceded(c_width_prefix, char('"' )), char('"' ))), |
| 222 | second:fold_many0( |
| 223 | alt(( |
| 224 | map(escaped_char, |c: CChar| c.into()), |
| 225 | map(is_not([b' \\' , b'"' ]), |c: &[u8]| c.into()), |
| 226 | )), |
| 227 | Vec::new, |
| 228 | |mut v: Vec<u8>, res: Vec<u8>| { |
| 229 | v.extend_from_slice(&res); |
| 230 | v |
| 231 | }, |
| 232 | ), |
| 233 | third:char('"' ), |
| 234 | )(i) |
| 235 | } |
| 236 | |
| 237 | // ================================ |
| 238 | // ======== parse integers ======== |
| 239 | // ================================ |
| 240 | |
| 241 | fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> { |
| 242 | strOption<&str>::from_utf8(&n) |
| 243 | .ok() |
| 244 | .and_then(|i: &str| u64::from_str_radix(src:i, radix).ok()) |
| 245 | } |
| 246 | |
| 247 | fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> { |
| 248 | let r: Result<(&[u8], &[u8]), Err<…>> = input.split_at_position(|c: u8| c != b'u' && c != b'U' && c != b'l' && c != b'L' ); |
| 249 | match r { |
| 250 | Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)), |
| 251 | res: Result<(&[u8], &[u8]), Err<…>> => res, |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> { |
| 256 | map( |
| 257 | terminated( |
| 258 | alt(( |
| 259 | map_opt(preceded(tag("0x" ), many1(complete(hexadecimal))), |v| { |
| 260 | c_int_radix(v, 16) |
| 261 | }), |
| 262 | map_opt(preceded(tag("0X" ), many1(complete(hexadecimal))), |v| { |
| 263 | c_int_radix(v, 16) |
| 264 | }), |
| 265 | map_opt(preceded(tag("0b" ), many1(complete(binary))), |v| { |
| 266 | c_int_radix(v, 2) |
| 267 | }), |
| 268 | map_opt(preceded(tag("0B" ), many1(complete(binary))), |v| { |
| 269 | c_int_radix(v, 2) |
| 270 | }), |
| 271 | map_opt(preceded(char('0' ), many1(complete(octal))), |v| { |
| 272 | c_int_radix(v, 8) |
| 273 | }), |
| 274 | map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)), |
| 275 | |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))), |
| 276 | )), |
| 277 | opt(take_ul), |
| 278 | ), |
| 279 | |i| i as i64, |
| 280 | )(i) |
| 281 | } |
| 282 | |
| 283 | // ============================== |
| 284 | // ======== parse floats ======== |
| 285 | // ============================== |
| 286 | |
| 287 | fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> { |
| 288 | nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L' ))(i) |
| 289 | } |
| 290 | |
| 291 | fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> { |
| 292 | preceded( |
| 293 | first:byte!(b'e' | b'E' ), |
| 294 | second:pair(first:opt(byte!(b'-' | b'+' )), second:many1(complete(decimal))), |
| 295 | )(i) |
| 296 | } |
| 297 | |
| 298 | fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> { |
| 299 | map_opt( |
| 300 | alt(( |
| 301 | terminated( |
| 302 | recognize(tuple(( |
| 303 | many1(complete(decimal)), |
| 304 | byte!(b'.' ), |
| 305 | many0(complete(decimal)), |
| 306 | ))), |
| 307 | opt(float_width), |
| 308 | ), |
| 309 | terminated( |
| 310 | recognize(tuple(( |
| 311 | many0(complete(decimal)), |
| 312 | byte!(b'.' ), |
| 313 | many1(complete(decimal)), |
| 314 | ))), |
| 315 | opt(float_width), |
| 316 | ), |
| 317 | terminated( |
| 318 | recognize(tuple(( |
| 319 | many0(complete(decimal)), |
| 320 | opt(byte!(b'.' )), |
| 321 | many1(complete(decimal)), |
| 322 | float_exp, |
| 323 | ))), |
| 324 | opt(float_width), |
| 325 | ), |
| 326 | terminated( |
| 327 | recognize(tuple(( |
| 328 | many1(complete(decimal)), |
| 329 | opt(byte!(b'.' )), |
| 330 | many0(complete(decimal)), |
| 331 | float_exp, |
| 332 | ))), |
| 333 | opt(float_width), |
| 334 | ), |
| 335 | terminated(recognize(many1(complete(decimal))), float_width), |
| 336 | )), |
| 337 | |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()), |
| 338 | )(i) |
| 339 | } |
| 340 | |
| 341 | // ================================ |
| 342 | // ======== main interface ======== |
| 343 | // ================================ |
| 344 | |
| 345 | fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> { |
| 346 | altResult<(&[u8], EvalResult), …>(( |
| 347 | map(parser:full(c_char), f:EvalResult::Char), |
| 348 | map(parser:full(c_int), |i: i64| EvalResult::Int(::std::num::Wrapping(i))), |
| 349 | map(parser:full(c_float), f:EvalResult::Float), |
| 350 | map(parser:full(c_string), f:EvalResult::Str), |
| 351 | ))(input) |
| 352 | .to_cexpr_result() |
| 353 | } |
| 354 | |
| 355 | /// Parse a C literal. |
| 356 | /// |
| 357 | /// The input must contain exactly the representation of a single literal |
| 358 | /// token, and in particular no whitespace or sign prefixes. |
| 359 | pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> { |
| 360 | crate::assert_full_parse(result:one_literal(input)) |
| 361 | } |
| 362 | |