| 1 | //! Functions to parse floating-point numbers. |
| 2 | |
| 3 | use crate::num::dec2flt::common::{ByteSlice, is_8digits}; |
| 4 | use crate::num::dec2flt::decimal::Decimal; |
| 5 | use crate::num::dec2flt::float::RawFloat; |
| 6 | |
| 7 | const MIN_19DIGIT_INT: u64 = 100_0000_0000_0000_0000; |
| 8 | |
| 9 | /// Parse 8 digits, loaded as bytes in little-endian order. |
| 10 | /// |
| 11 | /// This uses the trick where every digit is in [0x030, 0x39], |
| 12 | /// and therefore can be parsed in 3 multiplications, much |
| 13 | /// faster than the normal 8. |
| 14 | /// |
| 15 | /// This is based off the algorithm described in "Fast numeric string to |
| 16 | /// int", available here: <https://johnnylee-sde.github.io/Fast-numeric-string-to-int/>. |
| 17 | fn parse_8digits(mut v: u64) -> u64 { |
| 18 | const MASK: u64 = 0x0000_00FF_0000_00FF; |
| 19 | const MUL1: u64 = 0x000F_4240_0000_0064; |
| 20 | const MUL2: u64 = 0x0000_2710_0000_0001; |
| 21 | v -= 0x3030_3030_3030_3030; |
| 22 | v = (v * 10) + (v >> 8); // will not overflow, fits in 63 bits |
| 23 | let v1: u64 = (v & MASK).wrapping_mul(MUL1); |
| 24 | let v2: u64 = ((v >> 16) & MASK).wrapping_mul(MUL2); |
| 25 | ((v1.wrapping_add(v2) >> 32) as u32) as u64 |
| 26 | } |
| 27 | |
| 28 | /// Parse digits until a non-digit character is found. |
| 29 | fn try_parse_digits(mut s: &[u8], mut x: u64) -> (&[u8], u64) { |
| 30 | // may cause overflows, to be handled later |
| 31 | |
| 32 | while s.len() >= 8 { |
| 33 | let num: u64 = s.read_u64(); |
| 34 | if is_8digits(num) { |
| 35 | x = x.wrapping_mul(1_0000_0000).wrapping_add(parse_8digits(num)); |
| 36 | s = &s[8..]; |
| 37 | } else { |
| 38 | break; |
| 39 | } |
| 40 | } |
| 41 | |
| 42 | s = s.parse_digits(|digit: u8| { |
| 43 | x = x.wrapping_mul(10).wrapping_add(digit as _); |
| 44 | }); |
| 45 | |
| 46 | (s, x) |
| 47 | } |
| 48 | |
| 49 | /// Parse up to 19 digits (the max that can be stored in a 64-bit integer). |
| 50 | fn try_parse_19digits(s_ref: &mut &[u8], x: &mut u64) { |
| 51 | let mut s: &[u8] = *s_ref; |
| 52 | |
| 53 | while *x < MIN_19DIGIT_INT { |
| 54 | if let Some((c: &u8, s_next: &[u8])) = s.split_first() { |
| 55 | let digit: u8 = c.wrapping_sub(b'0' ); |
| 56 | |
| 57 | if digit < 10 { |
| 58 | *x = (*x * 10) + digit as u64; // no overflows here |
| 59 | s = s_next; |
| 60 | } else { |
| 61 | break; |
| 62 | } |
| 63 | } else { |
| 64 | break; |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | *s_ref = s; |
| 69 | } |
| 70 | |
| 71 | /// Parse the scientific notation component of a float. |
| 72 | fn parse_scientific(s_ref: &mut &[u8]) -> Option<i64> { |
| 73 | let mut exponent = 0i64; |
| 74 | let mut negative = false; |
| 75 | |
| 76 | let mut s = *s_ref; |
| 77 | |
| 78 | if let Some((&c, s_next)) = s.split_first() { |
| 79 | negative = c == b'-' ; |
| 80 | if c == b'-' || c == b'+' { |
| 81 | s = s_next; |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | if matches!(s.first(), Some(&x) if x.is_ascii_digit()) { |
| 86 | *s_ref = s.parse_digits(|digit| { |
| 87 | // no overflows here, saturate well before overflow |
| 88 | if exponent < 0x10000 { |
| 89 | exponent = 10 * exponent + digit as i64; |
| 90 | } |
| 91 | }); |
| 92 | if negative { Some(-exponent) } else { Some(exponent) } |
| 93 | } else { |
| 94 | *s_ref = s; |
| 95 | None |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | /// Parse a partial, non-special floating point number. |
| 100 | /// |
| 101 | /// This creates a representation of the float as the |
| 102 | /// significant digits and the decimal exponent. |
| 103 | fn parse_partial_number(mut s: &[u8]) -> Option<(Decimal, usize)> { |
| 104 | debug_assert!(!s.is_empty()); |
| 105 | |
| 106 | // parse initial digits before dot |
| 107 | let mut mantissa = 0_u64; |
| 108 | let start = s; |
| 109 | let tmp = try_parse_digits(s, mantissa); |
| 110 | s = tmp.0; |
| 111 | mantissa = tmp.1; |
| 112 | let mut n_digits = s.offset_from(start); |
| 113 | |
| 114 | // handle dot with the following digits |
| 115 | let mut n_after_dot = 0; |
| 116 | let mut exponent = 0_i64; |
| 117 | let int_end = s; |
| 118 | |
| 119 | if let Some((&b'.' , s_next)) = s.split_first() { |
| 120 | s = s_next; |
| 121 | let before = s; |
| 122 | let tmp = try_parse_digits(s, mantissa); |
| 123 | s = tmp.0; |
| 124 | mantissa = tmp.1; |
| 125 | n_after_dot = s.offset_from(before); |
| 126 | exponent = -n_after_dot as i64; |
| 127 | } |
| 128 | |
| 129 | n_digits += n_after_dot; |
| 130 | if n_digits == 0 { |
| 131 | return None; |
| 132 | } |
| 133 | |
| 134 | // handle scientific format |
| 135 | let mut exp_number = 0_i64; |
| 136 | if let Some((&c, s_next)) = s.split_first() { |
| 137 | if c == b'e' || c == b'E' { |
| 138 | s = s_next; |
| 139 | // If None, we have no trailing digits after exponent, or an invalid float. |
| 140 | exp_number = parse_scientific(&mut s)?; |
| 141 | exponent += exp_number; |
| 142 | } |
| 143 | } |
| 144 | |
| 145 | let len = s.offset_from(start) as _; |
| 146 | |
| 147 | // handle uncommon case with many digits |
| 148 | if n_digits <= 19 { |
| 149 | return Some((Decimal { exponent, mantissa, negative: false, many_digits: false }, len)); |
| 150 | } |
| 151 | |
| 152 | n_digits -= 19; |
| 153 | let mut many_digits = false; |
| 154 | let mut p = start; |
| 155 | while let Some((&c, p_next)) = p.split_first() { |
| 156 | if c == b'.' || c == b'0' { |
| 157 | n_digits -= c.saturating_sub(b'0' - 1) as isize; |
| 158 | p = p_next; |
| 159 | } else { |
| 160 | break; |
| 161 | } |
| 162 | } |
| 163 | if n_digits > 0 { |
| 164 | // at this point we have more than 19 significant digits, let's try again |
| 165 | many_digits = true; |
| 166 | mantissa = 0; |
| 167 | let mut s = start; |
| 168 | try_parse_19digits(&mut s, &mut mantissa); |
| 169 | exponent = if mantissa >= MIN_19DIGIT_INT { |
| 170 | // big int |
| 171 | int_end.offset_from(s) |
| 172 | } else { |
| 173 | s = &s[1..]; |
| 174 | let before = s; |
| 175 | try_parse_19digits(&mut s, &mut mantissa); |
| 176 | -s.offset_from(before) |
| 177 | } as i64; |
| 178 | // add back the explicit part |
| 179 | exponent += exp_number; |
| 180 | } |
| 181 | |
| 182 | Some((Decimal { exponent, mantissa, negative: false, many_digits }, len)) |
| 183 | } |
| 184 | |
| 185 | /// Try to parse a non-special floating point number, |
| 186 | /// as well as two slices with integer and fractional parts |
| 187 | /// and the parsed exponent. |
| 188 | pub fn parse_number(s: &[u8]) -> Option<Decimal> { |
| 189 | if let Some((float: Decimal, rest: usize)) = parse_partial_number(s) { |
| 190 | if rest == s.len() { |
| 191 | return Some(float); |
| 192 | } |
| 193 | } |
| 194 | None |
| 195 | } |
| 196 | |
| 197 | /// Try to parse a special, non-finite float. |
| 198 | pub(crate) fn parse_inf_nan<F: RawFloat>(s: &[u8], negative: bool) -> Option<F> { |
| 199 | // Since a valid string has at most the length 8, we can load |
| 200 | // all relevant characters into a u64 and work from there. |
| 201 | // This also generates much better code. |
| 202 | |
| 203 | let mut register; |
| 204 | let len: usize; |
| 205 | |
| 206 | // All valid strings are either of length 8 or 3. |
| 207 | if s.len() == 8 { |
| 208 | register = s.read_u64(); |
| 209 | len = 8; |
| 210 | } else if s.len() == 3 { |
| 211 | let a = s[0] as u64; |
| 212 | let b = s[1] as u64; |
| 213 | let c = s[2] as u64; |
| 214 | register = (c << 16) | (b << 8) | a; |
| 215 | len = 3; |
| 216 | } else { |
| 217 | return None; |
| 218 | } |
| 219 | |
| 220 | // Clear out the bits which turn ASCII uppercase characters into |
| 221 | // lowercase characters. The resulting string is all uppercase. |
| 222 | // What happens to other characters is irrelevant. |
| 223 | register &= 0xDFDFDFDFDFDFDFDF; |
| 224 | |
| 225 | // u64 values corresponding to relevant cases |
| 226 | const INF_3: u64 = 0x464E49; // "INF" |
| 227 | const INF_8: u64 = 0x5954494E49464E49; // "INFINITY" |
| 228 | const NAN: u64 = 0x4E414E; // "NAN" |
| 229 | |
| 230 | // Match register value to constant to parse string. |
| 231 | // Also match on the string length to catch edge cases |
| 232 | // like "inf\0\0\0\0\0". |
| 233 | let float = match (register, len) { |
| 234 | (INF_3, 3) => F::INFINITY, |
| 235 | (INF_8, 8) => F::INFINITY, |
| 236 | (NAN, 3) => F::NAN, |
| 237 | _ => return None, |
| 238 | }; |
| 239 | |
| 240 | if negative { Some(-float) } else { Some(float) } |
| 241 | } |
| 242 | |