1 | //! Functions to parse floating-point numbers. |
2 | |
3 | use crate::num::dec2flt::common::{ByteSlice, is_8digits}; |
4 | use crate::num::dec2flt::decimal::Decimal; |
5 | use crate::num::dec2flt::float::RawFloat; |
6 | |
7 | const MIN_19DIGIT_INT: u64 = 100_0000_0000_0000_0000; |
8 | |
9 | /// Parse 8 digits, loaded as bytes in little-endian order. |
10 | /// |
11 | /// This uses the trick where every digit is in [0x030, 0x39], |
12 | /// and therefore can be parsed in 3 multiplications, much |
13 | /// faster than the normal 8. |
14 | /// |
15 | /// This is based off the algorithm described in "Fast numeric string to |
16 | /// int", available here: <https://johnnylee-sde.github.io/Fast-numeric-string-to-int/>. |
17 | fn parse_8digits(mut v: u64) -> u64 { |
18 | const MASK: u64 = 0x0000_00FF_0000_00FF; |
19 | const MUL1: u64 = 0x000F_4240_0000_0064; |
20 | const MUL2: u64 = 0x0000_2710_0000_0001; |
21 | v -= 0x3030_3030_3030_3030; |
22 | v = (v * 10) + (v >> 8); // will not overflow, fits in 63 bits |
23 | let v1: u64 = (v & MASK).wrapping_mul(MUL1); |
24 | let v2: u64 = ((v >> 16) & MASK).wrapping_mul(MUL2); |
25 | ((v1.wrapping_add(v2) >> 32) as u32) as u64 |
26 | } |
27 | |
28 | /// Parse digits until a non-digit character is found. |
29 | fn try_parse_digits(mut s: &[u8], mut x: u64) -> (&[u8], u64) { |
30 | // may cause overflows, to be handled later |
31 | |
32 | while s.len() >= 8 { |
33 | let num: u64 = s.read_u64(); |
34 | if is_8digits(num) { |
35 | x = x.wrapping_mul(1_0000_0000).wrapping_add(parse_8digits(num)); |
36 | s = &s[8..]; |
37 | } else { |
38 | break; |
39 | } |
40 | } |
41 | |
42 | s = s.parse_digits(|digit: u8| { |
43 | x = x.wrapping_mul(10).wrapping_add(digit as _); |
44 | }); |
45 | |
46 | (s, x) |
47 | } |
48 | |
49 | /// Parse up to 19 digits (the max that can be stored in a 64-bit integer). |
50 | fn try_parse_19digits(s_ref: &mut &[u8], x: &mut u64) { |
51 | let mut s: &[u8] = *s_ref; |
52 | |
53 | while *x < MIN_19DIGIT_INT { |
54 | if let Some((c: &u8, s_next: &[u8])) = s.split_first() { |
55 | let digit: u8 = c.wrapping_sub(b'0' ); |
56 | |
57 | if digit < 10 { |
58 | *x = (*x * 10) + digit as u64; // no overflows here |
59 | s = s_next; |
60 | } else { |
61 | break; |
62 | } |
63 | } else { |
64 | break; |
65 | } |
66 | } |
67 | |
68 | *s_ref = s; |
69 | } |
70 | |
71 | /// Parse the scientific notation component of a float. |
72 | fn parse_scientific(s_ref: &mut &[u8]) -> Option<i64> { |
73 | let mut exponent = 0i64; |
74 | let mut negative = false; |
75 | |
76 | let mut s = *s_ref; |
77 | |
78 | if let Some((&c, s_next)) = s.split_first() { |
79 | negative = c == b'-' ; |
80 | if c == b'-' || c == b'+' { |
81 | s = s_next; |
82 | } |
83 | } |
84 | |
85 | if matches!(s.first(), Some(&x) if x.is_ascii_digit()) { |
86 | *s_ref = s.parse_digits(|digit| { |
87 | // no overflows here, saturate well before overflow |
88 | if exponent < 0x10000 { |
89 | exponent = 10 * exponent + digit as i64; |
90 | } |
91 | }); |
92 | if negative { Some(-exponent) } else { Some(exponent) } |
93 | } else { |
94 | *s_ref = s; |
95 | None |
96 | } |
97 | } |
98 | |
99 | /// Parse a partial, non-special floating point number. |
100 | /// |
101 | /// This creates a representation of the float as the |
102 | /// significant digits and the decimal exponent. |
103 | fn parse_partial_number(mut s: &[u8]) -> Option<(Decimal, usize)> { |
104 | debug_assert!(!s.is_empty()); |
105 | |
106 | // parse initial digits before dot |
107 | let mut mantissa = 0_u64; |
108 | let start = s; |
109 | let tmp = try_parse_digits(s, mantissa); |
110 | s = tmp.0; |
111 | mantissa = tmp.1; |
112 | let mut n_digits = s.offset_from(start); |
113 | |
114 | // handle dot with the following digits |
115 | let mut n_after_dot = 0; |
116 | let mut exponent = 0_i64; |
117 | let int_end = s; |
118 | |
119 | if let Some((&b'.' , s_next)) = s.split_first() { |
120 | s = s_next; |
121 | let before = s; |
122 | let tmp = try_parse_digits(s, mantissa); |
123 | s = tmp.0; |
124 | mantissa = tmp.1; |
125 | n_after_dot = s.offset_from(before); |
126 | exponent = -n_after_dot as i64; |
127 | } |
128 | |
129 | n_digits += n_after_dot; |
130 | if n_digits == 0 { |
131 | return None; |
132 | } |
133 | |
134 | // handle scientific format |
135 | let mut exp_number = 0_i64; |
136 | if let Some((&c, s_next)) = s.split_first() { |
137 | if c == b'e' || c == b'E' { |
138 | s = s_next; |
139 | // If None, we have no trailing digits after exponent, or an invalid float. |
140 | exp_number = parse_scientific(&mut s)?; |
141 | exponent += exp_number; |
142 | } |
143 | } |
144 | |
145 | let len = s.offset_from(start) as _; |
146 | |
147 | // handle uncommon case with many digits |
148 | if n_digits <= 19 { |
149 | return Some((Decimal { exponent, mantissa, negative: false, many_digits: false }, len)); |
150 | } |
151 | |
152 | n_digits -= 19; |
153 | let mut many_digits = false; |
154 | let mut p = start; |
155 | while let Some((&c, p_next)) = p.split_first() { |
156 | if c == b'.' || c == b'0' { |
157 | n_digits -= c.saturating_sub(b'0' - 1) as isize; |
158 | p = p_next; |
159 | } else { |
160 | break; |
161 | } |
162 | } |
163 | if n_digits > 0 { |
164 | // at this point we have more than 19 significant digits, let's try again |
165 | many_digits = true; |
166 | mantissa = 0; |
167 | let mut s = start; |
168 | try_parse_19digits(&mut s, &mut mantissa); |
169 | exponent = if mantissa >= MIN_19DIGIT_INT { |
170 | // big int |
171 | int_end.offset_from(s) |
172 | } else { |
173 | s = &s[1..]; |
174 | let before = s; |
175 | try_parse_19digits(&mut s, &mut mantissa); |
176 | -s.offset_from(before) |
177 | } as i64; |
178 | // add back the explicit part |
179 | exponent += exp_number; |
180 | } |
181 | |
182 | Some((Decimal { exponent, mantissa, negative: false, many_digits }, len)) |
183 | } |
184 | |
185 | /// Try to parse a non-special floating point number, |
186 | /// as well as two slices with integer and fractional parts |
187 | /// and the parsed exponent. |
188 | pub fn parse_number(s: &[u8]) -> Option<Decimal> { |
189 | if let Some((float: Decimal, rest: usize)) = parse_partial_number(s) { |
190 | if rest == s.len() { |
191 | return Some(float); |
192 | } |
193 | } |
194 | None |
195 | } |
196 | |
197 | /// Try to parse a special, non-finite float. |
198 | pub(crate) fn parse_inf_nan<F: RawFloat>(s: &[u8], negative: bool) -> Option<F> { |
199 | // Since a valid string has at most the length 8, we can load |
200 | // all relevant characters into a u64 and work from there. |
201 | // This also generates much better code. |
202 | |
203 | let mut register; |
204 | let len: usize; |
205 | |
206 | // All valid strings are either of length 8 or 3. |
207 | if s.len() == 8 { |
208 | register = s.read_u64(); |
209 | len = 8; |
210 | } else if s.len() == 3 { |
211 | let a = s[0] as u64; |
212 | let b = s[1] as u64; |
213 | let c = s[2] as u64; |
214 | register = (c << 16) | (b << 8) | a; |
215 | len = 3; |
216 | } else { |
217 | return None; |
218 | } |
219 | |
220 | // Clear out the bits which turn ASCII uppercase characters into |
221 | // lowercase characters. The resulting string is all uppercase. |
222 | // What happens to other characters is irrelevant. |
223 | register &= 0xDFDFDFDFDFDFDFDF; |
224 | |
225 | // u64 values corresponding to relevant cases |
226 | const INF_3: u64 = 0x464E49; // "INF" |
227 | const INF_8: u64 = 0x5954494E49464E49; // "INFINITY" |
228 | const NAN: u64 = 0x4E414E; // "NAN" |
229 | |
230 | // Match register value to constant to parse string. |
231 | // Also match on the string length to catch edge cases |
232 | // like "inf\0\0\0\0\0". |
233 | let float = match (register, len) { |
234 | (INF_3, 3) => F::INFINITY, |
235 | (INF_8, 8) => F::INFINITY, |
236 | (NAN, 3) => F::NAN, |
237 | _ => return None, |
238 | }; |
239 | |
240 | if negative { Some(-float) } else { Some(float) } |
241 | } |
242 | |