1 | //! Functions to parse floating-point numbers. |
2 | |
3 | use crate::num::dec2flt::common::{is_8digits, ByteSlice}; |
4 | use crate::num::dec2flt::float::RawFloat; |
5 | use crate::num::dec2flt::number::Number; |
6 | |
7 | const MIN_19DIGIT_INT: u64 = 100_0000_0000_0000_0000; |
8 | |
9 | /// Parse 8 digits, loaded as bytes in little-endian order. |
10 | /// |
11 | /// This uses the trick where every digit is in [0x030, 0x39], |
12 | /// and therefore can be parsed in 3 multiplications, much |
13 | /// faster than the normal 8. |
14 | /// |
15 | /// This is based off the algorithm described in "Fast numeric string to |
16 | /// int", available here: <https://johnnylee-sde.github.io/Fast-numeric-string-to-int/>. |
17 | fn parse_8digits(mut v: u64) -> u64 { |
18 | const MASK: u64 = 0x0000_00FF_0000_00FF; |
19 | const MUL1: u64 = 0x000F_4240_0000_0064; |
20 | const MUL2: u64 = 0x0000_2710_0000_0001; |
21 | v -= 0x3030_3030_3030_3030; |
22 | v = (v * 10) + (v >> 8); // will not overflow, fits in 63 bits |
23 | let v1: u64 = (v & MASK).wrapping_mul(MUL1); |
24 | let v2: u64 = ((v >> 16) & MASK).wrapping_mul(MUL2); |
25 | ((v1.wrapping_add(v2) >> 32) as u32) as u64 |
26 | } |
27 | |
28 | /// Parse digits until a non-digit character is found. |
29 | fn try_parse_digits(mut s: &[u8], mut x: u64) -> (&[u8], u64) { |
30 | // may cause overflows, to be handled later |
31 | |
32 | while s.len() >= 8 { |
33 | let num: u64 = s.read_u64(); |
34 | if is_8digits(num) { |
35 | x = x.wrapping_mul(1_0000_0000).wrapping_add(parse_8digits(num)); |
36 | s = &s[8..]; |
37 | } else { |
38 | break; |
39 | } |
40 | } |
41 | |
42 | s = s.parse_digits(|digit: u8| { |
43 | x = x.wrapping_mul(10).wrapping_add(digit as _); |
44 | }); |
45 | |
46 | (s, x) |
47 | } |
48 | |
49 | /// Parse up to 19 digits (the max that can be stored in a 64-bit integer). |
50 | fn try_parse_19digits(s_ref: &mut &[u8], x: &mut u64) { |
51 | let mut s: &[u8] = *s_ref; |
52 | |
53 | while *x < MIN_19DIGIT_INT { |
54 | // FIXME: Can't use s.split_first() here yet, |
55 | // see https://github.com/rust-lang/rust/issues/109328 |
56 | if let [c: &u8, s_next: &[u8] @ ..] = s { |
57 | let digit: u8 = c.wrapping_sub(b'0' ); |
58 | |
59 | if digit < 10 { |
60 | *x = (*x * 10) + digit as u64; // no overflows here |
61 | s = s_next; |
62 | } else { |
63 | break; |
64 | } |
65 | } else { |
66 | break; |
67 | } |
68 | } |
69 | |
70 | *s_ref = s; |
71 | } |
72 | |
73 | /// Parse the scientific notation component of a float. |
74 | fn parse_scientific(s_ref: &mut &[u8]) -> Option<i64> { |
75 | let mut exponent = 0i64; |
76 | let mut negative = false; |
77 | |
78 | let mut s = *s_ref; |
79 | |
80 | if let Some((&c, s_next)) = s.split_first() { |
81 | negative = c == b'-' ; |
82 | if c == b'-' || c == b'+' { |
83 | s = s_next; |
84 | } |
85 | } |
86 | |
87 | if matches!(s.first(), Some(&x) if x.is_ascii_digit()) { |
88 | *s_ref = s.parse_digits(|digit| { |
89 | // no overflows here, saturate well before overflow |
90 | if exponent < 0x10000 { |
91 | exponent = 10 * exponent + digit as i64; |
92 | } |
93 | }); |
94 | if negative { Some(-exponent) } else { Some(exponent) } |
95 | } else { |
96 | *s_ref = s; |
97 | None |
98 | } |
99 | } |
100 | |
101 | /// Parse a partial, non-special floating point number. |
102 | /// |
103 | /// This creates a representation of the float as the |
104 | /// significant digits and the decimal exponent. |
105 | fn parse_partial_number(mut s: &[u8]) -> Option<(Number, usize)> { |
106 | debug_assert!(!s.is_empty()); |
107 | |
108 | // parse initial digits before dot |
109 | let mut mantissa = 0_u64; |
110 | let start = s; |
111 | let tmp = try_parse_digits(s, mantissa); |
112 | s = tmp.0; |
113 | mantissa = tmp.1; |
114 | let mut n_digits = s.offset_from(start); |
115 | |
116 | // handle dot with the following digits |
117 | let mut n_after_dot = 0; |
118 | let mut exponent = 0_i64; |
119 | let int_end = s; |
120 | |
121 | if let Some((&b'.' , s_next)) = s.split_first() { |
122 | s = s_next; |
123 | let before = s; |
124 | let tmp = try_parse_digits(s, mantissa); |
125 | s = tmp.0; |
126 | mantissa = tmp.1; |
127 | n_after_dot = s.offset_from(before); |
128 | exponent = -n_after_dot as i64; |
129 | } |
130 | |
131 | n_digits += n_after_dot; |
132 | if n_digits == 0 { |
133 | return None; |
134 | } |
135 | |
136 | // handle scientific format |
137 | let mut exp_number = 0_i64; |
138 | if let Some((&c, s_next)) = s.split_first() { |
139 | if c == b'e' || c == b'E' { |
140 | s = s_next; |
141 | // If None, we have no trailing digits after exponent, or an invalid float. |
142 | exp_number = parse_scientific(&mut s)?; |
143 | exponent += exp_number; |
144 | } |
145 | } |
146 | |
147 | let len = s.offset_from(start) as _; |
148 | |
149 | // handle uncommon case with many digits |
150 | if n_digits <= 19 { |
151 | return Some((Number { exponent, mantissa, negative: false, many_digits: false }, len)); |
152 | } |
153 | |
154 | n_digits -= 19; |
155 | let mut many_digits = false; |
156 | let mut p = start; |
157 | while let Some((&c, p_next)) = p.split_first() { |
158 | if c == b'.' || c == b'0' { |
159 | n_digits -= c.saturating_sub(b'0' - 1) as isize; |
160 | p = p_next; |
161 | } else { |
162 | break; |
163 | } |
164 | } |
165 | if n_digits > 0 { |
166 | // at this point we have more than 19 significant digits, let's try again |
167 | many_digits = true; |
168 | mantissa = 0; |
169 | let mut s = start; |
170 | try_parse_19digits(&mut s, &mut mantissa); |
171 | exponent = if mantissa >= MIN_19DIGIT_INT { |
172 | // big int |
173 | int_end.offset_from(s) |
174 | } else { |
175 | s = &s[1..]; |
176 | let before = s; |
177 | try_parse_19digits(&mut s, &mut mantissa); |
178 | -s.offset_from(before) |
179 | } as i64; |
180 | // add back the explicit part |
181 | exponent += exp_number; |
182 | } |
183 | |
184 | Some((Number { exponent, mantissa, negative: false, many_digits }, len)) |
185 | } |
186 | |
187 | /// Try to parse a non-special floating point number, |
188 | /// as well as two slices with integer and fractional parts |
189 | /// and the parsed exponent. |
190 | pub fn parse_number(s: &[u8]) -> Option<Number> { |
191 | if let Some((float: Number, rest: usize)) = parse_partial_number(s) { |
192 | if rest == s.len() { |
193 | return Some(float); |
194 | } |
195 | } |
196 | None |
197 | } |
198 | |
199 | /// Try to parse a special, non-finite float. |
200 | pub(crate) fn parse_inf_nan<F: RawFloat>(s: &[u8], negative: bool) -> Option<F> { |
201 | // Since a valid string has at most the length 8, we can load |
202 | // all relevant characters into a u64 and work from there. |
203 | // This also generates much better code. |
204 | |
205 | let mut register; |
206 | let len: usize; |
207 | |
208 | // All valid strings are either of length 8 or 3. |
209 | if s.len() == 8 { |
210 | register = s.read_u64(); |
211 | len = 8; |
212 | } else if s.len() == 3 { |
213 | let a = s[0] as u64; |
214 | let b = s[1] as u64; |
215 | let c = s[2] as u64; |
216 | register = (c << 16) | (b << 8) | a; |
217 | len = 3; |
218 | } else { |
219 | return None; |
220 | } |
221 | |
222 | // Clear out the bits which turn ASCII uppercase characters into |
223 | // lowercase characters. The resulting string is all uppercase. |
224 | // What happens to other characters is irrelevant. |
225 | register &= 0xDFDFDFDFDFDFDFDF; |
226 | |
227 | // u64 values corresponding to relevant cases |
228 | const INF_3: u64 = 0x464E49; // "INF" |
229 | const INF_8: u64 = 0x5954494E49464E49; // "INFINITY" |
230 | const NAN: u64 = 0x4E414E; // "NAN" |
231 | |
232 | // Match register value to constant to parse string. |
233 | // Also match on the string length to catch edge cases |
234 | // like "inf\0\0\0\0\0". |
235 | let float = match (register, len) { |
236 | (INF_3, 3) => F::INFINITY, |
237 | (INF_8, 8) => F::INFINITY, |
238 | (NAN, 3) => F::NAN, |
239 | _ => return None, |
240 | }; |
241 | |
242 | if negative { Some(-float) } else { Some(float) } |
243 | } |
244 | |