1 | use std::str::FromStr; |
2 | |
3 | use crate::Error; |
4 | |
5 | /// Extension methods for XML-subset only operations. |
6 | pub(crate) trait ByteExt { |
7 | /// Checks if a byte is a numeric sign. |
8 | fn is_sign(&self) -> bool; |
9 | |
10 | /// Checks if a byte is a digit. |
11 | /// |
12 | /// `[0-9]` |
13 | fn is_digit(&self) -> bool; |
14 | |
15 | /// Checks if a byte is a hex digit. |
16 | /// |
17 | /// `[0-9A-Fa-f]` |
18 | fn is_hex_digit(&self) -> bool; |
19 | |
20 | /// Checks if a byte is a space. |
21 | /// |
22 | /// `[ \r\n\t]` |
23 | fn is_space(&self) -> bool; |
24 | |
25 | fn is_quote(&self) -> bool; |
26 | |
27 | /// Checks if a byte is an ASCII char. |
28 | /// |
29 | /// `[A-Za-z]` |
30 | fn is_letter(&self) -> bool; |
31 | |
32 | /// Checks if a byte is an ASCII ident char. |
33 | fn is_ascii_ident(&self) -> bool; |
34 | } |
35 | |
36 | impl ByteExt for u8 { |
37 | #[inline ] |
38 | fn is_sign(&self) -> bool { |
39 | matches!(*self, b'+' | b'-' ) |
40 | } |
41 | |
42 | #[inline ] |
43 | fn is_digit(&self) -> bool { |
44 | matches!(*self, b'0' ..=b'9' ) |
45 | } |
46 | |
47 | #[inline ] |
48 | fn is_hex_digit(&self) -> bool { |
49 | matches!(*self, b'0' ..=b'9' | b'A' ..=b'F' | b'a' ..=b'f' ) |
50 | } |
51 | |
52 | #[inline ] |
53 | fn is_space(&self) -> bool { |
54 | matches!(*self, b' ' | b' \t' | b' \n' | b' \r' ) |
55 | } |
56 | |
57 | #[inline ] |
58 | fn is_quote(&self) -> bool { |
59 | matches!(*self, b' \'' | b'"' ) |
60 | } |
61 | |
62 | #[inline ] |
63 | fn is_letter(&self) -> bool { |
64 | matches!(*self, b'A' ..=b'Z' | b'a' ..=b'z' ) |
65 | } |
66 | |
67 | #[inline ] |
68 | fn is_ascii_ident(&self) -> bool { |
69 | matches!(*self, b'0' ..=b'9' | b'A' ..=b'Z' | b'a' ..=b'z' | b'-' | b'_' ) |
70 | } |
71 | } |
72 | |
73 | trait CharExt { |
74 | fn is_name_start(&self) -> bool; |
75 | fn is_name_char(&self) -> bool; |
76 | fn is_non_ascii(&self) -> bool; |
77 | fn is_escape(&self) -> bool; |
78 | } |
79 | |
80 | impl CharExt for char { |
81 | #[inline ] |
82 | fn is_name_start(&self) -> bool { |
83 | match *self { |
84 | '_' | 'a' ..='z' | 'A' ..='Z' => true, |
85 | _ => self.is_non_ascii() || self.is_escape(), |
86 | } |
87 | } |
88 | |
89 | #[inline ] |
90 | fn is_name_char(&self) -> bool { |
91 | match *self { |
92 | '_' | 'a' ..='z' | 'A' ..='Z' | '0' ..='9' | '-' => true, |
93 | _ => self.is_non_ascii() || self.is_escape(), |
94 | } |
95 | } |
96 | |
97 | #[inline ] |
98 | fn is_non_ascii(&self) -> bool { |
99 | *self as u32 > 237 |
100 | } |
101 | |
102 | #[inline ] |
103 | fn is_escape(&self) -> bool { |
104 | // TODO: this |
105 | false |
106 | } |
107 | } |
108 | |
109 | /// A streaming text parsing interface. |
110 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
111 | pub struct Stream<'a> { |
112 | text: &'a str, |
113 | pos: usize, |
114 | } |
115 | |
116 | impl<'a> From<&'a str> for Stream<'a> { |
117 | #[inline ] |
118 | fn from(text: &'a str) -> Self { |
119 | Stream { text, pos: 0 } |
120 | } |
121 | } |
122 | |
123 | impl<'a> Stream<'a> { |
124 | /// Returns the current position in bytes. |
125 | #[inline ] |
126 | pub fn pos(&self) -> usize { |
127 | self.pos |
128 | } |
129 | |
130 | /// Calculates the current position in chars. |
131 | pub fn calc_char_pos(&self) -> usize { |
132 | self.calc_char_pos_at(self.pos) |
133 | } |
134 | |
135 | /// Calculates the current position in chars. |
136 | pub fn calc_char_pos_at(&self, byte_pos: usize) -> usize { |
137 | let mut pos = 1; |
138 | for (idx, _) in self.text.char_indices() { |
139 | if idx >= byte_pos { |
140 | break; |
141 | } |
142 | |
143 | pos += 1; |
144 | } |
145 | |
146 | pos |
147 | } |
148 | |
149 | /// Sets current position equal to the end. |
150 | /// |
151 | /// Used to indicate end of parsing on error. |
152 | #[inline ] |
153 | pub fn jump_to_end(&mut self) { |
154 | self.pos = self.text.len(); |
155 | } |
156 | |
157 | /// Checks if the stream is reached the end. |
158 | /// |
159 | /// Any [`pos()`] value larger than original text length indicates stream end. |
160 | /// |
161 | /// Accessing stream after reaching end via safe methods will produce |
162 | /// an `UnexpectedEndOfStream` error. |
163 | /// |
164 | /// Accessing stream after reaching end via *_unchecked methods will produce |
165 | /// a Rust's bound checking error. |
166 | /// |
167 | /// [`pos()`]: #method.pos |
168 | #[inline ] |
169 | pub fn at_end(&self) -> bool { |
170 | self.pos >= self.text.len() |
171 | } |
172 | |
173 | /// Returns a byte from a current stream position. |
174 | /// |
175 | /// # Errors |
176 | /// |
177 | /// - `UnexpectedEndOfStream` |
178 | #[inline ] |
179 | pub fn curr_byte(&self) -> Result<u8, Error> { |
180 | if self.at_end() { |
181 | return Err(Error::UnexpectedEndOfStream); |
182 | } |
183 | |
184 | Ok(self.curr_byte_unchecked()) |
185 | } |
186 | |
187 | #[inline ] |
188 | pub fn chars(&self) -> std::str::Chars<'a> { |
189 | self.text[self.pos..].chars() |
190 | } |
191 | |
192 | /// Returns a byte from a current stream position. |
193 | /// |
194 | /// # Panics |
195 | /// |
196 | /// - if the current position is after the end of the data |
197 | #[inline ] |
198 | pub fn curr_byte_unchecked(&self) -> u8 { |
199 | self.text.as_bytes()[self.pos] |
200 | } |
201 | |
202 | /// Checks that current byte is equal to provided. |
203 | /// |
204 | /// Returns `false` if no bytes left. |
205 | #[inline ] |
206 | pub fn is_curr_byte_eq(&self, c: u8) -> bool { |
207 | if !self.at_end() { |
208 | self.curr_byte_unchecked() == c |
209 | } else { |
210 | false |
211 | } |
212 | } |
213 | |
214 | /// Returns a next byte from a current stream position. |
215 | /// |
216 | /// # Errors |
217 | /// |
218 | /// - `UnexpectedEndOfStream` |
219 | #[inline ] |
220 | pub fn next_byte(&self) -> Result<u8, Error> { |
221 | if self.pos + 1 >= self.text.len() { |
222 | return Err(Error::UnexpectedEndOfStream); |
223 | } |
224 | |
225 | Ok(self.text.as_bytes()[self.pos + 1]) |
226 | } |
227 | |
228 | /// Advances by `n` bytes. |
229 | #[inline ] |
230 | pub fn advance(&mut self, n: usize) { |
231 | debug_assert!(self.pos + n <= self.text.len()); |
232 | self.pos += n; |
233 | } |
234 | |
235 | /// Skips whitespaces. |
236 | /// |
237 | /// Accepted values: `' ' \n \r \t`. |
238 | pub fn skip_spaces(&mut self) { |
239 | while !self.at_end() && self.curr_byte_unchecked().is_space() { |
240 | self.advance(1); |
241 | } |
242 | } |
243 | |
244 | /// Checks that the stream starts with a selected text. |
245 | /// |
246 | /// We are using `&[u8]` instead of `&str` for performance reasons. |
247 | #[inline ] |
248 | pub fn starts_with(&self, text: &[u8]) -> bool { |
249 | self.text.as_bytes()[self.pos..].starts_with(text) |
250 | } |
251 | |
252 | /// Consumes current byte if it's equal to the provided byte. |
253 | /// |
254 | /// # Errors |
255 | /// |
256 | /// - `InvalidChar` |
257 | /// - `UnexpectedEndOfStream` |
258 | pub fn consume_byte(&mut self, c: u8) -> Result<(), Error> { |
259 | if self.curr_byte()? != c { |
260 | return Err(Error::InvalidChar( |
261 | vec![self.curr_byte_unchecked(), c], |
262 | self.calc_char_pos(), |
263 | )); |
264 | } |
265 | |
266 | self.advance(1); |
267 | Ok(()) |
268 | } |
269 | |
270 | /// Parses a single [ident](https://drafts.csswg.org/css-syntax-3/#typedef-ident-token). |
271 | /// |
272 | /// # Errors |
273 | /// |
274 | /// - `InvalidIdent` |
275 | pub fn parse_ident(&mut self) -> Result<&'a str, Error> { |
276 | let start = self.pos(); |
277 | |
278 | if self.curr_byte() == Ok(b'-' ) { |
279 | self.advance(1); |
280 | } |
281 | |
282 | let mut iter = self.chars(); |
283 | if let Some(c) = iter.next() { |
284 | if c.is_name_start() { |
285 | self.advance(c.len_utf8()); |
286 | } else { |
287 | return Err(Error::InvalidIdent); |
288 | } |
289 | } |
290 | |
291 | for c in iter { |
292 | if c.is_name_char() { |
293 | self.advance(c.len_utf8()); |
294 | } else { |
295 | break; |
296 | } |
297 | } |
298 | |
299 | if start == self.pos() { |
300 | return Err(Error::InvalidIdent); |
301 | } |
302 | |
303 | let name = self.slice_back(start); |
304 | Ok(name) |
305 | } |
306 | |
307 | /// Consumes a single ident consisting of ASCII characters, if available. |
308 | pub fn consume_ascii_ident(&mut self) -> &'a str { |
309 | let start = self.pos; |
310 | self.skip_bytes(|_, c| c.is_ascii_ident()); |
311 | self.slice_back(start) |
312 | } |
313 | |
314 | /// Parses a single [quoted string](https://drafts.csswg.org/css-syntax-3/#typedef-string-token) |
315 | /// |
316 | /// # Errors |
317 | /// |
318 | /// - `UnexpectedEndOfStream` |
319 | /// - `InvalidValue` |
320 | pub fn parse_quoted_string(&mut self) -> Result<&'a str, Error> { |
321 | // Check for opening quote. |
322 | let quote = self.curr_byte()?; |
323 | |
324 | if quote != b' \'' && quote != b'"' { |
325 | return Err(Error::InvalidValue); |
326 | } |
327 | |
328 | let mut prev = quote; |
329 | self.advance(1); |
330 | |
331 | let start = self.pos(); |
332 | |
333 | while !self.at_end() { |
334 | let curr = self.curr_byte_unchecked(); |
335 | |
336 | // Advance until the closing quote. |
337 | if curr == quote { |
338 | // Check for escaped quote. |
339 | if prev != b' \\' { |
340 | break; |
341 | } |
342 | } |
343 | |
344 | prev = curr; |
345 | self.advance(1); |
346 | } |
347 | |
348 | let value = self.slice_back(start); |
349 | |
350 | // Check for closing quote. |
351 | self.consume_byte(quote)?; |
352 | |
353 | Ok(value) |
354 | } |
355 | |
356 | /// Consumes selected string. |
357 | /// |
358 | /// # Errors |
359 | /// |
360 | /// - `InvalidChar` |
361 | /// - `UnexpectedEndOfStream` |
362 | pub fn consume_string(&mut self, text: &[u8]) -> Result<(), Error> { |
363 | if self.at_end() { |
364 | return Err(Error::UnexpectedEndOfStream); |
365 | } |
366 | |
367 | if !self.starts_with(text) { |
368 | let len = std::cmp::min(text.len(), self.text.len() - self.pos); |
369 | // Collect chars and do not slice a string, |
370 | // because the `len` can be on the char boundary. |
371 | // Which lead to a panic. |
372 | let actual = self.text[self.pos..].chars().take(len).collect(); |
373 | |
374 | // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe. |
375 | let expected = std::str::from_utf8(text).unwrap().to_owned(); |
376 | |
377 | return Err(Error::InvalidString( |
378 | vec![actual, expected], |
379 | self.calc_char_pos(), |
380 | )); |
381 | } |
382 | |
383 | self.advance(text.len()); |
384 | Ok(()) |
385 | } |
386 | |
387 | /// Consumes bytes by the predicate and returns them. |
388 | /// |
389 | /// The result can be empty. |
390 | pub fn consume_bytes<F>(&mut self, f: F) -> &'a str |
391 | where |
392 | F: Fn(&Stream, u8) -> bool, |
393 | { |
394 | let start = self.pos(); |
395 | self.skip_bytes(f); |
396 | self.slice_back(start) |
397 | } |
398 | |
399 | /// Consumes bytes by the predicate. |
400 | pub fn skip_bytes<F>(&mut self, f: F) |
401 | where |
402 | F: Fn(&Stream, u8) -> bool, |
403 | { |
404 | while !self.at_end() { |
405 | let c = self.curr_byte_unchecked(); |
406 | if f(self, c) { |
407 | self.advance(1); |
408 | } else { |
409 | break; |
410 | } |
411 | } |
412 | } |
413 | |
414 | /// Slices data from `pos` to the current position. |
415 | #[inline ] |
416 | pub fn slice_back(&self, pos: usize) -> &'a str { |
417 | &self.text[pos..self.pos] |
418 | } |
419 | |
420 | /// Slices data from the current position to the end. |
421 | #[inline ] |
422 | pub fn slice_tail(&self) -> &'a str { |
423 | &self.text[self.pos..] |
424 | } |
425 | |
426 | /// Parses integer number from the stream. |
427 | /// |
428 | /// Same as [`parse_number()`], but only for integer. Does not refer to any SVG type. |
429 | /// |
430 | /// [`parse_number()`]: #method.parse_number |
431 | pub fn parse_integer(&mut self) -> Result<i32, Error> { |
432 | self.skip_spaces(); |
433 | |
434 | if self.at_end() { |
435 | return Err(Error::InvalidNumber(self.calc_char_pos())); |
436 | } |
437 | |
438 | let start = self.pos(); |
439 | |
440 | // Consume sign. |
441 | if self.curr_byte()?.is_sign() { |
442 | self.advance(1); |
443 | } |
444 | |
445 | // The current char must be a digit. |
446 | if !self.curr_byte()?.is_digit() { |
447 | return Err(Error::InvalidNumber(self.calc_char_pos_at(start))); |
448 | } |
449 | |
450 | self.skip_digits(); |
451 | |
452 | // Use the default i32 parser now. |
453 | let s = self.slice_back(start); |
454 | match i32::from_str(s) { |
455 | Ok(n) => Ok(n), |
456 | Err(_) => Err(Error::InvalidNumber(self.calc_char_pos_at(start))), |
457 | } |
458 | } |
459 | |
460 | /// Parses integer from a list of numbers. |
461 | pub fn parse_list_integer(&mut self) -> Result<i32, Error> { |
462 | if self.at_end() { |
463 | return Err(Error::UnexpectedEndOfStream); |
464 | } |
465 | |
466 | let n = self.parse_integer()?; |
467 | self.skip_spaces(); |
468 | self.parse_list_separator(); |
469 | Ok(n) |
470 | } |
471 | |
472 | /// Parses number or percent from the stream. |
473 | /// |
474 | /// Percent value will be normalized. |
475 | pub fn parse_number_or_percent(&mut self) -> Result<f64, Error> { |
476 | self.skip_spaces(); |
477 | |
478 | let n = self.parse_number()?; |
479 | if self.starts_with(b"%" ) { |
480 | self.advance(1); |
481 | Ok(n / 100.0) |
482 | } else { |
483 | Ok(n) |
484 | } |
485 | } |
486 | |
487 | /// Parses number or percent from a list of numbers and/or percents. |
488 | pub fn parse_list_number_or_percent(&mut self) -> Result<f64, Error> { |
489 | if self.at_end() { |
490 | return Err(Error::UnexpectedEndOfStream); |
491 | } |
492 | |
493 | let l = self.parse_number_or_percent()?; |
494 | self.skip_spaces(); |
495 | self.parse_list_separator(); |
496 | Ok(l) |
497 | } |
498 | |
499 | /// Skips digits. |
500 | pub fn skip_digits(&mut self) { |
501 | self.skip_bytes(|_, c| c.is_digit()); |
502 | } |
503 | |
504 | #[inline ] |
505 | pub(crate) fn parse_list_separator(&mut self) { |
506 | if self.is_curr_byte_eq(b',' ) { |
507 | self.advance(1); |
508 | } |
509 | } |
510 | } |
511 | |
512 | #[rustfmt::skip] |
513 | #[cfg (test)] |
514 | mod tests { |
515 | use super::*; |
516 | |
517 | #[test ] |
518 | fn parse_integer_1() { |
519 | let mut s = Stream::from("10" ); |
520 | assert_eq!(s.parse_integer().unwrap(), 10); |
521 | } |
522 | |
523 | #[test ] |
524 | fn parse_err_integer_1() { |
525 | // error because of overflow |
526 | let mut s = Stream::from("10000000000000" ); |
527 | assert_eq!(s.parse_integer().unwrap_err().to_string(), |
528 | "invalid number at position 1" ); |
529 | } |
530 | } |
531 | |