1 | use core::ops::Range; |
2 | use core::str; |
3 | |
4 | use crate::{Error, TextPos}; |
5 | |
6 | type Result<T> = core::result::Result<T, Error>; |
7 | |
8 | /// Extension methods for XML-subset only operations. |
9 | trait XmlCharExt { |
10 | /// Checks if the value is within the |
11 | /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range. |
12 | fn is_xml_name_start(&self) -> bool; |
13 | |
14 | /// Checks if the value is within the |
15 | /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range. |
16 | fn is_xml_name(&self) -> bool; |
17 | |
18 | /// Checks if the value is within the |
19 | /// [Char](https://www.w3.org/TR/xml/#NT-Char) range. |
20 | fn is_xml_char(&self) -> bool; |
21 | } |
22 | |
23 | impl XmlCharExt for char { |
24 | #[inline ] |
25 | fn is_xml_name_start(&self) -> bool { |
26 | // Check for ASCII first. |
27 | if *self as u32 <= 128 { |
28 | return matches!(*self as u8, b'A' ..=b'Z' | b'a' ..=b'z' | b':' | b'_' ); |
29 | } |
30 | |
31 | matches!(*self as u32, |
32 | 0x0000C0..=0x0000D6 |
33 | | 0x0000D8..=0x0000F6 |
34 | | 0x0000F8..=0x0002FF |
35 | | 0x000370..=0x00037D |
36 | | 0x00037F..=0x001FFF |
37 | | 0x00200C..=0x00200D |
38 | | 0x002070..=0x00218F |
39 | | 0x002C00..=0x002FEF |
40 | | 0x003001..=0x00D7FF |
41 | | 0x00F900..=0x00FDCF |
42 | | 0x00FDF0..=0x00FFFD |
43 | | 0x010000..=0x0EFFFF) |
44 | } |
45 | |
46 | #[inline ] |
47 | fn is_xml_name(&self) -> bool { |
48 | // Check for ASCII first. |
49 | if *self as u32 <= 128 { |
50 | return (*self as u8).is_xml_name(); |
51 | } |
52 | |
53 | matches!(*self as u32, 0x0000B7 |
54 | | 0x0000C0..=0x0000D6 |
55 | | 0x0000D8..=0x0000F6 |
56 | | 0x0000F8..=0x0002FF |
57 | | 0x000300..=0x00036F |
58 | | 0x000370..=0x00037D |
59 | | 0x00037F..=0x001FFF |
60 | | 0x00200C..=0x00200D |
61 | | 0x00203F..=0x002040 |
62 | | 0x002070..=0x00218F |
63 | | 0x002C00..=0x002FEF |
64 | | 0x003001..=0x00D7FF |
65 | | 0x00F900..=0x00FDCF |
66 | | 0x00FDF0..=0x00FFFD |
67 | | 0x010000..=0x0EFFFF) |
68 | } |
69 | |
70 | #[inline ] |
71 | fn is_xml_char(&self) -> bool { |
72 | // Does not check for surrogate code points U+D800-U+DFFF, |
73 | // since that check was performed by Rust when the `&str` was constructed. |
74 | if (*self as u32) < 0x20 { |
75 | return (*self as u8).is_xml_space(); |
76 | } |
77 | |
78 | !matches!(*self as u32, 0xFFFF | 0xFFFE) |
79 | } |
80 | } |
81 | |
82 | trait XmlByteExt { |
83 | /// Checks if byte is a space. |
84 | /// |
85 | /// `[ \r\n\t]` |
86 | fn is_xml_space(&self) -> bool; |
87 | |
88 | /// Checks if byte is within the ASCII |
89 | /// [Char](https://www.w3.org/TR/xml/#NT-Char) range. |
90 | fn is_xml_name(&self) -> bool; |
91 | } |
92 | |
93 | impl XmlByteExt for u8 { |
94 | #[inline ] |
95 | fn is_xml_space(&self) -> bool { |
96 | matches!(*self, b' ' | b' \t' | b' \n' | b' \r' ) |
97 | } |
98 | |
99 | #[inline ] |
100 | fn is_xml_name(&self) -> bool { |
101 | matches!(*self, b'A' ..=b'Z' | b'a' ..=b'z' | b'0' ..=b'9' | b':' | b'_' | b'-' | b'.' ) |
102 | } |
103 | } |
104 | |
105 | /// A string slice. |
106 | /// |
107 | /// Like `&str`, but also contains the position in the input XML |
108 | /// from which it was parsed. |
109 | #[must_use ] |
110 | #[derive (Clone, Copy)] |
111 | pub struct StrSpan<'input> { |
112 | text: &'input str, |
113 | start: usize, |
114 | } |
115 | |
116 | impl<'input> From<&'input str> for StrSpan<'input> { |
117 | #[inline ] |
118 | fn from(text: &'input str) -> Self { |
119 | StrSpan { text, start: 0 } |
120 | } |
121 | } |
122 | |
123 | impl<'input> StrSpan<'input> { |
124 | #[inline ] |
125 | pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan { |
126 | debug_assert!(start <= end); |
127 | StrSpan { |
128 | text: &text[start..end], |
129 | start, |
130 | } |
131 | } |
132 | |
133 | #[inline ] |
134 | pub fn range(&self) -> Range<usize> { |
135 | self.start..(self.start + self.text.len()) |
136 | } |
137 | |
138 | #[inline ] |
139 | pub fn as_str(&self) -> &'input str { |
140 | self.text |
141 | } |
142 | |
143 | #[inline ] |
144 | fn slice_region(&self, start: usize, end: usize) -> &'input str { |
145 | &self.text[start..end] |
146 | } |
147 | } |
148 | |
149 | pub enum Token<'input> { |
150 | // <?target content?> |
151 | ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>), |
152 | |
153 | // <!-- text --> |
154 | Comment(&'input str, Range<usize>), |
155 | |
156 | // <!ENTITY ns_extend "http://test.com"> |
157 | EntityDeclaration(&'input str, StrSpan<'input>), |
158 | |
159 | // <ns:elem |
160 | ElementStart(&'input str, &'input str, usize), |
161 | |
162 | // ns:attr="value" |
163 | Attribute(usize, &'input str, &'input str, StrSpan<'input>), |
164 | |
165 | ElementEnd(ElementEnd<'input>, Range<usize>), |
166 | |
167 | // Contains text between elements including whitespaces. |
168 | // Basically everything between `>` and `<`. |
169 | // Except `]]>`, which is not allowed and will lead to an error. |
170 | Text(&'input str, Range<usize>), |
171 | |
172 | // <![CDATA[text]]> |
173 | Cdata(&'input str, Range<usize>), |
174 | } |
175 | |
176 | /// `ElementEnd` token. |
177 | #[derive (Clone, Copy)] |
178 | pub enum ElementEnd<'input> { |
179 | /// Indicates `>` |
180 | Open, |
181 | /// Indicates `</ns:name>` |
182 | Close(&'input str, &'input str), |
183 | /// Indicates `/>` |
184 | Empty, |
185 | } |
186 | |
187 | pub trait XmlEvents<'input> { |
188 | fn token(&mut self, token: Token<'input>) -> Result<()>; |
189 | } |
190 | |
191 | // document ::= prolog element Misc* |
192 | pub fn parse<'input>( |
193 | text: &'input str, |
194 | allow_dtd: bool, |
195 | events: &mut dyn XmlEvents<'input>, |
196 | ) -> Result<()> { |
197 | let s = &mut Stream::new(text); |
198 | |
199 | // Skip UTF-8 BOM. |
200 | if s.starts_with(&[0xEF, 0xBB, 0xBF]) { |
201 | s.advance(3); |
202 | } |
203 | |
204 | if s.starts_with(b"<?xml " ) { |
205 | parse_declaration(s)?; |
206 | } |
207 | |
208 | parse_misc(s, events)?; |
209 | |
210 | s.skip_spaces(); |
211 | if s.starts_with(b"<!DOCTYPE" ) { |
212 | if !allow_dtd { |
213 | return Err(Error::DtdDetected); |
214 | } |
215 | |
216 | parse_doctype(s, events)?; |
217 | parse_misc(s, events)?; |
218 | } |
219 | |
220 | s.skip_spaces(); |
221 | if s.curr_byte().ok() == Some(b'<' ) { |
222 | parse_element(s, events)?; |
223 | } |
224 | |
225 | parse_misc(s, events)?; |
226 | |
227 | if !s.at_end() { |
228 | return Err(Error::UnknownToken(s.gen_text_pos())); |
229 | } |
230 | |
231 | Ok(()) |
232 | } |
233 | |
234 | // Misc ::= Comment | PI | S |
235 | fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
236 | while !s.at_end() { |
237 | s.skip_spaces(); |
238 | if s.starts_with(text:b"<!--" ) { |
239 | parse_comment(s, events)?; |
240 | } else if s.starts_with(text:b"<?" ) { |
241 | parse_pi(s, events)?; |
242 | } else { |
243 | break; |
244 | } |
245 | } |
246 | |
247 | Ok(()) |
248 | } |
249 | |
250 | // XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' |
251 | // |
252 | // We don't actually return a token for the XML declaration and only validate it. |
253 | fn parse_declaration(s: &mut Stream) -> Result<()> { |
254 | fn consume_spaces(s: &mut Stream) -> Result<()> { |
255 | if s.starts_with_space() { |
256 | s.skip_spaces(); |
257 | } else if !s.starts_with(b"?>" ) && !s.at_end() { |
258 | return Err(Error::InvalidChar2( |
259 | "a whitespace" , |
260 | s.curr_byte_unchecked(), |
261 | s.gen_text_pos(), |
262 | )); |
263 | } |
264 | |
265 | Ok(()) |
266 | } |
267 | |
268 | s.advance(5); // <?xml |
269 | consume_spaces(s)?; |
270 | |
271 | // The `version` "attribute" is mandatory. |
272 | if !s.starts_with(b"version" ) { |
273 | // Will trigger the InvalidString error, which is what we want. |
274 | return s.skip_string(b"version" ); |
275 | } |
276 | let _ = parse_attribute(s)?; |
277 | consume_spaces(s)?; |
278 | |
279 | if s.starts_with(b"encoding" ) { |
280 | let _ = parse_attribute(s)?; |
281 | consume_spaces(s)?; |
282 | } |
283 | |
284 | if s.starts_with(b"standalone" ) { |
285 | let _ = parse_attribute(s)?; |
286 | } |
287 | |
288 | s.skip_spaces(); |
289 | s.skip_string(b"?>" )?; |
290 | |
291 | Ok(()) |
292 | } |
293 | |
294 | // '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' |
295 | fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
296 | let start: usize = s.pos(); |
297 | s.advance(4); |
298 | let text: &str = s.consume_chars(|s: &Stream<'_>, c: char| !(c == '-' && s.starts_with(text:b"-->" )))?; |
299 | s.skip_string(text:b"-->" )?; |
300 | |
301 | if text.contains("--" ) { |
302 | return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start))); |
303 | } |
304 | |
305 | if text.ends_with('-' ) { |
306 | return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start))); |
307 | } |
308 | |
309 | let range: Range = s.range_from(start); |
310 | events.token(Token::Comment(text, range))?; |
311 | |
312 | Ok(()) |
313 | } |
314 | |
315 | // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' |
316 | // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) |
317 | fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
318 | if s.starts_with(text:b"<?xml " ) { |
319 | return Err(Error::UnexpectedDeclaration(s.gen_text_pos())); |
320 | } |
321 | |
322 | let start: usize = s.pos(); |
323 | s.advance(2); |
324 | let target: &str = s.consume_name()?; |
325 | s.skip_spaces(); |
326 | let content: &str = s.consume_chars(|s: &Stream<'_>, c: char| !(c == '?' && s.starts_with(text:b"?>" )))?; |
327 | let content: Option<&str> = if !content.is_empty() { |
328 | Some(content) |
329 | } else { |
330 | None |
331 | }; |
332 | |
333 | s.skip_string(text:b"?>" )?; |
334 | |
335 | let range: Range = s.range_from(start); |
336 | events.token(Token::ProcessingInstruction(target, content, range))?; |
337 | Ok(()) |
338 | } |
339 | |
340 | fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
341 | let start = s.pos(); |
342 | parse_doctype_start(s)?; |
343 | s.skip_spaces(); |
344 | |
345 | if s.curr_byte() == Ok(b'>' ) { |
346 | s.advance(1); |
347 | return Ok(()); |
348 | } |
349 | |
350 | s.advance(1); // [ |
351 | while !s.at_end() { |
352 | s.skip_spaces(); |
353 | if s.starts_with(b"<!ENTITY" ) { |
354 | parse_entity_decl(s, events)?; |
355 | } else if s.starts_with(b"<!--" ) { |
356 | parse_comment(s, events)?; |
357 | } else if s.starts_with(b"<?" ) { |
358 | parse_pi(s, events)?; |
359 | } else if s.starts_with(b"]" ) { |
360 | // DTD ends with ']' S? '>', therefore we have to skip possible spaces. |
361 | s.advance(1); |
362 | s.skip_spaces(); |
363 | match s.curr_byte() { |
364 | Ok(b'>' ) => { |
365 | s.advance(1); |
366 | break; |
367 | } |
368 | Ok(c) => { |
369 | return Err(Error::InvalidChar2("'>'" , c, s.gen_text_pos())); |
370 | } |
371 | Err(_) => { |
372 | return Err(Error::UnexpectedEndOfStream); |
373 | } |
374 | } |
375 | } else if s.starts_with(b"<!ELEMENT" ) |
376 | || s.starts_with(b"<!ATTLIST" ) |
377 | || s.starts_with(b"<!NOTATION" ) |
378 | { |
379 | if consume_decl(s).is_err() { |
380 | let pos = s.gen_text_pos_from(start); |
381 | return Err(Error::UnknownToken(pos)); |
382 | } |
383 | } else { |
384 | return Err(Error::UnknownToken(s.gen_text_pos())); |
385 | } |
386 | } |
387 | |
388 | Ok(()) |
389 | } |
390 | |
391 | // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' |
392 | fn parse_doctype_start(s: &mut Stream) -> Result<()> { |
393 | s.advance(9); |
394 | |
395 | s.consume_spaces()?; |
396 | s.skip_name()?; |
397 | s.skip_spaces(); |
398 | |
399 | let _ = parse_external_id(s)?; |
400 | s.skip_spaces(); |
401 | |
402 | let c: u8 = s.curr_byte()?; |
403 | if c != b'[' && c != b'>' { |
404 | return Err(Error::InvalidChar2("'[' or '>'" , c, s.gen_text_pos())); |
405 | } |
406 | |
407 | Ok(()) |
408 | } |
409 | |
410 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral |
411 | fn parse_external_id(s: &mut Stream) -> Result<bool> { |
412 | let v = if s.starts_with(b"SYSTEM" ) || s.starts_with(b"PUBLIC" ) { |
413 | let start = s.pos(); |
414 | s.advance(6); |
415 | let id = s.slice_back(start); |
416 | |
417 | s.consume_spaces()?; |
418 | let quote = s.consume_quote()?; |
419 | let _ = s.consume_bytes(|c| c != quote); |
420 | s.consume_byte(quote)?; |
421 | |
422 | if id == "SYSTEM" { |
423 | // Ok |
424 | } else { |
425 | s.consume_spaces()?; |
426 | let quote = s.consume_quote()?; |
427 | let _ = s.consume_bytes(|c| c != quote); |
428 | s.consume_byte(quote)?; |
429 | } |
430 | |
431 | true |
432 | } else { |
433 | false |
434 | }; |
435 | |
436 | Ok(v) |
437 | } |
438 | |
439 | // EntityDecl ::= GEDecl | PEDecl |
440 | // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' |
441 | // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' |
442 | fn parse_entity_decl<'input>( |
443 | s: &mut Stream<'input>, |
444 | events: &mut dyn XmlEvents<'input>, |
445 | ) -> Result<()> { |
446 | s.advance(8); |
447 | s.consume_spaces()?; |
448 | |
449 | let is_ge: bool = if s.try_consume_byte(b'%' ) { |
450 | s.consume_spaces()?; |
451 | false |
452 | } else { |
453 | true |
454 | }; |
455 | |
456 | let name: &str = s.consume_name()?; |
457 | s.consume_spaces()?; |
458 | if let Some(definition: StrSpan<'_>) = parse_entity_def(s, is_ge)? { |
459 | events.token(Token::EntityDeclaration(name, definition))?; |
460 | } |
461 | s.skip_spaces(); |
462 | s.consume_byte(b'>' )?; |
463 | |
464 | Ok(()) |
465 | } |
466 | |
467 | // EntityDef ::= EntityValue | (ExternalID NDataDecl?) |
468 | // PEDef ::= EntityValue | ExternalID |
469 | // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&'] |
470 | // | PEReference | Reference)* "'" |
471 | // ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral |
472 | // NDataDecl ::= S 'NDATA' S Name |
473 | fn parse_entity_def<'input>( |
474 | s: &mut Stream<'input>, |
475 | is_ge: bool, |
476 | ) -> Result<Option<StrSpan<'input>>> { |
477 | let c = s.curr_byte()?; |
478 | match c { |
479 | b'"' | b' \'' => { |
480 | let quote = s.consume_quote()?; |
481 | let start = s.pos(); |
482 | s.skip_bytes(|c| c != quote); |
483 | let value = s.slice_back_span(start); |
484 | s.consume_byte(quote)?; |
485 | Ok(Some(value)) |
486 | } |
487 | b'S' | b'P' => { |
488 | if parse_external_id(s)? { |
489 | if is_ge { |
490 | s.skip_spaces(); |
491 | if s.starts_with(b"NDATA" ) { |
492 | s.advance(5); |
493 | s.consume_spaces()?; |
494 | s.skip_name()?; |
495 | // TODO: NDataDecl is not supported |
496 | } |
497 | } |
498 | |
499 | Ok(None) |
500 | } else { |
501 | Err(Error::InvalidExternalID(s.gen_text_pos())) |
502 | } |
503 | } |
504 | _ => { |
505 | let pos = s.gen_text_pos(); |
506 | Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC" , c, pos)) |
507 | } |
508 | } |
509 | } |
510 | |
511 | fn consume_decl(s: &mut Stream) -> Result<()> { |
512 | s.skip_bytes(|c: u8| c != b'>' ); |
513 | s.consume_byte(b'>' )?; |
514 | Ok(()) |
515 | } |
516 | |
517 | // element ::= EmptyElemTag | STag content ETag |
518 | // '<' Name (S Attribute)* S? '>' |
519 | fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
520 | let start = s.pos(); |
521 | s.advance(1); // < |
522 | let (prefix, local) = s.consume_qname()?; |
523 | events.token(Token::ElementStart(prefix, local, start))?; |
524 | |
525 | let mut open = false; |
526 | while !s.at_end() { |
527 | let has_space = s.starts_with_space(); |
528 | s.skip_spaces(); |
529 | let start = s.pos(); |
530 | match s.curr_byte()? { |
531 | b'/' => { |
532 | s.advance(1); |
533 | s.consume_byte(b'>' )?; |
534 | let range = s.range_from(start); |
535 | events.token(Token::ElementEnd(ElementEnd::Empty, range))?; |
536 | break; |
537 | } |
538 | b'>' => { |
539 | s.advance(1); |
540 | let range = s.range_from(start); |
541 | events.token(Token::ElementEnd(ElementEnd::Open, range))?; |
542 | open = true; |
543 | break; |
544 | } |
545 | _ => { |
546 | // An attribute must be preceded with a whitespace. |
547 | if !has_space { |
548 | // Will always trigger an error. Which is what we want. |
549 | s.consume_spaces()?; |
550 | } |
551 | |
552 | // Manual inlining of `parse_attribute` for performance. |
553 | // We cannot mark `parse_attribute` as `#[inline(always)]` |
554 | // because it will blow up the binary size. |
555 | let (prefix, local) = s.consume_qname()?; |
556 | s.consume_eq()?; |
557 | let quote = s.consume_quote()?; |
558 | let quote_c = quote as char; |
559 | // The attribute value must not contain the < character. |
560 | let value_start = s.pos(); |
561 | s.skip_chars(|_, c| c != quote_c && c != '<' )?; |
562 | let value = s.slice_back_span(value_start); |
563 | s.consume_byte(quote)?; |
564 | events.token(Token::Attribute(start, prefix, local, value))?; |
565 | } |
566 | } |
567 | } |
568 | |
569 | if open { |
570 | parse_content(s, events)?; |
571 | } |
572 | |
573 | Ok(()) |
574 | } |
575 | |
576 | // Attribute ::= Name Eq AttValue |
577 | fn parse_attribute<'input>( |
578 | s: &mut Stream<'input>, |
579 | ) -> Result<(&'input str, &'input str, StrSpan<'input>)> { |
580 | let (prefix: &str, local: &str) = s.consume_qname()?; |
581 | s.consume_eq()?; |
582 | let quote: u8 = s.consume_quote()?; |
583 | let quote_c: char = quote as char; |
584 | // The attribute value must not contain the < character. |
585 | let value_start: usize = s.pos(); |
586 | s.skip_chars(|_, c: char| c != quote_c && c != '<' )?; |
587 | let value: StrSpan<'_> = s.slice_back_span(pos:value_start); |
588 | s.consume_byte(quote)?; |
589 | Ok((prefix, local, value)) |
590 | } |
591 | |
592 | // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* |
593 | pub fn parse_content<'input>( |
594 | s: &mut Stream<'input>, |
595 | events: &mut dyn XmlEvents<'input>, |
596 | ) -> Result<()> { |
597 | while !s.at_end() { |
598 | match s.curr_byte() { |
599 | Ok(b'<' ) => match s.next_byte() { |
600 | Ok(b'!' ) => { |
601 | if s.starts_with(b"<!--" ) { |
602 | parse_comment(s, events)?; |
603 | } else if s.starts_with(b"<![CDATA[" ) { |
604 | parse_cdata(s, events)?; |
605 | } else { |
606 | return Err(Error::UnknownToken(s.gen_text_pos())); |
607 | } |
608 | } |
609 | Ok(b'?' ) => parse_pi(s, events)?, |
610 | Ok(b'/' ) => { |
611 | parse_close_element(s, events)?; |
612 | break; |
613 | } |
614 | Ok(_) => parse_element(s, events)?, |
615 | Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())), |
616 | }, |
617 | Ok(_) => parse_text(s, events)?, |
618 | Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())), |
619 | } |
620 | } |
621 | |
622 | Ok(()) |
623 | } |
624 | |
625 | // CDSect ::= CDStart CData CDEnd |
626 | // CDStart ::= '<![CDATA[' |
627 | // CData ::= (Char* - (Char* ']]>' Char*)) |
628 | // CDEnd ::= ']]>' |
629 | fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
630 | let start: usize = s.pos(); |
631 | s.advance(9); // <![CDATA[ |
632 | let text: &str = s.consume_chars(|s: &Stream<'_>, c: char| !(c == ']' && s.starts_with(text:b"]]>" )))?; |
633 | s.skip_string(text:b"]]>" )?; |
634 | let range: Range = s.range_from(start); |
635 | events.token(Token::Cdata(text, range))?; |
636 | Ok(()) |
637 | } |
638 | |
639 | // '</' Name S? '>' |
640 | fn parse_close_element<'input>( |
641 | s: &mut Stream<'input>, |
642 | events: &mut dyn XmlEvents<'input>, |
643 | ) -> Result<()> { |
644 | let start: usize = s.pos(); |
645 | s.advance(2); // </ |
646 | |
647 | let (prefix: &str, tag_name: &str) = s.consume_qname()?; |
648 | s.skip_spaces(); |
649 | s.consume_byte(b'>' )?; |
650 | |
651 | let range: Range = s.range_from(start); |
652 | events.token(Token::ElementEnd( |
653 | ElementEnd::Close(prefix, tag_name), |
654 | range, |
655 | ))?; |
656 | Ok(()) |
657 | } |
658 | |
659 | fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> { |
660 | let start: usize = s.pos(); |
661 | let text: &str = s.consume_chars(|_, c: char| c != '<' )?; |
662 | |
663 | // According to the spec, `]]>` must not appear inside a Text node. |
664 | // https://www.w3.org/TR/xml/#syntax |
665 | // |
666 | // Search for `>` first, since it's a bit faster than looking for `]]>`. |
667 | if text.contains('>' ) && text.contains("]]>" ) { |
668 | return Err(Error::InvalidCharacterData(s.gen_text_pos())); |
669 | } |
670 | |
671 | let range: Range = s.range_from(start); |
672 | events.token(Token::Text(text, range))?; |
673 | Ok(()) |
674 | } |
675 | |
676 | /// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value. |
677 | #[derive (Clone, Copy)] |
678 | pub enum Reference<'input> { |
679 | /// An entity reference. |
680 | /// |
681 | /// <https://www.w3.org/TR/xml/#NT-EntityRef> |
682 | Entity(&'input str), |
683 | |
684 | /// A character reference. |
685 | /// |
686 | /// <https://www.w3.org/TR/xml/#NT-CharRef> |
687 | Char(char), |
688 | } |
689 | |
690 | #[derive (Clone)] |
691 | pub struct Stream<'input> { |
692 | pos: usize, |
693 | end: usize, |
694 | span: StrSpan<'input>, |
695 | } |
696 | |
697 | impl<'input> Stream<'input> { |
698 | #[inline ] |
699 | pub fn new(text: &'input str) -> Self { |
700 | Stream { |
701 | pos: 0, |
702 | end: text.len(), |
703 | span: text.into(), |
704 | } |
705 | } |
706 | |
707 | #[inline ] |
708 | pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self { |
709 | Stream { |
710 | pos: fragment.start, |
711 | end: fragment.end, |
712 | span: text.into(), |
713 | } |
714 | } |
715 | |
716 | #[inline ] |
717 | pub fn pos(&self) -> usize { |
718 | self.pos |
719 | } |
720 | |
721 | #[inline ] |
722 | pub fn at_end(&self) -> bool { |
723 | self.pos >= self.end |
724 | } |
725 | |
726 | #[inline ] |
727 | pub fn curr_byte(&self) -> Result<u8> { |
728 | if self.at_end() { |
729 | return Err(Error::UnexpectedEndOfStream); |
730 | } |
731 | |
732 | Ok(self.curr_byte_unchecked()) |
733 | } |
734 | |
735 | #[inline ] |
736 | pub fn curr_byte_unchecked(&self) -> u8 { |
737 | self.span.text.as_bytes()[self.pos] |
738 | } |
739 | |
740 | #[inline ] |
741 | fn next_byte(&self) -> Result<u8> { |
742 | if self.pos + 1 >= self.end { |
743 | return Err(Error::UnexpectedEndOfStream); |
744 | } |
745 | |
746 | Ok(self.span.as_str().as_bytes()[self.pos + 1]) |
747 | } |
748 | |
749 | #[inline ] |
750 | pub fn advance(&mut self, n: usize) { |
751 | debug_assert!(self.pos + n <= self.end); |
752 | self.pos += n; |
753 | } |
754 | |
755 | #[inline ] |
756 | fn starts_with(&self, text: &[u8]) -> bool { |
757 | self.span.text.as_bytes()[self.pos..self.end].starts_with(text) |
758 | } |
759 | |
760 | fn consume_byte(&mut self, c: u8) -> Result<()> { |
761 | let curr = self.curr_byte()?; |
762 | if curr != c { |
763 | return Err(Error::InvalidChar(c, curr, self.gen_text_pos())); |
764 | } |
765 | |
766 | self.advance(1); |
767 | Ok(()) |
768 | } |
769 | |
770 | // Unlike `consume_byte()` will not return any errors. |
771 | fn try_consume_byte(&mut self, c: u8) -> bool { |
772 | match self.curr_byte() { |
773 | Ok(b) if b == c => { |
774 | self.advance(1); |
775 | true |
776 | } |
777 | _ => false, |
778 | } |
779 | } |
780 | |
781 | fn skip_string(&mut self, text: &'static [u8]) -> Result<()> { |
782 | if !self.starts_with(text) { |
783 | let pos = self.gen_text_pos(); |
784 | |
785 | // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe. |
786 | let expected = str::from_utf8(text).unwrap(); |
787 | |
788 | return Err(Error::InvalidString(expected, pos)); |
789 | } |
790 | |
791 | self.advance(text.len()); |
792 | Ok(()) |
793 | } |
794 | |
795 | #[inline ] |
796 | fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str { |
797 | let start = self.pos; |
798 | self.skip_bytes(f); |
799 | self.slice_back(start) |
800 | } |
801 | |
802 | fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) { |
803 | while !self.at_end() && f(self.curr_byte_unchecked()) { |
804 | self.advance(1); |
805 | } |
806 | } |
807 | |
808 | #[inline ] |
809 | fn consume_chars<F>(&mut self, f: F) -> Result<&'input str> |
810 | where |
811 | F: Fn(&Stream, char) -> bool, |
812 | { |
813 | let start = self.pos; |
814 | self.skip_chars(f)?; |
815 | Ok(self.slice_back(start)) |
816 | } |
817 | |
818 | #[inline ] |
819 | fn skip_chars<F>(&mut self, f: F) -> Result<()> |
820 | where |
821 | F: Fn(&Stream, char) -> bool, |
822 | { |
823 | for c in self.chars() { |
824 | if !c.is_xml_char() { |
825 | return Err(Error::NonXmlChar(c, self.gen_text_pos())); |
826 | } else if f(self, c) { |
827 | self.advance(c.len_utf8()); |
828 | } else { |
829 | break; |
830 | } |
831 | } |
832 | |
833 | Ok(()) |
834 | } |
835 | |
836 | #[inline ] |
837 | fn chars(&self) -> str::Chars<'input> { |
838 | self.span.as_str()[self.pos..self.end].chars() |
839 | } |
840 | |
841 | #[inline ] |
842 | fn slice_back(&self, pos: usize) -> &'input str { |
843 | self.span.slice_region(pos, self.pos) |
844 | } |
845 | |
846 | #[inline ] |
847 | fn slice_back_span(&self, pos: usize) -> StrSpan<'input> { |
848 | StrSpan::from_substr(self.span.text, pos, self.pos) |
849 | } |
850 | |
851 | #[inline ] |
852 | fn range_from(&self, start: usize) -> Range<usize> { |
853 | start..self.pos |
854 | } |
855 | |
856 | #[inline ] |
857 | fn skip_spaces(&mut self) { |
858 | while self.starts_with_space() { |
859 | self.advance(1); |
860 | } |
861 | } |
862 | |
863 | #[inline ] |
864 | fn starts_with_space(&self) -> bool { |
865 | !self.at_end() && self.curr_byte_unchecked().is_xml_space() |
866 | } |
867 | |
868 | // Like `skip_spaces()`, but checks that first char is actually a space. |
869 | fn consume_spaces(&mut self) -> Result<()> { |
870 | if self.at_end() { |
871 | return Err(Error::UnexpectedEndOfStream); |
872 | } |
873 | |
874 | if !self.starts_with_space() { |
875 | return Err(Error::InvalidChar2( |
876 | "a whitespace" , |
877 | self.curr_byte_unchecked(), |
878 | self.gen_text_pos(), |
879 | )); |
880 | } |
881 | |
882 | self.skip_spaces(); |
883 | Ok(()) |
884 | } |
885 | |
886 | /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference> |
887 | pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> { |
888 | let start = self.pos(); |
889 | |
890 | // Consume reference on a substream. |
891 | let mut s = self.clone(); |
892 | let result = s.consume_reference()?; |
893 | |
894 | // If the current data is a reference than advance the current stream |
895 | // by number of bytes read by substream. |
896 | self.advance(s.pos() - start); |
897 | Some(result) |
898 | } |
899 | |
900 | #[inline (never)] |
901 | fn consume_reference(&mut self) -> Option<Reference<'input>> { |
902 | if !self.try_consume_byte(b'&' ) { |
903 | return None; |
904 | } |
905 | |
906 | let reference = if self.try_consume_byte(b'#' ) { |
907 | let (value, radix) = if self.try_consume_byte(b'x' ) { |
908 | let value = |
909 | self.consume_bytes(|c| matches!(c, b'0' ..=b'9' | b'A' ..=b'F' | b'a' ..=b'f' )); |
910 | (value, 16) |
911 | } else { |
912 | let value = self.consume_bytes(|c| c.is_ascii_digit()); |
913 | (value, 10) |
914 | }; |
915 | |
916 | let n = u32::from_str_radix(value, radix).ok()?; |
917 | |
918 | let c = char::from_u32(n).unwrap_or(' \u{FFFD}' ); |
919 | if !c.is_xml_char() { |
920 | return None; |
921 | } |
922 | |
923 | Reference::Char(c) |
924 | } else { |
925 | let name = self.consume_name().ok()?; |
926 | match name { |
927 | "quot" => Reference::Char('"' ), |
928 | "amp" => Reference::Char('&' ), |
929 | "apos" => Reference::Char(' \'' ), |
930 | "lt" => Reference::Char('<' ), |
931 | "gt" => Reference::Char('>' ), |
932 | _ => Reference::Entity(name), |
933 | } |
934 | }; |
935 | |
936 | self.consume_byte(b';' ).ok()?; |
937 | |
938 | Some(reference) |
939 | } |
940 | |
941 | /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name> |
942 | fn consume_name(&mut self) -> Result<&'input str> { |
943 | let start = self.pos(); |
944 | self.skip_name()?; |
945 | |
946 | let name = self.slice_back(start); |
947 | if name.is_empty() { |
948 | return Err(Error::InvalidName(self.gen_text_pos_from(start))); |
949 | } |
950 | |
951 | Ok(name) |
952 | } |
953 | |
954 | /// The same as `consume_name()`, but does not return a consumed name. |
955 | fn skip_name(&mut self) -> Result<()> { |
956 | let start = self.pos(); |
957 | let mut iter = self.chars(); |
958 | if let Some(c) = iter.next() { |
959 | if c.is_xml_name_start() { |
960 | self.advance(c.len_utf8()); |
961 | } else { |
962 | return Err(Error::InvalidName(self.gen_text_pos_from(start))); |
963 | } |
964 | } |
965 | |
966 | for c in iter { |
967 | if c.is_xml_name() { |
968 | self.advance(c.len_utf8()); |
969 | } else { |
970 | break; |
971 | } |
972 | } |
973 | |
974 | Ok(()) |
975 | } |
976 | |
977 | /// Consumes a qualified XML name and returns it. |
978 | /// |
979 | /// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames> |
980 | #[inline (never)] |
981 | fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> { |
982 | let start = self.pos(); |
983 | |
984 | let mut splitter = None; |
985 | |
986 | while !self.at_end() { |
987 | // Check for ASCII first for performance reasons. |
988 | let b = self.curr_byte_unchecked(); |
989 | if b < 128 { |
990 | if b == b':' { |
991 | if splitter.is_none() { |
992 | splitter = Some(self.pos()); |
993 | self.advance(1); |
994 | } else { |
995 | // Multiple `:` is an error. |
996 | return Err(Error::InvalidName(self.gen_text_pos_from(start))); |
997 | } |
998 | } else if b.is_xml_name() { |
999 | self.advance(1); |
1000 | } else { |
1001 | break; |
1002 | } |
1003 | } else { |
1004 | // Fallback to Unicode code point. |
1005 | match self.chars().nth(0) { |
1006 | Some(c) if c.is_xml_name() => { |
1007 | self.advance(c.len_utf8()); |
1008 | } |
1009 | _ => break, |
1010 | } |
1011 | } |
1012 | } |
1013 | |
1014 | let (prefix, local) = if let Some(splitter) = splitter { |
1015 | let prefix = self.span.slice_region(start, splitter); |
1016 | let local = self.slice_back(splitter + 1); |
1017 | (prefix, local) |
1018 | } else { |
1019 | let local = self.slice_back(start); |
1020 | // Slice an empty prefix. This way we can preserve attribute start position. |
1021 | (self.span.slice_region(start, start), local) |
1022 | }; |
1023 | |
1024 | // Prefix must start with a `NameStartChar`. |
1025 | if let Some(c) = prefix.chars().nth(0) { |
1026 | if !c.is_xml_name_start() { |
1027 | return Err(Error::InvalidName(self.gen_text_pos_from(start))); |
1028 | } |
1029 | } |
1030 | |
1031 | // Local name must start with a `NameStartChar`. |
1032 | if let Some(c) = local.chars().nth(0) { |
1033 | if !c.is_xml_name_start() { |
1034 | return Err(Error::InvalidName(self.gen_text_pos_from(start))); |
1035 | } |
1036 | } else { |
1037 | // If empty - error. |
1038 | return Err(Error::InvalidName(self.gen_text_pos_from(start))); |
1039 | } |
1040 | |
1041 | Ok((prefix, local)) |
1042 | } |
1043 | |
1044 | fn consume_eq(&mut self) -> Result<()> { |
1045 | self.skip_spaces(); |
1046 | self.consume_byte(b'=' )?; |
1047 | self.skip_spaces(); |
1048 | |
1049 | Ok(()) |
1050 | } |
1051 | |
1052 | fn consume_quote(&mut self) -> Result<u8> { |
1053 | let c = self.curr_byte()?; |
1054 | if c == b' \'' || c == b'"' { |
1055 | self.advance(1); |
1056 | Ok(c) |
1057 | } else { |
1058 | Err(Error::InvalidChar2("a quote" , c, self.gen_text_pos())) |
1059 | } |
1060 | } |
1061 | |
1062 | /// Calculates a current absolute position. |
1063 | /// |
1064 | /// This operation is very expensive. Use only for errors. |
1065 | #[inline (never)] |
1066 | pub fn gen_text_pos(&self) -> TextPos { |
1067 | let text = self.span.as_str(); |
1068 | let end = self.pos; |
1069 | |
1070 | let row = Self::calc_curr_row(text, end); |
1071 | let col = Self::calc_curr_col(text, end); |
1072 | TextPos::new(row, col) |
1073 | } |
1074 | |
1075 | /// Calculates an absolute position at `pos`. |
1076 | /// |
1077 | /// This operation is very expensive. Use only for errors. |
1078 | #[inline (never)] |
1079 | pub fn gen_text_pos_from(&self, pos: usize) -> TextPos { |
1080 | let mut s = self.clone(); |
1081 | s.pos = core::cmp::min(pos, s.span.as_str().len()); |
1082 | s.gen_text_pos() |
1083 | } |
1084 | |
1085 | fn calc_curr_row(text: &str, end: usize) -> u32 { |
1086 | let mut row = 1; |
1087 | for c in &text.as_bytes()[..end] { |
1088 | if *c == b' \n' { |
1089 | row += 1; |
1090 | } |
1091 | } |
1092 | |
1093 | row |
1094 | } |
1095 | |
1096 | fn calc_curr_col(text: &str, end: usize) -> u32 { |
1097 | let mut col = 1; |
1098 | for c in text[..end].chars().rev() { |
1099 | if c == ' \n' { |
1100 | break; |
1101 | } else { |
1102 | col += 1; |
1103 | } |
1104 | } |
1105 | |
1106 | col |
1107 | } |
1108 | } |
1109 | |