1use core::ops::Range;
2use core::str;
3
4use crate::{Error, TextPos};
5
6type Result<T> = core::result::Result<T, Error>;
7
8/// Extension methods for XML-subset only operations.
9trait XmlCharExt {
10 /// Checks if the value is within the
11 /// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
12 fn is_xml_name_start(&self) -> bool;
13
14 /// Checks if the value is within the
15 /// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
16 fn is_xml_name(&self) -> bool;
17
18 /// Checks if the value is within the
19 /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
20 fn is_xml_char(&self) -> bool;
21}
22
23impl XmlCharExt for char {
24 #[inline]
25 fn is_xml_name_start(&self) -> bool {
26 // Check for ASCII first.
27 if *self as u32 <= 128 {
28 return matches!(*self as u8, b'A'..=b'Z' | b'a'..=b'z' | b':' | b'_');
29 }
30
31 matches!(*self as u32,
32 0x0000C0..=0x0000D6
33 | 0x0000D8..=0x0000F6
34 | 0x0000F8..=0x0002FF
35 | 0x000370..=0x00037D
36 | 0x00037F..=0x001FFF
37 | 0x00200C..=0x00200D
38 | 0x002070..=0x00218F
39 | 0x002C00..=0x002FEF
40 | 0x003001..=0x00D7FF
41 | 0x00F900..=0x00FDCF
42 | 0x00FDF0..=0x00FFFD
43 | 0x010000..=0x0EFFFF)
44 }
45
46 #[inline]
47 fn is_xml_name(&self) -> bool {
48 // Check for ASCII first.
49 if *self as u32 <= 128 {
50 return (*self as u8).is_xml_name();
51 }
52
53 matches!(*self as u32, 0x0000B7
54 | 0x0000C0..=0x0000D6
55 | 0x0000D8..=0x0000F6
56 | 0x0000F8..=0x0002FF
57 | 0x000300..=0x00036F
58 | 0x000370..=0x00037D
59 | 0x00037F..=0x001FFF
60 | 0x00200C..=0x00200D
61 | 0x00203F..=0x002040
62 | 0x002070..=0x00218F
63 | 0x002C00..=0x002FEF
64 | 0x003001..=0x00D7FF
65 | 0x00F900..=0x00FDCF
66 | 0x00FDF0..=0x00FFFD
67 | 0x010000..=0x0EFFFF)
68 }
69
70 #[inline]
71 fn is_xml_char(&self) -> bool {
72 // Does not check for surrogate code points U+D800-U+DFFF,
73 // since that check was performed by Rust when the `&str` was constructed.
74 if (*self as u32) < 0x20 {
75 return (*self as u8).is_xml_space();
76 }
77
78 !matches!(*self as u32, 0xFFFF | 0xFFFE)
79 }
80}
81
82trait XmlByteExt {
83 /// Checks if byte is a space.
84 ///
85 /// `[ \r\n\t]`
86 fn is_xml_space(&self) -> bool;
87
88 /// Checks if byte is within the ASCII
89 /// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
90 fn is_xml_name(&self) -> bool;
91}
92
93impl XmlByteExt for u8 {
94 #[inline]
95 fn is_xml_space(&self) -> bool {
96 matches!(*self, b' ' | b'\t' | b'\n' | b'\r')
97 }
98
99 #[inline]
100 fn is_xml_name(&self) -> bool {
101 matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
102 }
103}
104
105/// A string slice.
106///
107/// Like `&str`, but also contains the position in the input XML
108/// from which it was parsed.
109#[must_use]
110#[derive(Clone, Copy)]
111pub struct StrSpan<'input> {
112 text: &'input str,
113 start: usize,
114}
115
116impl<'input> From<&'input str> for StrSpan<'input> {
117 #[inline]
118 fn from(text: &'input str) -> Self {
119 StrSpan { text, start: 0 }
120 }
121}
122
123impl<'input> StrSpan<'input> {
124 #[inline]
125 pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
126 debug_assert!(start <= end);
127 StrSpan {
128 text: &text[start..end],
129 start,
130 }
131 }
132
133 #[inline]
134 pub fn range(&self) -> Range<usize> {
135 self.start..(self.start + self.text.len())
136 }
137
138 #[inline]
139 pub fn as_str(&self) -> &'input str {
140 self.text
141 }
142
143 #[inline]
144 fn slice_region(&self, start: usize, end: usize) -> &'input str {
145 &self.text[start..end]
146 }
147}
148
149pub enum Token<'input> {
150 // <?target content?>
151 ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
152
153 // <!-- text -->
154 Comment(&'input str, Range<usize>),
155
156 // <!ENTITY ns_extend "http://test.com">
157 EntityDeclaration(&'input str, StrSpan<'input>),
158
159 // <ns:elem
160 ElementStart(&'input str, &'input str, usize),
161
162 // ns:attr="value"
163 Attribute(usize, &'input str, &'input str, StrSpan<'input>),
164
165 ElementEnd(ElementEnd<'input>, Range<usize>),
166
167 // Contains text between elements including whitespaces.
168 // Basically everything between `>` and `<`.
169 // Except `]]>`, which is not allowed and will lead to an error.
170 Text(&'input str, Range<usize>),
171
172 // <![CDATA[text]]>
173 Cdata(&'input str, Range<usize>),
174}
175
176/// `ElementEnd` token.
177#[derive(Clone, Copy)]
178pub enum ElementEnd<'input> {
179 /// Indicates `>`
180 Open,
181 /// Indicates `</ns:name>`
182 Close(&'input str, &'input str),
183 /// Indicates `/>`
184 Empty,
185}
186
187pub trait XmlEvents<'input> {
188 fn token(&mut self, token: Token<'input>) -> Result<()>;
189}
190
191// document ::= prolog element Misc*
192pub fn parse<'input>(
193 text: &'input str,
194 allow_dtd: bool,
195 events: &mut dyn XmlEvents<'input>,
196) -> Result<()> {
197 let s = &mut Stream::new(text);
198
199 // Skip UTF-8 BOM.
200 if s.starts_with(&[0xEF, 0xBB, 0xBF]) {
201 s.advance(3);
202 }
203
204 if s.starts_with(b"<?xml ") {
205 parse_declaration(s)?;
206 }
207
208 parse_misc(s, events)?;
209
210 s.skip_spaces();
211 if s.starts_with(b"<!DOCTYPE") {
212 if !allow_dtd {
213 return Err(Error::DtdDetected);
214 }
215
216 parse_doctype(s, events)?;
217 parse_misc(s, events)?;
218 }
219
220 s.skip_spaces();
221 if s.curr_byte().ok() == Some(b'<') {
222 parse_element(s, events)?;
223 }
224
225 parse_misc(s, events)?;
226
227 if !s.at_end() {
228 return Err(Error::UnknownToken(s.gen_text_pos()));
229 }
230
231 Ok(())
232}
233
234// Misc ::= Comment | PI | S
235fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
236 while !s.at_end() {
237 s.skip_spaces();
238 if s.starts_with(text:b"<!--") {
239 parse_comment(s, events)?;
240 } else if s.starts_with(text:b"<?") {
241 parse_pi(s, events)?;
242 } else {
243 break;
244 }
245 }
246
247 Ok(())
248}
249
250// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
251//
252// We don't actually return a token for the XML declaration and only validate it.
253fn parse_declaration(s: &mut Stream) -> Result<()> {
254 fn consume_spaces(s: &mut Stream) -> Result<()> {
255 if s.starts_with_space() {
256 s.skip_spaces();
257 } else if !s.starts_with(b"?>") && !s.at_end() {
258 return Err(Error::InvalidChar2(
259 "a whitespace",
260 s.curr_byte_unchecked(),
261 s.gen_text_pos(),
262 ));
263 }
264
265 Ok(())
266 }
267
268 s.advance(5); // <?xml
269 consume_spaces(s)?;
270
271 // The `version` "attribute" is mandatory.
272 if !s.starts_with(b"version") {
273 // Will trigger the InvalidString error, which is what we want.
274 return s.skip_string(b"version");
275 }
276 let _ = parse_attribute(s)?;
277 consume_spaces(s)?;
278
279 if s.starts_with(b"encoding") {
280 let _ = parse_attribute(s)?;
281 consume_spaces(s)?;
282 }
283
284 if s.starts_with(b"standalone") {
285 let _ = parse_attribute(s)?;
286 }
287
288 s.skip_spaces();
289 s.skip_string(b"?>")?;
290
291 Ok(())
292}
293
294// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
295fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
296 let start: usize = s.pos();
297 s.advance(4);
298 let text: &str = s.consume_chars(|s: &Stream<'_>, c: char| !(c == '-' && s.starts_with(text:b"-->")))?;
299 s.skip_string(text:b"-->")?;
300
301 if text.contains("--") {
302 return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start)));
303 }
304
305 if text.ends_with('-') {
306 return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start)));
307 }
308
309 let range: Range = s.range_from(start);
310 events.token(Token::Comment(text, range))?;
311
312 Ok(())
313}
314
315// PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
316// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
317fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
318 if s.starts_with(text:b"<?xml ") {
319 return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
320 }
321
322 let start: usize = s.pos();
323 s.advance(2);
324 let target: &str = s.consume_name()?;
325 s.skip_spaces();
326 let content: &str = s.consume_chars(|s: &Stream<'_>, c: char| !(c == '?' && s.starts_with(text:b"?>")))?;
327 let content: Option<&str> = if !content.is_empty() {
328 Some(content)
329 } else {
330 None
331 };
332
333 s.skip_string(text:b"?>")?;
334
335 let range: Range = s.range_from(start);
336 events.token(Token::ProcessingInstruction(target, content, range))?;
337 Ok(())
338}
339
340fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
341 let start = s.pos();
342 parse_doctype_start(s)?;
343 s.skip_spaces();
344
345 if s.curr_byte() == Ok(b'>') {
346 s.advance(1);
347 return Ok(());
348 }
349
350 s.advance(1); // [
351 while !s.at_end() {
352 s.skip_spaces();
353 if s.starts_with(b"<!ENTITY") {
354 parse_entity_decl(s, events)?;
355 } else if s.starts_with(b"<!--") {
356 parse_comment(s, events)?;
357 } else if s.starts_with(b"<?") {
358 parse_pi(s, events)?;
359 } else if s.starts_with(b"]") {
360 // DTD ends with ']' S? '>', therefore we have to skip possible spaces.
361 s.advance(1);
362 s.skip_spaces();
363 match s.curr_byte() {
364 Ok(b'>') => {
365 s.advance(1);
366 break;
367 }
368 Ok(c) => {
369 return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
370 }
371 Err(_) => {
372 return Err(Error::UnexpectedEndOfStream);
373 }
374 }
375 } else if s.starts_with(b"<!ELEMENT")
376 || s.starts_with(b"<!ATTLIST")
377 || s.starts_with(b"<!NOTATION")
378 {
379 if consume_decl(s).is_err() {
380 let pos = s.gen_text_pos_from(start);
381 return Err(Error::UnknownToken(pos));
382 }
383 } else {
384 return Err(Error::UnknownToken(s.gen_text_pos()));
385 }
386 }
387
388 Ok(())
389}
390
391// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
392fn parse_doctype_start(s: &mut Stream) -> Result<()> {
393 s.advance(9);
394
395 s.consume_spaces()?;
396 s.skip_name()?;
397 s.skip_spaces();
398
399 let _ = parse_external_id(s)?;
400 s.skip_spaces();
401
402 let c: u8 = s.curr_byte()?;
403 if c != b'[' && c != b'>' {
404 return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
405 }
406
407 Ok(())
408}
409
410// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
411fn parse_external_id(s: &mut Stream) -> Result<bool> {
412 let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
413 let start = s.pos();
414 s.advance(6);
415 let id = s.slice_back(start);
416
417 s.consume_spaces()?;
418 let quote = s.consume_quote()?;
419 let _ = s.consume_bytes(|c| c != quote);
420 s.consume_byte(quote)?;
421
422 if id == "SYSTEM" {
423 // Ok
424 } else {
425 s.consume_spaces()?;
426 let quote = s.consume_quote()?;
427 let _ = s.consume_bytes(|c| c != quote);
428 s.consume_byte(quote)?;
429 }
430
431 true
432 } else {
433 false
434 };
435
436 Ok(v)
437}
438
439// EntityDecl ::= GEDecl | PEDecl
440// GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
441// PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
442fn parse_entity_decl<'input>(
443 s: &mut Stream<'input>,
444 events: &mut dyn XmlEvents<'input>,
445) -> Result<()> {
446 s.advance(8);
447 s.consume_spaces()?;
448
449 let is_ge: bool = if s.try_consume_byte(b'%') {
450 s.consume_spaces()?;
451 false
452 } else {
453 true
454 };
455
456 let name: &str = s.consume_name()?;
457 s.consume_spaces()?;
458 if let Some(definition: StrSpan<'_>) = parse_entity_def(s, is_ge)? {
459 events.token(Token::EntityDeclaration(name, definition))?;
460 }
461 s.skip_spaces();
462 s.consume_byte(b'>')?;
463
464 Ok(())
465}
466
467// EntityDef ::= EntityValue | (ExternalID NDataDecl?)
468// PEDef ::= EntityValue | ExternalID
469// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&']
470// | PEReference | Reference)* "'"
471// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
472// NDataDecl ::= S 'NDATA' S Name
473fn parse_entity_def<'input>(
474 s: &mut Stream<'input>,
475 is_ge: bool,
476) -> Result<Option<StrSpan<'input>>> {
477 let c = s.curr_byte()?;
478 match c {
479 b'"' | b'\'' => {
480 let quote = s.consume_quote()?;
481 let start = s.pos();
482 s.skip_bytes(|c| c != quote);
483 let value = s.slice_back_span(start);
484 s.consume_byte(quote)?;
485 Ok(Some(value))
486 }
487 b'S' | b'P' => {
488 if parse_external_id(s)? {
489 if is_ge {
490 s.skip_spaces();
491 if s.starts_with(b"NDATA") {
492 s.advance(5);
493 s.consume_spaces()?;
494 s.skip_name()?;
495 // TODO: NDataDecl is not supported
496 }
497 }
498
499 Ok(None)
500 } else {
501 Err(Error::InvalidExternalID(s.gen_text_pos()))
502 }
503 }
504 _ => {
505 let pos = s.gen_text_pos();
506 Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
507 }
508 }
509}
510
511fn consume_decl(s: &mut Stream) -> Result<()> {
512 s.skip_bytes(|c: u8| c != b'>');
513 s.consume_byte(b'>')?;
514 Ok(())
515}
516
517// element ::= EmptyElemTag | STag content ETag
518// '<' Name (S Attribute)* S? '>'
519fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
520 let start = s.pos();
521 s.advance(1); // <
522 let (prefix, local) = s.consume_qname()?;
523 events.token(Token::ElementStart(prefix, local, start))?;
524
525 let mut open = false;
526 while !s.at_end() {
527 let has_space = s.starts_with_space();
528 s.skip_spaces();
529 let start = s.pos();
530 match s.curr_byte()? {
531 b'/' => {
532 s.advance(1);
533 s.consume_byte(b'>')?;
534 let range = s.range_from(start);
535 events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
536 break;
537 }
538 b'>' => {
539 s.advance(1);
540 let range = s.range_from(start);
541 events.token(Token::ElementEnd(ElementEnd::Open, range))?;
542 open = true;
543 break;
544 }
545 _ => {
546 // An attribute must be preceded with a whitespace.
547 if !has_space {
548 // Will always trigger an error. Which is what we want.
549 s.consume_spaces()?;
550 }
551
552 // Manual inlining of `parse_attribute` for performance.
553 // We cannot mark `parse_attribute` as `#[inline(always)]`
554 // because it will blow up the binary size.
555 let (prefix, local) = s.consume_qname()?;
556 s.consume_eq()?;
557 let quote = s.consume_quote()?;
558 let quote_c = quote as char;
559 // The attribute value must not contain the < character.
560 let value_start = s.pos();
561 s.skip_chars(|_, c| c != quote_c && c != '<')?;
562 let value = s.slice_back_span(value_start);
563 s.consume_byte(quote)?;
564 events.token(Token::Attribute(start, prefix, local, value))?;
565 }
566 }
567 }
568
569 if open {
570 parse_content(s, events)?;
571 }
572
573 Ok(())
574}
575
576// Attribute ::= Name Eq AttValue
577fn parse_attribute<'input>(
578 s: &mut Stream<'input>,
579) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
580 let (prefix: &str, local: &str) = s.consume_qname()?;
581 s.consume_eq()?;
582 let quote: u8 = s.consume_quote()?;
583 let quote_c: char = quote as char;
584 // The attribute value must not contain the < character.
585 let value_start: usize = s.pos();
586 s.skip_chars(|_, c: char| c != quote_c && c != '<')?;
587 let value: StrSpan<'_> = s.slice_back_span(pos:value_start);
588 s.consume_byte(quote)?;
589 Ok((prefix, local, value))
590}
591
592// content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
593pub fn parse_content<'input>(
594 s: &mut Stream<'input>,
595 events: &mut dyn XmlEvents<'input>,
596) -> Result<()> {
597 while !s.at_end() {
598 match s.curr_byte() {
599 Ok(b'<') => match s.next_byte() {
600 Ok(b'!') => {
601 if s.starts_with(b"<!--") {
602 parse_comment(s, events)?;
603 } else if s.starts_with(b"<![CDATA[") {
604 parse_cdata(s, events)?;
605 } else {
606 return Err(Error::UnknownToken(s.gen_text_pos()));
607 }
608 }
609 Ok(b'?') => parse_pi(s, events)?,
610 Ok(b'/') => {
611 parse_close_element(s, events)?;
612 break;
613 }
614 Ok(_) => parse_element(s, events)?,
615 Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
616 },
617 Ok(_) => parse_text(s, events)?,
618 Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
619 }
620 }
621
622 Ok(())
623}
624
625// CDSect ::= CDStart CData CDEnd
626// CDStart ::= '<![CDATA['
627// CData ::= (Char* - (Char* ']]>' Char*))
628// CDEnd ::= ']]>'
629fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
630 let start: usize = s.pos();
631 s.advance(9); // <![CDATA[
632 let text: &str = s.consume_chars(|s: &Stream<'_>, c: char| !(c == ']' && s.starts_with(text:b"]]>")))?;
633 s.skip_string(text:b"]]>")?;
634 let range: Range = s.range_from(start);
635 events.token(Token::Cdata(text, range))?;
636 Ok(())
637}
638
639// '</' Name S? '>'
640fn parse_close_element<'input>(
641 s: &mut Stream<'input>,
642 events: &mut dyn XmlEvents<'input>,
643) -> Result<()> {
644 let start: usize = s.pos();
645 s.advance(2); // </
646
647 let (prefix: &str, tag_name: &str) = s.consume_qname()?;
648 s.skip_spaces();
649 s.consume_byte(b'>')?;
650
651 let range: Range = s.range_from(start);
652 events.token(Token::ElementEnd(
653 ElementEnd::Close(prefix, tag_name),
654 range,
655 ))?;
656 Ok(())
657}
658
659fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
660 let start: usize = s.pos();
661 let text: &str = s.consume_chars(|_, c: char| c != '<')?;
662
663 // According to the spec, `]]>` must not appear inside a Text node.
664 // https://www.w3.org/TR/xml/#syntax
665 //
666 // Search for `>` first, since it's a bit faster than looking for `]]>`.
667 if text.contains('>') && text.contains("]]>") {
668 return Err(Error::InvalidCharacterData(s.gen_text_pos()));
669 }
670
671 let range: Range = s.range_from(start);
672 events.token(Token::Text(text, range))?;
673 Ok(())
674}
675
676/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
677#[derive(Clone, Copy)]
678pub enum Reference<'input> {
679 /// An entity reference.
680 ///
681 /// <https://www.w3.org/TR/xml/#NT-EntityRef>
682 Entity(&'input str),
683
684 /// A character reference.
685 ///
686 /// <https://www.w3.org/TR/xml/#NT-CharRef>
687 Char(char),
688}
689
690#[derive(Clone)]
691pub struct Stream<'input> {
692 pos: usize,
693 end: usize,
694 span: StrSpan<'input>,
695}
696
697impl<'input> Stream<'input> {
698 #[inline]
699 pub fn new(text: &'input str) -> Self {
700 Stream {
701 pos: 0,
702 end: text.len(),
703 span: text.into(),
704 }
705 }
706
707 #[inline]
708 pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
709 Stream {
710 pos: fragment.start,
711 end: fragment.end,
712 span: text.into(),
713 }
714 }
715
716 #[inline]
717 pub fn pos(&self) -> usize {
718 self.pos
719 }
720
721 #[inline]
722 pub fn at_end(&self) -> bool {
723 self.pos >= self.end
724 }
725
726 #[inline]
727 pub fn curr_byte(&self) -> Result<u8> {
728 if self.at_end() {
729 return Err(Error::UnexpectedEndOfStream);
730 }
731
732 Ok(self.curr_byte_unchecked())
733 }
734
735 #[inline]
736 pub fn curr_byte_unchecked(&self) -> u8 {
737 self.span.text.as_bytes()[self.pos]
738 }
739
740 #[inline]
741 fn next_byte(&self) -> Result<u8> {
742 if self.pos + 1 >= self.end {
743 return Err(Error::UnexpectedEndOfStream);
744 }
745
746 Ok(self.span.as_str().as_bytes()[self.pos + 1])
747 }
748
749 #[inline]
750 pub fn advance(&mut self, n: usize) {
751 debug_assert!(self.pos + n <= self.end);
752 self.pos += n;
753 }
754
755 #[inline]
756 fn starts_with(&self, text: &[u8]) -> bool {
757 self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
758 }
759
760 fn consume_byte(&mut self, c: u8) -> Result<()> {
761 let curr = self.curr_byte()?;
762 if curr != c {
763 return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
764 }
765
766 self.advance(1);
767 Ok(())
768 }
769
770 // Unlike `consume_byte()` will not return any errors.
771 fn try_consume_byte(&mut self, c: u8) -> bool {
772 match self.curr_byte() {
773 Ok(b) if b == c => {
774 self.advance(1);
775 true
776 }
777 _ => false,
778 }
779 }
780
781 fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
782 if !self.starts_with(text) {
783 let pos = self.gen_text_pos();
784
785 // Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
786 let expected = str::from_utf8(text).unwrap();
787
788 return Err(Error::InvalidString(expected, pos));
789 }
790
791 self.advance(text.len());
792 Ok(())
793 }
794
795 #[inline]
796 fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
797 let start = self.pos;
798 self.skip_bytes(f);
799 self.slice_back(start)
800 }
801
802 fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
803 while !self.at_end() && f(self.curr_byte_unchecked()) {
804 self.advance(1);
805 }
806 }
807
808 #[inline]
809 fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
810 where
811 F: Fn(&Stream, char) -> bool,
812 {
813 let start = self.pos;
814 self.skip_chars(f)?;
815 Ok(self.slice_back(start))
816 }
817
818 #[inline]
819 fn skip_chars<F>(&mut self, f: F) -> Result<()>
820 where
821 F: Fn(&Stream, char) -> bool,
822 {
823 for c in self.chars() {
824 if !c.is_xml_char() {
825 return Err(Error::NonXmlChar(c, self.gen_text_pos()));
826 } else if f(self, c) {
827 self.advance(c.len_utf8());
828 } else {
829 break;
830 }
831 }
832
833 Ok(())
834 }
835
836 #[inline]
837 fn chars(&self) -> str::Chars<'input> {
838 self.span.as_str()[self.pos..self.end].chars()
839 }
840
841 #[inline]
842 fn slice_back(&self, pos: usize) -> &'input str {
843 self.span.slice_region(pos, self.pos)
844 }
845
846 #[inline]
847 fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
848 StrSpan::from_substr(self.span.text, pos, self.pos)
849 }
850
851 #[inline]
852 fn range_from(&self, start: usize) -> Range<usize> {
853 start..self.pos
854 }
855
856 #[inline]
857 fn skip_spaces(&mut self) {
858 while self.starts_with_space() {
859 self.advance(1);
860 }
861 }
862
863 #[inline]
864 fn starts_with_space(&self) -> bool {
865 !self.at_end() && self.curr_byte_unchecked().is_xml_space()
866 }
867
868 // Like `skip_spaces()`, but checks that first char is actually a space.
869 fn consume_spaces(&mut self) -> Result<()> {
870 if self.at_end() {
871 return Err(Error::UnexpectedEndOfStream);
872 }
873
874 if !self.starts_with_space() {
875 return Err(Error::InvalidChar2(
876 "a whitespace",
877 self.curr_byte_unchecked(),
878 self.gen_text_pos(),
879 ));
880 }
881
882 self.skip_spaces();
883 Ok(())
884 }
885
886 /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
887 pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> {
888 let start = self.pos();
889
890 // Consume reference on a substream.
891 let mut s = self.clone();
892 let result = s.consume_reference()?;
893
894 // If the current data is a reference than advance the current stream
895 // by number of bytes read by substream.
896 self.advance(s.pos() - start);
897 Some(result)
898 }
899
900 #[inline(never)]
901 fn consume_reference(&mut self) -> Option<Reference<'input>> {
902 if !self.try_consume_byte(b'&') {
903 return None;
904 }
905
906 let reference = if self.try_consume_byte(b'#') {
907 let (value, radix) = if self.try_consume_byte(b'x') {
908 let value =
909 self.consume_bytes(|c| matches!(c, b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f'));
910 (value, 16)
911 } else {
912 let value = self.consume_bytes(|c| c.is_ascii_digit());
913 (value, 10)
914 };
915
916 let n = u32::from_str_radix(value, radix).ok()?;
917
918 let c = char::from_u32(n).unwrap_or('\u{FFFD}');
919 if !c.is_xml_char() {
920 return None;
921 }
922
923 Reference::Char(c)
924 } else {
925 let name = self.consume_name().ok()?;
926 match name {
927 "quot" => Reference::Char('"'),
928 "amp" => Reference::Char('&'),
929 "apos" => Reference::Char('\''),
930 "lt" => Reference::Char('<'),
931 "gt" => Reference::Char('>'),
932 _ => Reference::Entity(name),
933 }
934 };
935
936 self.consume_byte(b';').ok()?;
937
938 Some(reference)
939 }
940
941 /// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
942 fn consume_name(&mut self) -> Result<&'input str> {
943 let start = self.pos();
944 self.skip_name()?;
945
946 let name = self.slice_back(start);
947 if name.is_empty() {
948 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
949 }
950
951 Ok(name)
952 }
953
954 /// The same as `consume_name()`, but does not return a consumed name.
955 fn skip_name(&mut self) -> Result<()> {
956 let start = self.pos();
957 let mut iter = self.chars();
958 if let Some(c) = iter.next() {
959 if c.is_xml_name_start() {
960 self.advance(c.len_utf8());
961 } else {
962 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
963 }
964 }
965
966 for c in iter {
967 if c.is_xml_name() {
968 self.advance(c.len_utf8());
969 } else {
970 break;
971 }
972 }
973
974 Ok(())
975 }
976
977 /// Consumes a qualified XML name and returns it.
978 ///
979 /// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
980 #[inline(never)]
981 fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
982 let start = self.pos();
983
984 let mut splitter = None;
985
986 while !self.at_end() {
987 // Check for ASCII first for performance reasons.
988 let b = self.curr_byte_unchecked();
989 if b < 128 {
990 if b == b':' {
991 if splitter.is_none() {
992 splitter = Some(self.pos());
993 self.advance(1);
994 } else {
995 // Multiple `:` is an error.
996 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
997 }
998 } else if b.is_xml_name() {
999 self.advance(1);
1000 } else {
1001 break;
1002 }
1003 } else {
1004 // Fallback to Unicode code point.
1005 match self.chars().nth(0) {
1006 Some(c) if c.is_xml_name() => {
1007 self.advance(c.len_utf8());
1008 }
1009 _ => break,
1010 }
1011 }
1012 }
1013
1014 let (prefix, local) = if let Some(splitter) = splitter {
1015 let prefix = self.span.slice_region(start, splitter);
1016 let local = self.slice_back(splitter + 1);
1017 (prefix, local)
1018 } else {
1019 let local = self.slice_back(start);
1020 // Slice an empty prefix. This way we can preserve attribute start position.
1021 (self.span.slice_region(start, start), local)
1022 };
1023
1024 // Prefix must start with a `NameStartChar`.
1025 if let Some(c) = prefix.chars().nth(0) {
1026 if !c.is_xml_name_start() {
1027 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1028 }
1029 }
1030
1031 // Local name must start with a `NameStartChar`.
1032 if let Some(c) = local.chars().nth(0) {
1033 if !c.is_xml_name_start() {
1034 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1035 }
1036 } else {
1037 // If empty - error.
1038 return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1039 }
1040
1041 Ok((prefix, local))
1042 }
1043
1044 fn consume_eq(&mut self) -> Result<()> {
1045 self.skip_spaces();
1046 self.consume_byte(b'=')?;
1047 self.skip_spaces();
1048
1049 Ok(())
1050 }
1051
1052 fn consume_quote(&mut self) -> Result<u8> {
1053 let c = self.curr_byte()?;
1054 if c == b'\'' || c == b'"' {
1055 self.advance(1);
1056 Ok(c)
1057 } else {
1058 Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
1059 }
1060 }
1061
1062 /// Calculates a current absolute position.
1063 ///
1064 /// This operation is very expensive. Use only for errors.
1065 #[inline(never)]
1066 pub fn gen_text_pos(&self) -> TextPos {
1067 let text = self.span.as_str();
1068 let end = self.pos;
1069
1070 let row = Self::calc_curr_row(text, end);
1071 let col = Self::calc_curr_col(text, end);
1072 TextPos::new(row, col)
1073 }
1074
1075 /// Calculates an absolute position at `pos`.
1076 ///
1077 /// This operation is very expensive. Use only for errors.
1078 #[inline(never)]
1079 pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
1080 let mut s = self.clone();
1081 s.pos = core::cmp::min(pos, s.span.as_str().len());
1082 s.gen_text_pos()
1083 }
1084
1085 fn calc_curr_row(text: &str, end: usize) -> u32 {
1086 let mut row = 1;
1087 for c in &text.as_bytes()[..end] {
1088 if *c == b'\n' {
1089 row += 1;
1090 }
1091 }
1092
1093 row
1094 }
1095
1096 fn calc_curr_col(text: &str, end: usize) -> u32 {
1097 let mut col = 1;
1098 for c in text[..end].chars().rev() {
1099 if c == '\n' {
1100 break;
1101 } else {
1102 col += 1;
1103 }
1104 }
1105
1106 col
1107 }
1108}
1109