1use alloc::string::{String, ToString};
2use alloc::vec::Vec;
3use core::ops::Range;
4
5use crate::{
6 AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
7 NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
8 XMLNS,
9};
10
11use crate::tokenizer::{self, Reference, StrSpan, Stream};
12
13type Result<T> = core::result::Result<T, Error>;
14
15/// A list of all possible errors.
16#[derive(Clone, PartialEq, Eq, Hash, Debug)]
17pub enum Error {
18 /// The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.
19 InvalidXmlPrefixUri(TextPos),
20
21 /// Only the `xmlns:xml` attribute can have the <http://www.w3.org/XML/1998/namespace> URI.
22 UnexpectedXmlUri(TextPos),
23
24 /// The <http://www.w3.org/2000/xmlns/> URI must not be declared.
25 UnexpectedXmlnsUri(TextPos),
26
27 /// `xmlns` can't be used as an element prefix.
28 InvalidElementNamePrefix(TextPos),
29
30 /// A namespace was already defined on this element.
31 DuplicatedNamespace(String, TextPos),
32
33 /// An unknown namespace.
34 ///
35 /// Indicates that an element or an attribute has an unknown qualified name prefix.
36 ///
37 /// The first value is a prefix.
38 UnknownNamespace(String, TextPos),
39
40 /// Incorrect tree structure.
41 ///
42 /// expected, actual, position
43 #[allow(missing_docs)]
44 UnexpectedCloseTag(String, String, TextPos),
45
46 /// Entity value starts with a close tag.
47 ///
48 /// Example:
49 /// ```xml
50 /// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
51 /// <root>&p;</root>
52 /// ```
53 UnexpectedEntityCloseTag(TextPos),
54
55 /// A reference to an entity that was not defined in the DTD.
56 UnknownEntityReference(String, TextPos),
57
58 /// A malformed entity reference.
59 ///
60 /// A `&` character inside an attribute value or text indicates an entity reference.
61 /// Otherwise, the document is not well-formed.
62 MalformedEntityReference(TextPos),
63
64 /// A possible entity reference loop.
65 ///
66 /// The current depth limit is 10. The max number of references per reference is 255.
67 EntityReferenceLoop(TextPos),
68
69 /// Attribute value cannot have a `<` character.
70 InvalidAttributeValue(TextPos),
71
72 /// An element has a duplicated attributes.
73 ///
74 /// This also includes namespaces resolving.
75 /// So an element like this will lead to an error.
76 /// ```xml
77 /// <e xmlns:n1='http://www.w3.org' xmlns:n2='http://www.w3.org' n1:a='b1' n2:a='b2'/>
78 /// ```
79 DuplicatedAttribute(String, TextPos),
80
81 /// The XML document must have at least one element.
82 NoRootNode,
83
84 /// The root node was opened but never closed.
85 UnclosedRootNode,
86
87 /// An XML document can have only one XML declaration
88 /// and it must be at the start of the document.
89 UnexpectedDeclaration(TextPos),
90
91 /// An XML with DTD detected.
92 ///
93 /// This error will be emitted only when `ParsingOptions::allow_dtd` is set to `false`.
94 DtdDetected,
95
96 /// Indicates that the [`ParsingOptions::nodes_limit`] was reached.
97 NodesLimitReached,
98
99 /// Indicates that too many attributes were parsed.
100 AttributesLimitReached,
101
102 /// Indicates that too many namespaces were parsed.
103 NamespacesLimitReached,
104
105 /// An invalid name.
106 InvalidName(TextPos),
107
108 /// A non-XML character has occurred.
109 ///
110 /// Valid characters are: <https://www.w3.org/TR/xml/#char32>
111 NonXmlChar(char, TextPos),
112
113 /// An invalid/unexpected character.
114 ///
115 /// expected, actual, position
116 InvalidChar(u8, u8, TextPos),
117
118 /// An invalid/unexpected character.
119 ///
120 /// expected, actual, position
121 InvalidChar2(&'static str, u8, TextPos),
122
123 /// An unexpected string.
124 ///
125 /// Contains what string was expected.
126 InvalidString(&'static str, TextPos),
127
128 /// An invalid ExternalID in the DTD.
129 InvalidExternalID(TextPos),
130
131 /// A comment cannot contain `--` or end with `-`.
132 InvalidComment(TextPos),
133
134 /// A Character Data node contains an invalid data.
135 ///
136 /// Currently, only `]]>` is not allowed.
137 InvalidCharacterData(TextPos),
138
139 /// An unknown token.
140 UnknownToken(TextPos),
141
142 /// The steam ended earlier than we expected.
143 ///
144 /// Should only appear on invalid input data.
145 UnexpectedEndOfStream,
146}
147
148impl Error {
149 /// Returns the error position.
150 pub fn pos(&self) -> TextPos {
151 match *self {
152 Error::InvalidXmlPrefixUri(pos) => pos,
153 Error::UnexpectedXmlUri(pos) => pos,
154 Error::UnexpectedXmlnsUri(pos) => pos,
155 Error::InvalidElementNamePrefix(pos) => pos,
156 Error::DuplicatedNamespace(_, pos) => pos,
157 Error::UnknownNamespace(_, pos) => pos,
158 Error::UnexpectedCloseTag(_, _, pos) => pos,
159 Error::UnexpectedEntityCloseTag(pos) => pos,
160 Error::UnknownEntityReference(_, pos) => pos,
161 Error::MalformedEntityReference(pos) => pos,
162 Error::EntityReferenceLoop(pos) => pos,
163 Error::InvalidAttributeValue(pos) => pos,
164 Error::DuplicatedAttribute(_, pos) => pos,
165 Error::NoRootNode => TextPos::new(1, 1),
166 Error::UnclosedRootNode => TextPos::new(1, 1),
167 Error::UnexpectedDeclaration(pos) => pos,
168 Error::DtdDetected => TextPos::new(1, 1),
169 Error::NodesLimitReached => TextPos::new(1, 1),
170 Error::AttributesLimitReached => TextPos::new(1, 1),
171 Error::NamespacesLimitReached => TextPos::new(1, 1),
172 Error::InvalidName(pos) => pos,
173 Error::NonXmlChar(_, pos) => pos,
174 Error::InvalidChar(_, _, pos) => pos,
175 Error::InvalidChar2(_, _, pos) => pos,
176 Error::InvalidString(_, pos) => pos,
177 Error::InvalidExternalID(pos) => pos,
178 Error::InvalidComment(pos) => pos,
179 Error::InvalidCharacterData(pos) => pos,
180 Error::UnknownToken(pos) => pos,
181 Error::UnexpectedEndOfStream => TextPos::new(1, 1),
182 }
183 }
184}
185
186impl core::fmt::Display for Error {
187 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
188 match *self {
189 Error::InvalidXmlPrefixUri(pos) => {
190 write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
191 }
192 Error::UnexpectedXmlUri(pos) => {
193 write!(
194 f,
195 "the 'xml' namespace URI is used for not 'xml' prefix at {}",
196 pos
197 )
198 }
199 Error::UnexpectedXmlnsUri(pos) => {
200 write!(
201 f,
202 "the 'xmlns' URI is used at {}, but it must not be declared",
203 pos
204 )
205 }
206 Error::InvalidElementNamePrefix(pos) => {
207 write!(
208 f,
209 "the 'xmlns' prefix is used at {}, but it must not be",
210 pos
211 )
212 }
213 Error::DuplicatedNamespace(ref name, pos) => {
214 write!(f, "namespace '{}' at {} is already defined", name, pos)
215 }
216 Error::UnknownNamespace(ref name, pos) => {
217 write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
218 }
219 Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
220 write!(
221 f,
222 "expected '{}' tag, not '{}' at {}",
223 expected, actual, pos
224 )
225 }
226 Error::UnexpectedEntityCloseTag(pos) => {
227 write!(f, "unexpected close tag at {}", pos)
228 }
229 Error::MalformedEntityReference(pos) => {
230 write!(f, "malformed entity reference at {}", pos)
231 }
232 Error::UnknownEntityReference(ref name, pos) => {
233 write!(f, "unknown entity reference '{}' at {}", name, pos)
234 }
235 Error::EntityReferenceLoop(pos) => {
236 write!(f, "a possible entity reference loop is detected at {}", pos)
237 }
238 Error::InvalidAttributeValue(pos) => {
239 write!(f, "unescaped '<' found at {}", pos)
240 }
241 Error::DuplicatedAttribute(ref name, pos) => {
242 write!(f, "attribute '{}' at {} is already defined", name, pos)
243 }
244 Error::NoRootNode => {
245 write!(f, "the document does not have a root node")
246 }
247 Error::UnclosedRootNode => {
248 write!(f, "the root node was opened but never closed")
249 }
250 Error::UnexpectedDeclaration(pos) => {
251 write!(f, "unexpected XML declaration at {}", pos)
252 }
253 Error::DtdDetected => {
254 write!(f, "XML with DTD detected")
255 }
256 Error::NodesLimitReached => {
257 write!(f, "nodes limit reached")
258 }
259 Error::AttributesLimitReached => {
260 write!(f, "more than 2^32 attributes were parsed")
261 }
262 Error::NamespacesLimitReached => {
263 write!(f, "more than 2^16 unique namespaces were parsed")
264 }
265 Error::InvalidName(pos) => {
266 write!(f, "invalid name token at {}", pos)
267 }
268 Error::NonXmlChar(c, pos) => {
269 write!(f, "a non-XML character {:?} found at {}", c, pos)
270 }
271 Error::InvalidChar(expected, actual, pos) => {
272 write!(
273 f,
274 "expected '{}' not '{}' at {}",
275 expected as char, actual as char, pos
276 )
277 }
278 Error::InvalidChar2(expected, actual, pos) => {
279 write!(
280 f,
281 "expected {} not '{}' at {}",
282 expected, actual as char, pos
283 )
284 }
285 Error::InvalidString(expected, pos) => {
286 write!(f, "expected '{}' at {}", expected, pos)
287 }
288 Error::InvalidExternalID(pos) => {
289 write!(f, "invalid ExternalID at {}", pos)
290 }
291 Error::InvalidComment(pos) => {
292 write!(f, "comment at {} contains '--'", pos)
293 }
294 Error::InvalidCharacterData(pos) => {
295 write!(f, "']]>' at {} is not allowed inside a character data", pos)
296 }
297 Error::UnknownToken(pos) => {
298 write!(f, "unknown token at {}", pos)
299 }
300 Error::UnexpectedEndOfStream => {
301 write!(f, "unexpected end of stream")
302 }
303 }
304 }
305}
306
307#[cfg(feature = "std")]
308impl std::error::Error for Error {
309 fn description(&self) -> &str {
310 "an XML parsing error"
311 }
312}
313
314/// Parsing options.
315#[derive(Clone, Copy, PartialEq, Eq, Debug)]
316pub struct ParsingOptions {
317 /// Allow DTD parsing.
318 ///
319 /// When set to `false`, XML with DTD will cause an error.
320 /// Empty DTD block is not an error.
321 ///
322 /// Currently, there is no option to simply skip DTD.
323 /// Mainly because you will get `UnknownEntityReference` error later anyway.
324 ///
325 /// This flag is set to `false` by default for security reasons,
326 /// but `roxmltree` still has checks for billion laughs attack,
327 /// so this is just an extra security measure.
328 ///
329 /// Default: false
330 pub allow_dtd: bool,
331
332 /// Sets the maximum number of nodes to parse.
333 ///
334 /// Useful when dealing with random input to limit memory usage.
335 ///
336 /// Default: u32::MAX (no limit)
337 pub nodes_limit: u32,
338}
339
340// Explicit for readability.
341#[allow(clippy::derivable_impls)]
342impl Default for ParsingOptions {
343 fn default() -> Self {
344 ParsingOptions {
345 allow_dtd: false,
346 nodes_limit: core::u32::MAX,
347 }
348 }
349}
350
351struct TempAttributeData<'input> {
352 prefix: &'input str,
353 local: &'input str,
354 value: StringStorage<'input>,
355 pos: usize,
356}
357
358impl<'input> Document<'input> {
359 /// Parses the input XML string.
360 ///
361 /// We do not support `&[u8]` or `Reader` because the input must be an already allocated
362 /// UTF-8 string.
363 ///
364 /// This is a shorthand for `Document::parse_with_options(data, ParsingOptions::default())`.
365 ///
366 /// # Examples
367 ///
368 /// ```
369 /// let doc = roxmltree::Document::parse("<e/>").unwrap();
370 /// assert_eq!(doc.descendants().count(), 2); // root node + `e` element node
371 /// ```
372 #[inline]
373 pub fn parse(text: &str) -> Result<Document> {
374 Self::parse_with_options(text, ParsingOptions::default())
375 }
376
377 /// Parses the input XML string using to selected options.
378 ///
379 /// We do not support `&[u8]` or `Reader` because the input must be an already allocated
380 /// UTF-8 string.
381 ///
382 /// # Examples
383 ///
384 /// ```
385 /// let opt = roxmltree::ParsingOptions::default();
386 /// let doc = roxmltree::Document::parse_with_options("<e/>", opt).unwrap();
387 /// assert_eq!(doc.descendants().count(), 2); // root node + `e` element node
388 /// ```
389 #[inline]
390 pub fn parse_with_options(text: &str, opt: ParsingOptions) -> Result<Document> {
391 parse(text, opt)
392 }
393}
394
395struct Entity<'input> {
396 name: &'input str,
397 value: StrSpan<'input>,
398}
399
400#[derive(Clone, Copy)]
401struct TagNameSpan<'input> {
402 prefix: &'input str,
403 name: &'input str,
404 pos: usize,
405 prefix_pos: usize,
406}
407
408impl<'input> TagNameSpan<'input> {
409 #[inline]
410 fn new_null() -> Self {
411 Self {
412 prefix: "",
413 name: "",
414 pos: 0,
415 prefix_pos: 0,
416 }
417 }
418}
419
420/// An entity loop detector.
421///
422/// Limits:
423/// - Entities depth is 10.
424/// - Maximum number of entity references per entity reference is 255.
425///
426/// Basically, if a text or an attribute has an entity reference and this reference
427/// has more than 10 nested references - this is an error.
428///
429/// This is useful for simple loops like:
430///
431/// ```text
432/// <!ENTITY a '&b;'>
433/// <!ENTITY b '&a;'>
434/// ```
435///
436/// And, if a text or an attribute has an entity reference and it references more
437/// than 255 references - this is an error.
438///
439/// This is useful for cases like billion laughs attack, where depth can be pretty small,
440/// but the number of references is exponentially increasing:
441///
442/// ```text
443/// <!ENTITY lol "lol">
444/// <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
445/// <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
446/// <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
447/// <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
448/// ```
449#[derive(Default)]
450struct LoopDetector {
451 /// References depth.
452 depth: u8,
453 /// Number of references resolved by the root reference.
454 references: u8,
455}
456
457impl LoopDetector {
458 #[inline]
459 fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
460 if self.depth < 10 {
461 self.depth += 1;
462 Ok(())
463 } else {
464 Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
465 }
466 }
467
468 #[inline]
469 fn dec_depth(&mut self) {
470 if self.depth > 0 {
471 self.depth -= 1;
472 }
473
474 // Reset references count after reaching zero depth.
475 if self.depth == 0 {
476 self.references = 0;
477 }
478 }
479
480 #[inline]
481 fn inc_references(&mut self, stream: &Stream) -> Result<()> {
482 if self.depth == 0 {
483 // Allow infinite amount of references at zero depth.
484 Ok(())
485 } else {
486 if self.references == core::u8::MAX {
487 return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
488 }
489
490 self.references += 1;
491 Ok(())
492 }
493 }
494}
495
496struct Context<'input> {
497 opt: ParsingOptions,
498 namespace_start_idx: usize,
499 current_attributes: Vec<TempAttributeData<'input>>,
500 awaiting_subtree: Vec<NodeId>,
501 parent_prefixes: Vec<&'input str>,
502 entities: Vec<Entity<'input>>,
503 after_text: bool,
504 parent_id: NodeId,
505 tag_name: TagNameSpan<'input>,
506 loop_detector: LoopDetector,
507 doc: Document<'input>,
508}
509
510impl<'input> Context<'input> {
511 fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
512 if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
513 return Err(Error::NodesLimitReached);
514 }
515
516 #[cfg(not(feature = "positions"))]
517 let _ = range;
518
519 let new_child_id = NodeId::from(self.doc.nodes.len());
520
521 let appending_element = matches!(kind, NodeKind::Element { .. });
522 self.doc.nodes.push(NodeData {
523 parent: Some(self.parent_id),
524 prev_sibling: None,
525 next_subtree: None,
526 last_child: None,
527 kind,
528 #[cfg(feature = "positions")]
529 range,
530 });
531
532 let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
533 self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
534 self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
535
536 for id in &self.awaiting_subtree {
537 self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
538 }
539 self.awaiting_subtree.clear();
540
541 if !appending_element {
542 self.awaiting_subtree
543 .push(NodeId::from(self.doc.nodes.len() - 1));
544 }
545
546 Ok(new_child_id)
547 }
548
549 fn err_pos_at(&self, pos: usize) -> TextPos {
550 self.doc.text_pos_at(pos)
551 }
552}
553
554fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
555 // Trying to guess rough nodes and attributes amount.
556 let nodes_capacity = text.bytes().filter(|c| *c == b'<').count();
557 let attributes_capacity = text.bytes().filter(|c| *c == b'=').count();
558
559 // Init document.
560 let mut doc = Document {
561 text,
562 nodes: Vec::with_capacity(nodes_capacity),
563 attributes: Vec::with_capacity(attributes_capacity),
564 namespaces: Namespaces::default(),
565 };
566
567 // Add a root node.
568 doc.nodes.push(NodeData {
569 parent: None,
570 prev_sibling: None,
571 next_subtree: None,
572 last_child: None,
573 kind: NodeKind::Root,
574 #[cfg(feature = "positions")]
575 range: 0..text.len(),
576 });
577
578 doc.namespaces
579 .push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
580
581 let mut ctx = Context {
582 opt,
583 namespace_start_idx: 1,
584 current_attributes: Vec::with_capacity(16),
585 entities: Vec::new(),
586 awaiting_subtree: Vec::new(),
587 parent_prefixes: Vec::new(),
588 after_text: false,
589 parent_id: NodeId::new(0),
590 tag_name: TagNameSpan::new_null(),
591 loop_detector: LoopDetector::default(),
592 doc,
593 };
594 ctx.parent_prefixes.push("");
595
596 tokenizer::parse(text, opt.allow_dtd, &mut ctx)?;
597
598 let mut doc = ctx.doc;
599 if !doc.root().children().any(|n| n.is_element()) {
600 return Err(Error::NoRootNode);
601 }
602
603 if ctx.parent_prefixes.len() > 1 {
604 return Err(Error::UnclosedRootNode);
605 }
606
607 doc.nodes.shrink_to_fit();
608 doc.attributes.shrink_to_fit();
609 doc.namespaces.shrink_to_fit();
610
611 Ok(doc)
612}
613
614impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
615 fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
616 match token {
617 tokenizer::Token::ProcessingInstruction(target, value, range) => {
618 let pi = NodeKind::PI(PI { target, value });
619 self.append_node(pi, range)?;
620 self.after_text = false;
621 }
622 tokenizer::Token::Comment(text, range) => {
623 self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
624 self.after_text = false;
625 }
626 tokenizer::Token::EntityDeclaration(name, definition) => {
627 self.entities.push(Entity {
628 name,
629 value: definition,
630 });
631 }
632 tokenizer::Token::ElementStart(prefix, local, start) => {
633 if prefix == XMLNS {
634 let pos = self.err_pos_at(start + 1);
635 return Err(Error::InvalidElementNamePrefix(pos));
636 }
637
638 self.tag_name = TagNameSpan {
639 prefix,
640 name: local,
641 pos: start,
642 prefix_pos: start + 1,
643 };
644
645 self.after_text = false;
646 }
647 tokenizer::Token::Attribute(attr_start, prefix, local, value) => {
648 process_attribute(attr_start, prefix, local, value, self)?;
649 }
650 tokenizer::Token::ElementEnd(end, range) => {
651 process_element(end, range, self)?;
652 self.after_text = false;
653 }
654 tokenizer::Token::Text(text, range) => {
655 process_text(text, range, self)?;
656 }
657 tokenizer::Token::Cdata(text, range) => {
658 process_cdata(text, range, self)?;
659 }
660 }
661
662 Ok(())
663 }
664}
665
666#[allow(clippy::too_many_arguments)]
667fn process_attribute<'input>(
668 attr_pos: usize,
669 prefix: &'input str,
670 local: &'input str,
671 value: StrSpan<'input>,
672 ctx: &mut Context<'input>,
673) -> Result<()> {
674 let value = normalize_attribute(value, ctx)?;
675
676 if prefix == XMLNS {
677 // The xmlns namespace MUST NOT be declared as the default namespace.
678 if value.as_str() == NS_XMLNS_URI {
679 let pos = ctx.err_pos_at(attr_pos);
680 return Err(Error::UnexpectedXmlnsUri(pos));
681 }
682
683 let is_xml_ns_uri = value.as_str() == NS_XML_URI;
684
685 // The prefix 'xml' is by definition bound to the namespace name
686 // http://www.w3.org/XML/1998/namespace.
687 // It MUST NOT be bound to any other namespace name.
688 if local == NS_XML_PREFIX {
689 if !is_xml_ns_uri {
690 let pos = ctx.err_pos_at(attr_pos);
691 return Err(Error::InvalidXmlPrefixUri(pos));
692 }
693 } else {
694 // The xml namespace MUST NOT be bound to a non-xml prefix.
695 if is_xml_ns_uri {
696 let pos = ctx.err_pos_at(attr_pos);
697 return Err(Error::UnexpectedXmlUri(pos));
698 }
699 }
700
701 // Check for duplicated namespaces.
702 if ctx
703 .doc
704 .namespaces
705 .exists(ctx.namespace_start_idx, Some(local))
706 {
707 let pos = ctx.err_pos_at(attr_pos);
708 return Err(Error::DuplicatedNamespace(local.to_string(), pos));
709 }
710
711 // Xml namespace should not be added to the namespaces.
712 if !is_xml_ns_uri {
713 ctx.doc.namespaces.push_ns(Some(local), value)?;
714 }
715 } else if local == XMLNS {
716 // The xml namespace MUST NOT be declared as the default namespace.
717 if value.as_str() == NS_XML_URI {
718 let pos = ctx.err_pos_at(attr_pos);
719 return Err(Error::UnexpectedXmlUri(pos));
720 }
721
722 // The xmlns namespace MUST NOT be declared as the default namespace.
723 if value.as_str() == NS_XMLNS_URI {
724 let pos = ctx.err_pos_at(attr_pos);
725 return Err(Error::UnexpectedXmlnsUri(pos));
726 }
727
728 ctx.doc.namespaces.push_ns(None, value)?;
729 } else {
730 ctx.current_attributes.push(TempAttributeData {
731 prefix,
732 local,
733 value,
734 pos: attr_pos,
735 });
736 }
737
738 Ok(())
739}
740
741fn process_element<'input>(
742 end_token: tokenizer::ElementEnd<'input>,
743 token_range: Range<usize>,
744 ctx: &mut Context<'input>,
745) -> Result<()> {
746 if ctx.tag_name.name.is_empty() {
747 // May occur in XML like this:
748 // <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
749 // <root>&p;</root>
750
751 if let tokenizer::ElementEnd::Close(..) = end_token {
752 return Err(Error::UnexpectedEntityCloseTag(
753 ctx.err_pos_at(token_range.start),
754 ));
755 } else {
756 unreachable!("should be already checked by the tokenizer");
757 }
758 }
759
760 let namespaces = ctx.resolve_namespaces();
761 ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
762
763 let attributes = resolve_attributes(namespaces, ctx)?;
764
765 match end_token {
766 tokenizer::ElementEnd::Empty => {
767 let tag_ns_idx = get_ns_idx_by_prefix(
768 namespaces,
769 ctx.tag_name.prefix_pos,
770 ctx.tag_name.prefix,
771 ctx,
772 )?;
773 let new_element_id = ctx.append_node(
774 NodeKind::Element {
775 tag_name: ExpandedNameIndexed {
776 namespace_idx: tag_ns_idx,
777 local_name: ctx.tag_name.name,
778 },
779 attributes,
780 namespaces,
781 },
782 ctx.tag_name.pos..token_range.end,
783 )?;
784 ctx.awaiting_subtree.push(new_element_id);
785 }
786 tokenizer::ElementEnd::Close(prefix, local) => {
787 let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
788 // should never panic as we start with the single prefix of the
789 // root node and always push another one when changing the parent
790 let parent_prefix = *ctx.parent_prefixes.last().unwrap();
791
792 #[cfg(feature = "positions")]
793 {
794 parent_node.range.end = token_range.end;
795 }
796
797 if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
798 if prefix != parent_prefix || local != tag_name.local_name {
799 return Err(Error::UnexpectedCloseTag(
800 gen_qname_string(parent_prefix, tag_name.local_name),
801 gen_qname_string(prefix, local),
802 ctx.err_pos_at(token_range.start),
803 ));
804 }
805 }
806 ctx.awaiting_subtree.push(ctx.parent_id);
807
808 if let Some(id) = parent_node.parent {
809 ctx.parent_id = id;
810 ctx.parent_prefixes.pop();
811 debug_assert!(!ctx.parent_prefixes.is_empty());
812 } else {
813 unreachable!("should be already checked by the tokenizer");
814 }
815 }
816 tokenizer::ElementEnd::Open => {
817 let tag_ns_idx = get_ns_idx_by_prefix(
818 namespaces,
819 ctx.tag_name.prefix_pos,
820 ctx.tag_name.prefix,
821 ctx,
822 )?;
823 ctx.parent_id = ctx.append_node(
824 NodeKind::Element {
825 tag_name: ExpandedNameIndexed {
826 namespace_idx: tag_ns_idx,
827 local_name: ctx.tag_name.name,
828 },
829 attributes,
830 namespaces,
831 },
832 ctx.tag_name.pos..token_range.end,
833 )?;
834 ctx.parent_prefixes.push(ctx.tag_name.prefix);
835 }
836 }
837
838 Ok(())
839}
840
841impl Context<'_> {
842 fn resolve_namespaces(&mut self) -> ShortRange {
843 if let NodeKind::Element { ref namespaces, .. } =
844 self.doc.nodes[self.parent_id.get_usize()].kind
845 {
846 let parent_ns = *namespaces;
847 if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
848 return parent_ns;
849 }
850
851 for i in parent_ns.to_urange() {
852 if !self.doc.namespaces.exists(
853 self.namespace_start_idx,
854 self.doc
855 .namespaces
856 .get(self.doc.namespaces.tree_order[i])
857 .name,
858 ) {
859 self.doc.namespaces.push_ref(i);
860 }
861 }
862 }
863
864 (self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
865 }
866}
867
868fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
869 if ctx.current_attributes.is_empty() {
870 return Ok(ShortRange::new(0, 0));
871 }
872
873 if ctx.doc.attributes.len() + ctx.current_attributes.len() >= core::u32::MAX as usize {
874 return Err(Error::AttributesLimitReached);
875 }
876
877 let start_idx = ctx.doc.attributes.len();
878
879 let current_attributes = core::mem::take(&mut ctx.current_attributes);
880 for attr in current_attributes {
881 let namespace_idx = if attr.prefix == NS_XML_PREFIX {
882 // The prefix 'xml' is by definition bound to the namespace name
883 // http://www.w3.org/XML/1998/namespace. This namespace is added
884 // to the document on creation and is always element 0.
885 Some(NamespaceIdx(0))
886 } else if attr.prefix.is_empty() {
887 // 'The namespace name for an unprefixed attribute name
888 // always has no value.'
889 None
890 } else {
891 get_ns_idx_by_prefix(namespaces, attr.pos, attr.prefix, ctx)?
892 };
893
894 let attr_name = ExpandedNameIndexed {
895 namespace_idx,
896 local_name: attr.local,
897 };
898
899 // Check for duplicated attributes.
900 if ctx.doc.attributes[start_idx..].iter().any(|attr| {
901 attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
902 }) {
903 let pos = ctx.err_pos_at(attr.pos);
904 return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
905 }
906
907 ctx.doc.attributes.push(AttributeData {
908 name: attr_name,
909 value: attr.value,
910 #[cfg(feature = "positions")]
911 pos: attr.pos,
912 });
913 }
914
915 Ok((start_idx..ctx.doc.attributes.len()).into())
916}
917
918fn process_text<'input>(
919 text: &'input str,
920 range: Range<usize>,
921 ctx: &mut Context<'input>,
922) -> Result<()> {
923 // Add text as is if it has only valid characters.
924 if !text.bytes().any(|b| b == b'&' || b == b'\r') {
925 append_text(StringStorage::Borrowed(text), range, ctx)?;
926 ctx.after_text = true;
927 return Ok(());
928 }
929
930 let mut text_buffer = TextBuffer::new();
931 let mut is_as_is = false; // TODO: explain
932 let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
933 while !stream.at_end() {
934 match parse_next_chunk(&mut stream, &ctx.entities)? {
935 NextChunk::Byte(c) => {
936 if is_as_is {
937 text_buffer.push_raw(c);
938 is_as_is = false;
939 } else {
940 text_buffer.push_from_text(c, stream.at_end());
941 }
942 }
943 NextChunk::Char(c) => {
944 for b in CharToBytes::new(c) {
945 if ctx.loop_detector.depth > 0 {
946 text_buffer.push_from_text(b, stream.at_end());
947 } else {
948 // Characters not from entity should be added as is.
949 // Not sure why... At least `lxml` produces the same result.
950 text_buffer.push_raw(b);
951 is_as_is = true;
952 }
953 }
954 }
955 NextChunk::Text(fragment) => {
956 is_as_is = false;
957
958 if !text_buffer.is_empty() {
959 let storage = StringStorage::new_owned(text_buffer.to_str());
960 append_text(storage, range.clone(), ctx)?;
961 text_buffer.clear();
962 ctx.after_text = true;
963 }
964
965 ctx.loop_detector.inc_references(&stream)?;
966 ctx.loop_detector.inc_depth(&stream)?;
967
968 let mut stream = Stream::from_substr(ctx.doc.text, fragment.range());
969 let prev_tag_name = ctx.tag_name;
970 ctx.tag_name = TagNameSpan::new_null();
971 tokenizer::parse_content(&mut stream, ctx)?;
972 ctx.tag_name = prev_tag_name;
973 text_buffer.clear();
974
975 ctx.loop_detector.dec_depth();
976 }
977 }
978 }
979
980 if !text_buffer.is_empty() {
981 append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
982 ctx.after_text = true;
983 }
984
985 Ok(())
986}
987
988// While the whole purpose of CDATA is to indicate to an XML library that this text
989// has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
990fn process_cdata<'input>(
991 text: &'input str,
992 range: Range<usize>,
993 ctx: &mut Context<'input>,
994) -> Result<()> {
995 // Add text as is if it has only valid characters.
996 if !text.as_bytes().contains(&b'\r') {
997 append_text(text:StringStorage::Borrowed(text), range, ctx)?;
998 ctx.after_text = true;
999 return Ok(());
1000 }
1001
1002 let mut text_buffer: TextBuffer = TextBuffer::new();
1003 let count: usize = text.chars().count();
1004 for (i: usize, c: char) in text.chars().enumerate() {
1005 for b: u8 in CharToBytes::new(c) {
1006 text_buffer.push_from_text(c:b, at_end:i + 1 == count);
1007 }
1008 }
1009
1010 if !text_buffer.is_empty() {
1011 append_text(text:StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
1012 ctx.after_text = true;
1013 }
1014
1015 Ok(())
1016}
1017
1018fn append_text<'input>(
1019 text: StringStorage<'input>,
1020 range: Range<usize>,
1021 ctx: &mut Context<'input>,
1022) -> Result<()> {
1023 if ctx.after_text {
1024 // Prepend to a previous text node.
1025 if let Some(node: &mut NodeData<'_>) = ctx.doc.nodes.last_mut() {
1026 if let NodeKind::Text(ref mut prev_text: &mut StringStorage<'_>) = node.kind {
1027 let text_str: &str = text.as_str();
1028 let prev_text_str: &str = prev_text.as_str();
1029
1030 let mut concat_text: String = String::with_capacity(text_str.len() + prev_text_str.len());
1031 concat_text.push_str(string:prev_text_str);
1032 concat_text.push_str(string:text_str);
1033 *prev_text = StringStorage::new_owned(concat_text);
1034 }
1035 }
1036 } else {
1037 ctx.append_node(kind:NodeKind::Text(text), range)?;
1038 }
1039
1040 Ok(())
1041}
1042
1043enum NextChunk<'a> {
1044 Byte(u8),
1045 Char(char),
1046 Text(StrSpan<'a>),
1047}
1048
1049fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1050 debug_assert!(!stream.at_end());
1051
1052 // Safe, because we already checked that stream is not at the end.
1053 // But we have an additional `debug_assert` above just in case.
1054 let c = stream.curr_byte_unchecked();
1055
1056 // Check for character/entity references.
1057 if c == b'&' {
1058 let start = stream.pos();
1059 match stream.try_consume_reference() {
1060 Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1061 Some(Reference::Entity(name)) => entities
1062 .iter()
1063 .find(|e| e.name == name)
1064 .map(|e| NextChunk::Text(e.value))
1065 .ok_or_else(|| {
1066 let pos = stream.gen_text_pos_from(start);
1067 Error::UnknownEntityReference(name.into(), pos)
1068 }),
1069 None => {
1070 let pos = stream.gen_text_pos_from(start);
1071 Err(Error::MalformedEntityReference(pos))
1072 }
1073 }
1074 } else {
1075 stream.advance(1);
1076 Ok(NextChunk::Byte(c))
1077 }
1078}
1079
1080// https://www.w3.org/TR/REC-xml/#AVNormalize
1081fn normalize_attribute<'input>(
1082 text: StrSpan<'input>,
1083 ctx: &mut Context<'input>,
1084) -> Result<StringStorage<'input>> {
1085 if is_normalization_required(&text) {
1086 let mut text_buffer: TextBuffer = TextBuffer::new();
1087 _normalize_attribute(text, &mut text_buffer, ctx)?;
1088 Ok(StringStorage::new_owned(text_buffer.finish()))
1089 } else {
1090 Ok(StringStorage::Borrowed(text.as_str()))
1091 }
1092}
1093
1094#[inline]
1095fn is_normalization_required(text: &StrSpan) -> bool {
1096 // We assume that `&` indicates an entity or a character reference.
1097 // But in rare cases it can be just an another character.
1098
1099 fn check(c: u8) -> bool {
1100 matches!(c, b'&' | b'\t' | b'\n' | b'\r')
1101 }
1102
1103 text.as_str().bytes().any(check)
1104}
1105
1106fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1107 let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1108 while !stream.at_end() {
1109 // Safe, because we already checked that the stream is not at the end.
1110 let c = stream.curr_byte_unchecked();
1111
1112 if c != b'&' {
1113 stream.advance(1);
1114 buffer.push_from_attr(c, stream.curr_byte().ok());
1115 continue;
1116 }
1117
1118 // Check for character/entity references.
1119 let start = stream.pos();
1120 match stream.try_consume_reference() {
1121 Some(Reference::Char(ch)) => {
1122 for b in CharToBytes::new(ch) {
1123 if ctx.loop_detector.depth > 0 {
1124 // Escaped `<` inside an ENTITY is an error.
1125 // Escaped `<` outside an ENTITY is ok.
1126 if b == b'<' {
1127 return Err(Error::InvalidAttributeValue(
1128 stream.gen_text_pos_from(start),
1129 ));
1130 }
1131
1132 buffer.push_from_attr(b, None);
1133 } else {
1134 // Characters not from entity should be added as is.
1135 // Not sure why... At least `lxml` produces the same results.
1136 buffer.push_raw(b);
1137 }
1138 }
1139 }
1140 Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
1141 Some(entity) => {
1142 ctx.loop_detector.inc_references(&stream)?;
1143 ctx.loop_detector.inc_depth(&stream)?;
1144 _normalize_attribute(entity.value, buffer, ctx)?;
1145 ctx.loop_detector.dec_depth();
1146 }
1147 None => {
1148 let pos = stream.gen_text_pos_from(start);
1149 return Err(Error::UnknownEntityReference(name.into(), pos));
1150 }
1151 },
1152 None => {
1153 let pos = stream.gen_text_pos_from(start);
1154 return Err(Error::MalformedEntityReference(pos));
1155 }
1156 }
1157 }
1158
1159 Ok(())
1160}
1161
1162fn get_ns_idx_by_prefix(
1163 namespaces: ShortRange,
1164 prefix_pos: usize,
1165 prefix: &str,
1166 ctx: &Context,
1167) -> Result<Option<NamespaceIdx>> {
1168 // Prefix CAN be empty when the default namespace was defined.
1169 //
1170 // Example:
1171 // <e xmlns='http://www.w3.org'/>
1172 let prefix_opt = if prefix.is_empty() {
1173 None
1174 } else {
1175 Some(prefix)
1176 };
1177
1178 let idx = ctx.doc.namespaces.tree_order[namespaces.to_urange()]
1179 .iter()
1180 .find(|idx| ctx.doc.namespaces.get(**idx).name == prefix_opt);
1181
1182 match idx {
1183 Some(idx) => Ok(Some(*idx)),
1184 None => {
1185 if !prefix.is_empty() {
1186 // If an URI was not found and prefix IS NOT empty than
1187 // we have an unknown namespace.
1188 //
1189 // Example:
1190 // <e random:a='b'/>
1191 let pos = ctx.err_pos_at(prefix_pos);
1192 Err(Error::UnknownNamespace(prefix.to_string(), pos))
1193 } else {
1194 // If an URI was not found and prefix IS empty than
1195 // an element or an attribute doesn't have a namespace.
1196 //
1197 // Example:
1198 // <e a='b'/>
1199 Ok(None)
1200 }
1201 }
1202 }
1203}
1204
1205fn gen_qname_string(prefix: &str, local: &str) -> String {
1206 if prefix.is_empty() {
1207 local.to_string()
1208 } else {
1209 alloc::format!("{}:{}", prefix, local)
1210 }
1211}
1212
1213/// Iterate over `char` by `u8`.
1214struct CharToBytes {
1215 buf: [u8; 4],
1216 idx: u8,
1217}
1218
1219impl CharToBytes {
1220 #[inline]
1221 fn new(c: char) -> Self {
1222 let mut buf: [u8; 4] = [0xFF; 4];
1223 c.encode_utf8(&mut buf);
1224
1225 CharToBytes { buf, idx: 0 }
1226 }
1227}
1228
1229impl Iterator for CharToBytes {
1230 type Item = u8;
1231
1232 #[inline]
1233 fn next(&mut self) -> Option<Self::Item> {
1234 if self.idx < 4 {
1235 let b: u8 = self.buf[self.idx as usize];
1236
1237 if b != 0xFF {
1238 self.idx += 1;
1239 return Some(b);
1240 } else {
1241 self.idx = 4;
1242 }
1243 }
1244
1245 None
1246 }
1247}
1248
1249struct TextBuffer {
1250 buffer: Vec<u8>,
1251}
1252
1253impl TextBuffer {
1254 #[inline]
1255 fn new() -> Self {
1256 TextBuffer {
1257 buffer: Vec::with_capacity(32),
1258 }
1259 }
1260
1261 #[inline]
1262 fn push_raw(&mut self, c: u8) {
1263 self.buffer.push(c);
1264 }
1265
1266 fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1267 // \r in \r\n should be ignored.
1268 if current == b'\r' && next == Some(b'\n') {
1269 return;
1270 }
1271
1272 // \n, \r and \t should be converted into spaces.
1273 current = match current {
1274 b'\n' | b'\r' | b'\t' => b' ',
1275 _ => current,
1276 };
1277
1278 self.buffer.push(current);
1279 }
1280
1281 // Translate \r\n and any \r that is not followed by \n into a single \n character.
1282 //
1283 // https://www.w3.org/TR/xml/#sec-line-ends
1284 fn push_from_text(&mut self, c: u8, at_end: bool) {
1285 if self.buffer.last() == Some(&b'\r') {
1286 let idx = self.buffer.len() - 1;
1287 self.buffer[idx] = b'\n';
1288
1289 if at_end && c == b'\r' {
1290 self.buffer.push(b'\n');
1291 } else if c != b'\n' {
1292 self.buffer.push(c);
1293 }
1294 } else if at_end && c == b'\r' {
1295 self.buffer.push(b'\n');
1296 } else {
1297 self.buffer.push(c);
1298 }
1299 }
1300
1301 #[inline]
1302 fn clear(&mut self) {
1303 self.buffer.clear();
1304 }
1305
1306 #[inline]
1307 fn is_empty(&self) -> bool {
1308 self.buffer.is_empty()
1309 }
1310
1311 #[inline]
1312 fn to_str(&self) -> &str {
1313 // `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1314 core::str::from_utf8(&self.buffer).unwrap()
1315 }
1316
1317 #[inline]
1318 fn finish(self) -> String {
1319 // `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1320 String::from_utf8(self.buffer).unwrap()
1321 }
1322}
1323