parse.rs source code [crates/roxmltree-0.19.0/src/parse.rs]

1	use alloc::string::{String, ToString};
2	use alloc::vec::Vec;
3	use core::ops::Range;
4
5	use crate::{
6	AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
7	NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
8	XMLNS,
9	};
10
11	use crate::tokenizer::{self, Reference, StrSpan, Stream};
12
13	type Result<T> = core::result::Result<T, Error>;
14
15	/// A list of all possible errors.
16	#[derive(Clone, PartialEq, Eq, Hash, Debug)]
17	pub enum Error {
18	/// The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.
19	InvalidXmlPrefixUri(TextPos),
20
21	/// Only the `xmlns:xml` attribute can have the <http://www.w3.org/XML/1998/namespace> URI.
22	UnexpectedXmlUri(TextPos),
23
24	/// The <http://www.w3.org/2000/xmlns/> URI must not be declared.
25	UnexpectedXmlnsUri(TextPos),
26
27	/// `xmlns` can't be used as an element prefix.
28	InvalidElementNamePrefix(TextPos),
29
30	/// A namespace was already defined on this element.
31	DuplicatedNamespace(String, TextPos),
32
33	/// An unknown namespace.
34	///
35	/// Indicates that an element or an attribute has an unknown qualified name prefix.
36	///
37	/// The first value is a prefix.
38	UnknownNamespace(String, TextPos),
39
40	/// Incorrect tree structure.
41	///
42	/// expected, actual, position
43	#[allow(missing_docs)]
44	UnexpectedCloseTag(String, String, TextPos),
45
46	/// Entity value starts with a close tag.
47	///
48	/// Example:
49	/// ```xml
50	/// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
51	/// <root>&p;</root>
52	/// ```
53	UnexpectedEntityCloseTag(TextPos),
54
55	/// A reference to an entity that was not defined in the DTD.
56	UnknownEntityReference(String, TextPos),
57
58	/// A malformed entity reference.
59	///
60	/// A `&` character inside an attribute value or text indicates an entity reference.
61	/// Otherwise, the document is not well-formed.
62	MalformedEntityReference(TextPos),
63
64	/// A possible entity reference loop.
65	///
66	/// The current depth limit is 10. The max number of references per reference is 255.
67	EntityReferenceLoop(TextPos),
68
69	/// Attribute value cannot have a `<` character.
70	InvalidAttributeValue(TextPos),
71
72	/// An element has a duplicated attributes.
73	///
74	/// This also includes namespaces resolving.
75	/// So an element like this will lead to an error.
76	/// ```xml
77	/// <e xmlns:n1='http://www.w3.org' xmlns:n2='http://www.w3.org' n1:a='b1' n2:a='b2'/>
78	/// ```
79	DuplicatedAttribute(String, TextPos),
80
81	/// The XML document must have at least one element.
82	NoRootNode,
83
84	/// The root node was opened but never closed.
85	UnclosedRootNode,
86
87	/// An XML document can have only one XML declaration
88	/// and it must be at the start of the document.
89	UnexpectedDeclaration(TextPos),
90
91	/// An XML with DTD detected.
92	///
93	/// This error will be emitted only when `ParsingOptions::allow_dtd` is set to `false`.
94	DtdDetected,
95
96	/// Indicates that the [`ParsingOptions::nodes_limit`] was reached.
97	NodesLimitReached,
98
99	/// Indicates that too many attributes were parsed.
100	AttributesLimitReached,
101
102	/// Indicates that too many namespaces were parsed.
103	NamespacesLimitReached,
104
105	/// An invalid name.
106	InvalidName(TextPos),
107
108	/// A non-XML character has occurred.
109	///
110	/// Valid characters are: <https://www.w3.org/TR/xml/#char32>
111	NonXmlChar(char, TextPos),
112
113	/// An invalid/unexpected character.
114	///
115	/// expected, actual, position
116	InvalidChar(u8, u8, TextPos),
117
118	/// An invalid/unexpected character.
119	///
120	/// expected, actual, position
121	InvalidChar2(&'static str, u8, TextPos),
122
123	/// An unexpected string.
124	///
125	/// Contains what string was expected.
126	InvalidString(&'static str, TextPos),
127
128	/// An invalid ExternalID in the DTD.
129	InvalidExternalID(TextPos),
130
131	/// A comment cannot contain `--` or end with `-`.
132	InvalidComment(TextPos),
133
134	/// A Character Data node contains an invalid data.
135	///
136	/// Currently, only `]]>` is not allowed.
137	InvalidCharacterData(TextPos),
138
139	/// An unknown token.
140	UnknownToken(TextPos),
141
142	/// The steam ended earlier than we expected.
143	///
144	/// Should only appear on invalid input data.
145	UnexpectedEndOfStream,
146	}
147
148	impl Error {
149	/// Returns the error position.
150	pub fn pos(&self) -> TextPos {
151	match *self {
152	Error::InvalidXmlPrefixUri(pos) => pos,
153	Error::UnexpectedXmlUri(pos) => pos,
154	Error::UnexpectedXmlnsUri(pos) => pos,
155	Error::InvalidElementNamePrefix(pos) => pos,
156	Error::DuplicatedNamespace(_, pos) => pos,
157	Error::UnknownNamespace(_, pos) => pos,
158	Error::UnexpectedCloseTag(_, _, pos) => pos,
159	Error::UnexpectedEntityCloseTag(pos) => pos,
160	Error::UnknownEntityReference(_, pos) => pos,
161	Error::MalformedEntityReference(pos) => pos,
162	Error::EntityReferenceLoop(pos) => pos,
163	Error::InvalidAttributeValue(pos) => pos,
164	Error::DuplicatedAttribute(_, pos) => pos,
165	Error::NoRootNode => TextPos::new(`1`, `1`),
166	Error::UnclosedRootNode => TextPos::new(`1`, `1`),
167	Error::UnexpectedDeclaration(pos) => pos,
168	Error::DtdDetected => TextPos::new(`1`, `1`),
169	Error::NodesLimitReached => TextPos::new(`1`, `1`),
170	Error::AttributesLimitReached => TextPos::new(`1`, `1`),
171	Error::NamespacesLimitReached => TextPos::new(`1`, `1`),
172	Error::InvalidName(pos) => pos,
173	Error::NonXmlChar(_, pos) => pos,
174	Error::InvalidChar(_, _, pos) => pos,
175	Error::InvalidChar2(_, _, pos) => pos,
176	Error::InvalidString(_, pos) => pos,
177	Error::InvalidExternalID(pos) => pos,
178	Error::InvalidComment(pos) => pos,
179	Error::InvalidCharacterData(pos) => pos,
180	Error::UnknownToken(pos) => pos,
181	Error::UnexpectedEndOfStream => TextPos::new(`1`, `1`),
182	}
183	}
184	}
185
186	impl core::fmt::Display for Error {
187	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
188	match *self {
189	Error::InvalidXmlPrefixUri(pos) => {
190	write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
191	}
192	Error::UnexpectedXmlUri(pos) => {
193	write!(
194	f,
195	"the 'xml' namespace URI is used for not 'xml' prefix at {}",
196	pos
197	)
198	}
199	Error::UnexpectedXmlnsUri(pos) => {
200	write!(
201	f,
202	"the 'xmlns' URI is used at {}, but it must not be declared",
203	pos
204	)
205	}
206	Error::InvalidElementNamePrefix(pos) => {
207	write!(
208	f,
209	"the 'xmlns' prefix is used at {}, but it must not be",
210	pos
211	)
212	}
213	Error::DuplicatedNamespace(ref name, pos) => {
214	write!(f, "namespace '{}' at {} is already defined", name, pos)
215	}
216	Error::UnknownNamespace(ref name, pos) => {
217	write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
218	}
219	Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
220	write!(
221	f,
222	"expected '{}' tag, not '{}' at {}",
223	expected, actual, pos
224	)
225	}
226	Error::UnexpectedEntityCloseTag(pos) => {
227	write!(f, "unexpected close tag at {}", pos)
228	}
229	Error::MalformedEntityReference(pos) => {
230	write!(f, "malformed entity reference at {}", pos)
231	}
232	Error::UnknownEntityReference(ref name, pos) => {
233	write!(f, "unknown entity reference '{}' at {}", name, pos)
234	}
235	Error::EntityReferenceLoop(pos) => {
236	write!(f, "a possible entity reference loop is detected at {}", pos)
237	}
238	Error::InvalidAttributeValue(pos) => {
239	write!(f, "unescaped '<' found at {}", pos)
240	}
241	Error::DuplicatedAttribute(ref name, pos) => {
242	write!(f, "attribute '{}' at {} is already defined", name, pos)
243	}
244	Error::NoRootNode => {
245	write!(f, "the document does not have a root node")
246	}
247	Error::UnclosedRootNode => {
248	write!(f, "the root node was opened but never closed")
249	}
250	Error::UnexpectedDeclaration(pos) => {
251	write!(f, "unexpected XML declaration at {}", pos)
252	}
253	Error::DtdDetected => {
254	write!(f, "XML with DTD detected")
255	}
256	Error::NodesLimitReached => {
257	write!(f, "nodes limit reached")
258	}
259	Error::AttributesLimitReached => {
260	write!(f, "more than 2^32 attributes were parsed")
261	}
262	Error::NamespacesLimitReached => {
263	write!(f, "more than 2^16 unique namespaces were parsed")
264	}
265	Error::InvalidName(pos) => {
266	write!(f, "invalid name token at {}", pos)
267	}
268	Error::NonXmlChar(c, pos) => {
269	write!(f, "a non-XML character {:?} found at {}", c, pos)
270	}
271	Error::InvalidChar(expected, actual, pos) => {
272	write!(
273	f,
274	"expected '{}' not '{}' at {}",
275	expected as char, actual as char, pos
276	)
277	}
278	Error::InvalidChar2(expected, actual, pos) => {
279	write!(
280	f,
281	"expected {} not '{}' at {}",
282	expected, actual as char, pos
283	)
284	}
285	Error::InvalidString(expected, pos) => {
286	write!(f, "expected '{}' at {}", expected, pos)
287	}
288	Error::InvalidExternalID(pos) => {
289	write!(f, "invalid ExternalID at {}", pos)
290	}
291	Error::InvalidComment(pos) => {
292	write!(f, "comment at {} contains '--'", pos)
293	}
294	Error::InvalidCharacterData(pos) => {
295	write!(f, "']]>' at {} is not allowed inside a character data", pos)
296	}
297	Error::UnknownToken(pos) => {
298	write!(f, "unknown token at {}", pos)
299	}
300	Error::UnexpectedEndOfStream => {
301	write!(f, "unexpected end of stream")
302	}
303	}
304	}
305	}
306
307	#[cfg(feature = "std")]
308	impl std::error::Error for Error {
309	fn description(&self) -> &str {
310	"an XML parsing error"
311	}
312	}
313
314	/// Parsing options.
315	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
316	pub struct ParsingOptions {
317	/// Allow DTD parsing.
318	///
319	/// When set to `false`, XML with DTD will cause an error.
320	/// Empty DTD block is not an error.
321	///
322	/// Currently, there is no option to simply skip DTD.
323	/// Mainly because you will get `UnknownEntityReference` error later anyway.
324	///
325	/// This flag is set to `false` by default for security reasons,
326	/// but `roxmltree` still has checks for billion laughs attack,
327	/// so this is just an extra security measure.
328	///
329	/// Default: false
330	pub allow_dtd: bool,
331
332	/// Sets the maximum number of nodes to parse.
333	///
334	/// Useful when dealing with random input to limit memory usage.
335	///
336	/// Default: u32::MAX (no limit)
337	pub nodes_limit: u32,
338	}
339
340	// Explicit for readability.
341	#[allow(clippy::derivable_impls)]
342	impl Default for ParsingOptions {
343	fn default() -> Self {
344	ParsingOptions {
345	allow_dtd: `false`,
346	nodes_limit: core::u32::MAX,
347	}
348	}
349	}
350
351	struct TempAttributeData<'input> {
352	prefix: &'input str,
353	local: &'input str,
354	value: StringStorage<'input>,
355	pos: usize,
356	}
357
358	impl<'input> Document<'input> {
359	/// Parses the input XML string.
360	///
361	/// We do not support `&[u8]` or `Reader` because the input must be an already allocated
362	/// UTF-8 string.
363	///
364	/// This is a shorthand for `Document::parse_with_options(data, ParsingOptions::default())`.
365	///
366	/// # Examples
367	///
368	/// ```
369	/// let doc = roxmltree::Document::parse("<e/>").unwrap();
370	/// assert_eq!(doc.descendants().count(), `2`); // root node + `e` element node
371	/// ```
372	#[inline]
373	pub fn parse(text: &str) -> Result<Document> {
374	Self::parse_with_options(text, ParsingOptions::default())
375	}
376
377	/// Parses the input XML string using to selected options.
378	///
379	/// We do not support `&[u8]` or `Reader` because the input must be an already allocated
380	/// UTF-8 string.
381	///
382	/// # Examples
383	///
384	/// ```
385	/// let opt = roxmltree::ParsingOptions::default();
386	/// let doc = roxmltree::Document::parse_with_options("<e/>", opt).unwrap();
387	/// assert_eq!(doc.descendants().count(), `2`); // root node + `e` element node
388	/// ```
389	#[inline]
390	pub fn parse_with_options(text: &str, opt: ParsingOptions) -> Result<Document> {
391	parse(text, opt)
392	}
393	}
394
395	struct Entity<'input> {
396	name: &'input str,
397	value: StrSpan<'input>,
398	}
399
400	#[derive(Clone, Copy)]
401	struct TagNameSpan<'input> {
402	prefix: &'input str,
403	name: &'input str,
404	pos: usize,
405	prefix_pos: usize,
406	}
407
408	impl<'input> TagNameSpan<'input> {
409	#[inline]
410	fn new_null() -> Self {
411	Self {
412	prefix: "",
413	name: "",
414	pos: `0`,
415	prefix_pos: `0`,
416	}
417	}
418	}
419
420	/// An entity loop detector.
421	///
422	/// Limits:
423	/// - Entities depth is 10.
424	/// - Maximum number of entity references per entity reference is 255.
425	///
426	/// Basically, if a text or an attribute has an entity reference and this reference
427	/// has more than 10 nested references - this is an error.
428	///
429	/// This is useful for simple loops like:
430	///
431	/// ```text
432	/// <!ENTITY a '&b;'>
433	/// <!ENTITY b '&a;'>
434	/// ```
435	///
436	/// And, if a text or an attribute has an entity reference and it references more
437	/// than 255 references - this is an error.
438	///
439	/// This is useful for cases like billion laughs attack, where depth can be pretty small,
440	/// but the number of references is exponentially increasing:
441	///
442	/// ```text
443	/// <!ENTITY lol "lol">
444	/// <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
445	/// <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
446	/// <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
447	/// <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
448	/// ```
449	#[derive(Default)]
450	struct LoopDetector {
451	/// References depth.
452	depth: u8,
453	/// Number of references resolved by the root reference.
454	references: u8,
455	}
456
457	impl LoopDetector {
458	#[inline]
459	fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
460	if self.depth < `10` {
461	self.depth += `1`;
462	Ok(())
463	} else {
464	Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
465	}
466	}
467
468	#[inline]
469	fn dec_depth(&mut self) {
470	if self.depth > `0` {
471	self.depth -= `1`;
472	}
473
474	// Reset references count after reaching zero depth.
475	if self.depth == `0` {
476	self.references = `0`;
477	}
478	}
479
480	#[inline]
481	fn inc_references(&mut self, stream: &Stream) -> Result<()> {
482	if self.depth == `0` {
483	// Allow infinite amount of references at zero depth.
484	Ok(())
485	} else {
486	if self.references == core::u8::MAX {
487	return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
488	}
489
490	self.references += `1`;
491	Ok(())
492	}
493	}
494	}
495
496	struct Context<'input> {
497	opt: ParsingOptions,
498	namespace_start_idx: usize,
499	current_attributes: Vec<TempAttributeData<'input>>,
500	awaiting_subtree: Vec<NodeId>,
501	parent_prefixes: Vec<&'input str>,
502	entities: Vec<Entity<'input>>,
503	after_text: bool,
504	parent_id: NodeId,
505	tag_name: TagNameSpan<'input>,
506	loop_detector: LoopDetector,
507	doc: Document<'input>,
508	}
509
510	impl<'input> Context<'input> {
511	fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
512	if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
513	return Err(Error::NodesLimitReached);
514	}
515
516	#[cfg(not(feature = "positions"))]
517	let _ = range;
518
519	let new_child_id = NodeId::from(self.doc.nodes.len());
520
521	let appending_element = matches!(kind, NodeKind::Element { .. });
522	self.doc.nodes.push(NodeData {
523	parent: Some(self.parent_id),
524	prev_sibling: None,
525	next_subtree: None,
526	last_child: None,
527	kind,
528	#[cfg(feature = "positions")]
529	range,
530	});
531
532	let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
533	self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
534	self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
535
536	for id in &self.awaiting_subtree {
537	self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
538	}
539	self.awaiting_subtree.clear();
540
541	if !appending_element {
542	self.awaiting_subtree
543	.push(NodeId::from(self.doc.nodes.len() - `1`));
544	}
545
546	Ok(new_child_id)
547	}
548
549	fn err_pos_at(&self, pos: usize) -> TextPos {
550	self.doc.text_pos_at(pos)
551	}
552	}
553
554	fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
555	// Trying to guess rough nodes and attributes amount.
556	let nodes_capacity = text.bytes().filter(\|c\| *c == b'<').count();
557	let attributes_capacity = text.bytes().filter(\|c\| *c == b'=').count();
558
559	// Init document.
560	let mut doc = Document {
561	text,
562	nodes: Vec::with_capacity(nodes_capacity),
563	attributes: Vec::with_capacity(attributes_capacity),
564	namespaces: Namespaces::default(),
565	};
566
567	// Add a root node.
568	doc.nodes.push(NodeData {
569	parent: None,
570	prev_sibling: None,
571	next_subtree: None,
572	last_child: None,
573	kind: NodeKind::Root,
574	#[cfg(feature = "positions")]
575	range: `0`..text.len(),
576	});
577
578	doc.namespaces
579	.push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
580
581	let mut ctx = Context {
582	opt,
583	namespace_start_idx: `1`,
584	current_attributes: Vec::with_capacity(`16`),
585	entities: Vec::new(),
586	awaiting_subtree: Vec::new(),
587	parent_prefixes: Vec::new(),
588	after_text: `false`,
589	parent_id: NodeId::new(`0`),
590	tag_name: TagNameSpan::new_null(),
591	loop_detector: LoopDetector::default(),
592	doc,
593	};
594	ctx.parent_prefixes.push("");
595
596	tokenizer::parse(text, opt.allow_dtd, &mut ctx)?;
597
598	let mut doc = ctx.doc;
599	if !doc.root().children().any(\|n\| n.is_element()) {
600	return Err(Error::NoRootNode);
601	}
602
603	if ctx.parent_prefixes.len() > `1` {
604	return Err(Error::UnclosedRootNode);
605	}
606
607	doc.nodes.shrink_to_fit();
608	doc.attributes.shrink_to_fit();
609	doc.namespaces.shrink_to_fit();
610
611	Ok(doc)
612	}
613
614	impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
615	fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
616	match token {
617	tokenizer::Token::ProcessingInstruction(target, value, range) => {
618	let pi = NodeKind::PI(PI { target, value });
619	self.append_node(pi, range)?;
620	self.after_text = `false`;
621	}
622	tokenizer::Token::Comment(text, range) => {
623	self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
624	self.after_text = `false`;
625	}
626	tokenizer::Token::EntityDeclaration(name, definition) => {
627	self.entities.push(Entity {
628	name,
629	value: definition,
630	});
631	}
632	tokenizer::Token::ElementStart(prefix, local, start) => {
633	if prefix == XMLNS {
634	let pos = self.err_pos_at(start + `1`);
635	return Err(Error::InvalidElementNamePrefix(pos));
636	}
637
638	self.tag_name = TagNameSpan {
639	prefix,
640	name: local,
641	pos: start,
642	prefix_pos: start + `1`,
643	};
644
645	self.after_text = `false`;
646	}
647	tokenizer::Token::Attribute(attr_start, prefix, local, value) => {
648	process_attribute(attr_start, prefix, local, value, self)?;
649	}
650	tokenizer::Token::ElementEnd(end, range) => {
651	process_element(end, range, self)?;
652	self.after_text = `false`;
653	}
654	tokenizer::Token::Text(text, range) => {
655	process_text(text, range, self)?;
656	}
657	tokenizer::Token::Cdata(text, range) => {
658	process_cdata(text, range, self)?;
659	}
660	}
661
662	Ok(())
663	}
664	}
665
666	#[allow(clippy::too_many_arguments)]
667	fn process_attribute<'input>(
668	attr_pos: usize,
669	prefix: &'input str,
670	local: &'input str,
671	value: StrSpan<'input>,
672	ctx: &mut Context<'input>,
673	) -> Result<()> {
674	let value = normalize_attribute(value, ctx)?;
675
676	if prefix == XMLNS {
677	// The xmlns namespace MUST NOT be declared as the default namespace.
678	if value.as_str() == NS_XMLNS_URI {
679	let pos = ctx.err_pos_at(attr_pos);
680	return Err(Error::UnexpectedXmlnsUri(pos));
681	}
682
683	let is_xml_ns_uri = value.as_str() == NS_XML_URI;
684
685	// The prefix 'xml' is by definition bound to the namespace name
686	// http://www.w3.org/XML/1998/namespace.
687	// It MUST NOT be bound to any other namespace name.
688	if local == NS_XML_PREFIX {
689	if !is_xml_ns_uri {
690	let pos = ctx.err_pos_at(attr_pos);
691	return Err(Error::InvalidXmlPrefixUri(pos));
692	}
693	} else {
694	// The xml namespace MUST NOT be bound to a non-xml prefix.
695	if is_xml_ns_uri {
696	let pos = ctx.err_pos_at(attr_pos);
697	return Err(Error::UnexpectedXmlUri(pos));
698	}
699	}
700
701	// Check for duplicated namespaces.
702	if ctx
703	.doc
704	.namespaces
705	.exists(ctx.namespace_start_idx, Some(local))
706	{
707	let pos = ctx.err_pos_at(attr_pos);
708	return Err(Error::DuplicatedNamespace(local.to_string(), pos));
709	}
710
711	// Xml namespace should not be added to the namespaces.
712	if !is_xml_ns_uri {
713	ctx.doc.namespaces.push_ns(Some(local), value)?;
714	}
715	} else if local == XMLNS {
716	// The xml namespace MUST NOT be declared as the default namespace.
717	if value.as_str() == NS_XML_URI {
718	let pos = ctx.err_pos_at(attr_pos);
719	return Err(Error::UnexpectedXmlUri(pos));
720	}
721
722	// The xmlns namespace MUST NOT be declared as the default namespace.
723	if value.as_str() == NS_XMLNS_URI {
724	let pos = ctx.err_pos_at(attr_pos);
725	return Err(Error::UnexpectedXmlnsUri(pos));
726	}
727
728	ctx.doc.namespaces.push_ns(None, value)?;
729	} else {
730	ctx.current_attributes.push(TempAttributeData {
731	prefix,
732	local,
733	value,
734	pos: attr_pos,
735	});
736	}
737
738	Ok(())
739	}
740
741	fn process_element<'input>(
742	end_token: tokenizer::ElementEnd<'input>,
743	token_range: Range<usize>,
744	ctx: &mut Context<'input>,
745	) -> Result<()> {
746	if ctx.tag_name.name.is_empty() {
747	// May occur in XML like this:
748	// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
749	// <root>&p;</root>
750
751	if let tokenizer::ElementEnd::Close(..) = end_token {
752	return Err(Error::UnexpectedEntityCloseTag(
753	ctx.err_pos_at(token_range.start),
754	));
755	} else {
756	unreachable!("should be already checked by the tokenizer");
757	}
758	}
759
760	let namespaces = ctx.resolve_namespaces();
761	ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
762
763	let attributes = resolve_attributes(namespaces, ctx)?;
764
765	match end_token {
766	tokenizer::ElementEnd::Empty => {
767	let tag_ns_idx = get_ns_idx_by_prefix(
768	namespaces,
769	ctx.tag_name.prefix_pos,
770	ctx.tag_name.prefix,
771	ctx,
772	)?;
773	let new_element_id = ctx.append_node(
774	NodeKind::Element {
775	tag_name: ExpandedNameIndexed {
776	namespace_idx: tag_ns_idx,
777	local_name: ctx.tag_name.name,
778	},
779	attributes,
780	namespaces,
781	},
782	ctx.tag_name.pos..token_range.end,
783	)?;
784	ctx.awaiting_subtree.push(new_element_id);
785	}
786	tokenizer::ElementEnd::Close(prefix, local) => {
787	let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
788	// should never panic as we start with the single prefix of the
789	// root node and always push another one when changing the parent
790	let parent_prefix = *ctx.parent_prefixes.last().unwrap();
791
792	#[cfg(feature = "positions")]
793	{
794	parent_node.range.end = token_range.end;
795	}
796
797	if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
798	if prefix != parent_prefix \|\| local != tag_name.local_name {
799	return Err(Error::UnexpectedCloseTag(
800	gen_qname_string(parent_prefix, tag_name.local_name),
801	gen_qname_string(prefix, local),
802	ctx.err_pos_at(token_range.start),
803	));
804	}
805	}
806	ctx.awaiting_subtree.push(ctx.parent_id);
807
808	if let Some(id) = parent_node.parent {
809	ctx.parent_id = id;
810	ctx.parent_prefixes.pop();
811	debug_assert!(!ctx.parent_prefixes.is_empty());
812	} else {
813	unreachable!("should be already checked by the tokenizer");
814	}
815	}
816	tokenizer::ElementEnd::Open => {
817	let tag_ns_idx = get_ns_idx_by_prefix(
818	namespaces,
819	ctx.tag_name.prefix_pos,
820	ctx.tag_name.prefix,
821	ctx,
822	)?;
823	ctx.parent_id = ctx.append_node(
824	NodeKind::Element {
825	tag_name: ExpandedNameIndexed {
826	namespace_idx: tag_ns_idx,
827	local_name: ctx.tag_name.name,
828	},
829	attributes,
830	namespaces,
831	},
832	ctx.tag_name.pos..token_range.end,
833	)?;
834	ctx.parent_prefixes.push(ctx.tag_name.prefix);
835	}
836	}
837
838	Ok(())
839	}
840
841	impl Context<'_> {
842	fn resolve_namespaces(&mut self) -> ShortRange {
843	if let NodeKind::Element { ref namespaces, .. } =
844	self.doc.nodes[self.parent_id.get_usize()].kind
845	{
846	let parent_ns = *namespaces;
847	if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
848	return parent_ns;
849	}
850
851	for i in parent_ns.to_urange() {
852	if !self.doc.namespaces.exists(
853	self.namespace_start_idx,
854	self.doc
855	.namespaces
856	.get(self.doc.namespaces.tree_order[i])
857	.name,
858	) {
859	self.doc.namespaces.push_ref(i);
860	}
861	}
862	}
863
864	(self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
865	}
866	}
867
868	fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
869	if ctx.current_attributes.is_empty() {
870	return Ok(ShortRange::new(`0`, `0`));
871	}
872
873	if ctx.doc.attributes.len() + ctx.current_attributes.len() >= core::u32::MAX as usize {
874	return Err(Error::AttributesLimitReached);
875	}
876
877	let start_idx = ctx.doc.attributes.len();
878
879	let current_attributes = core::mem::take(&mut ctx.current_attributes);
880	for attr in current_attributes {
881	let namespace_idx = if attr.prefix == NS_XML_PREFIX {
882	// The prefix 'xml' is by definition bound to the namespace name
883	// http://www.w3.org/XML/1998/namespace. This namespace is added
884	// to the document on creation and is always element 0.
885	Some(NamespaceIdx(`0`))
886	} else if attr.prefix.is_empty() {
887	// 'The namespace name for an unprefixed attribute name
888	// always has no value.'
889	None
890	} else {
891	get_ns_idx_by_prefix(namespaces, attr.pos, attr.prefix, ctx)?
892	};
893
894	let attr_name = ExpandedNameIndexed {
895	namespace_idx,
896	local_name: attr.local,
897	};
898
899	// Check for duplicated attributes.
900	if ctx.doc.attributes[start_idx..].iter().any(\|attr\| {
901	attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
902	}) {
903	let pos = ctx.err_pos_at(attr.pos);
904	return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
905	}
906
907	ctx.doc.attributes.push(AttributeData {
908	name: attr_name,
909	value: attr.value,
910	#[cfg(feature = "positions")]
911	pos: attr.pos,
912	});
913	}
914
915	Ok((start_idx..ctx.doc.attributes.len()).into())
916	}
917
918	fn process_text<'input>(
919	text: &'input str,
920	range: Range<usize>,
921	ctx: &mut Context<'input>,
922	) -> Result<()> {
923	// Add text as is if it has only valid characters.
924	if !text.bytes().any(\|b\| b == b'&' \|\| b == b'`\r`') {
925	append_text(StringStorage::Borrowed(text), range, ctx)?;
926	ctx.after_text = `true`;
927	return Ok(());
928	}
929
930	let mut text_buffer = TextBuffer::new();
931	let mut is_as_is = `false`; // TODO: explain
932	let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
933	while !stream.at_end() {
934	match parse_next_chunk(&mut stream, &ctx.entities)? {
935	NextChunk::Byte(c) => {
936	if is_as_is {
937	text_buffer.push_raw(c);
938	is_as_is = `false`;
939	} else {
940	text_buffer.push_from_text(c, stream.at_end());
941	}
942	}
943	NextChunk::Char(c) => {
944	for b in CharToBytes::new(c) {
945	if ctx.loop_detector.depth > `0` {
946	text_buffer.push_from_text(b, stream.at_end());
947	} else {
948	// Characters not from entity should be added as is.
949	// Not sure why... At least `lxml` produces the same result.
950	text_buffer.push_raw(b);
951	is_as_is = `true`;
952	}
953	}
954	}
955	NextChunk::Text(fragment) => {
956	is_as_is = `false`;
957
958	if !text_buffer.is_empty() {
959	let storage = StringStorage::new_owned(text_buffer.to_str());
960	append_text(storage, range.clone(), ctx)?;
961	text_buffer.clear();
962	ctx.after_text = `true`;
963	}
964
965	ctx.loop_detector.inc_references(&stream)?;
966	ctx.loop_detector.inc_depth(&stream)?;
967
968	let mut stream = Stream::from_substr(ctx.doc.text, fragment.range());
969	let prev_tag_name = ctx.tag_name;
970	ctx.tag_name = TagNameSpan::new_null();
971	tokenizer::parse_content(&mut stream, ctx)?;
972	ctx.tag_name = prev_tag_name;
973	text_buffer.clear();
974
975	ctx.loop_detector.dec_depth();
976	}
977	}
978	}
979
980	if !text_buffer.is_empty() {
981	append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
982	ctx.after_text = `true`;
983	}
984
985	Ok(())
986	}
987
988	// While the whole purpose of CDATA is to indicate to an XML library that this text
989	// has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
990	fn process_cdata<'input>(
991	text: &'input str,
992	range: Range<usize>,
993	ctx: &mut Context<'input>,
994	) -> Result<()> {
995	// Add text as is if it has only valid characters.
996	if !text.as_bytes().contains(&b'`\r`') {
997	append_text(text:StringStorage::Borrowed(text), range, ctx)?;
998	ctx.after_text = `true`;
999	return Ok(());
1000	}
1001
1002	let mut text_buffer: TextBuffer = TextBuffer::new();
1003	let count: usize = text.chars().count();
1004	for (i: usize, c: char) in text.chars().enumerate() {
1005	for b: u8 in CharToBytes::new(c) {
1006	text_buffer.push_from_text(c:b, at_end:i + `1` == count);
1007	}
1008	}
1009
1010	if !text_buffer.is_empty() {
1011	append_text(text:StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
1012	ctx.after_text = `true`;
1013	}
1014
1015	Ok(())
1016	}
1017
1018	fn append_text<'input>(
1019	text: StringStorage<'input>,
1020	range: Range<usize>,
1021	ctx: &mut Context<'input>,
1022	) -> Result<()> {
1023	if ctx.after_text {
1024	// Prepend to a previous text node.
1025	if let Some(node: &mut NodeData<'_>) = ctx.doc.nodes.last_mut() {
1026	if let NodeKind::Text(ref mut prev_text: &mut StringStorage<'_>) = node.kind {
1027	let text_str: &str = text.as_str();
1028	let prev_text_str: &str = prev_text.as_str();
1029
1030	let mut concat_text: String = String::with_capacity(text_str.len() + prev_text_str.len());
1031	concat_text.push_str(string:prev_text_str);
1032	concat_text.push_str(string:text_str);
1033	*prev_text = StringStorage::new_owned(concat_text);
1034	}
1035	}
1036	} else {
1037	ctx.append_node(kind:NodeKind::Text(text), range)?;
1038	}
1039
1040	Ok(())
1041	}
1042
1043	enum NextChunk<'a> {
1044	Byte(u8),
1045	Char(char),
1046	Text(StrSpan<'a>),
1047	}
1048
1049	fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1050	debug_assert!(!stream.at_end());
1051
1052	// Safe, because we already checked that stream is not at the end.
1053	// But we have an additional `debug_assert` above just in case.
1054	let c = stream.curr_byte_unchecked();
1055
1056	// Check for character/entity references.
1057	if c == b'&' {
1058	let start = stream.pos();
1059	match stream.try_consume_reference() {
1060	Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1061	Some(Reference::Entity(name)) => entities
1062	.iter()
1063	.find(\|e\| e.name == name)
1064	.map(\|e\| NextChunk::Text(e.value))
1065	.ok_or_else(\|\| {
1066	let pos = stream.gen_text_pos_from(start);
1067	Error::UnknownEntityReference(name.into(), pos)
1068	}),
1069	None => {
1070	let pos = stream.gen_text_pos_from(start);
1071	Err(Error::MalformedEntityReference(pos))
1072	}
1073	}
1074	} else {
1075	stream.advance(`1`);
1076	Ok(NextChunk::Byte(c))
1077	}
1078	}
1079
1080	// https://www.w3.org/TR/REC-xml/#AVNormalize
1081	fn normalize_attribute<'input>(
1082	text: StrSpan<'input>,
1083	ctx: &mut Context<'input>,
1084	) -> Result<StringStorage<'input>> {
1085	if is_normalization_required(&text) {
1086	let mut text_buffer: TextBuffer = TextBuffer::new();
1087	_normalize_attribute(text, &mut text_buffer, ctx)?;
1088	Ok(StringStorage::new_owned(text_buffer.finish()))
1089	} else {
1090	Ok(StringStorage::Borrowed(text.as_str()))
1091	}
1092	}
1093
1094	#[inline]
1095	fn is_normalization_required(text: &StrSpan) -> bool {
1096	// We assume that `&` indicates an entity or a character reference.
1097	// But in rare cases it can be just an another character.
1098
1099	fn check(c: u8) -> bool {
1100	matches!(c, b'&' \| b'`\t`' \| b'`\n`' \| b'`\r`')
1101	}
1102
1103	text.as_str().bytes().any(check)
1104	}
1105
1106	fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1107	let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1108	while !stream.at_end() {
1109	// Safe, because we already checked that the stream is not at the end.
1110	let c = stream.curr_byte_unchecked();
1111
1112	if c != b'&' {
1113	stream.advance(`1`);
1114	buffer.push_from_attr(c, stream.curr_byte().ok());
1115	continue;
1116	}
1117
1118	// Check for character/entity references.
1119	let start = stream.pos();
1120	match stream.try_consume_reference() {
1121	Some(Reference::Char(ch)) => {
1122	for b in CharToBytes::new(ch) {
1123	if ctx.loop_detector.depth > `0` {
1124	// Escaped `<` inside an ENTITY is an error.
1125	// Escaped `<` outside an ENTITY is ok.
1126	if b == b'<' {
1127	return Err(Error::InvalidAttributeValue(
1128	stream.gen_text_pos_from(start),
1129	));
1130	}
1131
1132	buffer.push_from_attr(b, None);
1133	} else {
1134	// Characters not from entity should be added as is.
1135	// Not sure why... At least `lxml` produces the same results.
1136	buffer.push_raw(b);
1137	}
1138	}
1139	}
1140	Some(Reference::Entity(name)) => match ctx.entities.iter().find(\|e\| e.name == name) {
1141	Some(entity) => {
1142	ctx.loop_detector.inc_references(&stream)?;
1143	ctx.loop_detector.inc_depth(&stream)?;
1144	_normalize_attribute(entity.value, buffer, ctx)?;
1145	ctx.loop_detector.dec_depth();
1146	}
1147	None => {
1148	let pos = stream.gen_text_pos_from(start);
1149	return Err(Error::UnknownEntityReference(name.into(), pos));
1150	}
1151	},
1152	None => {
1153	let pos = stream.gen_text_pos_from(start);
1154	return Err(Error::MalformedEntityReference(pos));
1155	}
1156	}
1157	}
1158
1159	Ok(())
1160	}
1161
1162	fn get_ns_idx_by_prefix(
1163	namespaces: ShortRange,
1164	prefix_pos: usize,
1165	prefix: &str,
1166	ctx: &Context,
1167	) -> Result<Option<NamespaceIdx>> {
1168	// Prefix CAN be empty when the default namespace was defined.
1169	//
1170	// Example:
1171	// <e xmlns='http://www.w3.org'/>
1172	let prefix_opt = if prefix.is_empty() {
1173	None
1174	} else {
1175	Some(prefix)
1176	};
1177
1178	let idx = ctx.doc.namespaces.tree_order[namespaces.to_urange()]
1179	.iter()
1180	.find(\|idx\| ctx.doc.namespaces.get(**idx).name == prefix_opt);
1181
1182	match idx {
1183	Some(idx) => Ok(Some(*idx)),
1184	None => {
1185	if !prefix.is_empty() {
1186	// If an URI was not found and prefix IS NOT empty than
1187	// we have an unknown namespace.
1188	//
1189	// Example:
1190	// <e random:a='b'/>
1191	let pos = ctx.err_pos_at(prefix_pos);
1192	Err(Error::UnknownNamespace(prefix.to_string(), pos))
1193	} else {
1194	// If an URI was not found and prefix IS empty than
1195	// an element or an attribute doesn't have a namespace.
1196	//
1197	// Example:
1198	// <e a='b'/>
1199	Ok(None)
1200	}
1201	}
1202	}
1203	}
1204
1205	fn gen_qname_string(prefix: &str, local: &str) -> String {
1206	if prefix.is_empty() {
1207	local.to_string()
1208	} else {
1209	alloc::format!("{}:{}", prefix, local)
1210	}
1211	}
1212
1213	/// Iterate over `char` by `u8`.
1214	struct CharToBytes {
1215	buf: [u8; `4`],
1216	idx: u8,
1217	}
1218
1219	impl CharToBytes {
1220	#[inline]
1221	fn new(c: char) -> Self {
1222	let mut buf: [u8; 4] = [`0xFF`; `4`];
1223	c.encode_utf8(&mut buf);
1224
1225	CharToBytes { buf, idx: `0` }
1226	}
1227	}
1228
1229	impl Iterator for CharToBytes {
1230	type Item = u8;
1231
1232	#[inline]
1233	fn next(&mut self) -> Option<Self::Item> {
1234	if self.idx < `4` {
1235	let b: u8 = self.buf[self.idx as usize];
1236
1237	if b != `0xFF` {
1238	self.idx += `1`;
1239	return Some(b);
1240	} else {
1241	self.idx = `4`;
1242	}
1243	}
1244
1245	None
1246	}
1247	}
1248
1249	struct TextBuffer {
1250	buffer: Vec<u8>,
1251	}
1252
1253	impl TextBuffer {
1254	#[inline]
1255	fn new() -> Self {
1256	TextBuffer {
1257	buffer: Vec::with_capacity(`32`),
1258	}
1259	}
1260
1261	#[inline]
1262	fn push_raw(&mut self, c: u8) {
1263	self.buffer.push(c);
1264	}
1265
1266	fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1267	// \r in \r\n should be ignored.
1268	if current == b'`\r`' && next == Some(b'`\n`') {
1269	return;
1270	}
1271
1272	// \n, \r and \t should be converted into spaces.
1273	current = match current {
1274	b'`\n`' \| b'`\r`' \| b'`\t`' => b' ',
1275	_ => current,
1276	};
1277
1278	self.buffer.push(current);
1279	}
1280
1281	// Translate \r\n and any \r that is not followed by \n into a single \n character.
1282	//
1283	// https://www.w3.org/TR/xml/#sec-line-ends
1284	fn push_from_text(&mut self, c: u8, at_end: bool) {
1285	if self.buffer.last() == Some(&b'`\r`') {
1286	let idx = self.buffer.len() - `1`;
1287	self.buffer[idx] = b'`\n`';
1288
1289	if at_end && c == b'`\r`' {
1290	self.buffer.push(b'`\n`');
1291	} else if c != b'`\n`' {
1292	self.buffer.push(c);
1293	}
1294	} else if at_end && c == b'`\r`' {
1295	self.buffer.push(b'`\n`');
1296	} else {
1297	self.buffer.push(c);
1298	}
1299	}
1300
1301	#[inline]
1302	fn clear(&mut self) {
1303	self.buffer.clear();
1304	}
1305
1306	#[inline]
1307	fn is_empty(&self) -> bool {
1308	self.buffer.is_empty()
1309	}
1310
1311	#[inline]
1312	fn to_str(&self) -> &str {
1313	// `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1314	core::str::from_utf8(&self.buffer).unwrap()
1315	}
1316
1317	#[inline]
1318	fn finish(self) -> String {
1319	// `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1320	String::from_utf8(self.buffer).unwrap()
1321	}
1322	}
1323