parse.rs source code [crates/roxmltree/src/parse.rs]

1	use alloc::string::{String, ToString};
2	use alloc::vec::Vec;
3	use core::ops::Range;
4
5	use crate::{
6	AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
7	NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
8	XMLNS,
9	};
10
11	use crate::tokenizer::{self, Reference, StrSpan, Stream};
12
13	type Result<T> = core::result::Result<T, Error>;
14
15	/// A list of all possible errors.
16	#[derive(Clone, PartialEq, Eq, Hash, Debug)]
17	pub enum Error {
18	/// The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.
19	InvalidXmlPrefixUri(TextPos),
20
21	/// Only the `xmlns:xml` attribute can have the <http://www.w3.org/XML/1998/namespace> URI.
22	UnexpectedXmlUri(TextPos),
23
24	/// The <http://www.w3.org/2000/xmlns/> URI must not be declared.
25	UnexpectedXmlnsUri(TextPos),
26
27	/// `xmlns` can't be used as an element prefix.
28	InvalidElementNamePrefix(TextPos),
29
30	/// A namespace was already defined on this element.
31	DuplicatedNamespace(String, TextPos),
32
33	/// An unknown namespace.
34	///
35	/// Indicates that an element or an attribute has an unknown qualified name prefix.
36	///
37	/// The first value is a prefix.
38	UnknownNamespace(String, TextPos),
39
40	/// Incorrect tree structure.
41	///
42	/// expected, actual, position
43	#[allow(missing_docs)]
44	UnexpectedCloseTag(String, String, TextPos),
45
46	/// Entity value starts with a close tag.
47	///
48	/// Example:
49	/// ```xml
50	/// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
51	/// <root>&p;</root>
52	/// ```
53	UnexpectedEntityCloseTag(TextPos),
54
55	/// A reference to an entity that was not defined in the DTD.
56	UnknownEntityReference(String, TextPos),
57
58	/// A malformed entity reference.
59	///
60	/// A `&` character inside an attribute value or text indicates an entity reference.
61	/// Otherwise, the document is not well-formed.
62	MalformedEntityReference(TextPos),
63
64	/// A possible entity reference loop.
65	///
66	/// The current depth limit is 10. The max number of references per reference is 255.
67	EntityReferenceLoop(TextPos),
68
69	/// Attribute value cannot have a `<` character.
70	InvalidAttributeValue(TextPos),
71
72	/// An element has a duplicated attributes.
73	///
74	/// This also includes namespaces resolving.
75	/// So an element like this will lead to an error.
76	/// ```xml
77	/// <e xmlns:n1='http://www.w3.org' xmlns:n2='http://www.w3.org' n1:a='b1' n2:a='b2'/>
78	/// ```
79	DuplicatedAttribute(String, TextPos),
80
81	/// The XML document must have at least one element.
82	NoRootNode,
83
84	/// The root node was opened but never closed.
85	UnclosedRootNode,
86
87	/// An XML document can have only one XML declaration
88	/// and it must be at the start of the document.
89	UnexpectedDeclaration(TextPos),
90
91	/// An XML with DTD detected.
92	///
93	/// This error will be emitted only when `ParsingOptions::allow_dtd` is set to `false`.
94	DtdDetected,
95
96	/// Indicates that the [`ParsingOptions::nodes_limit`] was reached.
97	NodesLimitReached,
98
99	/// Indicates that too many attributes were parsed.
100	AttributesLimitReached,
101
102	/// Indicates that too many namespaces were parsed.
103	NamespacesLimitReached,
104
105	/// An invalid name.
106	InvalidName(TextPos),
107
108	/// A non-XML character has occurred.
109	///
110	/// Valid characters are: <https://www.w3.org/TR/xml/#char32>
111	NonXmlChar(char, TextPos),
112
113	/// An invalid/unexpected character.
114	///
115	/// expected, actual, position
116	InvalidChar(u8, u8, TextPos),
117
118	/// An invalid/unexpected character.
119	///
120	/// expected, actual, position
121	InvalidChar2(&'static str, u8, TextPos),
122
123	/// An unexpected string.
124	///
125	/// Contains what string was expected.
126	InvalidString(&'static str, TextPos),
127
128	/// An invalid ExternalID in the DTD.
129	InvalidExternalID(TextPos),
130
131	/// A comment cannot contain `--` or end with `-`.
132	InvalidComment(TextPos),
133
134	/// A Character Data node contains an invalid data.
135	///
136	/// Currently, only `]]>` is not allowed.
137	InvalidCharacterData(TextPos),
138
139	/// An unknown token.
140	UnknownToken(TextPos),
141
142	/// The steam ended earlier than we expected.
143	///
144	/// Should only appear on invalid input data.
145	UnexpectedEndOfStream,
146	}
147
148	impl Error {
149	/// Returns the error position.
150	pub fn pos(&self) -> TextPos {
151	match *self {
152	Error::InvalidXmlPrefixUri(pos) => pos,
153	Error::UnexpectedXmlUri(pos) => pos,
154	Error::UnexpectedXmlnsUri(pos) => pos,
155	Error::InvalidElementNamePrefix(pos) => pos,
156	Error::DuplicatedNamespace(_, pos) => pos,
157	Error::UnknownNamespace(_, pos) => pos,
158	Error::UnexpectedCloseTag(_, _, pos) => pos,
159	Error::UnexpectedEntityCloseTag(pos) => pos,
160	Error::UnknownEntityReference(_, pos) => pos,
161	Error::MalformedEntityReference(pos) => pos,
162	Error::EntityReferenceLoop(pos) => pos,
163	Error::InvalidAttributeValue(pos) => pos,
164	Error::DuplicatedAttribute(_, pos) => pos,
165	Error::NoRootNode => TextPos::new(`1`, `1`),
166	Error::UnclosedRootNode => TextPos::new(`1`, `1`),
167	Error::UnexpectedDeclaration(pos) => pos,
168	Error::DtdDetected => TextPos::new(`1`, `1`),
169	Error::NodesLimitReached => TextPos::new(`1`, `1`),
170	Error::AttributesLimitReached => TextPos::new(`1`, `1`),
171	Error::NamespacesLimitReached => TextPos::new(`1`, `1`),
172	Error::InvalidName(pos) => pos,
173	Error::NonXmlChar(_, pos) => pos,
174	Error::InvalidChar(_, _, pos) => pos,
175	Error::InvalidChar2(_, _, pos) => pos,
176	Error::InvalidString(_, pos) => pos,
177	Error::InvalidExternalID(pos) => pos,
178	Error::InvalidComment(pos) => pos,
179	Error::InvalidCharacterData(pos) => pos,
180	Error::UnknownToken(pos) => pos,
181	Error::UnexpectedEndOfStream => TextPos::new(`1`, `1`),
182	}
183	}
184	}
185
186	impl core::fmt::Display for Error {
187	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
188	match *self {
189	Error::InvalidXmlPrefixUri(pos) => {
190	write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
191	}
192	Error::UnexpectedXmlUri(pos) => {
193	write!(
194	f,
195	"the 'xml' namespace URI is used for not 'xml' prefix at {}",
196	pos
197	)
198	}
199	Error::UnexpectedXmlnsUri(pos) => {
200	write!(
201	f,
202	"the 'xmlns' URI is used at {}, but it must not be declared",
203	pos
204	)
205	}
206	Error::InvalidElementNamePrefix(pos) => {
207	write!(
208	f,
209	"the 'xmlns' prefix is used at {}, but it must not be",
210	pos
211	)
212	}
213	Error::DuplicatedNamespace(ref name, pos) => {
214	write!(f, "namespace '{}' at {} is already defined", name, pos)
215	}
216	Error::UnknownNamespace(ref name, pos) => {
217	write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
218	}
219	Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
220	write!(
221	f,
222	"expected '{}' tag, not '{}' at {}",
223	expected, actual, pos
224	)
225	}
226	Error::UnexpectedEntityCloseTag(pos) => {
227	write!(f, "unexpected close tag at {}", pos)
228	}
229	Error::MalformedEntityReference(pos) => {
230	write!(f, "malformed entity reference at {}", pos)
231	}
232	Error::UnknownEntityReference(ref name, pos) => {
233	write!(f, "unknown entity reference '{}' at {}", name, pos)
234	}
235	Error::EntityReferenceLoop(pos) => {
236	write!(f, "a possible entity reference loop is detected at {}", pos)
237	}
238	Error::InvalidAttributeValue(pos) => {
239	write!(f, "unescaped '<' found at {}", pos)
240	}
241	Error::DuplicatedAttribute(ref name, pos) => {
242	write!(f, "attribute '{}' at {} is already defined", name, pos)
243	}
244	Error::NoRootNode => {
245	write!(f, "the document does not have a root node")
246	}
247	Error::UnclosedRootNode => {
248	write!(f, "the root node was opened but never closed")
249	}
250	Error::UnexpectedDeclaration(pos) => {
251	write!(f, "unexpected XML declaration at {}", pos)
252	}
253	Error::DtdDetected => {
254	write!(f, "XML with DTD detected")
255	}
256	Error::NodesLimitReached => {
257	write!(f, "nodes limit reached")
258	}
259	Error::AttributesLimitReached => {
260	write!(f, "more than 2^32 attributes were parsed")
261	}
262	Error::NamespacesLimitReached => {
263	write!(f, "more than 2^16 unique namespaces were parsed")
264	}
265	Error::InvalidName(pos) => {
266	write!(f, "invalid name token at {}", pos)
267	}
268	Error::NonXmlChar(c, pos) => {
269	write!(f, "a non-XML character {:?} found at {}", c, pos)
270	}
271	Error::InvalidChar(expected, actual, pos) => {
272	write!(
273	f,
274	"expected '{}' not '{}' at {}",
275	expected as char, actual as char, pos
276	)
277	}
278	Error::InvalidChar2(expected, actual, pos) => {
279	write!(
280	f,
281	"expected {} not '{}' at {}",
282	expected, actual as char, pos
283	)
284	}
285	Error::InvalidString(expected, pos) => {
286	write!(f, "expected '{}' at {}", expected, pos)
287	}
288	Error::InvalidExternalID(pos) => {
289	write!(f, "invalid ExternalID at {}", pos)
290	}
291	Error::InvalidComment(pos) => {
292	write!(f, "comment at {} contains '--'", pos)
293	}
294	Error::InvalidCharacterData(pos) => {
295	write!(f, "']]>' at {} is not allowed inside a character data", pos)
296	}
297	Error::UnknownToken(pos) => {
298	write!(f, "unknown token at {}", pos)
299	}
300	Error::UnexpectedEndOfStream => {
301	write!(f, "unexpected end of stream")
302	}
303	}
304	}
305	}
306
307	#[cfg(feature = "std")]
308	impl std::error::Error for Error {
309	fn description(&self) -> &str {
310	"an XML parsing error"
311	}
312	}
313
314	/// Parsing options.
315	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
316	pub struct ParsingOptions {
317	/// Allow DTD parsing.
318	///
319	/// When set to `false`, XML with DTD will cause an error.
320	/// Empty DTD block is not an error.
321	///
322	/// Currently, there is no option to simply skip DTD.
323	/// Mainly because you will get `UnknownEntityReference` error later anyway.
324	///
325	/// This flag is set to `false` by default for security reasons,
326	/// but `roxmltree` still has checks for billion laughs attack,
327	/// so this is just an extra security measure.
328	///
329	/// Default: false
330	pub allow_dtd: bool,
331
332	/// Sets the maximum number of nodes to parse.
333	///
334	/// Useful when dealing with random input to limit memory usage.
335	///
336	/// Default: u32::MAX (no limit)
337	pub nodes_limit: u32,
338	}
339
340	// Explicit for readability.
341	#[allow(clippy::derivable_impls)]
342	impl Default for ParsingOptions {
343	fn default() -> Self {
344	ParsingOptions {
345	allow_dtd: `false`,
346	nodes_limit: core::u32::MAX,
347	}
348	}
349	}
350
351	struct TempAttributeData<'input> {
352	prefix: &'input str,
353	local: &'input str,
354	value: StringStorage<'input>,
355	range: Range<usize>,
356	#[allow(unused)] // only used for feature "positions"
357	qname_len: u16,
358	#[allow(unused)] // only used for feature "positions"
359	eq_len: u8,
360	}
361
362	impl<'input> Document<'input> {
363	/// Parses the input XML string.
364	///
365	/// We do not support `&[u8]` or `Reader` because the input must be an already allocated
366	/// UTF-8 string.
367	///
368	/// This is a shorthand for `Document::parse_with_options(data, ParsingOptions::default())`.
369	///
370	/// # Examples
371	///
372	/// ```
373	/// let doc = roxmltree::Document::parse("<e/>").unwrap();
374	/// assert_eq!(doc.descendants().count(), `2`); // root node + `e` element node
375	/// ```
376	#[inline]
377	pub fn parse(text: &str) -> Result<Document> {
378	Self::parse_with_options(text, ParsingOptions::default())
379	}
380
381	/// Parses the input XML string using to selected options.
382	///
383	/// We do not support `&[u8]` or `Reader` because the input must be an already allocated
384	/// UTF-8 string.
385	///
386	/// # Examples
387	///
388	/// ```
389	/// let opt = roxmltree::ParsingOptions::default();
390	/// let doc = roxmltree::Document::parse_with_options("<e/>", opt).unwrap();
391	/// assert_eq!(doc.descendants().count(), `2`); // root node + `e` element node
392	/// ```
393	#[inline]
394	pub fn parse_with_options(text: &str, opt: ParsingOptions) -> Result<Document> {
395	parse(text, opt)
396	}
397	}
398
399	struct Entity<'input> {
400	name: &'input str,
401	value: StrSpan<'input>,
402	}
403
404	#[derive(Clone, Copy)]
405	struct TagNameSpan<'input> {
406	prefix: &'input str,
407	name: &'input str,
408	pos: usize,
409	prefix_pos: usize,
410	}
411
412	impl<'input> TagNameSpan<'input> {
413	#[inline]
414	fn new_null() -> Self {
415	Self {
416	prefix: "",
417	name: "",
418	pos: `0`,
419	prefix_pos: `0`,
420	}
421	}
422	}
423
424	/// An entity loop detector.
425	///
426	/// Limits:
427	/// - Entities depth is 10.
428	/// - Maximum number of entity references per entity reference is 255.
429	///
430	/// Basically, if a text or an attribute has an entity reference and this reference
431	/// has more than 10 nested references - this is an error.
432	///
433	/// This is useful for simple loops like:
434	///
435	/// ```text
436	/// <!ENTITY a '&b;'>
437	/// <!ENTITY b '&a;'>
438	/// ```
439	///
440	/// And, if a text or an attribute has an entity reference and it references more
441	/// than 255 references - this is an error.
442	///
443	/// This is useful for cases like billion laughs attack, where depth can be pretty small,
444	/// but the number of references is exponentially increasing:
445	///
446	/// ```text
447	/// <!ENTITY lol "lol">
448	/// <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
449	/// <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
450	/// <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
451	/// <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
452	/// ```
453	#[derive(Default)]
454	struct LoopDetector {
455	/// References depth.
456	depth: u8,
457	/// Number of references resolved by the root reference.
458	references: u8,
459	}
460
461	impl LoopDetector {
462	#[inline]
463	fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
464	if self.depth < `10` {
465	self.depth += `1`;
466	Ok(())
467	} else {
468	Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
469	}
470	}
471
472	#[inline]
473	fn dec_depth(&mut self) {
474	if self.depth > `0` {
475	self.depth -= `1`;
476	}
477
478	// Reset references count after reaching zero depth.
479	if self.depth == `0` {
480	self.references = `0`;
481	}
482	}
483
484	#[inline]
485	fn inc_references(&mut self, stream: &Stream) -> Result<()> {
486	if self.depth == `0` {
487	// Allow infinite amount of references at zero depth.
488	Ok(())
489	} else {
490	if self.references == core::u8::MAX {
491	return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
492	}
493
494	self.references += `1`;
495	Ok(())
496	}
497	}
498	}
499
500	struct Context<'input> {
501	opt: ParsingOptions,
502	namespace_start_idx: usize,
503	current_attributes: Vec<TempAttributeData<'input>>,
504	awaiting_subtree: Vec<NodeId>,
505	parent_prefixes: Vec<&'input str>,
506	entities: Vec<Entity<'input>>,
507	after_text: bool,
508	parent_id: NodeId,
509	tag_name: TagNameSpan<'input>,
510	loop_detector: LoopDetector,
511	doc: Document<'input>,
512	}
513
514	impl<'input> Context<'input> {
515	fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
516	if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
517	return Err(Error::NodesLimitReached);
518	}
519
520	#[cfg(not(feature = "positions"))]
521	let _ = range;
522
523	let new_child_id = NodeId::from(self.doc.nodes.len());
524
525	let appending_element = matches!(kind, NodeKind::Element { .. });
526	self.doc.nodes.push(NodeData {
527	parent: Some(self.parent_id),
528	prev_sibling: None,
529	next_subtree: None,
530	last_child: None,
531	kind,
532	#[cfg(feature = "positions")]
533	range,
534	});
535
536	let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
537	self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
538	self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
539
540	for id in &self.awaiting_subtree {
541	self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
542	}
543	self.awaiting_subtree.clear();
544
545	if !appending_element {
546	self.awaiting_subtree
547	.push(NodeId::from(self.doc.nodes.len() - `1`));
548	}
549
550	Ok(new_child_id)
551	}
552
553	fn err_pos_at(&self, pos: usize) -> TextPos {
554	self.doc.text_pos_at(pos)
555	}
556	}
557
558	fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
559	// Trying to guess rough nodes and attributes amount.
560	let nodes_capacity = text.bytes().filter(\|c\| *c == b'<').count();
561	let attributes_capacity = text.bytes().filter(\|c\| *c == b'=').count();
562
563	// Init document.
564	let mut doc = Document {
565	text,
566	nodes: Vec::with_capacity(nodes_capacity),
567	attributes: Vec::with_capacity(attributes_capacity),
568	namespaces: Namespaces::default(),
569	};
570
571	// Add a root node.
572	doc.nodes.push(NodeData {
573	parent: None,
574	prev_sibling: None,
575	next_subtree: None,
576	last_child: None,
577	kind: NodeKind::Root,
578	#[cfg(feature = "positions")]
579	range: `0`..text.len(),
580	});
581
582	doc.namespaces
583	.push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
584
585	let mut ctx = Context {
586	opt,
587	namespace_start_idx: `1`,
588	current_attributes: Vec::with_capacity(`16`),
589	entities: Vec::new(),
590	awaiting_subtree: Vec::new(),
591	parent_prefixes: Vec::new(),
592	after_text: `false`,
593	parent_id: NodeId::new(`0`),
594	tag_name: TagNameSpan::new_null(),
595	loop_detector: LoopDetector::default(),
596	doc,
597	};
598	ctx.parent_prefixes.push("");
599
600	tokenizer::parse(text, opt.allow_dtd, &mut ctx)?;
601
602	let mut doc = ctx.doc;
603	if !doc.root().children().any(\|n\| n.is_element()) {
604	return Err(Error::NoRootNode);
605	}
606
607	if ctx.parent_prefixes.len() > `1` {
608	return Err(Error::UnclosedRootNode);
609	}
610
611	doc.nodes.shrink_to_fit();
612	doc.attributes.shrink_to_fit();
613	doc.namespaces.shrink_to_fit();
614
615	Ok(doc)
616	}
617
618	impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
619	fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
620	match token {
621	tokenizer::Token::ProcessingInstruction(target, value, range) => {
622	let pi = NodeKind::PI(PI { target, value });
623	self.append_node(pi, range)?;
624	self.after_text = `false`;
625	}
626	tokenizer::Token::Comment(text, range) => {
627	self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
628	self.after_text = `false`;
629	}
630	tokenizer::Token::EntityDeclaration(name, definition) => {
631	self.entities.push(Entity {
632	name,
633	value: definition,
634	});
635	}
636	tokenizer::Token::ElementStart(prefix, local, start) => {
637	if prefix == XMLNS {
638	let pos = self.err_pos_at(start + `1`);
639	return Err(Error::InvalidElementNamePrefix(pos));
640	}
641
642	self.tag_name = TagNameSpan {
643	prefix,
644	name: local,
645	pos: start,
646	prefix_pos: start + `1`,
647	};
648
649	self.after_text = `false`;
650	}
651	tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
652	process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
653	}
654	tokenizer::Token::ElementEnd(end, range) => {
655	process_element(end, range, self)?;
656	self.after_text = `false`;
657	}
658	tokenizer::Token::Text(text, range) => {
659	process_text(text, range, self)?;
660	}
661	tokenizer::Token::Cdata(text, range) => {
662	process_cdata(text, range, self)?;
663	}
664	}
665
666	Ok(())
667	}
668	}
669
670	#[allow(clippy::too_many_arguments)]
671	fn process_attribute<'input>(
672	range: Range<usize>,
673	qname_len: u16,
674	eq_len: u8,
675	prefix: &'input str,
676	local: &'input str,
677	value: StrSpan<'input>,
678	ctx: &mut Context<'input>,
679	) -> Result<()> {
680	let value = normalize_attribute(value, ctx)?;
681
682	if prefix == XMLNS {
683	// The xmlns namespace MUST NOT be declared as the default namespace.
684	if value.as_str() == NS_XMLNS_URI {
685	let pos = ctx.err_pos_at(range.start);
686	return Err(Error::UnexpectedXmlnsUri(pos));
687	}
688
689	let is_xml_ns_uri = value.as_str() == NS_XML_URI;
690
691	// The prefix 'xml' is by definition bound to the namespace name
692	// http://www.w3.org/XML/1998/namespace.
693	// It MUST NOT be bound to any other namespace name.
694	if local == NS_XML_PREFIX {
695	if !is_xml_ns_uri {
696	let pos = ctx.err_pos_at(range.start);
697	return Err(Error::InvalidXmlPrefixUri(pos));
698	}
699	} else {
700	// The xml namespace MUST NOT be bound to a non-xml prefix.
701	if is_xml_ns_uri {
702	let pos = ctx.err_pos_at(range.start);
703	return Err(Error::UnexpectedXmlUri(pos));
704	}
705	}
706
707	// Check for duplicated namespaces.
708	if ctx
709	.doc
710	.namespaces
711	.exists(ctx.namespace_start_idx, Some(local))
712	{
713	let pos = ctx.err_pos_at(range.start);
714	return Err(Error::DuplicatedNamespace(local.to_string(), pos));
715	}
716
717	// Xml namespace should not be added to the namespaces.
718	if !is_xml_ns_uri {
719	ctx.doc.namespaces.push_ns(Some(local), value)?;
720	}
721	} else if local == XMLNS {
722	// The xml namespace MUST NOT be declared as the default namespace.
723	if value.as_str() == NS_XML_URI {
724	let pos = ctx.err_pos_at(range.start);
725	return Err(Error::UnexpectedXmlUri(pos));
726	}
727
728	// The xmlns namespace MUST NOT be declared as the default namespace.
729	if value.as_str() == NS_XMLNS_URI {
730	let pos = ctx.err_pos_at(range.start);
731	return Err(Error::UnexpectedXmlnsUri(pos));
732	}
733
734	ctx.doc.namespaces.push_ns(None, value)?;
735	} else {
736	ctx.current_attributes.push(TempAttributeData {
737	prefix,
738	local,
739	value,
740	range,
741	qname_len,
742	eq_len,
743	});
744	}
745
746	Ok(())
747	}
748
749	fn process_element<'input>(
750	end_token: tokenizer::ElementEnd<'input>,
751	token_range: Range<usize>,
752	ctx: &mut Context<'input>,
753	) -> Result<()> {
754	if ctx.tag_name.name.is_empty() {
755	// May occur in XML like this:
756	// <!DOCTYPE test [ <!ENTITY p '</p>'> ]>
757	// <root>&p;</root>
758
759	if let tokenizer::ElementEnd::Close(..) = end_token {
760	return Err(Error::UnexpectedEntityCloseTag(
761	ctx.err_pos_at(token_range.start),
762	));
763	} else {
764	unreachable!("should be already checked by the tokenizer");
765	}
766	}
767
768	let namespaces = ctx.resolve_namespaces();
769	ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
770
771	let attributes = resolve_attributes(namespaces, ctx)?;
772
773	match end_token {
774	tokenizer::ElementEnd::Empty => {
775	let tag_ns_idx = get_ns_idx_by_prefix(
776	namespaces,
777	ctx.tag_name.prefix_pos,
778	ctx.tag_name.prefix,
779	ctx,
780	)?;
781	let new_element_id = ctx.append_node(
782	NodeKind::Element {
783	tag_name: ExpandedNameIndexed {
784	namespace_idx: tag_ns_idx,
785	local_name: ctx.tag_name.name,
786	},
787	attributes,
788	namespaces,
789	},
790	ctx.tag_name.pos..token_range.end,
791	)?;
792	ctx.awaiting_subtree.push(new_element_id);
793	}
794	tokenizer::ElementEnd::Close(prefix, local) => {
795	let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
796	// should never panic as we start with the single prefix of the
797	// root node and always push another one when changing the parent
798	let parent_prefix = *ctx.parent_prefixes.last().unwrap();
799
800	#[cfg(feature = "positions")]
801	{
802	parent_node.range.end = token_range.end;
803	}
804
805	if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
806	if prefix != parent_prefix \|\| local != tag_name.local_name {
807	return Err(Error::UnexpectedCloseTag(
808	gen_qname_string(parent_prefix, tag_name.local_name),
809	gen_qname_string(prefix, local),
810	ctx.err_pos_at(token_range.start),
811	));
812	}
813	}
814	ctx.awaiting_subtree.push(ctx.parent_id);
815
816	if let Some(id) = parent_node.parent {
817	ctx.parent_id = id;
818	ctx.parent_prefixes.pop();
819	debug_assert!(!ctx.parent_prefixes.is_empty());
820	} else {
821	unreachable!("should be already checked by the tokenizer");
822	}
823	}
824	tokenizer::ElementEnd::Open => {
825	let tag_ns_idx = get_ns_idx_by_prefix(
826	namespaces,
827	ctx.tag_name.prefix_pos,
828	ctx.tag_name.prefix,
829	ctx,
830	)?;
831	ctx.parent_id = ctx.append_node(
832	NodeKind::Element {
833	tag_name: ExpandedNameIndexed {
834	namespace_idx: tag_ns_idx,
835	local_name: ctx.tag_name.name,
836	},
837	attributes,
838	namespaces,
839	},
840	ctx.tag_name.pos..token_range.end,
841	)?;
842	ctx.parent_prefixes.push(ctx.tag_name.prefix);
843	}
844	}
845
846	Ok(())
847	}
848
849	impl Context<'_> {
850	fn resolve_namespaces(&mut self) -> ShortRange {
851	if let NodeKind::Element { ref namespaces, .. } =
852	self.doc.nodes[self.parent_id.get_usize()].kind
853	{
854	let parent_ns = *namespaces;
855	if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
856	return parent_ns;
857	}
858
859	for i in parent_ns.to_urange() {
860	if !self.doc.namespaces.exists(
861	self.namespace_start_idx,
862	self.doc
863	.namespaces
864	.get(self.doc.namespaces.tree_order[i])
865	.name,
866	) {
867	self.doc.namespaces.push_ref(i);
868	}
869	}
870	}
871
872	(self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
873	}
874	}
875
876	fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
877	if ctx.current_attributes.is_empty() {
878	return Ok(ShortRange::new(`0`, `0`));
879	}
880
881	if ctx.doc.attributes.len() + ctx.current_attributes.len() >= core::u32::MAX as usize {
882	return Err(Error::AttributesLimitReached);
883	}
884
885	let start_idx = ctx.doc.attributes.len();
886
887	let current_attributes = core::mem::take(&mut ctx.current_attributes);
888	for attr in current_attributes {
889	let namespace_idx = if attr.prefix == NS_XML_PREFIX {
890	// The prefix 'xml' is by definition bound to the namespace name
891	// http://www.w3.org/XML/1998/namespace. This namespace is added
892	// to the document on creation and is always element 0.
893	Some(NamespaceIdx(`0`))
894	} else if attr.prefix.is_empty() {
895	// 'The namespace name for an unprefixed attribute name
896	// always has no value.'
897	None
898	} else {
899	get_ns_idx_by_prefix(namespaces, attr.range.start, attr.prefix, ctx)?
900	};
901
902	let attr_name = ExpandedNameIndexed {
903	namespace_idx,
904	local_name: attr.local,
905	};
906
907	// Check for duplicated attributes.
908	if ctx.doc.attributes[start_idx..].iter().any(\|attr\| {
909	attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
910	}) {
911	let pos = ctx.err_pos_at(attr.range.start);
912	return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
913	}
914
915	ctx.doc.attributes.push(AttributeData {
916	name: attr_name,
917	value: attr.value,
918	#[cfg(feature = "positions")]
919	range: attr.range,
920	#[cfg(feature = "positions")]
921	qname_len: attr.qname_len,
922	#[cfg(feature = "positions")]
923	eq_len: attr.eq_len,
924	});
925	}
926
927	Ok((start_idx..ctx.doc.attributes.len()).into())
928	}
929
930	fn process_text<'input>(
931	text: &'input str,
932	range: Range<usize>,
933	ctx: &mut Context<'input>,
934	) -> Result<()> {
935	// Add text as is if it has only valid characters.
936	if !text.bytes().any(\|b\| b == b'&' \|\| b == b'`\r`') {
937	append_text(StringStorage::Borrowed(text), range, ctx)?;
938	ctx.after_text = `true`;
939	return Ok(());
940	}
941
942	let mut text_buffer = TextBuffer::new();
943	let mut is_as_is = `false`; // TODO: explain
944	let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
945	while !stream.at_end() {
946	match parse_next_chunk(&mut stream, &ctx.entities)? {
947	NextChunk::Byte(c) => {
948	if is_as_is {
949	text_buffer.push_raw(c);
950	is_as_is = `false`;
951	} else {
952	text_buffer.push_from_text(c, stream.at_end());
953	}
954	}
955	NextChunk::Char(c) => {
956	for b in CharToBytes::new(c) {
957	if ctx.loop_detector.depth > `0` {
958	text_buffer.push_from_text(b, stream.at_end());
959	} else {
960	// Characters not from entity should be added as is.
961	// Not sure why... At least `lxml` produces the same result.
962	text_buffer.push_raw(b);
963	is_as_is = `true`;
964	}
965	}
966	}
967	NextChunk::Text(fragment) => {
968	is_as_is = `false`;
969
970	if !text_buffer.is_empty() {
971	let storage = StringStorage::new_owned(text_buffer.to_str());
972	append_text(storage, range.clone(), ctx)?;
973	text_buffer.clear();
974	ctx.after_text = `true`;
975	}
976
977	ctx.loop_detector.inc_references(&stream)?;
978	ctx.loop_detector.inc_depth(&stream)?;
979
980	let mut stream = Stream::from_substr(ctx.doc.text, fragment.range());
981	let prev_tag_name = ctx.tag_name;
982	ctx.tag_name = TagNameSpan::new_null();
983	tokenizer::parse_content(&mut stream, ctx)?;
984	ctx.tag_name = prev_tag_name;
985	text_buffer.clear();
986
987	ctx.loop_detector.dec_depth();
988	}
989	}
990	}
991
992	if !text_buffer.is_empty() {
993	append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
994	ctx.after_text = `true`;
995	}
996
997	Ok(())
998	}
999
1000	// While the whole purpose of CDATA is to indicate to an XML library that this text
1001	// has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
1002	fn process_cdata<'input>(
1003	text: &'input str,
1004	range: Range<usize>,
1005	ctx: &mut Context<'input>,
1006	) -> Result<()> {
1007	// Add text as is if it has only valid characters.
1008	if !text.as_bytes().contains(&b'`\r`') {
1009	append_text(text:StringStorage::Borrowed(text), range, ctx)?;
1010	ctx.after_text = `true`;
1011	return Ok(());
1012	}
1013
1014	let mut text_buffer: TextBuffer = TextBuffer::new();
1015	let count: usize = text.chars().count();
1016	for (i: usize, c: char) in text.chars().enumerate() {
1017	for b: u8 in CharToBytes::new(c) {
1018	text_buffer.push_from_text(c:b, at_end:i + `1` == count);
1019	}
1020	}
1021
1022	if !text_buffer.is_empty() {
1023	append_text(text:StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
1024	ctx.after_text = `true`;
1025	}
1026
1027	Ok(())
1028	}
1029
1030	fn append_text<'input>(
1031	text: StringStorage<'input>,
1032	range: Range<usize>,
1033	ctx: &mut Context<'input>,
1034	) -> Result<()> {
1035	if ctx.after_text {
1036	// Prepend to a previous text node.
1037	if let Some(node: &mut NodeData<'input>) = ctx.doc.nodes.last_mut() {
1038	if let NodeKind::Text(ref mut prev_text: &mut StringStorage<'_>) = node.kind {
1039	let text_str: &str = text.as_str();
1040	let prev_text_str: &str = prev_text.as_str();
1041
1042	let mut concat_text: String = String::with_capacity(text_str.len() + prev_text_str.len());
1043	concat_text.push_str(string:prev_text_str);
1044	concat_text.push_str(string:text_str);
1045	*prev_text = StringStorage::new_owned(concat_text);
1046	}
1047	}
1048	} else {
1049	ctx.append_node(kind:NodeKind::Text(text), range)?;
1050	}
1051
1052	Ok(())
1053	}
1054
1055	enum NextChunk<'a> {
1056	Byte(u8),
1057	Char(char),
1058	Text(StrSpan<'a>),
1059	}
1060
1061	fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
1062	debug_assert!(!stream.at_end());
1063
1064	// Safe, because we already checked that stream is not at the end.
1065	// But we have an additional `debug_assert` above just in case.
1066	let c = stream.curr_byte_unchecked();
1067
1068	// Check for character/entity references.
1069	if c == b'&' {
1070	let start = stream.pos();
1071	match stream.try_consume_reference() {
1072	Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
1073	Some(Reference::Entity(name)) => entities
1074	.iter()
1075	.find(\|e\| e.name == name)
1076	.map(\|e\| NextChunk::Text(e.value))
1077	.ok_or_else(\|\| {
1078	let pos = stream.gen_text_pos_from(start);
1079	Error::UnknownEntityReference(name.into(), pos)
1080	}),
1081	None => {
1082	let pos = stream.gen_text_pos_from(start);
1083	Err(Error::MalformedEntityReference(pos))
1084	}
1085	}
1086	} else {
1087	stream.advance(`1`);
1088	Ok(NextChunk::Byte(c))
1089	}
1090	}
1091
1092	// https://www.w3.org/TR/REC-xml/#AVNormalize
1093	fn normalize_attribute<'input>(
1094	text: StrSpan<'input>,
1095	ctx: &mut Context<'input>,
1096	) -> Result<StringStorage<'input>> {
1097	if is_normalization_required(&text) {
1098	let mut text_buffer: TextBuffer = TextBuffer::new();
1099	_normalize_attribute(text, &mut text_buffer, ctx)?;
1100	Ok(StringStorage::new_owned(text_buffer.finish()))
1101	} else {
1102	Ok(StringStorage::Borrowed(text.as_str()))
1103	}
1104	}
1105
1106	#[inline]
1107	fn is_normalization_required(text: &StrSpan) -> bool {
1108	// We assume that `&` indicates an entity or a character reference.
1109	// But in rare cases it can be just an another character.
1110
1111	fn check(c: u8) -> bool {
1112	matches!(c, b'&' \| b'`\t`' \| b'`\n`' \| b'`\r`')
1113	}
1114
1115	text.as_str().bytes().any(check)
1116	}
1117
1118	fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
1119	let mut stream = Stream::from_substr(ctx.doc.text, text.range());
1120	while !stream.at_end() {
1121	// Safe, because we already checked that the stream is not at the end.
1122	let c = stream.curr_byte_unchecked();
1123
1124	if c != b'&' {
1125	stream.advance(`1`);
1126	buffer.push_from_attr(c, stream.curr_byte().ok());
1127	continue;
1128	}
1129
1130	// Check for character/entity references.
1131	let start = stream.pos();
1132	match stream.try_consume_reference() {
1133	Some(Reference::Char(ch)) => {
1134	for b in CharToBytes::new(ch) {
1135	if ctx.loop_detector.depth > `0` {
1136	// Escaped `<` inside an ENTITY is an error.
1137	// Escaped `<` outside an ENTITY is ok.
1138	if b == b'<' {
1139	return Err(Error::InvalidAttributeValue(
1140	stream.gen_text_pos_from(start),
1141	));
1142	}
1143
1144	buffer.push_from_attr(b, None);
1145	} else {
1146	// Characters not from entity should be added as is.
1147	// Not sure why... At least `lxml` produces the same results.
1148	buffer.push_raw(b);
1149	}
1150	}
1151	}
1152	Some(Reference::Entity(name)) => match ctx.entities.iter().find(\|e\| e.name == name) {
1153	Some(entity) => {
1154	ctx.loop_detector.inc_references(&stream)?;
1155	ctx.loop_detector.inc_depth(&stream)?;
1156	_normalize_attribute(entity.value, buffer, ctx)?;
1157	ctx.loop_detector.dec_depth();
1158	}
1159	None => {
1160	let pos = stream.gen_text_pos_from(start);
1161	return Err(Error::UnknownEntityReference(name.into(), pos));
1162	}
1163	},
1164	None => {
1165	let pos = stream.gen_text_pos_from(start);
1166	return Err(Error::MalformedEntityReference(pos));
1167	}
1168	}
1169	}
1170
1171	Ok(())
1172	}
1173
1174	fn get_ns_idx_by_prefix(
1175	namespaces: ShortRange,
1176	prefix_pos: usize,
1177	prefix: &str,
1178	ctx: &Context,
1179	) -> Result<Option<NamespaceIdx>> {
1180	// Prefix CAN be empty when the default namespace was defined.
1181	//
1182	// Example:
1183	// <e xmlns='http://www.w3.org'/>
1184	let prefix_opt = if prefix.is_empty() {
1185	None
1186	} else {
1187	Some(prefix)
1188	};
1189
1190	let idx = ctx.doc.namespaces.tree_order[namespaces.to_urange()]
1191	.iter()
1192	.find(\|idx\| ctx.doc.namespaces.get(**idx).name == prefix_opt);
1193
1194	match idx {
1195	Some(idx) => Ok(Some(*idx)),
1196	None => {
1197	if !prefix.is_empty() {
1198	// If an URI was not found and prefix IS NOT empty than
1199	// we have an unknown namespace.
1200	//
1201	// Example:
1202	// <e random:a='b'/>
1203	let pos = ctx.err_pos_at(prefix_pos);
1204	Err(Error::UnknownNamespace(prefix.to_string(), pos))
1205	} else {
1206	// If an URI was not found and prefix IS empty than
1207	// an element or an attribute doesn't have a namespace.
1208	//
1209	// Example:
1210	// <e a='b'/>
1211	Ok(None)
1212	}
1213	}
1214	}
1215	}
1216
1217	fn gen_qname_string(prefix: &str, local: &str) -> String {
1218	if prefix.is_empty() {
1219	local.to_string()
1220	} else {
1221	alloc::format!("{}:{}", prefix, local)
1222	}
1223	}
1224
1225	/// Iterate over `char` by `u8`.
1226	struct CharToBytes {
1227	buf: [u8; `4`],
1228	idx: u8,
1229	}
1230
1231	impl CharToBytes {
1232	#[inline]
1233	fn new(c: char) -> Self {
1234	let mut buf: [u8; 4] = [`0xFF`; `4`];
1235	c.encode_utf8(&mut buf);
1236
1237	CharToBytes { buf, idx: `0` }
1238	}
1239	}
1240
1241	impl Iterator for CharToBytes {
1242	type Item = u8;
1243
1244	#[inline]
1245	fn next(&mut self) -> Option<Self::Item> {
1246	if self.idx < `4` {
1247	let b: u8 = self.buf[self.idx as usize];
1248
1249	if b != `0xFF` {
1250	self.idx += `1`;
1251	return Some(b);
1252	} else {
1253	self.idx = `4`;
1254	}
1255	}
1256
1257	None
1258	}
1259	}
1260
1261	struct TextBuffer {
1262	buffer: Vec<u8>,
1263	}
1264
1265	impl TextBuffer {
1266	#[inline]
1267	fn new() -> Self {
1268	TextBuffer {
1269	buffer: Vec::with_capacity(`32`),
1270	}
1271	}
1272
1273	#[inline]
1274	fn push_raw(&mut self, c: u8) {
1275	self.buffer.push(c);
1276	}
1277
1278	fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
1279	// \r in \r\n should be ignored.
1280	if current == b'`\r`' && next == Some(b'`\n`') {
1281	return;
1282	}
1283
1284	// \n, \r and \t should be converted into spaces.
1285	current = match current {
1286	b'`\n`' \| b'`\r`' \| b'`\t`' => b' ',
1287	_ => current,
1288	};
1289
1290	self.buffer.push(current);
1291	}
1292
1293	// Translate \r\n and any \r that is not followed by \n into a single \n character.
1294	//
1295	// https://www.w3.org/TR/xml/#sec-line-ends
1296	fn push_from_text(&mut self, c: u8, at_end: bool) {
1297	if self.buffer.last() == Some(&b'`\r`') {
1298	let idx = self.buffer.len() - `1`;
1299	self.buffer[idx] = b'`\n`';
1300
1301	if at_end && c == b'`\r`' {
1302	self.buffer.push(b'`\n`');
1303	} else if c != b'`\n`' {
1304	self.buffer.push(c);
1305	}
1306	} else if at_end && c == b'`\r`' {
1307	self.buffer.push(b'`\n`');
1308	} else {
1309	self.buffer.push(c);
1310	}
1311	}
1312
1313	#[inline]
1314	fn clear(&mut self) {
1315	self.buffer.clear();
1316	}
1317
1318	#[inline]
1319	fn is_empty(&self) -> bool {
1320	self.buffer.is_empty()
1321	}
1322
1323	#[inline]
1324	fn to_str(&self) -> &str {
1325	// `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1326	core::str::from_utf8(&self.buffer).unwrap()
1327	}
1328
1329	#[inline]
1330	fn finish(self) -> String {
1331	// `unwrap` is safe, because buffer must contain a valid UTF-8 string.
1332	String::from_utf8(self.buffer).unwrap()
1333	}
1334	}
1335