tokenizer.rs source code [crates/roxmltree-0.19.0/src/tokenizer.rs]

1	use core::ops::Range;
2	use core::str;
3
4	use crate::{Error, TextPos};
5
6	type Result<T> = core::result::Result<T, Error>;
7
8	/// Extension methods for XML-subset only operations.
9	trait XmlCharExt {
10	/// Checks if the value is within the
11	/// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
12	fn is_xml_name_start(&self) -> bool;
13
14	/// Checks if the value is within the
15	/// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
16	fn is_xml_name(&self) -> bool;
17
18	/// Checks if the value is within the
19	/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
20	fn is_xml_char(&self) -> bool;
21	}
22
23	impl XmlCharExt for char {
24	#[inline]
25	fn is_xml_name_start(&self) -> bool {
26	// Check for ASCII first.
27	if self as u32* <= `128` {
28	return matches!(*self as u8, b'A'..=b'Z' \| b'a'..=b'z' \| b':' \| b'_');
29	}
30
31	matches!(self as u32*,
32	`0x0000C0`..=`0x0000D6`
33	\| `0x0000D8`..=`0x0000F6`
34	\| `0x0000F8`..=`0x0002FF`
35	\| `0x000370`..=`0x00037D`
36	\| `0x00037F`..=`0x001FFF`
37	\| `0x00200C`..=`0x00200D`
38	\| `0x002070`..=`0x00218F`
39	\| `0x002C00`..=`0x002FEF`
40	\| `0x003001`..=`0x00D7FF`
41	\| `0x00F900`..=`0x00FDCF`
42	\| `0x00FDF0`..=`0x00FFFD`
43	\| `0x010000`..=`0x0EFFFF`)
44	}
45
46	#[inline]
47	fn is_xml_name(&self) -> bool {
48	// Check for ASCII first.
49	if self as u32* <= `128` {
50	return (*self as u8).is_xml_name();
51	}
52
53	matches!(self as u32*, `0x0000B7`
54	\| `0x0000C0`..=`0x0000D6`
55	\| `0x0000D8`..=`0x0000F6`
56	\| `0x0000F8`..=`0x0002FF`
57	\| `0x000300`..=`0x00036F`
58	\| `0x000370`..=`0x00037D`
59	\| `0x00037F`..=`0x001FFF`
60	\| `0x00200C`..=`0x00200D`
61	\| `0x00203F`..=`0x002040`
62	\| `0x002070`..=`0x00218F`
63	\| `0x002C00`..=`0x002FEF`
64	\| `0x003001`..=`0x00D7FF`
65	\| `0x00F900`..=`0x00FDCF`
66	\| `0x00FDF0`..=`0x00FFFD`
67	\| `0x010000`..=`0x0EFFFF`)
68	}
69
70	#[inline]
71	fn is_xml_char(&self) -> bool {
72	// Does not check for surrogate code points U+D800-U+DFFF,
73	// since that check was performed by Rust when the `&str` was constructed.
74	if (self as u32*) < `0x20` {
75	return (*self as u8).is_xml_space();
76	}
77
78	!matches!(self as u32*, `0xFFFF` \| `0xFFFE`)
79	}
80	}
81
82	trait XmlByteExt {
83	/// Checks if byte is a space.
84	///
85	/// `[ \r\n\t]`
86	fn is_xml_space(&self) -> bool;
87
88	/// Checks if byte is within the ASCII
89	/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
90	fn is_xml_name(&self) -> bool;
91	}
92
93	impl XmlByteExt for u8 {
94	#[inline]
95	fn is_xml_space(&self) -> bool {
96	matches!(*self, b' ' \| b'`\t`' \| b'`\n`' \| b'`\r`')
97	}
98
99	#[inline]
100	fn is_xml_name(&self) -> bool {
101	matches!(*self, b'A'..=b'Z' \| b'a'..=b'z'\| b'0'..=b'9'\| b':' \| b'_' \| b'-' \| b'.')
102	}
103	}
104
105	/// A string slice.
106	///
107	/// Like `&str`, but also contains the position in the input XML
108	/// from which it was parsed.
109	#[must_use]
110	#[derive(Clone, Copy)]
111	pub struct StrSpan<'input> {
112	text: &'input str,
113	start: usize,
114	}
115
116	impl<'input> From<&'input str> for StrSpan<'input> {
117	#[inline]
118	fn from(text: &'input str) -> Self {
119	StrSpan { text, start: `0` }
120	}
121	}
122
123	impl<'input> StrSpan<'input> {
124	#[inline]
125	pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
126	debug_assert!(start <= end);
127	StrSpan {
128	text: &text[start..end],
129	start,
130	}
131	}
132
133	#[inline]
134	pub fn range(&self) -> Range<usize> {
135	self.start..(self.start + self.text.len())
136	}
137
138	#[inline]
139	pub fn as_str(&self) -> &'input str {
140	self.text
141	}
142
143	#[inline]
144	fn slice_region(&self, start: usize, end: usize) -> &'input str {
145	&self.text[start..end]
146	}
147	}
148
149	pub enum Token<'input> {
150	// <?target content?>
151	ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
152
153	// <!-- text -->
154	Comment(&'input str, Range<usize>),
155
156	// <!ENTITY ns_extend "http://test.com">
157	EntityDeclaration(&'input str, StrSpan<'input>),
158
159	// <ns:elem
160	ElementStart(&'input str, &'input str, usize),
161
162	// ns:attr="value"
163	Attribute(usize, &'input str, &'input str, StrSpan<'input>),
164
165	ElementEnd(ElementEnd<'input>, Range<usize>),
166
167	// Contains text between elements including whitespaces.
168	// Basically everything between `>` and `<`.
169	// Except `]]>`, which is not allowed and will lead to an error.
170	Text(&'input str, Range<usize>),
171
172	// <![CDATA[text]]>
173	Cdata(&'input str, Range<usize>),
174	}
175
176	/// `ElementEnd` token.
177	#[derive(Clone, Copy)]
178	pub enum ElementEnd<'input> {
179	/// Indicates `>`
180	Open,
181	/// Indicates `</ns:name>`
182	Close(&'input str, &'input str),
183	/// Indicates `/>`
184	Empty,
185	}
186
187	pub trait XmlEvents<'input> {
188	fn token(&mut self, token: Token<'input>) -> Result<()>;
189	}
190
191	// document ::= prolog element Misc*
192	pub fn parse<'input>(
193	text: &'input str,
194	allow_dtd: bool,
195	events: &mut dyn XmlEvents<'input>,
196	) -> Result<()> {
197	let s = &mut Stream::new(text);
198
199	// Skip UTF-8 BOM.
200	if s.starts_with(&[`0xEF`, `0xBB`, `0xBF`]) {
201	s.advance(`3`);
202	}
203
204	if s.starts_with(b"<?xml ") {
205	parse_declaration(s)?;
206	}
207
208	parse_misc(s, events)?;
209
210	s.skip_spaces();
211	if s.starts_with(b"<!DOCTYPE") {
212	if !allow_dtd {
213	return Err(Error::DtdDetected);
214	}
215
216	parse_doctype(s, events)?;
217	parse_misc(s, events)?;
218	}
219
220	s.skip_spaces();
221	if s.curr_byte().ok() == Some(b'<') {
222	parse_element(s, events)?;
223	}
224
225	parse_misc(s, events)?;
226
227	if !s.at_end() {
228	return Err(Error::UnknownToken(s.gen_text_pos()));
229	}
230
231	Ok(())
232	}
233
234	// Misc ::= Comment \| PI \| S
235	fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
236	while !s.at_end() {
237	s.skip_spaces();
238	if s.starts_with(text:b"<!--") {
239	parse_comment(s, events)?;
240	} else if s.starts_with(text:b"<?") {
241	parse_pi(s, events)?;
242	} else {
243	break;
244	}
245	}
246
247	Ok(())
248	}
249
250	// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
251	//
252	// We don't actually return a token for the XML declaration and only validate it.
253	fn parse_declaration(s: &mut Stream) -> Result<()> {
254	fn consume_spaces(s: &mut Stream) -> Result<()> {
255	if s.starts_with_space() {
256	s.skip_spaces();
257	} else if !s.starts_with(b"?>") && !s.at_end() {
258	return Err(Error::InvalidChar2(
259	"a whitespace",
260	s.curr_byte_unchecked(),
261	s.gen_text_pos(),
262	));
263	}
264
265	Ok(())
266	}
267
268	s.advance(`5`); // <?xml
269	consume_spaces(s)?;
270
271	// The `version` "attribute" is mandatory.
272	if !s.starts_with(b"version") {
273	// Will trigger the InvalidString error, which is what we want.
274	return s.skip_string(b"version");
275	}
276	let _ = parse_attribute(s)?;
277	consume_spaces(s)?;
278
279	if s.starts_with(b"encoding") {
280	let _ = parse_attribute(s)?;
281	consume_spaces(s)?;
282	}
283
284	if s.starts_with(b"standalone") {
285	let _ = parse_attribute(s)?;
286	}
287
288	s.skip_spaces();
289	s.skip_string(b"?>")?;
290
291	Ok(())
292	}
293
294	// '<!--' ((Char - '-') \| ('-' (Char - '-'))) '-->'*
295	fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
296	let start: usize = s.pos();
297	s.advance(`4`);
298	let text: &str = s.consume_chars(\|s: &Stream<'_>, c: char\| !(c == '-' && s.starts_with(text:b"-->")))?;
299	s.skip_string(text:b"-->")?;
300
301	if text.contains("--") {
302	return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start)));
303	}
304
305	if text.ends_with('-') {
306	return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start)));
307	}
308
309	let range: Range = s.range_from(start);
310	events.token(Token::Comment(text, range))?;
311
312	Ok(())
313	}
314
315	// PI ::= '<?' PITarget (S (Char - (Char* '?>' Char)))? '?>'
316	// PITarget ::= Name - (('X' \| 'x') ('M' \| 'm') ('L' \| 'l'))
317	fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
318	if s.starts_with(text:b"<?xml ") {
319	return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
320	}
321
322	let start: usize = s.pos();
323	s.advance(`2`);
324	let target: &str = s.consume_name()?;
325	s.skip_spaces();
326	let content: &str = s.consume_chars(\|s: &Stream<'_>, c: char\| !(c == '?' && s.starts_with(text:b"?>")))?;
327	let content: Option<&str> = if !content.is_empty() {
328	Some(content)
329	} else {
330	None
331	};
332
333	s.skip_string(text:b"?>")?;
334
335	let range: Range = s.range_from(start);
336	events.token(Token::ProcessingInstruction(target, content, range))?;
337	Ok(())
338	}
339
340	fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
341	let start = s.pos();
342	parse_doctype_start(s)?;
343	s.skip_spaces();
344
345	if s.curr_byte() == Ok(b'>') {
346	s.advance(`1`);
347	return Ok(());
348	}
349
350	s.advance(`1`); // [
351	while !s.at_end() {
352	s.skip_spaces();
353	if s.starts_with(b"<!ENTITY") {
354	parse_entity_decl(s, events)?;
355	} else if s.starts_with(b"<!--") {
356	parse_comment(s, events)?;
357	} else if s.starts_with(b"<?") {
358	parse_pi(s, events)?;
359	} else if s.starts_with(b"]") {
360	// DTD ends with ']' S? '>', therefore we have to skip possible spaces.
361	s.advance(`1`);
362	s.skip_spaces();
363	match s.curr_byte() {
364	Ok(b'>') => {
365	s.advance(`1`);
366	break;
367	}
368	Ok(c) => {
369	return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
370	}
371	Err(_) => {
372	return Err(Error::UnexpectedEndOfStream);
373	}
374	}
375	} else if s.starts_with(b"<!ELEMENT")
376	\|\| s.starts_with(b"<!ATTLIST")
377	\|\| s.starts_with(b"<!NOTATION")
378	{
379	if consume_decl(s).is_err() {
380	let pos = s.gen_text_pos_from(start);
381	return Err(Error::UnknownToken(pos));
382	}
383	} else {
384	return Err(Error::UnknownToken(s.gen_text_pos()));
385	}
386	}
387
388	Ok(())
389	}
390
391	// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
392	fn parse_doctype_start(s: &mut Stream) -> Result<()> {
393	s.advance(`9`);
394
395	s.consume_spaces()?;
396	s.skip_name()?;
397	s.skip_spaces();
398
399	let _ = parse_external_id(s)?;
400	s.skip_spaces();
401
402	let c: u8 = s.curr_byte()?;
403	if c != b'[' && c != b'>' {
404	return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
405	}
406
407	Ok(())
408	}
409
410	// ExternalID ::= 'SYSTEM' S SystemLiteral \| 'PUBLIC' S PubidLiteral S SystemLiteral
411	fn parse_external_id(s: &mut Stream) -> Result<bool> {
412	let v = if s.starts_with(b"SYSTEM") \|\| s.starts_with(b"PUBLIC") {
413	let start = s.pos();
414	s.advance(`6`);
415	let id = s.slice_back(start);
416
417	s.consume_spaces()?;
418	let quote = s.consume_quote()?;
419	let _ = s.consume_bytes(\|c\| c != quote);
420	s.consume_byte(quote)?;
421
422	if id == "SYSTEM" {
423	// Ok
424	} else {
425	s.consume_spaces()?;
426	let quote = s.consume_quote()?;
427	let _ = s.consume_bytes(\|c\| c != quote);
428	s.consume_byte(quote)?;
429	}
430
431	`true`
432	} else {
433	`false`
434	};
435
436	Ok(v)
437	}
438
439	// EntityDecl ::= GEDecl \| PEDecl
440	// GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
441	// PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
442	fn parse_entity_decl<'input>(
443	s: &mut Stream<'input>,
444	events: &mut dyn XmlEvents<'input>,
445	) -> Result<()> {
446	s.advance(`8`);
447	s.consume_spaces()?;
448
449	let is_ge: bool = if s.try_consume_byte(b'%') {
450	s.consume_spaces()?;
451	`false`
452	} else {
453	`true`
454	};
455
456	let name: &str = s.consume_name()?;
457	s.consume_spaces()?;
458	if let Some(definition: StrSpan<'_>) = parse_entity_def(s, is_ge)? {
459	events.token(Token::EntityDeclaration(name, definition))?;
460	}
461	s.skip_spaces();
462	s.consume_byte(b'>')?;
463
464	Ok(())
465	}
466
467	// EntityDef ::= EntityValue \| (ExternalID NDataDecl?)
468	// PEDef ::= EntityValue \| ExternalID
469	// EntityValue ::= '"' ([^%&"] \| PEReference \| Reference) '"' \| "'" ([^%&']*
470	// \| PEReference \| Reference) "'"*
471	// ExternalID ::= 'SYSTEM' S SystemLiteral \| 'PUBLIC' S PubidLiteral S SystemLiteral
472	// NDataDecl ::= S 'NDATA' S Name
473	fn parse_entity_def<'input>(
474	s: &mut Stream<'input>,
475	is_ge: bool,
476	) -> Result<Option<StrSpan<'input>>> {
477	let c = s.curr_byte()?;
478	match c {
479	b'"' \| b'`\'`' => {
480	let quote = s.consume_quote()?;
481	let start = s.pos();
482	s.skip_bytes(\|c\| c != quote);
483	let value = s.slice_back_span(start);
484	s.consume_byte(quote)?;
485	Ok(Some(value))
486	}
487	b'S' \| b'P' => {
488	if parse_external_id(s)? {
489	if is_ge {
490	s.skip_spaces();
491	if s.starts_with(b"NDATA") {
492	s.advance(`5`);
493	s.consume_spaces()?;
494	s.skip_name()?;
495	// TODO: NDataDecl is not supported
496	}
497	}
498
499	Ok(None)
500	} else {
501	Err(Error::InvalidExternalID(s.gen_text_pos()))
502	}
503	}
504	_ => {
505	let pos = s.gen_text_pos();
506	Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
507	}
508	}
509	}
510
511	fn consume_decl(s: &mut Stream) -> Result<()> {
512	s.skip_bytes(\|c: u8\| c != b'>');
513	s.consume_byte(b'>')?;
514	Ok(())
515	}
516
517	// element ::= EmptyElemTag \| STag content ETag
518	// '<' Name (S Attribute) S? '>'*
519	fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
520	let start = s.pos();
521	s.advance(`1`); // <
522	let (prefix, local) = s.consume_qname()?;
523	events.token(Token::ElementStart(prefix, local, start))?;
524
525	let mut open = `false`;
526	while !s.at_end() {
527	let has_space = s.starts_with_space();
528	s.skip_spaces();
529	let start = s.pos();
530	match s.curr_byte()? {
531	b'/' => {
532	s.advance(`1`);
533	s.consume_byte(b'>')?;
534	let range = s.range_from(start);
535	events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
536	break;
537	}
538	b'>' => {
539	s.advance(`1`);
540	let range = s.range_from(start);
541	events.token(Token::ElementEnd(ElementEnd::Open, range))?;
542	open = `true`;
543	break;
544	}
545	_ => {
546	// An attribute must be preceded with a whitespace.
547	if !has_space {
548	// Will always trigger an error. Which is what we want.
549	s.consume_spaces()?;
550	}
551
552	// Manual inlining of `parse_attribute` for performance.
553	// We cannot mark `parse_attribute` as `#[inline(always)]`
554	// because it will blow up the binary size.
555	let (prefix, local) = s.consume_qname()?;
556	s.consume_eq()?;
557	let quote = s.consume_quote()?;
558	let quote_c = quote as char;
559	// The attribute value must not contain the < character.
560	let value_start = s.pos();
561	s.skip_chars(\|_, c\| c != quote_c && c != '<')?;
562	let value = s.slice_back_span(value_start);
563	s.consume_byte(quote)?;
564	events.token(Token::Attribute(start, prefix, local, value))?;
565	}
566	}
567	}
568
569	if open {
570	parse_content(s, events)?;
571	}
572
573	Ok(())
574	}
575
576	// Attribute ::= Name Eq AttValue
577	fn parse_attribute<'input>(
578	s: &mut Stream<'input>,
579	) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
580	let (prefix: &str, local: &str) = s.consume_qname()?;
581	s.consume_eq()?;
582	let quote: u8 = s.consume_quote()?;
583	let quote_c: char = quote as char;
584	// The attribute value must not contain the < character.
585	let value_start: usize = s.pos();
586	s.skip_chars(\|_, c: char\| c != quote_c && c != '<')?;
587	let value: StrSpan<'_> = s.slice_back_span(pos:value_start);
588	s.consume_byte(quote)?;
589	Ok((prefix, local, value))
590	}
591
592	// content ::= CharData? ((element \| Reference \| CDSect \| PI \| Comment) CharData?)*
593	pub fn parse_content<'input>(
594	s: &mut Stream<'input>,
595	events: &mut dyn XmlEvents<'input>,
596	) -> Result<()> {
597	while !s.at_end() {
598	match s.curr_byte() {
599	Ok(b'<') => match s.next_byte() {
600	Ok(b'!') => {
601	if s.starts_with(b"<!--") {
602	parse_comment(s, events)?;
603	} else if s.starts_with(b"<![CDATA[") {
604	parse_cdata(s, events)?;
605	} else {
606	return Err(Error::UnknownToken(s.gen_text_pos()));
607	}
608	}
609	Ok(b'?') => parse_pi(s, events)?,
610	Ok(b'/') => {
611	parse_close_element(s, events)?;
612	break;
613	}
614	Ok(_) => parse_element(s, events)?,
615	Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
616	},
617	Ok(_) => parse_text(s, events)?,
618	Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
619	}
620	}
621
622	Ok(())
623	}
624
625	// CDSect ::= CDStart CData CDEnd
626	// CDStart ::= '<![CDATA['
627	// CData ::= (Char - (Char* ']]>' Char))
628	// CDEnd ::= ']]>'
629	fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
630	let start: usize = s.pos();
631	s.advance(`9`); // <![CDATA[
632	let text: &str = s.consume_chars(\|s: &Stream<'_>, c: char\| !(c == ']' && s.starts_with(text:b"]]>")))?;
633	s.skip_string(text:b"]]>")?;
634	let range: Range = s.range_from(start);
635	events.token(Token::Cdata(text, range))?;
636	Ok(())
637	}
638
639	// '</' Name S? '>'
640	fn parse_close_element<'input>(
641	s: &mut Stream<'input>,
642	events: &mut dyn XmlEvents<'input>,
643	) -> Result<()> {
644	let start: usize = s.pos();
645	s.advance(`2`); // </
646
647	let (prefix: &str, tag_name: &str) = s.consume_qname()?;
648	s.skip_spaces();
649	s.consume_byte(b'>')?;
650
651	let range: Range = s.range_from(start);
652	events.token(Token::ElementEnd(
653	ElementEnd::Close(prefix, tag_name),
654	range,
655	))?;
656	Ok(())
657	}
658
659	fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
660	let start: usize = s.pos();
661	let text: &str = s.consume_chars(\|_, c: char\| c != '<')?;
662
663	// According to the spec, `]]>` must not appear inside a Text node.
664	// https://www.w3.org/TR/xml/#syntax
665	//
666	// Search for `>` first, since it's a bit faster than looking for `]]>`.
667	if text.contains('>') && text.contains("]]>") {
668	return Err(Error::InvalidCharacterData(s.gen_text_pos()));
669	}
670
671	let range: Range = s.range_from(start);
672	events.token(Token::Text(text, range))?;
673	Ok(())
674	}
675
676	/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
677	#[derive(Clone, Copy)]
678	pub enum Reference<'input> {
679	/// An entity reference.
680	///
681	/// <https://www.w3.org/TR/xml/#NT-EntityRef>
682	Entity(&'input str),
683
684	/// A character reference.
685	///
686	/// <https://www.w3.org/TR/xml/#NT-CharRef>
687	Char(char),
688	}
689
690	#[derive(Clone)]
691	pub struct Stream<'input> {
692	pos: usize,
693	end: usize,
694	span: StrSpan<'input>,
695	}
696
697	impl<'input> Stream<'input> {
698	#[inline]
699	pub fn new(text: &'input str) -> Self {
700	Stream {
701	pos: `0`,
702	end: text.len(),
703	span: text.into(),
704	}
705	}
706
707	#[inline]
708	pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
709	Stream {
710	pos: fragment.start,
711	end: fragment.end,
712	span: text.into(),
713	}
714	}
715
716	#[inline]
717	pub fn pos(&self) -> usize {
718	self.pos
719	}
720
721	#[inline]
722	pub fn at_end(&self) -> bool {
723	self.pos >= self.end
724	}
725
726	#[inline]
727	pub fn curr_byte(&self) -> Result<u8> {
728	if self.at_end() {
729	return Err(Error::UnexpectedEndOfStream);
730	}
731
732	Ok(self.curr_byte_unchecked())
733	}
734
735	#[inline]
736	pub fn curr_byte_unchecked(&self) -> u8 {
737	self.span.text.as_bytes()[self.pos]
738	}
739
740	#[inline]
741	fn next_byte(&self) -> Result<u8> {
742	if self.pos + `1` >= self.end {
743	return Err(Error::UnexpectedEndOfStream);
744	}
745
746	Ok(self.span.as_str().as_bytes()[self.pos + `1`])
747	}
748
749	#[inline]
750	pub fn advance(&mut self, n: usize) {
751	debug_assert!(self.pos + n <= self.end);
752	self.pos += n;
753	}
754
755	#[inline]
756	fn starts_with(&self, text: &[u8]) -> bool {
757	self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
758	}
759
760	fn consume_byte(&mut self, c: u8) -> Result<()> {
761	let curr = self.curr_byte()?;
762	if curr != c {
763	return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
764	}
765
766	self.advance(`1`);
767	Ok(())
768	}
769
770	// Unlike `consume_byte()` will not return any errors.
771	fn try_consume_byte(&mut self, c: u8) -> bool {
772	match self.curr_byte() {
773	Ok(b) if b == c => {
774	self.advance(`1`);
775	`true`
776	}
777	_ => `false`,
778	}
779	}
780
781	fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
782	if !self.starts_with(text) {
783	let pos = self.gen_text_pos();
784
785	// Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
786	let expected = str::from_utf8(text).unwrap();
787
788	return Err(Error::InvalidString(expected, pos));
789	}
790
791	self.advance(text.len());
792	Ok(())
793	}
794
795	#[inline]
796	fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
797	let start = self.pos;
798	self.skip_bytes(f);
799	self.slice_back(start)
800	}
801
802	fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
803	while !self.at_end() && f(self.curr_byte_unchecked()) {
804	self.advance(`1`);
805	}
806	}
807
808	#[inline]
809	fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
810	where
811	F: Fn(&Stream, char) -> bool,
812	{
813	let start = self.pos;
814	self.skip_chars(f)?;
815	Ok(self.slice_back(start))
816	}
817
818	#[inline]
819	fn skip_chars<F>(&mut self, f: F) -> Result<()>
820	where
821	F: Fn(&Stream, char) -> bool,
822	{
823	for c in self.chars() {
824	if !c.is_xml_char() {
825	return Err(Error::NonXmlChar(c, self.gen_text_pos()));
826	} else if f(self, c) {
827	self.advance(c.len_utf8());
828	} else {
829	break;
830	}
831	}
832
833	Ok(())
834	}
835
836	#[inline]
837	fn chars(&self) -> str::Chars<'input> {
838	self.span.as_str()[self.pos..self.end].chars()
839	}
840
841	#[inline]
842	fn slice_back(&self, pos: usize) -> &'input str {
843	self.span.slice_region(pos, self.pos)
844	}
845
846	#[inline]
847	fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
848	StrSpan::from_substr(self.span.text, pos, self.pos)
849	}
850
851	#[inline]
852	fn range_from(&self, start: usize) -> Range<usize> {
853	start..self.pos
854	}
855
856	#[inline]
857	fn skip_spaces(&mut self) {
858	while self.starts_with_space() {
859	self.advance(`1`);
860	}
861	}
862
863	#[inline]
864	fn starts_with_space(&self) -> bool {
865	!self.at_end() && self.curr_byte_unchecked().is_xml_space()
866	}
867
868	// Like `skip_spaces()`, but checks that first char is actually a space.
869	fn consume_spaces(&mut self) -> Result<()> {
870	if self.at_end() {
871	return Err(Error::UnexpectedEndOfStream);
872	}
873
874	if !self.starts_with_space() {
875	return Err(Error::InvalidChar2(
876	"a whitespace",
877	self.curr_byte_unchecked(),
878	self.gen_text_pos(),
879	));
880	}
881
882	self.skip_spaces();
883	Ok(())
884	}
885
886	/// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
887	pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> {
888	let start = self.pos();
889
890	// Consume reference on a substream.
891	let mut s = self.clone();
892	let result = s.consume_reference()?;
893
894	// If the current data is a reference than advance the current stream
895	// by number of bytes read by substream.
896	self.advance(s.pos() - start);
897	Some(result)
898	}
899
900	#[inline(never)]
901	fn consume_reference(&mut self) -> Option<Reference<'input>> {
902	if !self.try_consume_byte(b'&') {
903	return None;
904	}
905
906	let reference = if self.try_consume_byte(b'#') {
907	let (value, radix) = if self.try_consume_byte(b'x') {
908	let value =
909	self.consume_bytes(\|c\| matches!(c, b'0'..=b'9' \| b'A'..=b'F' \| b'a'..=b'f'));
910	(value, `16`)
911	} else {
912	let value = self.consume_bytes(\|c\| c.is_ascii_digit());
913	(value, `10`)
914	};
915
916	let n = u32::from_str_radix(value, radix).ok()?;
917
918	let c = char::from_u32(n).unwrap_or('`\u{FFFD}`');
919	if !c.is_xml_char() {
920	return None;
921	}
922
923	Reference::Char(c)
924	} else {
925	let name = self.consume_name().ok()?;
926	match name {
927	"quot" => Reference::Char('"'),
928	"amp" => Reference::Char('&'),
929	"apos" => Reference::Char('`\'`'),
930	"lt" => Reference::Char('<'),
931	"gt" => Reference::Char('>'),
932	_ => Reference::Entity(name),
933	}
934	};
935
936	self.consume_byte(b';').ok()?;
937
938	Some(reference)
939	}
940
941	/// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
942	fn consume_name(&mut self) -> Result<&'input str> {
943	let start = self.pos();
944	self.skip_name()?;
945
946	let name = self.slice_back(start);
947	if name.is_empty() {
948	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
949	}
950
951	Ok(name)
952	}
953
954	/// The same as `consume_name()`, but does not return a consumed name.
955	fn skip_name(&mut self) -> Result<()> {
956	let start = self.pos();
957	let mut iter = self.chars();
958	if let Some(c) = iter.next() {
959	if c.is_xml_name_start() {
960	self.advance(c.len_utf8());
961	} else {
962	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
963	}
964	}
965
966	for c in iter {
967	if c.is_xml_name() {
968	self.advance(c.len_utf8());
969	} else {
970	break;
971	}
972	}
973
974	Ok(())
975	}
976
977	/// Consumes a qualified XML name and returns it.
978	///
979	/// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
980	#[inline(never)]
981	fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
982	let start = self.pos();
983
984	let mut splitter = None;
985
986	while !self.at_end() {
987	// Check for ASCII first for performance reasons.
988	let b = self.curr_byte_unchecked();
989	if b < `128` {
990	if b == b':' {
991	if splitter.is_none() {
992	splitter = Some(self.pos());
993	self.advance(`1`);
994	} else {
995	// Multiple `:` is an error.
996	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
997	}
998	} else if b.is_xml_name() {
999	self.advance(`1`);
1000	} else {
1001	break;
1002	}
1003	} else {
1004	// Fallback to Unicode code point.
1005	match self.chars().nth(`0`) {
1006	Some(c) if c.is_xml_name() => {
1007	self.advance(c.len_utf8());
1008	}
1009	_ => break,
1010	}
1011	}
1012	}
1013
1014	let (prefix, local) = if let Some(splitter) = splitter {
1015	let prefix = self.span.slice_region(start, splitter);
1016	let local = self.slice_back(splitter + `1`);
1017	(prefix, local)
1018	} else {
1019	let local = self.slice_back(start);
1020	// Slice an empty prefix. This way we can preserve attribute start position.
1021	(self.span.slice_region(start, start), local)
1022	};
1023
1024	// Prefix must start with a `NameStartChar`.
1025	if let Some(c) = prefix.chars().nth(`0`) {
1026	if !c.is_xml_name_start() {
1027	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1028	}
1029	}
1030
1031	// Local name must start with a `NameStartChar`.
1032	if let Some(c) = local.chars().nth(`0`) {
1033	if !c.is_xml_name_start() {
1034	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1035	}
1036	} else {
1037	// If empty - error.
1038	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1039	}
1040
1041	Ok((prefix, local))
1042	}
1043
1044	fn consume_eq(&mut self) -> Result<()> {
1045	self.skip_spaces();
1046	self.consume_byte(b'=')?;
1047	self.skip_spaces();
1048
1049	Ok(())
1050	}
1051
1052	fn consume_quote(&mut self) -> Result<u8> {
1053	let c = self.curr_byte()?;
1054	if c == b'`\'`' \|\| c == b'"' {
1055	self.advance(`1`);
1056	Ok(c)
1057	} else {
1058	Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
1059	}
1060	}
1061
1062	/// Calculates a current absolute position.
1063	///
1064	/// This operation is very expensive. Use only for errors.
1065	#[inline(never)]
1066	pub fn gen_text_pos(&self) -> TextPos {
1067	let text = self.span.as_str();
1068	let end = self.pos;
1069
1070	let row = Self::calc_curr_row(text, end);
1071	let col = Self::calc_curr_col(text, end);
1072	TextPos::new(row, col)
1073	}
1074
1075	/// Calculates an absolute position at `pos`.
1076	///
1077	/// This operation is very expensive. Use only for errors.
1078	#[inline(never)]
1079	pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
1080	let mut s = self.clone();
1081	s.pos = core::cmp::min(pos, s.span.as_str().len());
1082	s.gen_text_pos()
1083	}
1084
1085	fn calc_curr_row(text: &str, end: usize) -> u32 {
1086	let mut row = `1`;
1087	for c in &text.as_bytes()[..end] {
1088	if *c == b'`\n`' {
1089	row += `1`;
1090	}
1091	}
1092
1093	row
1094	}
1095
1096	fn calc_curr_col(text: &str, end: usize) -> u32 {
1097	let mut col = `1`;
1098	for c in text[..end].chars().rev() {
1099	if c == '`\n`' {
1100	break;
1101	} else {
1102	col += `1`;
1103	}
1104	}
1105
1106	col
1107	}
1108	}
1109