tokenizer.rs source code [crates/roxmltree/src/tokenizer.rs]

1	use core::ops::Range;
2	use core::str;
3
4	use crate::{Error, TextPos};
5
6	type Result<T> = core::result::Result<T, Error>;
7
8	/// Extension methods for XML-subset only operations.
9	trait XmlCharExt {
10	/// Checks if the value is within the
11	/// [NameStartChar](https://www.w3.org/TR/xml/#NT-NameStartChar) range.
12	fn is_xml_name_start(&self) -> bool;
13
14	/// Checks if the value is within the
15	/// [NameChar](https://www.w3.org/TR/xml/#NT-NameChar) range.
16	fn is_xml_name(&self) -> bool;
17
18	/// Checks if the value is within the
19	/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
20	fn is_xml_char(&self) -> bool;
21	}
22
23	impl XmlCharExt for char {
24	#[inline]
25	fn is_xml_name_start(&self) -> bool {
26	// Check for ASCII first.
27	if self as u32* <= `128` {
28	return matches!(*self as u8, b'A'..=b'Z' \| b'a'..=b'z' \| b':' \| b'_');
29	}
30
31	matches!(self as u32*,
32	`0x0000C0`..=`0x0000D6`
33	\| `0x0000D8`..=`0x0000F6`
34	\| `0x0000F8`..=`0x0002FF`
35	\| `0x000370`..=`0x00037D`
36	\| `0x00037F`..=`0x001FFF`
37	\| `0x00200C`..=`0x00200D`
38	\| `0x002070`..=`0x00218F`
39	\| `0x002C00`..=`0x002FEF`
40	\| `0x003001`..=`0x00D7FF`
41	\| `0x00F900`..=`0x00FDCF`
42	\| `0x00FDF0`..=`0x00FFFD`
43	\| `0x010000`..=`0x0EFFFF`)
44	}
45
46	#[inline]
47	fn is_xml_name(&self) -> bool {
48	// Check for ASCII first.
49	if self as u32* <= `128` {
50	return (*self as u8).is_xml_name();
51	}
52
53	matches!(self as u32*, `0x0000B7`
54	\| `0x0000C0`..=`0x0000D6`
55	\| `0x0000D8`..=`0x0000F6`
56	\| `0x0000F8`..=`0x0002FF`
57	\| `0x000300`..=`0x00036F`
58	\| `0x000370`..=`0x00037D`
59	\| `0x00037F`..=`0x001FFF`
60	\| `0x00200C`..=`0x00200D`
61	\| `0x00203F`..=`0x002040`
62	\| `0x002070`..=`0x00218F`
63	\| `0x002C00`..=`0x002FEF`
64	\| `0x003001`..=`0x00D7FF`
65	\| `0x00F900`..=`0x00FDCF`
66	\| `0x00FDF0`..=`0x00FFFD`
67	\| `0x010000`..=`0x0EFFFF`)
68	}
69
70	#[inline]
71	fn is_xml_char(&self) -> bool {
72	// Does not check for surrogate code points U+D800-U+DFFF,
73	// since that check was performed by Rust when the `&str` was constructed.
74	if (self as u32*) < `0x20` {
75	return (*self as u8).is_xml_space();
76	}
77
78	!matches!(self as u32*, `0xFFFF` \| `0xFFFE`)
79	}
80	}
81
82	trait XmlByteExt {
83	/// Checks if byte is a space.
84	///
85	/// `[ \r\n\t]`
86	fn is_xml_space(&self) -> bool;
87
88	/// Checks if byte is within the ASCII
89	/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
90	fn is_xml_name(&self) -> bool;
91	}
92
93	impl XmlByteExt for u8 {
94	#[inline]
95	fn is_xml_space(&self) -> bool {
96	matches!(*self, b' ' \| b'`\t`' \| b'`\n`' \| b'`\r`')
97	}
98
99	#[inline]
100	fn is_xml_name(&self) -> bool {
101	matches!(*self, b'A'..=b'Z' \| b'a'..=b'z'\| b'0'..=b'9'\| b':' \| b'_' \| b'-' \| b'.')
102	}
103	}
104
105	/// A string slice.
106	///
107	/// Like `&str`, but also contains the position in the input XML
108	/// from which it was parsed.
109	#[must_use]
110	#[derive(Clone, Copy)]
111	pub struct StrSpan<'input> {
112	text: &'input str,
113	start: usize,
114	}
115
116	impl<'input> From<&'input str> for StrSpan<'input> {
117	#[inline]
118	fn from(text: &'input str) -> Self {
119	StrSpan { text, start: `0` }
120	}
121	}
122
123	impl<'input> StrSpan<'input> {
124	#[inline]
125	pub fn from_substr(text: &str, start: usize, end: usize) -> StrSpan {
126	debug_assert!(start <= end);
127	StrSpan {
128	text: &text[start..end],
129	start,
130	}
131	}
132
133	#[inline]
134	pub fn range(&self) -> Range<usize> {
135	self.start..(self.start + self.text.len())
136	}
137
138	#[inline]
139	pub fn as_str(&self) -> &'input str {
140	self.text
141	}
142
143	#[inline]
144	fn slice_region(&self, start: usize, end: usize) -> &'input str {
145	&self.text[start..end]
146	}
147	}
148
149	pub enum Token<'input> {
150	// <?target content?>
151	ProcessingInstruction(&'input str, Option<&'input str>, Range<usize>),
152
153	// <!-- text -->
154	Comment(&'input str, Range<usize>),
155
156	// <!ENTITY ns_extend "http://test.com">
157	EntityDeclaration(&'input str, StrSpan<'input>),
158
159	// <ns:elem
160	ElementStart(&'input str, &'input str, usize),
161
162	// ns:attr="value"
163	Attribute(Range<usize>, u16, u8, &'input str, &'input str, StrSpan<'input>),
164
165	ElementEnd(ElementEnd<'input>, Range<usize>),
166
167	// Contains text between elements including whitespaces.
168	// Basically everything between `>` and `<`.
169	// Except `]]>`, which is not allowed and will lead to an error.
170	Text(&'input str, Range<usize>),
171
172	// <![CDATA[text]]>
173	Cdata(&'input str, Range<usize>),
174	}
175
176	/// `ElementEnd` token.
177	#[derive(Clone, Copy)]
178	pub enum ElementEnd<'input> {
179	/// Indicates `>`
180	Open,
181	/// Indicates `</ns:name>`
182	Close(&'input str, &'input str),
183	/// Indicates `/>`
184	Empty,
185	}
186
187	pub trait XmlEvents<'input> {
188	fn token(&mut self, token: Token<'input>) -> Result<()>;
189	}
190
191	// document ::= prolog element Misc*
192	pub fn parse<'input>(
193	text: &'input str,
194	allow_dtd: bool,
195	events: &mut dyn XmlEvents<'input>,
196	) -> Result<()> {
197	let s = &mut Stream::new(text);
198
199	// Skip UTF-8 BOM.
200	if s.starts_with(&[`0xEF`, `0xBB`, `0xBF`]) {
201	s.advance(`3`);
202	}
203
204	if s.starts_with(b"<?xml ") {
205	parse_declaration(s)?;
206	}
207
208	parse_misc(s, events)?;
209
210	s.skip_spaces();
211	if s.starts_with(b"<!DOCTYPE") {
212	if !allow_dtd {
213	return Err(Error::DtdDetected);
214	}
215
216	parse_doctype(s, events)?;
217	parse_misc(s, events)?;
218	}
219
220	s.skip_spaces();
221	if s.curr_byte().ok() == Some(b'<') {
222	parse_element(s, events)?;
223	}
224
225	parse_misc(s, events)?;
226
227	if !s.at_end() {
228	return Err(Error::UnknownToken(s.gen_text_pos()));
229	}
230
231	Ok(())
232	}
233
234	// Misc ::= Comment \| PI \| S
235	fn parse_misc<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
236	while !s.at_end() {
237	s.skip_spaces();
238	if s.starts_with(text:b"<!--") {
239	parse_comment(s, events)?;
240	} else if s.starts_with(text:b"<?") {
241	parse_pi(s, events)?;
242	} else {
243	break;
244	}
245	}
246
247	Ok(())
248	}
249
250	// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
251	//
252	// We don't actually return a token for the XML declaration and only validate it.
253	fn parse_declaration(s: &mut Stream) -> Result<()> {
254	fn consume_spaces(s: &mut Stream) -> Result<()> {
255	if s.starts_with_space() {
256	s.skip_spaces();
257	} else if !s.starts_with(b"?>") && !s.at_end() {
258	return Err(Error::InvalidChar2(
259	"a whitespace",
260	s.curr_byte_unchecked(),
261	s.gen_text_pos(),
262	));
263	}
264
265	Ok(())
266	}
267
268	s.advance(`5`); // <?xml
269	consume_spaces(s)?;
270
271	// The `version` "attribute" is mandatory.
272	if !s.starts_with(b"version") {
273	// Will trigger the InvalidString error, which is what we want.
274	return s.skip_string(b"version");
275	}
276	let _ = parse_attribute(s)?;
277	consume_spaces(s)?;
278
279	if s.starts_with(b"encoding") {
280	let _ = parse_attribute(s)?;
281	consume_spaces(s)?;
282	}
283
284	if s.starts_with(b"standalone") {
285	let _ = parse_attribute(s)?;
286	}
287
288	s.skip_spaces();
289	s.skip_string(b"?>")?;
290
291	Ok(())
292	}
293
294	// '<!--' ((Char - '-') \| ('-' (Char - '-'))) '-->'*
295	fn parse_comment<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
296	let start: usize = s.pos();
297	s.advance(`4`);
298	let text: &'input str = s.consume_chars(\|s: &Stream<'_>, c: char\| !(c == '-' && s.starts_with(text:b"-->")))?;
299	s.skip_string(text:b"-->")?;
300
301	if text.contains("--") {
302	return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start)));
303	}
304
305	if text.ends_with('-') {
306	return Err(Error::InvalidComment(s.gen_text_pos_from(pos:start)));
307	}
308
309	let range: Range = s.range_from(start);
310	events.token(Token::Comment(text, range))?;
311
312	Ok(())
313	}
314
315	// PI ::= '<?' PITarget (S (Char - (Char* '?>' Char)))? '?>'
316	// PITarget ::= Name - (('X' \| 'x') ('M' \| 'm') ('L' \| 'l'))
317	fn parse_pi<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
318	if s.starts_with(text:b"<?xml ") {
319	return Err(Error::UnexpectedDeclaration(s.gen_text_pos()));
320	}
321
322	let start: usize = s.pos();
323	s.advance(`2`);
324	let target: &'input str = s.consume_name()?;
325	s.skip_spaces();
326	let content: &'input str = s.consume_chars(\|s: &Stream<'_>, c: char\| !(c == '?' && s.starts_with(text:b"?>")))?;
327	let content: Option<&'input str> = if !content.is_empty() {
328	Some(content)
329	} else {
330	None
331	};
332
333	s.skip_string(text:b"?>")?;
334
335	let range: Range = s.range_from(start);
336	events.token(Token::ProcessingInstruction(target, content, range))?;
337	Ok(())
338	}
339
340	fn parse_doctype<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
341	let start = s.pos();
342	parse_doctype_start(s)?;
343	s.skip_spaces();
344
345	if s.curr_byte() == Ok(b'>') {
346	s.advance(`1`);
347	return Ok(());
348	}
349
350	s.advance(`1`); // [
351	while !s.at_end() {
352	s.skip_spaces();
353	if s.starts_with(b"<!ENTITY") {
354	parse_entity_decl(s, events)?;
355	} else if s.starts_with(b"<!--") {
356	parse_comment(s, events)?;
357	} else if s.starts_with(b"<?") {
358	parse_pi(s, events)?;
359	} else if s.starts_with(b"]") {
360	// DTD ends with ']' S? '>', therefore we have to skip possible spaces.
361	s.advance(`1`);
362	s.skip_spaces();
363	match s.curr_byte() {
364	Ok(b'>') => {
365	s.advance(`1`);
366	break;
367	}
368	Ok(c) => {
369	return Err(Error::InvalidChar2("'>'", c, s.gen_text_pos()));
370	}
371	Err(_) => {
372	return Err(Error::UnexpectedEndOfStream);
373	}
374	}
375	} else if s.starts_with(b"<!ELEMENT")
376	\|\| s.starts_with(b"<!ATTLIST")
377	\|\| s.starts_with(b"<!NOTATION")
378	{
379	if consume_decl(s).is_err() {
380	let pos = s.gen_text_pos_from(start);
381	return Err(Error::UnknownToken(pos));
382	}
383	} else {
384	return Err(Error::UnknownToken(s.gen_text_pos()));
385	}
386	}
387
388	Ok(())
389	}
390
391	// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
392	fn parse_doctype_start(s: &mut Stream) -> Result<()> {
393	s.advance(`9`);
394
395	s.consume_spaces()?;
396	s.skip_name()?;
397	s.skip_spaces();
398
399	let _ = parse_external_id(s)?;
400	s.skip_spaces();
401
402	let c: u8 = s.curr_byte()?;
403	if c != b'[' && c != b'>' {
404	return Err(Error::InvalidChar2("'[' or '>'", c, s.gen_text_pos()));
405	}
406
407	Ok(())
408	}
409
410	// ExternalID ::= 'SYSTEM' S SystemLiteral \| 'PUBLIC' S PubidLiteral S SystemLiteral
411	fn parse_external_id(s: &mut Stream) -> Result<bool> {
412	let v = if s.starts_with(b"SYSTEM") \|\| s.starts_with(b"PUBLIC") {
413	let start = s.pos();
414	s.advance(`6`);
415	let id = s.slice_back(start);
416
417	s.consume_spaces()?;
418	let quote = s.consume_quote()?;
419	let _ = s.consume_bytes(\|c\| c != quote);
420	s.consume_byte(quote)?;
421
422	if id == "SYSTEM" {
423	// Ok
424	} else {
425	s.consume_spaces()?;
426	let quote = s.consume_quote()?;
427	let _ = s.consume_bytes(\|c\| c != quote);
428	s.consume_byte(quote)?;
429	}
430
431	`true`
432	} else {
433	`false`
434	};
435
436	Ok(v)
437	}
438
439	// EntityDecl ::= GEDecl \| PEDecl
440	// GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
441	// PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
442	fn parse_entity_decl<'input>(
443	s: &mut Stream<'input>,
444	events: &mut dyn XmlEvents<'input>,
445	) -> Result<()> {
446	s.advance(`8`);
447	s.consume_spaces()?;
448
449	let is_ge: bool = if s.try_consume_byte(b'%') {
450	s.consume_spaces()?;
451	`false`
452	} else {
453	`true`
454	};
455
456	let name: &'input str = s.consume_name()?;
457	s.consume_spaces()?;
458	if let Some(definition: StrSpan<'_>) = parse_entity_def(s, is_ge)? {
459	events.token(Token::EntityDeclaration(name, definition))?;
460	}
461	s.skip_spaces();
462	s.consume_byte(b'>')?;
463
464	Ok(())
465	}
466
467	// EntityDef ::= EntityValue \| (ExternalID NDataDecl?)
468	// PEDef ::= EntityValue \| ExternalID
469	// EntityValue ::= '"' ([^%&"] \| PEReference \| Reference) '"' \| "'" ([^%&']*
470	// \| PEReference \| Reference) "'"*
471	// ExternalID ::= 'SYSTEM' S SystemLiteral \| 'PUBLIC' S PubidLiteral S SystemLiteral
472	// NDataDecl ::= S 'NDATA' S Name
473	fn parse_entity_def<'input>(
474	s: &mut Stream<'input>,
475	is_ge: bool,
476	) -> Result<Option<StrSpan<'input>>> {
477	let c = s.curr_byte()?;
478	match c {
479	b'"' \| b'`\'`' => {
480	let quote = s.consume_quote()?;
481	let start = s.pos();
482	s.skip_bytes(\|c\| c != quote);
483	let value = s.slice_back_span(start);
484	s.consume_byte(quote)?;
485	Ok(Some(value))
486	}
487	b'S' \| b'P' => {
488	if parse_external_id(s)? {
489	if is_ge {
490	s.skip_spaces();
491	if s.starts_with(b"NDATA") {
492	s.advance(`5`);
493	s.consume_spaces()?;
494	s.skip_name()?;
495	// TODO: NDataDecl is not supported
496	}
497	}
498
499	Ok(None)
500	} else {
501	Err(Error::InvalidExternalID(s.gen_text_pos()))
502	}
503	}
504	_ => {
505	let pos = s.gen_text_pos();
506	Err(Error::InvalidChar2("a quote, SYSTEM or PUBLIC", c, pos))
507	}
508	}
509	}
510
511	fn consume_decl(s: &mut Stream) -> Result<()> {
512	s.skip_bytes(\|c: u8\| c != b'>');
513	s.consume_byte(b'>')?;
514	Ok(())
515	}
516
517	// element ::= EmptyElemTag \| STag content ETag
518	// '<' Name (S Attribute) S? '>'*
519	fn parse_element<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
520	let start = s.pos();
521	s.advance(`1`); // <
522	let (prefix, local) = s.consume_qname()?;
523	events.token(Token::ElementStart(prefix, local, start))?;
524
525	let mut open = `false`;
526	while !s.at_end() {
527	let has_space = s.starts_with_space();
528	s.skip_spaces();
529	let start = s.pos();
530	match s.curr_byte()? {
531	b'/' => {
532	s.advance(`1`);
533	s.consume_byte(b'>')?;
534	let range = s.range_from(start);
535	events.token(Token::ElementEnd(ElementEnd::Empty, range))?;
536	break;
537	}
538	b'>' => {
539	s.advance(`1`);
540	let range = s.range_from(start);
541	events.token(Token::ElementEnd(ElementEnd::Open, range))?;
542	open = `true`;
543	break;
544	}
545	_ => {
546	// An attribute must be preceded with a whitespace.
547	if !has_space {
548	// Will always trigger an error. Which is what we want.
549	s.consume_spaces()?;
550	}
551
552	// Manual inlining of `parse_attribute` for performance.
553	// We cannot mark `parse_attribute` as `#[inline(always)]`
554	// because it will blow up the binary size.
555	let (prefix, local) = s.consume_qname()?;
556	let qname_end = s.pos();
557	let qname_len = u16::try_from(qname_end - start).unwrap_or(u16::MAX);
558	s.consume_eq()?;
559	let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
560	let quote = s.consume_quote()?;
561	let quote_c = quote as char;
562	// The attribute value must not contain the < character.
563	let value_start = s.pos();
564	s.skip_chars(\|_, c\| c != quote_c && c != '<')?;
565	let value = s.slice_back_span(value_start);
566	s.consume_byte(quote)?;
567	let end = s.pos();
568	events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
569	}
570	}
571	}
572
573	if open {
574	parse_content(s, events)?;
575	}
576
577	Ok(())
578	}
579
580	// Attribute ::= Name Eq AttValue
581	fn parse_attribute<'input>(
582	s: &mut Stream<'input>,
583	) -> Result<(&'input str, &'input str, StrSpan<'input>)> {
584	let (prefix: &'input str, local: &'input str) = s.consume_qname()?;
585	s.consume_eq()?;
586	let quote: u8 = s.consume_quote()?;
587	let quote_c: char = quote as char;
588	// The attribute value must not contain the < character.
589	let value_start: usize = s.pos();
590	s.skip_chars(\|_, c: char\| c != quote_c && c != '<')?;
591	let value: StrSpan<'_> = s.slice_back_span(pos:value_start);
592	s.consume_byte(quote)?;
593	Ok((prefix, local, value))
594	}
595
596	// content ::= CharData? ((element \| Reference \| CDSect \| PI \| Comment) CharData?)*
597	pub fn parse_content<'input>(
598	s: &mut Stream<'input>,
599	events: &mut dyn XmlEvents<'input>,
600	) -> Result<()> {
601	while !s.at_end() {
602	match s.curr_byte() {
603	Ok(b'<') => match s.next_byte() {
604	Ok(b'!') => {
605	if s.starts_with(b"<!--") {
606	parse_comment(s, events)?;
607	} else if s.starts_with(b"<![CDATA[") {
608	parse_cdata(s, events)?;
609	} else {
610	return Err(Error::UnknownToken(s.gen_text_pos()));
611	}
612	}
613	Ok(b'?') => parse_pi(s, events)?,
614	Ok(b'/') => {
615	parse_close_element(s, events)?;
616	break;
617	}
618	Ok(_) => parse_element(s, events)?,
619	Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
620	},
621	Ok(_) => parse_text(s, events)?,
622	Err(_) => return Err(Error::UnknownToken(s.gen_text_pos())),
623	}
624	}
625
626	Ok(())
627	}
628
629	// CDSect ::= CDStart CData CDEnd
630	// CDStart ::= '<![CDATA['
631	// CData ::= (Char - (Char* ']]>' Char))
632	// CDEnd ::= ']]>'
633	fn parse_cdata<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
634	let start: usize = s.pos();
635	s.advance(`9`); // <![CDATA[
636	let text: &'input str = s.consume_chars(\|s: &Stream<'_>, c: char\| !(c == ']' && s.starts_with(text:b"]]>")))?;
637	s.skip_string(text:b"]]>")?;
638	let range: Range = s.range_from(start);
639	events.token(Token::Cdata(text, range))?;
640	Ok(())
641	}
642
643	// '</' Name S? '>'
644	fn parse_close_element<'input>(
645	s: &mut Stream<'input>,
646	events: &mut dyn XmlEvents<'input>,
647	) -> Result<()> {
648	let start: usize = s.pos();
649	s.advance(`2`); // </
650
651	let (prefix: &'input str, tag_name: &'input str) = s.consume_qname()?;
652	s.skip_spaces();
653	s.consume_byte(b'>')?;
654
655	let range: Range = s.range_from(start);
656	events.token(Token::ElementEnd(
657	ElementEnd::Close(prefix, tag_name),
658	range,
659	))?;
660	Ok(())
661	}
662
663	fn parse_text<'input>(s: &mut Stream<'input>, events: &mut dyn XmlEvents<'input>) -> Result<()> {
664	let start: usize = s.pos();
665	let text: &'input str = s.consume_chars(\|_, c: char\| c != '<')?;
666
667	// According to the spec, `]]>` must not appear inside a Text node.
668	// https://www.w3.org/TR/xml/#syntax
669	//
670	// Search for `>` first, since it's a bit faster than looking for `]]>`.
671	if text.contains('>') && text.contains("]]>") {
672	return Err(Error::InvalidCharacterData(s.gen_text_pos()));
673	}
674
675	let range: Range = s.range_from(start);
676	events.token(Token::Text(text, range))?;
677	Ok(())
678	}
679
680	/// Representation of the [Reference](https://www.w3.org/TR/xml/#NT-Reference) value.
681	#[derive(Clone, Copy)]
682	pub enum Reference<'input> {
683	/// An entity reference.
684	///
685	/// <https://www.w3.org/TR/xml/#NT-EntityRef>
686	Entity(&'input str),
687
688	/// A character reference.
689	///
690	/// <https://www.w3.org/TR/xml/#NT-CharRef>
691	Char(char),
692	}
693
694	#[derive(Clone)]
695	pub struct Stream<'input> {
696	pos: usize,
697	end: usize,
698	span: StrSpan<'input>,
699	}
700
701	impl<'input> Stream<'input> {
702	#[inline]
703	pub fn new(text: &'input str) -> Self {
704	Stream {
705	pos: `0`,
706	end: text.len(),
707	span: text.into(),
708	}
709	}
710
711	#[inline]
712	pub fn from_substr(text: &'input str, fragment: Range<usize>) -> Self {
713	Stream {
714	pos: fragment.start,
715	end: fragment.end,
716	span: text.into(),
717	}
718	}
719
720	#[inline]
721	pub fn pos(&self) -> usize {
722	self.pos
723	}
724
725	#[inline]
726	pub fn at_end(&self) -> bool {
727	self.pos >= self.end
728	}
729
730	#[inline]
731	pub fn curr_byte(&self) -> Result<u8> {
732	if self.at_end() {
733	return Err(Error::UnexpectedEndOfStream);
734	}
735
736	Ok(self.curr_byte_unchecked())
737	}
738
739	#[inline]
740	pub fn curr_byte_unchecked(&self) -> u8 {
741	self.span.text.as_bytes()[self.pos]
742	}
743
744	#[inline]
745	fn next_byte(&self) -> Result<u8> {
746	if self.pos + `1` >= self.end {
747	return Err(Error::UnexpectedEndOfStream);
748	}
749
750	Ok(self.span.as_str().as_bytes()[self.pos + `1`])
751	}
752
753	#[inline]
754	pub fn advance(&mut self, n: usize) {
755	debug_assert!(self.pos + n <= self.end);
756	self.pos += n;
757	}
758
759	#[inline]
760	fn starts_with(&self, text: &[u8]) -> bool {
761	self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
762	}
763
764	fn consume_byte(&mut self, c: u8) -> Result<()> {
765	let curr = self.curr_byte()?;
766	if curr != c {
767	return Err(Error::InvalidChar(c, curr, self.gen_text_pos()));
768	}
769
770	self.advance(`1`);
771	Ok(())
772	}
773
774	// Unlike `consume_byte()` will not return any errors.
775	fn try_consume_byte(&mut self, c: u8) -> bool {
776	match self.curr_byte() {
777	Ok(b) if b == c => {
778	self.advance(`1`);
779	`true`
780	}
781	_ => `false`,
782	}
783	}
784
785	fn skip_string(&mut self, text: &'static [u8]) -> Result<()> {
786	if !self.starts_with(text) {
787	let pos = self.gen_text_pos();
788
789	// Assume that all input `text` are valid UTF-8 strings, so unwrap is safe.
790	let expected = str::from_utf8(text).unwrap();
791
792	return Err(Error::InvalidString(expected, pos));
793	}
794
795	self.advance(text.len());
796	Ok(())
797	}
798
799	#[inline]
800	fn consume_bytes<F: Fn(u8) -> bool>(&mut self, f: F) -> &'input str {
801	let start = self.pos;
802	self.skip_bytes(f);
803	self.slice_back(start)
804	}
805
806	fn skip_bytes<F: Fn(u8) -> bool>(&mut self, f: F) {
807	while !self.at_end() && f(self.curr_byte_unchecked()) {
808	self.advance(`1`);
809	}
810	}
811
812	#[inline]
813	fn consume_chars<F>(&mut self, f: F) -> Result<&'input str>
814	where
815	F: Fn(&Stream, char) -> bool,
816	{
817	let start = self.pos;
818	self.skip_chars(f)?;
819	Ok(self.slice_back(start))
820	}
821
822	#[inline]
823	fn skip_chars<F>(&mut self, f: F) -> Result<()>
824	where
825	F: Fn(&Stream, char) -> bool,
826	{
827	for c in self.chars() {
828	if !c.is_xml_char() {
829	return Err(Error::NonXmlChar(c, self.gen_text_pos()));
830	} else if f(self, c) {
831	self.advance(c.len_utf8());
832	} else {
833	break;
834	}
835	}
836
837	Ok(())
838	}
839
840	#[inline]
841	fn chars(&self) -> str::Chars<'input> {
842	self.span.as_str()[self.pos..self.end].chars()
843	}
844
845	#[inline]
846	fn slice_back(&self, pos: usize) -> &'input str {
847	self.span.slice_region(pos, self.pos)
848	}
849
850	#[inline]
851	fn slice_back_span(&self, pos: usize) -> StrSpan<'input> {
852	StrSpan::from_substr(self.span.text, pos, self.pos)
853	}
854
855	#[inline]
856	fn range_from(&self, start: usize) -> Range<usize> {
857	start..self.pos
858	}
859
860	#[inline]
861	fn skip_spaces(&mut self) {
862	while self.starts_with_space() {
863	self.advance(`1`);
864	}
865	}
866
867	#[inline]
868	fn starts_with_space(&self) -> bool {
869	!self.at_end() && self.curr_byte_unchecked().is_xml_space()
870	}
871
872	// Like `skip_spaces()`, but checks that first char is actually a space.
873	fn consume_spaces(&mut self) -> Result<()> {
874	if self.at_end() {
875	return Err(Error::UnexpectedEndOfStream);
876	}
877
878	if !self.starts_with_space() {
879	return Err(Error::InvalidChar2(
880	"a whitespace",
881	self.curr_byte_unchecked(),
882	self.gen_text_pos(),
883	));
884	}
885
886	self.skip_spaces();
887	Ok(())
888	}
889
890	/// Consumes according to: <https://www.w3.org/TR/xml/#NT-Reference>
891	pub fn try_consume_reference(&mut self) -> Option<Reference<'input>> {
892	let start = self.pos();
893
894	// Consume reference on a substream.
895	let mut s = self.clone();
896	let result = s.consume_reference()?;
897
898	// If the current data is a reference than advance the current stream
899	// by number of bytes read by substream.
900	self.advance(s.pos() - start);
901	Some(result)
902	}
903
904	#[inline(never)]
905	fn consume_reference(&mut self) -> Option<Reference<'input>> {
906	if !self.try_consume_byte(b'&') {
907	return None;
908	}
909
910	let reference = if self.try_consume_byte(b'#') {
911	let (value, radix) = if self.try_consume_byte(b'x') {
912	let value =
913	self.consume_bytes(\|c\| matches!(c, b'0'..=b'9' \| b'A'..=b'F' \| b'a'..=b'f'));
914	(value, `16`)
915	} else {
916	let value = self.consume_bytes(\|c\| c.is_ascii_digit());
917	(value, `10`)
918	};
919
920	let n = u32::from_str_radix(value, radix).ok()?;
921
922	let c = char::from_u32(n).unwrap_or('`\u{FFFD}`');
923	if !c.is_xml_char() {
924	return None;
925	}
926
927	Reference::Char(c)
928	} else {
929	let name = self.consume_name().ok()?;
930	match name {
931	"quot" => Reference::Char('"'),
932	"amp" => Reference::Char('&'),
933	"apos" => Reference::Char('`\'`'),
934	"lt" => Reference::Char('<'),
935	"gt" => Reference::Char('>'),
936	_ => Reference::Entity(name),
937	}
938	};
939
940	self.consume_byte(b';').ok()?;
941
942	Some(reference)
943	}
944
945	/// Consumes according to: <https://www.w3.org/TR/xml/#NT-Name>
946	fn consume_name(&mut self) -> Result<&'input str> {
947	let start = self.pos();
948	self.skip_name()?;
949
950	let name = self.slice_back(start);
951	if name.is_empty() {
952	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
953	}
954
955	Ok(name)
956	}
957
958	/// The same as `consume_name()`, but does not return a consumed name.
959	fn skip_name(&mut self) -> Result<()> {
960	let start = self.pos();
961	let mut iter = self.chars();
962	if let Some(c) = iter.next() {
963	if c.is_xml_name_start() {
964	self.advance(c.len_utf8());
965	} else {
966	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
967	}
968	}
969
970	for c in iter {
971	if c.is_xml_name() {
972	self.advance(c.len_utf8());
973	} else {
974	break;
975	}
976	}
977
978	Ok(())
979	}
980
981	/// Consumes a qualified XML name and returns it.
982	///
983	/// Consumes according to: <https://www.w3.org/TR/xml-names/#ns-qualnames>
984	#[inline(never)]
985	fn consume_qname(&mut self) -> Result<(&'input str, &'input str)> {
986	let start = self.pos();
987
988	let mut splitter = None;
989
990	while !self.at_end() {
991	// Check for ASCII first for performance reasons.
992	let b = self.curr_byte_unchecked();
993	if b < `128` {
994	if b == b':' {
995	if splitter.is_none() {
996	splitter = Some(self.pos());
997	self.advance(`1`);
998	} else {
999	// Multiple `:` is an error.
1000	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1001	}
1002	} else if b.is_xml_name() {
1003	self.advance(`1`);
1004	} else {
1005	break;
1006	}
1007	} else {
1008	// Fallback to Unicode code point.
1009	match self.chars().nth(`0`) {
1010	Some(c) if c.is_xml_name() => {
1011	self.advance(c.len_utf8());
1012	}
1013	_ => break,
1014	}
1015	}
1016	}
1017
1018	let (prefix, local) = if let Some(splitter) = splitter {
1019	let prefix = self.span.slice_region(start, splitter);
1020	let local = self.slice_back(splitter + `1`);
1021	(prefix, local)
1022	} else {
1023	let local = self.slice_back(start);
1024	// Slice an empty prefix. This way we can preserve attribute start position.
1025	(self.span.slice_region(start, start), local)
1026	};
1027
1028	// Prefix must start with a `NameStartChar`.
1029	if let Some(c) = prefix.chars().nth(`0`) {
1030	if !c.is_xml_name_start() {
1031	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1032	}
1033	}
1034
1035	// Local name must start with a `NameStartChar`.
1036	if let Some(c) = local.chars().nth(`0`) {
1037	if !c.is_xml_name_start() {
1038	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1039	}
1040	} else {
1041	// If empty - error.
1042	return Err(Error::InvalidName(self.gen_text_pos_from(start)));
1043	}
1044
1045	Ok((prefix, local))
1046	}
1047
1048	fn consume_eq(&mut self) -> Result<()> {
1049	self.skip_spaces();
1050	self.consume_byte(b'=')?;
1051	self.skip_spaces();
1052
1053	Ok(())
1054	}
1055
1056	fn consume_quote(&mut self) -> Result<u8> {
1057	let c = self.curr_byte()?;
1058	if c == b'`\'`' \|\| c == b'"' {
1059	self.advance(`1`);
1060	Ok(c)
1061	} else {
1062	Err(Error::InvalidChar2("a quote", c, self.gen_text_pos()))
1063	}
1064	}
1065
1066	/// Calculates a current absolute position.
1067	///
1068	/// This operation is very expensive. Use only for errors.
1069	#[inline(never)]
1070	pub fn gen_text_pos(&self) -> TextPos {
1071	let text = self.span.as_str();
1072	let end = self.pos;
1073
1074	let row = Self::calc_curr_row(text, end);
1075	let col = Self::calc_curr_col(text, end);
1076	TextPos::new(row, col)
1077	}
1078
1079	/// Calculates an absolute position at `pos`.
1080	///
1081	/// This operation is very expensive. Use only for errors.
1082	#[inline(never)]
1083	pub fn gen_text_pos_from(&self, pos: usize) -> TextPos {
1084	let mut s = self.clone();
1085	s.pos = core::cmp::min(pos, s.span.as_str().len());
1086	s.gen_text_pos()
1087	}
1088
1089	fn calc_curr_row(text: &str, end: usize) -> u32 {
1090	let mut row = `1`;
1091	for c in &text.as_bytes()[..end] {
1092	if *c == b'`\n`' {
1093	row += `1`;
1094	}
1095	}
1096
1097	row
1098	}
1099
1100	fn calc_curr_col(text: &str, end: usize) -> u32 {
1101	let mut col = `1`;
1102	for c in text[..end].chars().rev() {
1103	if c == '`\n`' {
1104	break;
1105	} else {
1106	col += `1`;
1107	}
1108	}
1109
1110	col
1111	}
1112	}
1113