1//! Contains simple lexer for XML documents.
2//!
3//! This module is for internal use. Use `xml::pull` module to do parsing.
4
5use crate::common::{is_name_char, is_whitespace_char, is_xml10_char, is_xml11_char, Position, TextPosition};
6use crate::reader::error::SyntaxError;
7use crate::reader::{Error, ErrorKind};
8use crate::util::{CharReader, Encoding};
9use std::collections::VecDeque;
10use std::io::Read;
11use std::{fmt, result};
12
13use super::ParserConfig2;
14
15/// `Token` represents a single lexeme of an XML document. These lexemes
16/// are used to perform actual parsing.
17#[derive(Copy, Clone, PartialEq, Eq, Debug)]
18pub(crate) enum Token {
19 /// `<?`
20 ProcessingInstructionStart,
21 /// `?>`
22 ProcessingInstructionEnd,
23 /// `<!DOCTYPE…`
24 DoctypeStart,
25 /// `<`
26 OpeningTagStart,
27 /// `</`
28 ClosingTagStart,
29 /// `>`
30 TagEnd,
31 /// `/>`
32 EmptyTagEnd,
33 /// `<!--`
34 CommentStart,
35 /// `-->`
36 CommentEnd,
37 /// Any non-special character except whitespace.
38 Character(char),
39 /// `=`
40 EqualsSign,
41 /// `'`
42 SingleQuote,
43 /// `"`
44 DoubleQuote,
45 /// `<![CDATA[`
46 CDataStart,
47 /// `]]>`
48 CDataEnd,
49 /// `&`
50 ReferenceStart,
51 /// `;`
52 ReferenceEnd,
53 /// `<!` of `ENTITY`
54 MarkupDeclarationStart,
55 /// End of file
56 Eof,
57}
58
59impl fmt::Display for Token {
60 #[cold]
61 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62 match *self {
63 Token::Character(c) => c.fmt(f),
64 other => match other {
65 Token::OpeningTagStart => "<",
66 Token::ProcessingInstructionStart => "<?",
67 Token::DoctypeStart => "<!DOCTYPE",
68 Token::ClosingTagStart => "</",
69 Token::CommentStart => "<!--",
70 Token::CDataStart => "<![CDATA[",
71 Token::TagEnd => ">",
72 Token::EmptyTagEnd => "/>",
73 Token::ProcessingInstructionEnd => "?>",
74 Token::CommentEnd => "-->",
75 Token::CDataEnd => "]]>",
76 Token::ReferenceStart => "&",
77 Token::ReferenceEnd => ";",
78 Token::EqualsSign => "=",
79 Token::SingleQuote => "'",
80 Token::DoubleQuote => "\"",
81 Token::MarkupDeclarationStart => "<!",
82 Token::Eof | Token::Character(_) => {
83 debug_assert!(false);
84 ""
85 },
86 }.fmt(f),
87 }
88 }
89}
90
91impl Token {
92 pub const fn as_static_str(self) -> Option<&'static str> {
93 match self {
94 Self::OpeningTagStart => Some("<"),
95 Self::ProcessingInstructionStart => Some("<?"),
96 Self::DoctypeStart => Some("<!DOCTYPE"),
97 Self::ClosingTagStart => Some("</"),
98 Self::CommentStart => Some("<!--"),
99 Self::CDataStart => Some("<![CDATA["),
100 Self::TagEnd => Some(">"),
101 Self::EmptyTagEnd => Some("/>"),
102 Self::ProcessingInstructionEnd => Some("?>"),
103 Self::CommentEnd => Some("-->"),
104 Self::CDataEnd => Some("]]>"),
105 Self::ReferenceStart => Some("&"),
106 Self::ReferenceEnd => Some(";"),
107 Self::EqualsSign => Some("="),
108 Self::SingleQuote => Some("'"),
109 Self::DoubleQuote => Some("\""),
110 _ => None
111 }
112 }
113
114 // using String.push_str(token.to_string()) is simply way too slow
115 pub fn push_to_string(self, target: &mut String) {
116 match self {
117 Self::Character(c) => {
118 debug_assert!(is_xml10_char(c) || is_xml11_char(c));
119 target.push(c);
120 },
121 _ => if let Some(s) = self.as_static_str() {
122 target.push_str(s);
123 }
124 }
125 }
126}
127
128#[derive(Copy, Clone)]
129enum State {
130 /// Default state
131 Normal,
132 /// Triggered on '<'
133 TagStarted,
134 /// Triggered on '<!'
135 CommentOrCDataOrDoctypeStarted,
136 /// Triggered on '<!-'
137 CommentStarted,
138 /// Triggered on '<!D' up to '<!DOCTYPE'
139 DoctypeStarted(DoctypeStartedSubstate),
140 /// Other items like `<!ELEMENT` in DTD
141 InsideMarkupDeclaration,
142 /// Triggered after `DoctypeStarted` to handle sub elements
143 InsideDoctype,
144 /// Triggered on '<![' up to '<![CDATA'
145 CDataStarted(CDataStartedSubstate),
146 /// Triggered on '?'
147 ProcessingInstructionClosing,
148 /// Triggered on '/'
149 EmptyTagClosing,
150 /// Triggered on '-' up to '--'
151 CommentClosing(ClosingSubstate),
152 /// Triggered on ']' up to ']]' inside CDATA
153 CDataClosing(ClosingSubstate),
154 /// Triggered on ']' up to ']]' outside CDATA
155 InvalidCDataClosing(ClosingSubstate),
156 /// After `<!--`
157 InsideComment,
158 /// After `<[[`
159 InsideCdata,
160 /// After `<?`
161 InsideProcessingInstruction,
162 /// `<!ENTITY "here">`
163 InsideMarkupDeclarationQuotedString(QuoteStyle),
164}
165
166#[derive(Copy, Clone, Eq, PartialEq)]
167enum QuoteStyle {
168 Single, Double
169}
170
171#[derive(Copy, Clone)]
172enum ClosingSubstate {
173 First, Second
174}
175
176#[derive(Copy, Clone)]
177#[allow(clippy::upper_case_acronyms)]
178enum DoctypeStartedSubstate {
179 D, DO, DOC, DOCT, DOCTY, DOCTYP
180}
181
182#[derive(Copy, Clone)]
183#[allow(clippy::upper_case_acronyms)]
184enum CDataStartedSubstate {
185 E, C, CD, CDA, CDAT, CDATA
186}
187
188/// `Result` represents lexing result. It is either a token or an error message.
189pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
190
191/// Helps to set up a dispatch table for lexing large unambigous tokens like
192/// `<![CDATA[` or `<!DOCTYPE `.
193macro_rules! dispatch_on_enum_state(
194 ($_self:ident, $s:expr, $c:expr, $is:expr,
195 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
196 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
197 match $s {
198 $(
199 $st => match $c {
200 $stc => Ok($_self.move_to($is($next_st))),
201 _ => $_self.handle_error($chunk, $c)
202 },
203 )+
204 $end_st => match $c {
205 $end_c => $e,
206 _ => $_self.handle_error($end_chunk, $c)
207 }
208 }
209 )
210);
211
212/// `Lexer` is a lexer for XML documents, which implements pull API.
213///
214/// Main method is `next_token` which accepts an `std::io::Read` instance and
215/// tries to read the next lexeme from it.
216///
217/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
218/// When it is not set, errors will be reported as `Err` objects with a string message.
219/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
220/// to toggle the behavior.
221pub(crate) struct Lexer {
222 st: State,
223 reader: CharReader,
224 pos: TextPosition,
225 head_pos: TextPosition,
226 char_queue: VecDeque<char>,
227 /// Default state to go back to after a tag end (may be `InsideDoctype`)
228 normal_state: State,
229 inside_token: bool,
230 eof_handled: bool,
231 reparse_depth: u8,
232 #[cfg(test)]
233 skip_errors: bool,
234
235 max_entity_expansion_depth: u8,
236 max_entity_expansion_length: usize,
237}
238
239impl Position for Lexer {
240 #[inline]
241 /// Returns the position of the last token produced by the lexer
242 fn position(&self) -> TextPosition { self.pos }
243}
244
245impl Lexer {
246 /// Returns a new lexer with default state.
247 pub(crate) fn new(config: &ParserConfig2) -> Self {
248 Self {
249 reader: CharReader::new(),
250 pos: TextPosition::new(),
251 head_pos: TextPosition::new(),
252 char_queue: VecDeque::with_capacity(4), // TODO: check size
253 st: State::Normal,
254 normal_state: State::Normal,
255 inside_token: false,
256 eof_handled: false,
257 reparse_depth: 0,
258 #[cfg(test)]
259 skip_errors: false,
260
261 max_entity_expansion_depth: config.max_entity_expansion_depth,
262 max_entity_expansion_length: config.max_entity_expansion_length,
263 }
264 }
265
266 pub(crate) fn encoding(&self) -> Encoding {
267 self.reader.encoding
268 }
269
270 pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
271 self.reader.encoding = encoding;
272 }
273
274 /// Disables error handling so `next_token` will return `Some(Chunk(..))`
275 /// upon invalid lexeme with this lexeme content.
276 #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
277
278 /// Reset the eof handled flag of the lexer.
279 #[inline]
280 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
281
282 /// Tries to read the next token from the buffer.
283 ///
284 /// It is possible to pass different instaces of `BufReader` each time
285 /// this method is called, but the resulting behavior is undefined in this case.
286 ///
287 /// Return value:
288 /// * `Err(reason) where reason: reader::Error` - when an error occurs;
289 /// * `Ok(Token::Eof)` - upon end of stream is reached;
290 /// * `Ok(token) where token: Token` - in case a complete-token has been read from the stream.
291 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result<Token> {
292 // Already reached end of buffer
293 if self.eof_handled {
294 return Ok(Token::Eof);
295 }
296
297 if !self.inside_token {
298 self.pos = self.head_pos;
299 self.inside_token = true;
300 }
301
302 // Check if we have saved a char or two for ourselves
303 while let Some(c) = self.char_queue.pop_front() {
304 if let Some(t) = self.dispatch_char(c)? {
305 self.inside_token = false;
306 return Ok(t);
307 }
308 }
309 // if char_queue is empty, all circular reparsing is done
310 self.reparse_depth = 0;
311 while let Some(c) = self.reader.next_char_from(b)? {
312 if c == '\n' {
313 self.head_pos.new_line();
314 } else {
315 self.head_pos.advance(1);
316 }
317
318 if let Some(t) = self.dispatch_char(c)? {
319 self.inside_token = false;
320 return Ok(t);
321 }
322 }
323
324 self.end_of_stream()
325 }
326
327 #[inline(never)]
328 fn end_of_stream(&mut self) -> Result<Token> {
329 // Handle end of stream
330 self.eof_handled = true;
331 self.pos = self.head_pos;
332 match self.st {
333 State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
334 State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
335 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
336 State::CommentClosing(ClosingSubstate::Second) |
337 State::InsideComment | State::InsideMarkupDeclaration |
338 State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
339 State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
340 Err(self.error(SyntaxError::UnexpectedEof)),
341 State::EmptyTagClosing =>
342 Ok(Token::Character('/')),
343 State::CommentClosing(ClosingSubstate::First) =>
344 Ok(Token::Character('-')),
345 State::InvalidCDataClosing(ClosingSubstate::First) =>
346 Ok(Token::Character(']')),
347 State::InvalidCDataClosing(ClosingSubstate::Second) => {
348 self.eof_handled = false;
349 Ok(self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')))
350 },
351 State::Normal => Ok(Token::Eof),
352 }
353 }
354
355 #[cold]
356 #[allow(clippy::needless_pass_by_value)]
357 fn error(&self, e: SyntaxError) -> Error {
358 Error {
359 pos: self.position(),
360 kind: ErrorKind::Syntax(e.to_cow()),
361 }
362 }
363
364 #[inline(never)]
365 fn dispatch_char(&mut self, c: char) -> Result {
366 match self.st {
367 State::Normal => Ok(self.normal(c)),
368 State::TagStarted => self.tag_opened(c),
369 State::EmptyTagClosing => Ok(Some(self.empty_element_closing(c))),
370 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
371 State::InsideCdata => Ok(self.inside_cdata(c)),
372 State::CDataStarted(s) => self.cdata_started(c, s),
373 State::InsideComment => Ok(self.inside_comment_state(c)),
374 State::CommentStarted => self.comment_started(c),
375 State::InsideProcessingInstruction => Ok(self.inside_processing_instruction(c)),
376 State::ProcessingInstructionClosing => Ok(Some(self.processing_instruction_closing(c))),
377 State::CommentClosing(s) => self.comment_closing(c, s),
378 State::CDataClosing(s) => Ok(self.cdata_closing(c, s)),
379 State::InsideDoctype => Ok(self.inside_doctype(c)),
380 State::DoctypeStarted(s) => self.doctype_started(c, s),
381 State::InvalidCDataClosing(s) => Ok(self.invalid_cdata_closing(c, s)),
382 State::InsideMarkupDeclaration => self.markup_declaration(c),
383 State::InsideMarkupDeclarationQuotedString(q) => Ok(Some(self.markup_declaration_string(c, q))),
384 }
385 }
386
387 #[inline]
388 fn move_to(&mut self, st: State) -> Option<Token> {
389 self.st = st;
390 None
391 }
392
393 #[inline]
394 fn move_to_with(&mut self, st: State, token: Token) -> Token {
395 self.st = st;
396 token
397 }
398
399 #[inline]
400 fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Token {
401 self.normal_state = st;
402 self.st = st;
403 token
404 }
405
406 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Token {
407 for c in cs.iter().rev().copied() {
408 self.char_queue.push_front(c);
409 }
410 self.move_to_with(st, token)
411 }
412
413 pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
414 if markup.is_empty() {
415 return Ok(());
416 }
417
418 self.reparse_depth += 1;
419 if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
420 return Err(self.error(SyntaxError::EntityTooBig));
421 }
422
423 self.eof_handled = false;
424 self.char_queue.reserve(markup.len());
425 for c in markup.chars().rev() {
426 self.char_queue.push_front(c);
427 }
428
429 Ok(())
430 }
431
432 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
433 debug_assert!(!chunk.is_empty());
434
435 #[cfg(test)]
436 if self.skip_errors {
437 let mut chars = chunk.chars();
438 let first = chars.next().unwrap_or('\0');
439 self.char_queue.extend(chars);
440 self.char_queue.push_back(c);
441 return Ok(Some(self.move_to_with(State::Normal, Token::Character(first))));
442 }
443 Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
444 }
445
446 /// Encountered a char
447 fn normal(&mut self, c: char) -> Option<Token> {
448 match c {
449 '<' => self.move_to(State::TagStarted),
450 '>' => Some(Token::TagEnd),
451 '/' => self.move_to(State::EmptyTagClosing),
452 '=' => Some(Token::EqualsSign),
453 '"' => Some(Token::DoubleQuote),
454 '\'' => Some(Token::SingleQuote),
455 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
456 '&' => Some(Token::ReferenceStart),
457 ';' => Some(Token::ReferenceEnd),
458 _ => Some(Token::Character(c))
459 }
460 }
461
462 fn inside_cdata(&mut self, c: char) -> Option<Token> {
463 match c {
464 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)),
465 _ => Some(Token::Character(c)),
466 }
467 }
468
469 fn inside_processing_instruction(&mut self, c: char) -> Option<Token> {
470 // These tokens are used by `<?xml?>` parser
471 match c {
472 '?' => self.move_to(State::ProcessingInstructionClosing),
473 '<' => Some(Token::OpeningTagStart),
474 '>' => Some(Token::TagEnd),
475 '/' => Some(Token::ClosingTagStart),
476 '=' => Some(Token::EqualsSign),
477 '"' => Some(Token::DoubleQuote),
478 '\'' => Some(Token::SingleQuote),
479 '&' => Some(Token::ReferenceStart),
480 ';' => Some(Token::ReferenceEnd),
481 _ => Some(Token::Character(c))
482 }
483 }
484
485 fn inside_comment_state(&mut self, c: char) -> Option<Token> {
486 match c {
487 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
488 _ => Some(Token::Character(c)),
489 }
490 }
491
492 /// Encountered '<'
493 fn tag_opened(&mut self, c: char) -> Result {
494 match c {
495 '?' => Ok(Some(self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart))),
496 '/' => Ok(Some(self.move_to_with(self.normal_state, Token::ClosingTagStart))),
497 '!' => Ok(self.move_to(State::CommentOrCDataOrDoctypeStarted)),
498 _ if is_whitespace_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
499 _ if is_name_char(c) => Ok(Some(self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart))),
500 _ => self.handle_error("<", c)
501 }
502 }
503
504 /// Encountered '<!'
505 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
506 match c {
507 '-' => Ok(self.move_to(State::CommentStarted)),
508 '[' => Ok(self.move_to(State::CDataStarted(CDataStartedSubstate::E))),
509 'D' => Ok(self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D))),
510 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
511 Ok(Some(self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)))
512 },
513 _ => self.handle_error("<!", c),
514 }
515 }
516
517 /// Encountered '<!-'
518 fn comment_started(&mut self, c: char) -> Result {
519 match c {
520 '-' => Ok(Some(self.move_to_with(State::InsideComment, Token::CommentStart))),
521 _ => self.handle_error("<!-", c),
522 }
523 }
524
525 /// Encountered '<!['
526 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
527 use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
528 dispatch_on_enum_state!(self, s, c, State::CDataStarted,
529 E ; 'C' ; C ; "<![",
530 C ; 'D' ; CD ; "<![C",
531 CD ; 'A' ; CDA ; "<![CD",
532 CDA ; 'T' ; CDAT ; "<![CDA",
533 CDAT ; 'A' ; CDATA ; "<![CDAT";
534 CDATA ; '[' ; "<![CDATA" ; Ok(Some(self.move_to_with(State::InsideCdata, Token::CDataStart)))
535 )
536 }
537
538 /// Encountered '<!…' that isn't DOCTYPE or CDATA
539 fn markup_declaration(&mut self, c: char) -> Result {
540 match c {
541 '<' => self.handle_error("<!", c),
542 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::TagEnd))),
543 '&' => Ok(Some(Token::ReferenceStart)),
544 ';' => Ok(Some(Token::ReferenceEnd)),
545 '"' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote))),
546 '\'' => Ok(Some(self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote))),
547 _ => Ok(Some(Token::Character(c))),
548 }
549 }
550
551 fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Token {
552 match c {
553 '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
554 '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
555 _ => Token::Character(c),
556 }
557 }
558
559 /// Encountered '<!D'
560 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
561 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
562 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
563 D ; 'O' ; DO ; "<!D",
564 DO ; 'C' ; DOC ; "<!DO",
565 DOC ; 'T' ; DOCT ; "<!DOC",
566 DOCT ; 'Y' ; DOCTY ; "<!DOCT",
567 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
568 DOCTYP ; 'E' ; "<!DOCTYP" ; Ok(Some(self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)))
569 )
570 }
571
572 /// State used while awaiting the closing bracket for the <!DOCTYPE tag
573 fn inside_doctype(&mut self, c: char) -> Option<Token> {
574 match c {
575 '>' => Some(self.move_to_and_reset_normal(State::Normal, Token::TagEnd)),
576 '<' => self.move_to(State::TagStarted),
577 '&' => Some(Token::ReferenceStart),
578 ';' => Some(Token::ReferenceEnd),
579 '"' => Some(Token::DoubleQuote),
580 '\'' => Some(Token::SingleQuote),
581 _ => Some(Token::Character(c)),
582 }
583 }
584
585 /// Encountered '?'
586 fn processing_instruction_closing(&mut self, c: char) -> Token {
587 match c {
588 '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
589 _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
590 }
591 }
592
593 /// Encountered '/'
594 fn empty_element_closing(&mut self, c: char) -> Token {
595 match c {
596 '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
597 _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
598 }
599 }
600
601 /// Encountered '-'
602 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
603 match s {
604 ClosingSubstate::First => match c {
605 '-' => Ok(self.move_to(State::CommentClosing(ClosingSubstate::Second))),
606 _ => Ok(Some(self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')))),
607 },
608 ClosingSubstate::Second => match c {
609 '>' => Ok(Some(self.move_to_with(self.normal_state, Token::CommentEnd))),
610 // double dash not followed by a greater-than is a hard error inside comment
611 _ => self.handle_error("--", c),
612 },
613 }
614 }
615
616 /// Encountered ']'
617 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
618 match s {
619 ClosingSubstate::First => match c {
620 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
621 _ => Some(self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']'))),
622 },
623 ClosingSubstate::Second => match c {
624 '>' => Some(self.move_to_with(State::Normal, Token::CDataEnd)),
625 _ => Some(self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']'))),
626 },
627 }
628 }
629
630 /// Encountered ']'
631 fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Option<Token> {
632 match s {
633 ClosingSubstate::First => match c {
634 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
635 _ => Some(self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))),
636 },
637 ClosingSubstate::Second => match c {
638 '>' => Some(self.move_to_with(self.normal_state, Token::CDataEnd)),
639 _ => Some(self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))),
640 },
641 }
642 }
643}
644
645#[cfg(test)]
646mod tests {
647 use crate::{common::Position, reader::ParserConfig2};
648 use std::io::{BufReader, Cursor};
649
650 use super::{Lexer, Token};
651
652 macro_rules! assert_oks(
653 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
654 $(
655 assert_eq!(Ok($e), $lex.next_token(&mut $buf));
656 )+
657 })
658 );
659
660 macro_rules! assert_err(
661 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
662 let err = $lex.next_token(&mut $buf);
663 assert!(err.is_err());
664 let err = err.unwrap_err();
665 assert_eq!($r as u64, err.position().row);
666 assert_eq!($c as u64, err.position().column);
667 })
668 );
669
670 macro_rules! assert_none(
671 (for $lex:ident and $buf:ident) => (
672 assert_eq!(Ok(Token::Eof), $lex.next_token(&mut $buf))
673 )
674 );
675
676 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
677 (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
678 }
679
680 #[test]
681 fn tricky_pi() {
682 let (mut lex, mut buf) = make_lex_and_buf(r"<?x<!-- &??><x>");
683
684 assert_oks!(for lex and buf ;
685 Token::ProcessingInstructionStart
686 Token::Character('x')
687 Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens
688 Token::Character('!')
689 Token::Character('-')
690 Token::Character('-')
691 Token::Character(' ')
692 Token::ReferenceStart
693 Token::Character('?')
694 Token::ProcessingInstructionEnd
695 Token::OpeningTagStart
696 Token::Character('x')
697 Token::TagEnd
698 );
699 assert_none!(for lex and buf);
700 }
701
702 #[test]
703 fn reparser() {
704 let (mut lex, mut buf) = make_lex_and_buf(r"&a;");
705
706 assert_oks!(for lex and buf ;
707 Token::ReferenceStart
708 Token::Character('a')
709 Token::ReferenceEnd
710 );
711 lex.reparse("<hi/>").unwrap();
712 assert_oks!(for lex and buf ;
713 Token::OpeningTagStart
714 Token::Character('h')
715 Token::Character('i')
716 Token::EmptyTagEnd
717 );
718 assert_none!(for lex and buf);
719 }
720
721 #[test]
722 fn simple_lexer_test() {
723 let (mut lex, mut buf) = make_lex_and_buf(
724 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
725 );
726
727 assert_oks!(for lex and buf ;
728 Token::OpeningTagStart
729 Token::Character('a')
730 Token::Character(' ')
731 Token::Character('p')
732 Token::EqualsSign
733 Token::SingleQuote
734 Token::Character('q')
735 Token::SingleQuote
736 Token::TagEnd
737 Token::Character(' ')
738 Token::Character('x')
739 Token::OpeningTagStart
740 Token::Character('b')
741 Token::Character(' ')
742 Token::Character('z')
743 Token::EqualsSign
744 Token::DoubleQuote
745 Token::Character('y')
746 Token::DoubleQuote
747 Token::TagEnd
748 Token::Character('d')
749 Token::Character('\t')
750 Token::ClosingTagStart
751 Token::Character('b')
752 Token::TagEnd
753 Token::ClosingTagStart
754 Token::Character('a')
755 Token::TagEnd
756 Token::OpeningTagStart
757 Token::Character('p')
758 Token::EmptyTagEnd
759 Token::Character(' ')
760 Token::ProcessingInstructionStart
761 Token::Character('n')
762 Token::Character('m')
763 Token::Character(' ')
764 Token::ProcessingInstructionEnd
765 Token::Character(' ')
766 Token::CommentStart
767 Token::Character(' ')
768 Token::Character('a')
769 Token::Character(' ')
770 Token::Character('c')
771 Token::Character(' ')
772 Token::CommentEnd
773 Token::Character(' ')
774 Token::ReferenceStart
775 Token::Character('n')
776 Token::Character('b')
777 Token::Character('s')
778 Token::Character('p')
779 Token::ReferenceEnd
780 );
781 assert_none!(for lex and buf);
782 }
783
784 #[test]
785 fn special_chars_test() {
786 let (mut lex, mut buf) = make_lex_and_buf(
787 r"?x!+ // -| ]z]]"
788 );
789
790 assert_oks!(for lex and buf ;
791 Token::Character('?')
792 Token::Character('x')
793 Token::Character('!')
794 Token::Character('+')
795 Token::Character(' ')
796 Token::Character('/')
797 Token::Character('/')
798 Token::Character(' ')
799 Token::Character('-')
800 Token::Character('|')
801 Token::Character(' ')
802 Token::Character(']')
803 Token::Character('z')
804 Token::Character(']')
805 Token::Character(']')
806 );
807 assert_none!(for lex and buf);
808 }
809
810 #[test]
811 fn cdata_test() {
812 let (mut lex, mut buf) = make_lex_and_buf(
813 r"<a><![CDATA[x y ?]]> </a>"
814 );
815
816 assert_oks!(for lex and buf ;
817 Token::OpeningTagStart
818 Token::Character('a')
819 Token::TagEnd
820 Token::CDataStart
821 Token::Character('x')
822 Token::Character(' ')
823 Token::Character('y')
824 Token::Character(' ')
825 Token::Character('?')
826 Token::CDataEnd
827 Token::Character(' ')
828 Token::ClosingTagStart
829 Token::Character('a')
830 Token::TagEnd
831 );
832 assert_none!(for lex and buf);
833 }
834
835 #[test]
836 fn cdata_closers_test() {
837 let (mut lex, mut buf) = make_lex_and_buf(
838 r"<![CDATA[] > ]> ]]><!---->]]<a>"
839 );
840
841 assert_oks!(for lex and buf ;
842 Token::CDataStart
843 Token::Character(']')
844 Token::Character(' ')
845 Token::Character('>')
846 Token::Character(' ')
847 Token::Character(']')
848 Token::Character('>')
849 Token::Character(' ')
850 Token::CDataEnd
851 Token::CommentStart
852 Token::CommentEnd
853 Token::Character(']')
854 Token::Character(']')
855 Token::OpeningTagStart
856 Token::Character('a')
857 Token::TagEnd
858 );
859 assert_none!(for lex and buf);
860 }
861
862 #[test]
863 fn doctype_test() {
864 let (mut lex, mut buf) = make_lex_and_buf(
865 r"<a><!DOCTYPE ab xx z> "
866 );
867 assert_oks!(for lex and buf ;
868 Token::OpeningTagStart
869 Token::Character('a')
870 Token::TagEnd
871 Token::DoctypeStart
872 Token::Character(' ')
873 Token::Character('a')
874 Token::Character('b')
875 Token::Character(' ')
876 Token::Character('x')
877 Token::Character('x')
878 Token::Character(' ')
879 Token::Character('z')
880 Token::TagEnd
881 Token::Character(' ')
882 );
883 assert_none!(for lex and buf);
884 }
885
886 #[test]
887 fn tricky_comments() {
888 let (mut lex, mut buf) = make_lex_and_buf(
889 r"<a><!-- C ->--></a>"
890 );
891 assert_oks!(for lex and buf ;
892 Token::OpeningTagStart
893 Token::Character('a')
894 Token::TagEnd
895 Token::CommentStart
896 Token::Character(' ')
897 Token::Character('C')
898 Token::Character(' ')
899 Token::Character('-')
900 Token::Character('>')
901 Token::CommentEnd
902 Token::ClosingTagStart
903 Token::Character('a')
904 Token::TagEnd
905 );
906 assert_none!(for lex and buf);
907 }
908
909 #[test]
910 fn doctype_with_internal_subset_test() {
911 let (mut lex, mut buf) = make_lex_and_buf(
912 r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
913 );
914 assert_oks!(for lex and buf ;
915 Token::OpeningTagStart
916 Token::Character('a')
917 Token::TagEnd
918 Token::DoctypeStart
919 Token::Character(' ')
920 Token::Character('a')
921 Token::Character('b')
922 Token::Character('[')
923 Token::MarkupDeclarationStart
924 Token::Character('E')
925 Token::Character('L')
926 Token::Character('E')
927 Token::Character('M')
928 Token::Character('E')
929 Token::Character('N')
930 Token::Character('T')
931 Token::Character(' ')
932 Token::Character('b')
933 Token::Character('a')
934 Token::Character(' ')
935 Token::DoubleQuote
936 Token::Character('>')
937 Token::Character('>')
938 Token::Character('>')
939 Token::DoubleQuote
940 Token::TagEnd
941 Token::Character(' ')
942 Token::Character(']')
943 Token::TagEnd
944 Token::Character(' ')
945 );
946 assert_none!(for lex and buf);
947 }
948
949 #[test]
950 fn doctype_internal_pi_comment() {
951 let (mut lex, mut buf) = make_lex_and_buf(
952 "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
953 );
954 assert_oks!(for lex and buf ;
955 Token::DoctypeStart
956 Token::Character(' ')
957 Token::Character('a')
958 Token::Character(' ')
959 Token::Character('[')
960 Token::Character('\n')
961 Token::MarkupDeclarationStart
962 Token::Character('E')
963 Token::Character('L')
964 Token::Character('E')
965 Token::Character('M')
966 Token::Character('E')
967 Token::Character('N')
968 Token::Character('T')
969 Token::Character(' ')
970 Token::Character('l')
971 Token::Character(' ')
972 Token::Character('A')
973 Token::Character('N')
974 Token::Character('Y')
975 Token::TagEnd
976 Token::Character(' ')
977 Token::CommentStart
978 Token::Character(' ')
979 Token::Character('<')
980 Token::Character('?')
981 Token::Character('n')
982 Token::Character('o')
983 Token::Character('n')
984 Token::Character('?')
985 Token::Character('>')
986 Token::CommentEnd
987 Token::Character(' ')
988 Token::ProcessingInstructionStart
989 Token::Character('p')
990 Token::Character('i')
991 Token::Character(' ')
992 Token::TagEnd // not really
993 Token::Character(' ')
994 Token::ProcessingInstructionEnd
995 Token::Character(' ')
996 Token::Character('\n')
997 Token::Character(']')
998 Token::TagEnd // DTD
999 );
1000 assert_none!(for lex and buf);
1001 }
1002
1003 #[test]
1004 fn end_of_stream_handling_ok() {
1005 macro_rules! eof_check(
1006 ($data:expr ; $token:expr) => ({
1007 let (mut lex, mut buf) = make_lex_and_buf($data);
1008 assert_oks!(for lex and buf ; $token);
1009 assert_none!(for lex and buf);
1010 })
1011 );
1012 eof_check!("?" ; Token::Character('?'));
1013 eof_check!("/" ; Token::Character('/'));
1014 eof_check!("-" ; Token::Character('-'));
1015 eof_check!("]" ; Token::Character(']'));
1016 eof_check!("]" ; Token::Character(']'));
1017 eof_check!("]" ; Token::Character(']'));
1018 }
1019
1020 #[test]
1021 fn end_of_stream_handling_error() {
1022 macro_rules! eof_check(
1023 ($data:expr; $r:expr, $c:expr) => ({
1024 let (mut lex, mut buf) = make_lex_and_buf($data);
1025 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
1026 assert_none!(for lex and buf);
1027 })
1028 );
1029 eof_check!("<" ; 0, 1);
1030 eof_check!("<!" ; 0, 2);
1031 eof_check!("<!-" ; 0, 3);
1032 eof_check!("<![" ; 0, 3);
1033 eof_check!("<![C" ; 0, 4);
1034 eof_check!("<![CD" ; 0, 5);
1035 eof_check!("<![CDA" ; 0, 6);
1036 eof_check!("<![CDAT" ; 0, 7);
1037 eof_check!("<![CDATA" ; 0, 8);
1038 }
1039
1040 #[test]
1041 fn error_in_comment_or_cdata_prefix() {
1042 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1043 assert_err!(for lex and buf expect row 0 ; 0,
1044 "Unexpected token '<!' before 'x'"
1045 );
1046
1047 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1048 lex.disable_errors();
1049 assert_oks!(for lex and buf ;
1050 Token::Character('<')
1051 Token::Character('!')
1052 Token::Character('x')
1053 );
1054 assert_none!(for lex and buf);
1055 }
1056
1057 #[test]
1058 fn error_in_comment_started() {
1059 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1060 assert_err!(for lex and buf expect row 0 ; 0,
1061 "Unexpected token '<!-' before '\t'"
1062 );
1063
1064 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1065 lex.disable_errors();
1066 assert_oks!(for lex and buf ;
1067 Token::Character('<')
1068 Token::Character('!')
1069 Token::Character('-')
1070 Token::Character('\t')
1071 );
1072 assert_none!(for lex and buf);
1073 }
1074
1075 #[test]
1076 fn error_in_comment_two_dashes_not_at_end() {
1077 let (mut lex, mut buf) = make_lex_and_buf("--x");
1078 lex.st = super::State::InsideComment;
1079 assert_err!(for lex and buf expect row 0; 0,
1080 "Unexpected token '--' before 'x'"
1081 );
1082
1083 let (mut lex, mut buf) = make_lex_and_buf("--x");
1084 assert_oks!(for lex and buf ;
1085 Token::Character('-')
1086 Token::Character('-')
1087 Token::Character('x')
1088 );
1089 }
1090
1091 macro_rules! check_case(
1092 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
1093 let (mut lex, mut buf) = make_lex_and_buf($data);
1094 assert_err!(for lex and buf expect row $r ; $c, $s);
1095
1096 let (mut lex, mut buf) = make_lex_and_buf($data);
1097 lex.disable_errors();
1098 for c in $chunk.chars() {
1099 assert_eq!(Ok(Token::Character(c)), lex.next_token(&mut buf));
1100 }
1101 assert_oks!(for lex and buf ;
1102 Token::Character($app)
1103 );
1104 assert_none!(for lex and buf);
1105 })
1106 );
1107
1108 #[test]
1109 fn token_size() {
1110 assert_eq!(4, std::mem::size_of::<Token>());
1111 assert_eq!(2, std::mem::size_of::<super::State>());
1112 }
1113
1114 #[test]
1115 fn error_in_cdata_started() {
1116 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['");
1117 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['");
1118 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['");
1119 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['");
1120 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['");
1121 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
1122 }
1123
1124 #[test]
1125 fn error_in_doctype_started() {
1126 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'");
1127 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'");
1128 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'");
1129 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
1130 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
1131 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
1132 }
1133
1134 #[test]
1135 fn issue_98_cdata_ending_with_right_bracket() {
1136 let (mut lex, mut buf) = make_lex_and_buf(
1137 r"<![CDATA[Foo [Bar]]]>"
1138 );
1139
1140 assert_oks!(for lex and buf ;
1141 Token::CDataStart
1142 Token::Character('F')
1143 Token::Character('o')
1144 Token::Character('o')
1145 Token::Character(' ')
1146 Token::Character('[')
1147 Token::Character('B')
1148 Token::Character('a')
1149 Token::Character('r')
1150 Token::Character(']')
1151 Token::CDataEnd
1152 );
1153 assert_none!(for lex and buf);
1154 }
1155}
1156