1//! Contains simple lexer for XML documents.
2//!
3//! This module is for internal use. Use `xml::pull` module to do parsing.
4
5
6use crate::reader::ErrorKind;
7use crate::reader::error::SyntaxError;
8use std::collections::VecDeque;
9use std::fmt;
10use std::io::Read;
11use std::result;
12use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char};
13use crate::reader::Error;
14use crate::util::{CharReader, Encoding};
15
16use super::ParserConfig2;
17
18/// `Token` represents a single lexeme of an XML document. These lexemes
19/// are used to perform actual parsing.
20#[derive(Copy, Clone, PartialEq, Eq, Debug)]
21pub(crate) enum Token {
22 /// `<?`
23 ProcessingInstructionStart,
24 /// `?>`
25 ProcessingInstructionEnd,
26 /// `<!DOCTYPE
27 DoctypeStart,
28 /// `<`
29 OpeningTagStart,
30 /// `</`
31 ClosingTagStart,
32 /// `>`
33 TagEnd,
34 /// `/>`
35 EmptyTagEnd,
36 /// `<!--`
37 CommentStart,
38 /// `-->`
39 CommentEnd,
40 /// Any non-special character except whitespace.
41 Character(char),
42 /// `=`
43 EqualsSign,
44 /// `'`
45 SingleQuote,
46 /// `"`
47 DoubleQuote,
48 /// `<![CDATA[`
49 CDataStart,
50 /// `]]>`
51 CDataEnd,
52 /// `&`
53 ReferenceStart,
54 /// `;`
55 ReferenceEnd,
56 /// `<!` of `ENTITY`
57 MarkupDeclarationStart,
58}
59
60impl fmt::Display for Token {
61 #[cold]
62 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
63 match *self {
64 Token::Character(c) => c.fmt(f),
65 other => match other {
66 Token::OpeningTagStart => "<",
67 Token::ProcessingInstructionStart => "<?",
68 Token::DoctypeStart => "<!DOCTYPE",
69 Token::ClosingTagStart => "</",
70 Token::CommentStart => "<!--",
71 Token::CDataStart => "<![CDATA[",
72 Token::TagEnd => ">",
73 Token::EmptyTagEnd => "/>",
74 Token::ProcessingInstructionEnd => "?>",
75 Token::CommentEnd => "-->",
76 Token::CDataEnd => "]]>",
77 Token::ReferenceStart => "&",
78 Token::ReferenceEnd => ";",
79 Token::EqualsSign => "=",
80 Token::SingleQuote => "'",
81 Token::DoubleQuote => "\"",
82 Token::MarkupDeclarationStart => "<!",
83 _ => unreachable!()
84 }.fmt(f),
85 }
86 }
87}
88
89impl Token {
90 pub fn as_static_str(&self) -> Option<&'static str> {
91 match *self {
92 Token::OpeningTagStart => Some("<"),
93 Token::ProcessingInstructionStart => Some("<?"),
94 Token::DoctypeStart => Some("<!DOCTYPE"),
95 Token::ClosingTagStart => Some("</"),
96 Token::CommentStart => Some("<!--"),
97 Token::CDataStart => Some("<![CDATA["),
98 Token::TagEnd => Some(">"),
99 Token::EmptyTagEnd => Some("/>"),
100 Token::ProcessingInstructionEnd => Some("?>"),
101 Token::CommentEnd => Some("-->"),
102 Token::CDataEnd => Some("]]>"),
103 Token::ReferenceStart => Some("&"),
104 Token::ReferenceEnd => Some(";"),
105 Token::EqualsSign => Some("="),
106 Token::SingleQuote => Some("'"),
107 Token::DoubleQuote => Some("\""),
108 _ => None
109 }
110 }
111
112 // using String.push_str(token.to_string()) is simply way too slow
113 pub fn push_to_string(&self, target: &mut String) {
114 match *self {
115 Token::Character(c) => {
116 debug_assert!(is_xml10_char(c) || is_xml11_char(c));
117 target.push(c)
118 },
119 _ => if let Some(s) = self.as_static_str() {
120 target.push_str(s);
121 }
122 }
123 }
124}
125
126#[derive(Copy, Clone)]
127enum State {
128 /// Default state
129 Normal,
130 /// Triggered on '<'
131 TagStarted,
132 /// Triggered on '<!'
133 CommentOrCDataOrDoctypeStarted,
134 /// Triggered on '<!-'
135 CommentStarted,
136 /// Triggered on '<!D' up to '<!DOCTYPE'
137 DoctypeStarted(DoctypeStartedSubstate),
138 /// Other items like `<!ELEMENT` in DTD
139 InsideMarkupDeclaration,
140 /// Triggered after DoctypeStarted to handle sub elements
141 InsideDoctype,
142 /// Triggered on '<![' up to '<![CDATA'
143 CDataStarted(CDataStartedSubstate),
144 /// Triggered on '?'
145 ProcessingInstructionClosing,
146 /// Triggered on '/'
147 EmptyTagClosing,
148 /// Triggered on '-' up to '--'
149 CommentClosing(ClosingSubstate),
150 /// Triggered on ']' up to ']]' inside CDATA
151 CDataClosing(ClosingSubstate),
152 /// Triggered on ']' up to ']]' outside CDATA
153 InvalidCDataClosing(ClosingSubstate),
154 /// After `<!--`
155 InsideComment,
156 /// After `<[[`
157 InsideCdata,
158 /// After `<?`
159 InsideProcessingInstruction,
160 /// `<!ENTITY "here">`
161 InsideMarkupDeclarationQuotedString(QuoteStyle),
162}
163
164#[derive(Copy, Clone, Eq, PartialEq)]
165enum QuoteStyle {
166 Single, Double
167}
168
169#[derive(Copy, Clone)]
170enum ClosingSubstate {
171 First, Second
172}
173
174#[derive(Copy, Clone)]
175enum DoctypeStartedSubstate {
176 D, DO, DOC, DOCT, DOCTY, DOCTYP
177}
178
179#[derive(Copy, Clone)]
180enum CDataStartedSubstate {
181 E, C, CD, CDA, CDAT, CDATA
182}
183
184/// `Result` represents lexing result. It is either a token or an error message.
185pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
186
187/// Helps to set up a dispatch table for lexing large unambigous tokens like
188/// `<![CDATA[` or `<!DOCTYPE `.
189macro_rules! dispatch_on_enum_state(
190 ($_self:ident, $s:expr, $c:expr, $is:expr,
191 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
192 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
193 match $s {
194 $(
195 $st => match $c {
196 $stc => $_self.move_to($is($next_st)),
197 _ => $_self.handle_error($chunk, $c)
198 },
199 )+
200 $end_st => match $c {
201 $end_c => $e,
202 _ => $_self.handle_error($end_chunk, $c)
203 }
204 }
205 )
206);
207
208/// `Lexer` is a lexer for XML documents, which implements pull API.
209///
210/// Main method is `next_token` which accepts an `std::io::Read` instance and
211/// tries to read the next lexeme from it.
212///
213/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
214/// When it is not set, errors will be reported as `Err` objects with a string message.
215/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
216/// to toggle the behavior.
217pub(crate) struct Lexer {
218 st: State,
219 reader: CharReader,
220 pos: TextPosition,
221 head_pos: TextPosition,
222 char_queue: VecDeque<char>,
223 /// Default state to go back to after a tag end (may be `InsideDoctype`)
224 normal_state: State,
225 inside_token: bool,
226 eof_handled: bool,
227 reparse_depth: u8,
228 #[cfg(test)]
229 skip_errors: bool,
230
231 max_entity_expansion_depth: u8,
232 max_entity_expansion_length: usize,
233}
234
235impl Position for Lexer {
236 #[inline]
237 /// Returns the position of the last token produced by the lexer
238 fn position(&self) -> TextPosition { self.pos }
239}
240
241impl Lexer {
242 /// Returns a new lexer with default state.
243 pub(crate) fn new(config: &ParserConfig2) -> Lexer {
244 Lexer {
245 reader: CharReader::new(),
246 pos: TextPosition::new(),
247 head_pos: TextPosition::new(),
248 char_queue: VecDeque::with_capacity(4), // TODO: check size
249 st: State::Normal,
250 normal_state: State::Normal,
251 inside_token: false,
252 eof_handled: false,
253 reparse_depth: 0,
254 #[cfg(test)]
255 skip_errors: false,
256
257 max_entity_expansion_depth: config.max_entity_expansion_depth,
258 max_entity_expansion_length: config.max_entity_expansion_length,
259 }
260 }
261
262 pub(crate) fn encoding(&mut self) -> Encoding {
263 self.reader.encoding
264 }
265
266 pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
267 self.reader.encoding = encoding;
268 }
269
270 /// Disables error handling so `next_token` will return `Some(Chunk(..))`
271 /// upon invalid lexeme with this lexeme content.
272 #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
273
274 /// Reset the eof handled flag of the lexer.
275 #[inline]
276 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
277
278 /// Tries to read the next token from the buffer.
279 ///
280 /// It is possible to pass different instaces of `BufReader` each time
281 /// this method is called, but the resulting behavior is undefined in this case.
282 ///
283 /// Return value:
284 /// * `Err(reason) where reason: reader::Error` - when an error occurs;
285 /// * `Ok(None)` - upon end of stream is reached;
286 /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
287 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
288 // Already reached end of buffer
289 if self.eof_handled {
290 return Ok(None);
291 }
292
293 if !self.inside_token {
294 self.pos = self.head_pos;
295 self.inside_token = true;
296 }
297
298 // Check if we have saved a char or two for ourselves
299 while let Some(c) = self.char_queue.pop_front() {
300 match self.dispatch_char(c)? {
301 Some(t) => {
302 self.inside_token = false;
303 return Ok(Some(t));
304 }
305 None => {} // continue
306 }
307 }
308 // if char_queue is empty, all circular reparsing is done
309 self.reparse_depth = 0;
310 loop {
311 let c = match self.reader.next_char_from(b)? {
312 Some(c) => c, // got next char
313 None => break, // nothing to read left
314 };
315
316 if c == '\n' {
317 self.head_pos.new_line();
318 } else {
319 self.head_pos.advance(1);
320 }
321
322 match self.dispatch_char(c)? {
323 Some(t) => {
324 self.inside_token = false;
325 return Ok(Some(t));
326 }
327 None => {
328 // continue
329 }
330 }
331 }
332
333 self.end_of_stream()
334 }
335
336 #[inline(never)]
337 fn end_of_stream(&mut self) -> Result {
338 // Handle end of stream
339 self.eof_handled = true;
340 self.pos = self.head_pos;
341 match self.st {
342 State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
343 State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
344 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
345 State::CommentClosing(ClosingSubstate::Second) |
346 State::InsideComment | State::InsideMarkupDeclaration |
347 State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
348 State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
349 Err(self.error(SyntaxError::UnexpectedEof)),
350 State::EmptyTagClosing =>
351 Ok(Some(Token::Character('/'))),
352 State::CommentClosing(ClosingSubstate::First) =>
353 Ok(Some(Token::Character('-'))),
354 State::InvalidCDataClosing(ClosingSubstate::First) =>
355 Ok(Some(Token::Character(']'))),
356 State::InvalidCDataClosing(ClosingSubstate::Second) => {
357 self.eof_handled = false;
358 self.move_to_with_unread(State::Normal, &[']'], Token::Character(']'))
359 },
360 State::Normal =>
361 Ok(None),
362 }
363 }
364
365 #[cold]
366 fn error(&self, e: SyntaxError) -> Error {
367 Error {
368 pos: self.position(),
369 kind: ErrorKind::Syntax(e.to_cow()),
370 }
371 }
372
373
374 #[inline(never)]
375 fn dispatch_char(&mut self, c: char) -> Result {
376 match self.st {
377 State::Normal => self.normal(c),
378 State::TagStarted => self.tag_opened(c),
379 State::EmptyTagClosing => self.empty_element_closing(c),
380 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
381 State::InsideCdata => self.inside_cdata(c),
382 State::CDataStarted(s) => self.cdata_started(c, s),
383 State::InsideComment => self.inside_comment_state(c),
384 State::CommentStarted => self.comment_started(c),
385 State::InsideProcessingInstruction => self.inside_processing_instruction(c),
386 State::ProcessingInstructionClosing => self.processing_instruction_closing(c),
387 State::CommentClosing(s) => self.comment_closing(c, s),
388 State::CDataClosing(s) => self.cdata_closing(c, s),
389 State::InsideDoctype => self.inside_doctype(c),
390 State::DoctypeStarted(s) => self.doctype_started(c, s),
391 State::InvalidCDataClosing(s) => self.invalid_cdata_closing(c, s),
392 State::InsideMarkupDeclaration => self.markup_declaration(c),
393 State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q),
394 }
395 }
396
397 #[inline]
398 fn move_to(&mut self, st: State) -> Result {
399 self.st = st;
400 Ok(None)
401 }
402
403 #[inline]
404 fn move_to_with(&mut self, st: State, token: Token) -> Result {
405 self.st = st;
406 Ok(Some(token))
407 }
408
409 #[inline]
410 fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result {
411 self.normal_state = st;
412 self.st = st;
413 Ok(Some(token))
414 }
415
416 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
417 for c in cs.iter().rev().copied() {
418 self.char_queue.push_front(c);
419 }
420 self.move_to_with(st, token)
421 }
422
423 pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
424 if markup.is_empty() {
425 return Ok(());
426 }
427
428 self.reparse_depth += 1;
429 if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
430 return Err(self.error(SyntaxError::EntityTooBig))
431 }
432
433 self.eof_handled = false;
434 self.char_queue.reserve(markup.len());
435 for c in markup.chars().rev() {
436 self.char_queue.push_front(c);
437 }
438
439 Ok(())
440 }
441
442 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
443 debug_assert!(!chunk.is_empty());
444
445 #[cfg(test)]
446 if self.skip_errors {
447 let mut chars = chunk.chars();
448 let first = chars.next().unwrap_or('\0');
449 self.char_queue.extend(chars);
450 self.char_queue.push_back(c);
451 return self.move_to_with(State::Normal, Token::Character(first));
452 }
453 Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
454 }
455
456 /// Encountered a char
457 fn normal(&mut self, c: char) -> Result {
458 match c {
459 '<' => self.move_to(State::TagStarted),
460 '>' => Ok(Some(Token::TagEnd)),
461 '/' => self.move_to(State::EmptyTagClosing),
462 '=' => Ok(Some(Token::EqualsSign)),
463 '"' => Ok(Some(Token::DoubleQuote)),
464 '\'' => Ok(Some(Token::SingleQuote)),
465 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
466 '&' => Ok(Some(Token::ReferenceStart)),
467 ';' => Ok(Some(Token::ReferenceEnd)),
468 _ => Ok(Some(Token::Character(c)))
469 }
470 }
471
472 fn inside_cdata(&mut self, c: char) -> Result {
473 match c {
474 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)),
475 _ => Ok(Some(Token::Character(c)))
476 }
477 }
478
479 fn inside_processing_instruction(&mut self, c: char) -> Result {
480 // These tokens are used by `<?xml?>` parser
481 match c {
482 '?' => self.move_to(State::ProcessingInstructionClosing),
483 '<' => Ok(Some(Token::OpeningTagStart)),
484 '>' => Ok(Some(Token::TagEnd)),
485 '/' => Ok(Some(Token::ClosingTagStart)),
486 '=' => Ok(Some(Token::EqualsSign)),
487 '"' => Ok(Some(Token::DoubleQuote)),
488 '\'' => Ok(Some(Token::SingleQuote)),
489 '&' => Ok(Some(Token::ReferenceStart)),
490 ';' => Ok(Some(Token::ReferenceEnd)),
491 _ => Ok(Some(Token::Character(c)))
492 }
493 }
494
495 fn inside_comment_state(&mut self, c: char) -> Result {
496 match c {
497 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)),
498 _ => Ok(Some(Token::Character(c)))
499 }
500 }
501
502 /// Encountered '<'
503 fn tag_opened(&mut self, c: char) -> Result {
504 match c {
505 '?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart),
506 '/' => self.move_to_with(self.normal_state, Token::ClosingTagStart),
507 '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted),
508 _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
509 _ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
510 _ => self.handle_error("<", c)
511 }
512 }
513
514 /// Encountered '<!'
515 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
516 match c {
517 '-' => self.move_to(State::CommentStarted),
518 '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
519 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
520 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
521 self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)
522 },
523 _ => self.handle_error("<!", c),
524 }
525 }
526
527 /// Encountered '<!-'
528 fn comment_started(&mut self, c: char) -> Result {
529 match c {
530 '-' => self.move_to_with(State::InsideComment, Token::CommentStart),
531 _ => self.handle_error("<!-", c),
532 }
533 }
534
535 /// Encountered '<!['
536 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
537 use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
538 dispatch_on_enum_state!(self, s, c, State::CDataStarted,
539 E ; 'C' ; C ; "<![",
540 C ; 'D' ; CD ; "<![C",
541 CD ; 'A' ; CDA ; "<![CD",
542 CDA ; 'T' ; CDAT ; "<![CDA",
543 CDAT ; 'A' ; CDATA ; "<![CDAT";
544 CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::InsideCdata, Token::CDataStart)
545 )
546 }
547
548 /// Encountered '<!…' that isn't DOCTYPE or CDATA
549 fn markup_declaration(&mut self, c: char) -> Result {
550 match c {
551 '<' => self.handle_error("<!", c),
552 '>' => self.move_to_with(self.normal_state, Token::TagEnd),
553 '&' => Ok(Some(Token::ReferenceStart)),
554 ';' => Ok(Some(Token::ReferenceEnd)),
555 '"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote),
556 '\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote),
557 _ => Ok(Some(Token::Character(c))),
558 }
559 }
560
561 fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result {
562 match c {
563 '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
564 '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
565 _ => Ok(Some(Token::Character(c))),
566 }
567 }
568
569 /// Encountered '<!D'
570 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
571 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
572 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
573 D ; 'O' ; DO ; "<!D",
574 DO ; 'C' ; DOC ; "<!DO",
575 DOC ; 'T' ; DOCT ; "<!DOC",
576 DOCT ; 'Y' ; DOCTY ; "<!DOCT",
577 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY";
578 DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)
579 )
580 }
581
582 /// State used while awaiting the closing bracket for the <!DOCTYPE tag
583 fn inside_doctype(&mut self, c: char) -> Result {
584 match c {
585 '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd),
586 '<' => self.move_to(State::TagStarted),
587 '&' => Ok(Some(Token::ReferenceStart)),
588 ';' => Ok(Some(Token::ReferenceEnd)),
589 '"' => Ok(Some(Token::DoubleQuote)),
590 '\'' => Ok(Some(Token::SingleQuote)),
591 _ => Ok(Some(Token::Character(c))),
592 }
593 }
594
595 /// Encountered '?'
596 fn processing_instruction_closing(&mut self, c: char) -> Result {
597 match c {
598 '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
599 _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
600 }
601 }
602
603 /// Encountered '/'
604 fn empty_element_closing(&mut self, c: char) -> Result {
605 match c {
606 '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
607 _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
608 }
609 }
610
611 /// Encountered '-'
612 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
613 match s {
614 ClosingSubstate::First => match c {
615 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
616 _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')),
617 },
618 ClosingSubstate::Second => match c {
619 '>' => self.move_to_with(self.normal_state, Token::CommentEnd),
620 // double dash not followed by a greater-than is a hard error inside comment
621 _ => self.handle_error("--", c),
622 },
623 }
624 }
625
626 /// Encountered ']'
627 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
628 match s {
629 ClosingSubstate::First => match c {
630 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
631 _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')),
632 },
633 ClosingSubstate::Second => match c {
634 '>' => self.move_to_with(State::Normal, Token::CDataEnd),
635 _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')),
636 },
637 }
638 }
639
640 /// Encountered ']'
641 fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
642 match s {
643 ClosingSubstate::First => match c {
644 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
645 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')),
646 },
647 ClosingSubstate::Second => match c {
648 '>' => self.move_to_with(self.normal_state, Token::CDataEnd),
649 _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')),
650 },
651 }
652 }
653}
654
655#[cfg(test)]
656mod tests {
657 use crate::{common::Position, reader::ParserConfig2};
658 use std::io::{BufReader, Cursor};
659
660 use super::{Lexer, Token};
661
662 macro_rules! assert_oks(
663 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
664 $(
665 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
666 )+
667 })
668 );
669
670 macro_rules! assert_err(
671 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
672 let err = $lex.next_token(&mut $buf);
673 assert!(err.is_err());
674 let err = err.unwrap_err();
675 assert_eq!($r as u64, err.position().row);
676 assert_eq!($c as u64, err.position().column);
677 })
678 );
679
680 macro_rules! assert_none(
681 (for $lex:ident and $buf:ident) => (
682 assert_eq!(Ok(None), $lex.next_token(&mut $buf))
683 )
684 );
685
686 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
687 (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
688 }
689
690 #[test]
691 fn tricky_pi() {
692 let (mut lex, mut buf) = make_lex_and_buf(r#"<?x<!-- &??><x>"#);
693
694 assert_oks!(for lex and buf ;
695 Token::ProcessingInstructionStart
696 Token::Character('x')
697 Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens
698 Token::Character('!')
699 Token::Character('-')
700 Token::Character('-')
701 Token::Character(' ')
702 Token::ReferenceStart
703 Token::Character('?')
704 Token::ProcessingInstructionEnd
705 Token::OpeningTagStart
706 Token::Character('x')
707 Token::TagEnd
708 );
709 assert_none!(for lex and buf);
710 }
711
712 #[test]
713 fn reparser() {
714 let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#);
715
716 assert_oks!(for lex and buf ;
717 Token::ReferenceStart
718 Token::Character('a')
719 Token::ReferenceEnd
720 );
721 lex.reparse("<hi/>").unwrap();
722 assert_oks!(for lex and buf ;
723 Token::OpeningTagStart
724 Token::Character('h')
725 Token::Character('i')
726 Token::EmptyTagEnd
727 );
728 assert_none!(for lex and buf);
729 }
730
731 #[test]
732 fn simple_lexer_test() {
733 let (mut lex, mut buf) = make_lex_and_buf(
734 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
735 );
736
737 assert_oks!(for lex and buf ;
738 Token::OpeningTagStart
739 Token::Character('a')
740 Token::Character(' ')
741 Token::Character('p')
742 Token::EqualsSign
743 Token::SingleQuote
744 Token::Character('q')
745 Token::SingleQuote
746 Token::TagEnd
747 Token::Character(' ')
748 Token::Character('x')
749 Token::OpeningTagStart
750 Token::Character('b')
751 Token::Character(' ')
752 Token::Character('z')
753 Token::EqualsSign
754 Token::DoubleQuote
755 Token::Character('y')
756 Token::DoubleQuote
757 Token::TagEnd
758 Token::Character('d')
759 Token::Character('\t')
760 Token::ClosingTagStart
761 Token::Character('b')
762 Token::TagEnd
763 Token::ClosingTagStart
764 Token::Character('a')
765 Token::TagEnd
766 Token::OpeningTagStart
767 Token::Character('p')
768 Token::EmptyTagEnd
769 Token::Character(' ')
770 Token::ProcessingInstructionStart
771 Token::Character('n')
772 Token::Character('m')
773 Token::Character(' ')
774 Token::ProcessingInstructionEnd
775 Token::Character(' ')
776 Token::CommentStart
777 Token::Character(' ')
778 Token::Character('a')
779 Token::Character(' ')
780 Token::Character('c')
781 Token::Character(' ')
782 Token::CommentEnd
783 Token::Character(' ')
784 Token::ReferenceStart
785 Token::Character('n')
786 Token::Character('b')
787 Token::Character('s')
788 Token::Character('p')
789 Token::ReferenceEnd
790 );
791 assert_none!(for lex and buf);
792 }
793
794 #[test]
795 fn special_chars_test() {
796 let (mut lex, mut buf) = make_lex_and_buf(
797 r#"?x!+ // -| ]z]]"#
798 );
799
800 assert_oks!(for lex and buf ;
801 Token::Character('?')
802 Token::Character('x')
803 Token::Character('!')
804 Token::Character('+')
805 Token::Character(' ')
806 Token::Character('/')
807 Token::Character('/')
808 Token::Character(' ')
809 Token::Character('-')
810 Token::Character('|')
811 Token::Character(' ')
812 Token::Character(']')
813 Token::Character('z')
814 Token::Character(']')
815 Token::Character(']')
816 );
817 assert_none!(for lex and buf);
818 }
819
820 #[test]
821 fn cdata_test() {
822 let (mut lex, mut buf) = make_lex_and_buf(
823 r#"<a><![CDATA[x y ?]]> </a>"#
824 );
825
826 assert_oks!(for lex and buf ;
827 Token::OpeningTagStart
828 Token::Character('a')
829 Token::TagEnd
830 Token::CDataStart
831 Token::Character('x')
832 Token::Character(' ')
833 Token::Character('y')
834 Token::Character(' ')
835 Token::Character('?')
836 Token::CDataEnd
837 Token::Character(' ')
838 Token::ClosingTagStart
839 Token::Character('a')
840 Token::TagEnd
841 );
842 assert_none!(for lex and buf);
843 }
844
845 #[test]
846 fn cdata_closers_test() {
847 let (mut lex, mut buf) = make_lex_and_buf(
848 r#"<![CDATA[] > ]> ]]><!---->]]<a>"#
849 );
850
851 assert_oks!(for lex and buf ;
852 Token::CDataStart
853 Token::Character(']')
854 Token::Character(' ')
855 Token::Character('>')
856 Token::Character(' ')
857 Token::Character(']')
858 Token::Character('>')
859 Token::Character(' ')
860 Token::CDataEnd
861 Token::CommentStart
862 Token::CommentEnd
863 Token::Character(']')
864 Token::Character(']')
865 Token::OpeningTagStart
866 Token::Character('a')
867 Token::TagEnd
868 );
869 assert_none!(for lex and buf);
870 }
871
872 #[test]
873 fn doctype_test() {
874 let (mut lex, mut buf) = make_lex_and_buf(
875 r#"<a><!DOCTYPE ab xx z> "#
876 );
877 assert_oks!(for lex and buf ;
878 Token::OpeningTagStart
879 Token::Character('a')
880 Token::TagEnd
881 Token::DoctypeStart
882 Token::Character(' ')
883 Token::Character('a')
884 Token::Character('b')
885 Token::Character(' ')
886 Token::Character('x')
887 Token::Character('x')
888 Token::Character(' ')
889 Token::Character('z')
890 Token::TagEnd
891 Token::Character(' ')
892 );
893 assert_none!(for lex and buf);
894 }
895
896 #[test]
897 fn tricky_comments() {
898 let (mut lex, mut buf) = make_lex_and_buf(
899 r#"<a><!-- C ->--></a>"#
900 );
901 assert_oks!(for lex and buf ;
902 Token::OpeningTagStart
903 Token::Character('a')
904 Token::TagEnd
905 Token::CommentStart
906 Token::Character(' ')
907 Token::Character('C')
908 Token::Character(' ')
909 Token::Character('-')
910 Token::Character('>')
911 Token::CommentEnd
912 Token::ClosingTagStart
913 Token::Character('a')
914 Token::TagEnd
915 );
916 assert_none!(for lex and buf);
917 }
918
919 #[test]
920 fn doctype_with_internal_subset_test() {
921 let (mut lex, mut buf) = make_lex_and_buf(
922 r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
923 );
924 assert_oks!(for lex and buf ;
925 Token::OpeningTagStart
926 Token::Character('a')
927 Token::TagEnd
928 Token::DoctypeStart
929 Token::Character(' ')
930 Token::Character('a')
931 Token::Character('b')
932 Token::Character('[')
933 Token::MarkupDeclarationStart
934 Token::Character('E')
935 Token::Character('L')
936 Token::Character('E')
937 Token::Character('M')
938 Token::Character('E')
939 Token::Character('N')
940 Token::Character('T')
941 Token::Character(' ')
942 Token::Character('b')
943 Token::Character('a')
944 Token::Character(' ')
945 Token::DoubleQuote
946 Token::Character('>')
947 Token::Character('>')
948 Token::Character('>')
949 Token::DoubleQuote
950 Token::TagEnd
951 Token::Character(' ')
952 Token::Character(']')
953 Token::TagEnd
954 Token::Character(' ')
955 );
956 assert_none!(for lex and buf);
957 }
958
959 #[test]
960 fn doctype_internal_pi_comment() {
961 let (mut lex, mut buf) = make_lex_and_buf(
962 "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
963 );
964 assert_oks!(for lex and buf ;
965 Token::DoctypeStart
966 Token::Character(' ')
967 Token::Character('a')
968 Token::Character(' ')
969 Token::Character('[')
970 Token::Character('\n')
971 Token::MarkupDeclarationStart
972 Token::Character('E')
973 Token::Character('L')
974 Token::Character('E')
975 Token::Character('M')
976 Token::Character('E')
977 Token::Character('N')
978 Token::Character('T')
979 Token::Character(' ')
980 Token::Character('l')
981 Token::Character(' ')
982 Token::Character('A')
983 Token::Character('N')
984 Token::Character('Y')
985 Token::TagEnd
986 Token::Character(' ')
987 Token::CommentStart
988 Token::Character(' ')
989 Token::Character('<')
990 Token::Character('?')
991 Token::Character('n')
992 Token::Character('o')
993 Token::Character('n')
994 Token::Character('?')
995 Token::Character('>')
996 Token::CommentEnd
997 Token::Character(' ')
998 Token::ProcessingInstructionStart
999 Token::Character('p')
1000 Token::Character('i')
1001 Token::Character(' ')
1002 Token::TagEnd // not really
1003 Token::Character(' ')
1004 Token::ProcessingInstructionEnd
1005 Token::Character(' ')
1006 Token::Character('\n')
1007 Token::Character(']')
1008 Token::TagEnd // DTD
1009 );
1010 assert_none!(for lex and buf);
1011 }
1012
1013 #[test]
1014 fn end_of_stream_handling_ok() {
1015 macro_rules! eof_check(
1016 ($data:expr ; $token:expr) => ({
1017 let (mut lex, mut buf) = make_lex_and_buf($data);
1018 assert_oks!(for lex and buf ; $token);
1019 assert_none!(for lex and buf);
1020 })
1021 );
1022 eof_check!("?" ; Token::Character('?'));
1023 eof_check!("/" ; Token::Character('/'));
1024 eof_check!("-" ; Token::Character('-'));
1025 eof_check!("]" ; Token::Character(']'));
1026 eof_check!("]" ; Token::Character(']'));
1027 eof_check!("]" ; Token::Character(']'));
1028 }
1029
1030 #[test]
1031 fn end_of_stream_handling_error() {
1032 macro_rules! eof_check(
1033 ($data:expr; $r:expr, $c:expr) => ({
1034 let (mut lex, mut buf) = make_lex_and_buf($data);
1035 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
1036 assert_none!(for lex and buf);
1037 })
1038 );
1039 eof_check!("<" ; 0, 1);
1040 eof_check!("<!" ; 0, 2);
1041 eof_check!("<!-" ; 0, 3);
1042 eof_check!("<![" ; 0, 3);
1043 eof_check!("<![C" ; 0, 4);
1044 eof_check!("<![CD" ; 0, 5);
1045 eof_check!("<![CDA" ; 0, 6);
1046 eof_check!("<![CDAT" ; 0, 7);
1047 eof_check!("<![CDATA" ; 0, 8);
1048 }
1049
1050 #[test]
1051 fn error_in_comment_or_cdata_prefix() {
1052 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1053 assert_err!(for lex and buf expect row 0 ; 0,
1054 "Unexpected token '<!' before 'x'"
1055 );
1056
1057 let (mut lex, mut buf) = make_lex_and_buf("<!x");
1058 lex.disable_errors();
1059 assert_oks!(for lex and buf ;
1060 Token::Character('<')
1061 Token::Character('!')
1062 Token::Character('x')
1063 );
1064 assert_none!(for lex and buf);
1065 }
1066
1067 #[test]
1068 fn error_in_comment_started() {
1069 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1070 assert_err!(for lex and buf expect row 0 ; 0,
1071 "Unexpected token '<!-' before '\t'"
1072 );
1073
1074 let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1075 lex.disable_errors();
1076 assert_oks!(for lex and buf ;
1077 Token::Character('<')
1078 Token::Character('!')
1079 Token::Character('-')
1080 Token::Character('\t')
1081 );
1082 assert_none!(for lex and buf);
1083 }
1084
1085 #[test]
1086 fn error_in_comment_two_dashes_not_at_end() {
1087 let (mut lex, mut buf) = make_lex_and_buf("--x");
1088 lex.st = super::State::InsideComment;
1089 assert_err!(for lex and buf expect row 0; 0,
1090 "Unexpected token '--' before 'x'"
1091 );
1092
1093 let (mut lex, mut buf) = make_lex_and_buf("--x");
1094 assert_oks!(for lex and buf ;
1095 Token::Character('-')
1096 Token::Character('-')
1097 Token::Character('x')
1098 );
1099 }
1100
1101 macro_rules! check_case(
1102 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
1103 let (mut lex, mut buf) = make_lex_and_buf($data);
1104 assert_err!(for lex and buf expect row $r ; $c, $s);
1105
1106 let (mut lex, mut buf) = make_lex_and_buf($data);
1107 lex.disable_errors();
1108 for c in $chunk.chars() {
1109 assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf));
1110 }
1111 assert_oks!(for lex and buf ;
1112 Token::Character($app)
1113 );
1114 assert_none!(for lex and buf);
1115 })
1116 );
1117
1118 #[test]
1119 fn token_size() {
1120 assert_eq!(4, std::mem::size_of::<Token>());
1121 assert_eq!(2, std::mem::size_of::<super::State>());
1122 }
1123
1124 #[test]
1125 fn error_in_cdata_started() {
1126 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['");
1127 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['");
1128 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['");
1129 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['");
1130 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['");
1131 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
1132 }
1133
1134 #[test]
1135 fn error_in_doctype_started() {
1136 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'");
1137 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'");
1138 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'");
1139 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
1140 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
1141 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
1142 }
1143
1144
1145
1146 #[test]
1147 fn issue_98_cdata_ending_with_right_bracket() {
1148 let (mut lex, mut buf) = make_lex_and_buf(
1149 r#"<![CDATA[Foo [Bar]]]>"#
1150 );
1151
1152 assert_oks!(for lex and buf ;
1153 Token::CDataStart
1154 Token::Character('F')
1155 Token::Character('o')
1156 Token::Character('o')
1157 Token::Character(' ')
1158 Token::Character('[')
1159 Token::Character('B')
1160 Token::Character('a')
1161 Token::Character('r')
1162 Token::Character(']')
1163 Token::CDataEnd
1164 );
1165 assert_none!(for lex and buf);
1166 }
1167}
1168