1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The HTML5 tokenizer.
11
12pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use mac::format_if;
28use markup5ever::{namespace_url, ns, small_char_set};
29use std::borrow::Cow::{self, Borrowed};
30use std::cell::{Cell, RefCell, RefMut};
31use std::collections::BTreeMap;
32use std::mem;
33
34pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
43 Continue,
44 Suspend,
45 Script(Handle),
46}
47
48#[must_use]
49#[derive(Debug)]
50pub enum TokenizerResult<Handle> {
51 Done,
52 Script(Handle),
53}
54
55fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
56 match *opt_str {
57 Some(ref mut s: &mut Tendril) => s.push_char(c),
58 None => *opt_str = Some(StrTendril::from_char(c)),
59 }
60}
61
62/// Tokenizer options, with an impl for `Default`.
63#[derive(Clone)]
64pub struct TokenizerOpts {
65 /// Report all parse errors described in the spec, at some
66 /// performance penalty? Default: false
67 pub exact_errors: bool,
68
69 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
70 /// of the stream? Default: true
71 pub discard_bom: bool,
72
73 /// Keep a record of how long we spent in each state? Printed
74 /// when `end()` is called. Default: false
75 pub profile: bool,
76
77 /// Initial state override. Only the test runner should use
78 /// a non-`None` value!
79 pub initial_state: Option<states::State>,
80
81 /// Last start tag. Only the test runner should use a
82 /// non-`None` value!
83 ///
84 /// FIXME: Can't use Tendril because we want TokenizerOpts
85 /// to be Send.
86 pub last_start_tag_name: Option<String>,
87}
88
89impl Default for TokenizerOpts {
90 fn default() -> TokenizerOpts {
91 TokenizerOpts {
92 exact_errors: false,
93 discard_bom: true,
94 profile: false,
95 initial_state: None,
96 last_start_tag_name: None,
97 }
98 }
99}
100
101/// The HTML tokenizer.
102pub struct Tokenizer<Sink> {
103 /// Options controlling the behavior of the tokenizer.
104 opts: TokenizerOpts,
105
106 /// Destination for tokens we emit.
107 pub sink: Sink,
108
109 /// The abstract machine state as described in the spec.
110 state: Cell<states::State>,
111
112 /// Are we at the end of the file, once buffers have been processed
113 /// completely? This affects whether we will wait for lookahead or not.
114 at_eof: Cell<bool>,
115
116 /// Tokenizer for character references, if we're tokenizing
117 /// one at the moment.
118 char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
119
120 /// Current input character. Just consumed, may reconsume.
121 current_char: Cell<char>,
122
123 /// Should we reconsume the current input character?
124 reconsume: Cell<bool>,
125
126 /// Did we just consume \r, translating it to \n? In that case we need
127 /// to ignore the next character if it's \n.
128 ignore_lf: Cell<bool>,
129
130 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
131 /// beginning of the stream.
132 discard_bom: Cell<bool>,
133
134 /// Current tag kind.
135 current_tag_kind: Cell<TagKind>,
136
137 /// Current tag name.
138 current_tag_name: RefCell<StrTendril>,
139
140 /// Current tag is self-closing?
141 current_tag_self_closing: Cell<bool>,
142
143 /// Current tag attributes.
144 current_tag_attrs: RefCell<Vec<Attribute>>,
145
146 /// Current attribute name.
147 current_attr_name: RefCell<StrTendril>,
148
149 /// Current attribute value.
150 current_attr_value: RefCell<StrTendril>,
151
152 /// Current comment.
153 current_comment: RefCell<StrTendril>,
154
155 /// Current doctype token.
156 current_doctype: RefCell<Doctype>,
157
158 /// Last start tag name, for use in checking "appropriate end tag".
159 last_start_tag_name: RefCell<Option<LocalName>>,
160
161 /// The "temporary buffer" mentioned in the spec.
162 temp_buf: RefCell<StrTendril>,
163
164 /// Record of how many ns we spent in each state, if profiling is enabled.
165 state_profile: RefCell<BTreeMap<states::State, u64>>,
166
167 /// Record of how many ns we spent in the token sink.
168 time_in_sink: Cell<u64>,
169
170 /// Track current line
171 current_line: Cell<u64>,
172}
173
174impl<Sink: TokenSink> Tokenizer<Sink> {
175 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
176 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
177 let start_tag_name = opts
178 .last_start_tag_name
179 .take()
180 .map(|s| LocalName::from(&*s));
181 let state = opts.initial_state.unwrap_or(states::Data);
182 let discard_bom = opts.discard_bom;
183 Tokenizer {
184 opts,
185 sink,
186 state: Cell::new(state),
187 char_ref_tokenizer: RefCell::new(None),
188 at_eof: Cell::new(false),
189 current_char: Cell::new('\0'),
190 reconsume: Cell::new(false),
191 ignore_lf: Cell::new(false),
192 discard_bom: Cell::new(discard_bom),
193 current_tag_kind: Cell::new(StartTag),
194 current_tag_name: RefCell::new(StrTendril::new()),
195 current_tag_self_closing: Cell::new(false),
196 current_tag_attrs: RefCell::new(vec![]),
197 current_attr_name: RefCell::new(StrTendril::new()),
198 current_attr_value: RefCell::new(StrTendril::new()),
199 current_comment: RefCell::new(StrTendril::new()),
200 current_doctype: RefCell::new(Doctype::default()),
201 last_start_tag_name: RefCell::new(start_tag_name),
202 temp_buf: RefCell::new(StrTendril::new()),
203 state_profile: RefCell::new(BTreeMap::new()),
204 time_in_sink: Cell::new(0),
205 current_line: Cell::new(1),
206 }
207 }
208
209 /// Feed an input string into the tokenizer.
210 pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
211 if input.is_empty() {
212 return TokenizerResult::Done;
213 }
214
215 if self.discard_bom.get() {
216 if let Some(c) = input.peek() {
217 if c == '\u{feff}' {
218 input.next();
219 }
220 } else {
221 return TokenizerResult::Done;
222 }
223 };
224
225 self.run(input)
226 }
227
228 pub fn set_plaintext_state(&self) {
229 self.state.set(states::Plaintext);
230 }
231
232 fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
233 if self.opts.profile {
234 let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
235 self.time_in_sink.set(self.time_in_sink.get() + dt);
236 ret
237 } else {
238 self.sink.process_token(token, self.current_line.get())
239 }
240 }
241
242 fn process_token_and_continue(&self, token: Token) {
243 assert!(matches!(
244 self.process_token(token),
245 TokenSinkResult::Continue
246 ));
247 }
248
249 //ยง preprocessing-the-input-stream
250 // Get the next input character, which might be the character
251 // 'c' that we already consumed from the buffers.
252 fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
253 if self.ignore_lf.get() {
254 self.ignore_lf.set(false);
255 if c == '\n' {
256 c = input.next()?;
257 }
258 }
259
260 if c == '\r' {
261 self.ignore_lf.set(true);
262 c = '\n';
263 }
264
265 if c == '\n' {
266 self.current_line.set(self.current_line.get() + 1);
267 }
268
269 if self.opts.exact_errors
270 && match c as u32 {
271 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
272 n if (n & 0xFFFE) == 0xFFFE => true,
273 _ => false,
274 }
275 {
276 let msg = format!("Bad character {c}");
277 self.emit_error(Cow::Owned(msg));
278 }
279
280 trace!("got character {}", c);
281 self.current_char.set(c);
282 Some(c)
283 }
284
285 //ยง tokenization
286 // Get the next input character, if one is available.
287 fn get_char(&self, input: &BufferQueue) -> Option<char> {
288 if self.reconsume.get() {
289 self.reconsume.set(false);
290 Some(self.current_char.get())
291 } else {
292 input
293 .next()
294 .and_then(|c| self.get_preprocessed_char(c, input))
295 }
296 }
297
298 fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
299 // Bail to the slow path for various corner cases.
300 // This means that `FromSet` can contain characters not in the set!
301 // It shouldn't matter because the fallback `FromSet` case should
302 // always do the same thing as the `NotFromSet` case.
303 if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() {
304 return self.get_char(input).map(FromSet);
305 }
306
307 let d = input.pop_except_from(set);
308 trace!("got characters {:?}", d);
309 match d {
310 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
311
312 // NB: We don't set self.current_char for a run of characters not
313 // in the set. It shouldn't matter for the codepaths that use
314 // this.
315 _ => d,
316 }
317 }
318
319 // Check if the next characters are an ASCII case-insensitive match. See
320 // BufferQueue::eat.
321 //
322 // NB: this doesn't set the current input character.
323 fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
324 if self.ignore_lf.get() {
325 self.ignore_lf.set(false);
326 if self.peek(input) == Some('\n') {
327 self.discard_char(input);
328 }
329 }
330
331 input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
332 match input.eat(pat, eq) {
333 None if self.at_eof.get() => Some(false),
334 None => {
335 while let Some(data) = input.next() {
336 self.temp_buf.borrow_mut().push_char(data);
337 }
338 None
339 },
340 Some(matched) => Some(matched),
341 }
342 }
343
344 /// Run the state machine for as long as we can.
345 fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
346 if self.opts.profile {
347 loop {
348 let state = self.state.get();
349 let old_sink = self.time_in_sink.get();
350 let (run, mut dt) = time!(self.step(input));
351 dt -= (self.time_in_sink.get() - old_sink);
352 let new = match self.state_profile.borrow_mut().get_mut(&state) {
353 Some(x) => {
354 *x += dt;
355 false
356 },
357 None => true,
358 };
359 if new {
360 // do this here because of borrow shenanigans
361 self.state_profile.borrow_mut().insert(state, dt);
362 }
363 match run {
364 ProcessResult::Continue => (),
365 ProcessResult::Suspend => break,
366 ProcessResult::Script(node) => return TokenizerResult::Script(node),
367 }
368 }
369 } else {
370 loop {
371 match self.step(input) {
372 ProcessResult::Continue => (),
373 ProcessResult::Suspend => break,
374 ProcessResult::Script(node) => return TokenizerResult::Script(node),
375 }
376 }
377 }
378 TokenizerResult::Done
379 }
380
381 fn bad_char_error(&self) {
382 let msg = format_if!(
383 self.opts.exact_errors,
384 "Bad character",
385 "Saw {} in state {:?}",
386 self.current_char.get(),
387 self.state.get()
388 );
389 self.emit_error(msg);
390 }
391
392 fn bad_eof_error(&self) {
393 let msg = format_if!(
394 self.opts.exact_errors,
395 "Unexpected EOF",
396 "Saw EOF in state {:?}",
397 self.state.get()
398 );
399 self.emit_error(msg);
400 }
401
402 fn emit_char(&self, c: char) {
403 self.process_token_and_continue(match c {
404 '\0' => NullCharacterToken,
405 _ => CharacterTokens(StrTendril::from_char(c)),
406 });
407 }
408
409 // The string must not contain '\0'!
410 fn emit_chars(&self, b: StrTendril) {
411 self.process_token_and_continue(CharacterTokens(b));
412 }
413
414 fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
415 self.finish_attribute();
416
417 let name = LocalName::from(&**self.current_tag_name.borrow());
418 self.current_tag_name.borrow_mut().clear();
419
420 match self.current_tag_kind.get() {
421 StartTag => {
422 *self.last_start_tag_name.borrow_mut() = Some(name.clone());
423 },
424 EndTag => {
425 if !self.current_tag_attrs.borrow().is_empty() {
426 self.emit_error(Borrowed("Attributes on an end tag"));
427 }
428 if self.current_tag_self_closing.get() {
429 self.emit_error(Borrowed("Self-closing end tag"));
430 }
431 },
432 }
433
434 let token = TagToken(Tag {
435 kind: self.current_tag_kind.get(),
436 name,
437 self_closing: self.current_tag_self_closing.get(),
438 attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
439 });
440
441 match self.process_token(token) {
442 TokenSinkResult::Continue => ProcessResult::Continue,
443 TokenSinkResult::Plaintext => {
444 self.state.set(states::Plaintext);
445 ProcessResult::Continue
446 },
447 TokenSinkResult::Script(node) => {
448 self.state.set(states::Data);
449 ProcessResult::Script(node)
450 },
451 TokenSinkResult::RawData(kind) => {
452 self.state.set(states::RawData(kind));
453 ProcessResult::Continue
454 },
455 }
456 }
457
458 fn emit_temp_buf(&self) {
459 // FIXME: Make sure that clearing on emit is spec-compatible.
460 let buf = mem::take(&mut *self.temp_buf.borrow_mut());
461 self.emit_chars(buf);
462 }
463
464 fn clear_temp_buf(&self) {
465 // Do this without a new allocation.
466 self.temp_buf.borrow_mut().clear();
467 }
468
469 fn emit_current_comment(&self) {
470 let comment = mem::take(&mut *self.current_comment.borrow_mut());
471 self.process_token_and_continue(CommentToken(comment));
472 }
473
474 fn discard_tag(&self) {
475 self.current_tag_name.borrow_mut().clear();
476 self.current_tag_self_closing.set(false);
477 *self.current_tag_attrs.borrow_mut() = vec![];
478 }
479
480 fn create_tag(&self, kind: TagKind, c: char) {
481 self.discard_tag();
482 self.current_tag_name.borrow_mut().push_char(c);
483 self.current_tag_kind.set(kind);
484 }
485
486 fn have_appropriate_end_tag(&self) -> bool {
487 match self.last_start_tag_name.borrow().as_ref() {
488 Some(last) => {
489 (self.current_tag_kind.get() == EndTag)
490 && (**self.current_tag_name.borrow() == **last)
491 },
492 None => false,
493 }
494 }
495
496 fn create_attribute(&self, c: char) {
497 self.finish_attribute();
498
499 self.current_attr_name.borrow_mut().push_char(c);
500 }
501
502 fn finish_attribute(&self) {
503 if self.current_attr_name.borrow().is_empty() {
504 return;
505 }
506
507 // Check for a duplicate attribute.
508 // FIXME: the spec says we should error as soon as the name is finished.
509 let dup = {
510 let name = &*self.current_attr_name.borrow();
511 self.current_tag_attrs
512 .borrow()
513 .iter()
514 .any(|a| *a.name.local == **name)
515 };
516
517 if dup {
518 self.emit_error(Borrowed("Duplicate attribute"));
519 self.current_attr_name.borrow_mut().clear();
520 self.current_attr_value.borrow_mut().clear();
521 } else {
522 let name = LocalName::from(&**self.current_attr_name.borrow());
523 self.current_attr_name.borrow_mut().clear();
524 self.current_tag_attrs.borrow_mut().push(Attribute {
525 // The tree builder will adjust the namespace if necessary.
526 // This only happens in foreign elements.
527 name: QualName::new(None, ns!(), name),
528 value: mem::take(&mut self.current_attr_value.borrow_mut()),
529 });
530 }
531 }
532
533 fn emit_current_doctype(&self) {
534 let doctype = self.current_doctype.take();
535 self.process_token_and_continue(DoctypeToken(doctype));
536 }
537
538 fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<Option<StrTendril>> {
539 let current_doctype = self.current_doctype.borrow_mut();
540 match kind {
541 Public => RefMut::map(current_doctype, |d| &mut d.public_id),
542 System => RefMut::map(current_doctype, |d| &mut d.system_id),
543 }
544 }
545
546 fn clear_doctype_id(&self, kind: DoctypeIdKind) {
547 let mut id = self.doctype_id(kind);
548 match *id {
549 Some(ref mut s) => s.clear(),
550 None => *id = Some(StrTendril::new()),
551 }
552 }
553
554 fn consume_char_ref(&self) {
555 *self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!(
556 self.state.get(),
557 states::AttributeValue(_)
558 ))));
559 }
560
561 fn emit_eof(&self) {
562 self.process_token_and_continue(EOFToken);
563 }
564
565 fn peek(&self, input: &BufferQueue) -> Option<char> {
566 if self.reconsume.get() {
567 Some(self.current_char.get())
568 } else {
569 input.peek()
570 }
571 }
572
573 fn discard_char(&self, input: &BufferQueue) {
574 // peek() deals in un-processed characters (no newline normalization), while get_char()
575 // does.
576 //
577 // since discard_char is supposed to be used in combination with peek(), discard_char must
578 // discard a single raw input character, not a normalized newline.
579 if self.reconsume.get() {
580 self.reconsume.set(false);
581 } else {
582 input.next();
583 }
584 }
585
586 fn emit_error(&self, error: Cow<'static, str>) {
587 self.process_token_and_continue(ParseError(error));
588 }
589}
590//ยง END
591
592// Shorthand for common state machine behaviors.
593macro_rules! shorthand (
594 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
595 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
596 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
597 ( $me:ident : discard_tag ) => ( $me.discard_tag() );
598 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
599 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
600 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
601 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
602 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
603 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
604 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
605 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
606 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
607 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
608 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
609 ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
610 ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
611 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
612 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
613 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
614 ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true);
615 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
616 ( $me:ident : error ) => ( $me.bad_char_error() );
617 ( $me:ident : error_eof ) => ( $me.bad_eof_error() );
618);
619
620// Tracing of tokenizer actions. This adds significant bloat and compile time,
621// so it's behind a cfg flag.
622#[cfg(feature = "trace_tokenizer")]
623macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
624 trace!(" {:?}", stringify!($($cmds)*));
625 shorthand!($me : $($cmds)*);
626}));
627
628#[cfg(not(feature = "trace_tokenizer"))]
629macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
630
631// A little DSL for sequencing shorthand actions.
632macro_rules! go (
633 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
634 // We have to tell the parser how much lookahead we need.
635
636 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
637 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
638 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
639 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
640
641 // These can only come at the end.
642
643 ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
644 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
645 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
646
647 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); });
648 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); });
649 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); });
650
651 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; });
652
653 // We have a default next state after emitting a tag, but the sink can override.
654 ( $me:ident : emit_tag $s:ident ) => ({
655 $me.state.set(states::$s);
656 return $me.emit_current_tag();
657 });
658
659 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
660
661 // If nothing else matched, it's a single command
662 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
663
664 // or nothing.
665 ( $me:ident : ) => (());
666);
667
668macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
669 match $x {
670 $($pats)|+ => go!($me: $($cmds)*),
671 _ => (),
672 }
673));
674
675// This is a macro because it can cause early return
676// from the function where it is used.
677macro_rules! get_char ( ($me:expr, $input:expr) => (
678 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
679));
680
681macro_rules! peek ( ($me:expr, $input:expr) => (
682 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
683));
684
685macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
686 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
687));
688
689macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
690 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
691));
692
693macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
694 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
695));
696
697impl<Sink: TokenSink> Tokenizer<Sink> {
698 // Run the state machine for a while.
699 // Return true if we should be immediately re-invoked
700 // (this just simplifies control flow vs. break / continue).
701 #[allow(clippy::never_loop)]
702 fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
703 if self.char_ref_tokenizer.borrow().is_some() {
704 return self.step_char_ref_tokenizer(input);
705 }
706
707 trace!("processing in state {:?}", self.state);
708 match self.state.get() {
709 //ยง data-state
710 states::Data => loop {
711 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
712 FromSet('\0') => go!(self: error; emit '\0'),
713 FromSet('&') => go!(self: consume_char_ref),
714 FromSet('<') => go!(self: to TagOpen),
715 FromSet(c) => go!(self: emit c),
716 NotFromSet(b) => self.emit_chars(b),
717 }
718 },
719
720 //ยง rcdata-state
721 states::RawData(Rcdata) => loop {
722 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
723 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
724 FromSet('&') => go!(self: consume_char_ref),
725 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
726 FromSet(c) => go!(self: emit c),
727 NotFromSet(b) => self.emit_chars(b),
728 }
729 },
730
731 //ยง rawtext-state
732 states::RawData(Rawtext) => loop {
733 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
734 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
735 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
736 FromSet(c) => go!(self: emit c),
737 NotFromSet(b) => self.emit_chars(b),
738 }
739 },
740
741 //ยง script-data-state
742 states::RawData(ScriptData) => loop {
743 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
744 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
745 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
746 FromSet(c) => go!(self: emit c),
747 NotFromSet(b) => self.emit_chars(b),
748 }
749 },
750
751 //ยง script-data-escaped-state
752 states::RawData(ScriptDataEscaped(Escaped)) => loop {
753 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
754 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
755 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
756 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
757 FromSet(c) => go!(self: emit c),
758 NotFromSet(b) => self.emit_chars(b),
759 }
760 },
761
762 //ยง script-data-double-escaped-state
763 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
764 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
765 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
766 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
767 FromSet('<') => {
768 go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
769 },
770 FromSet(c) => go!(self: emit c),
771 NotFromSet(b) => self.emit_chars(b),
772 }
773 },
774
775 //ยง plaintext-state
776 states::Plaintext => loop {
777 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
778 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
779 FromSet(c) => go!(self: emit c),
780 NotFromSet(b) => self.emit_chars(b),
781 }
782 },
783
784 //ยง tag-open-state
785 states::TagOpen => loop {
786 match get_char!(self, input) {
787 '!' => go!(self: to MarkupDeclarationOpen),
788 '/' => go!(self: to EndTagOpen),
789 '?' => go!(self: error; clear_comment; reconsume BogusComment),
790 c => match lower_ascii_letter(c) {
791 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
792 None => go!(self: error; emit '<'; reconsume Data),
793 },
794 }
795 },
796
797 //ยง end-tag-open-state
798 states::EndTagOpen => loop {
799 match get_char!(self, input) {
800 '>' => go!(self: error; to Data),
801 c => match lower_ascii_letter(c) {
802 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
803 None => go!(self: error; clear_comment; reconsume BogusComment),
804 },
805 }
806 },
807
808 //ยง tag-name-state
809 states::TagName => loop {
810 match get_char!(self, input) {
811 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
812 '/' => go!(self: to SelfClosingStartTag),
813 '>' => go!(self: emit_tag Data),
814 '\0' => go!(self: error; push_tag '\u{fffd}'),
815 c => go!(self: push_tag (c.to_ascii_lowercase())),
816 }
817 },
818
819 //ยง script-data-escaped-less-than-sign-state
820 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
821 match get_char!(self, input) {
822 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
823 c => match lower_ascii_letter(c) {
824 Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
825 to ScriptDataEscapeStart DoubleEscaped),
826 None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
827 },
828 }
829 },
830
831 //ยง script-data-double-escaped-less-than-sign-state
832 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
833 match get_char!(self, input) {
834 '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
835 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
836 }
837 },
838
839 //ยง rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
840 // otherwise
841 states::RawLessThanSign(kind) => loop {
842 match get_char!(self, input) {
843 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
844 '!' if kind == ScriptData => {
845 go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
846 },
847 _ => go!(self: emit '<'; reconsume RawData kind),
848 }
849 },
850
851 //ยง rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
852 states::RawEndTagOpen(kind) => loop {
853 let c = get_char!(self, input);
854 match lower_ascii_letter(c) {
855 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
856 None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
857 }
858 },
859
860 //ยง rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
861 states::RawEndTagName(kind) => loop {
862 let c = get_char!(self, input);
863 if self.have_appropriate_end_tag() {
864 match c {
865 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName),
866 '/' => go!(self: clear_temp; to SelfClosingStartTag),
867 '>' => go!(self: clear_temp; emit_tag Data),
868 _ => (),
869 }
870 }
871
872 match lower_ascii_letter(c) {
873 Some(cl) => go!(self: push_tag cl; push_temp c),
874 None => {
875 go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
876 },
877 }
878 },
879
880 //ยง script-data-double-escape-start-state
881 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
882 let c = get_char!(self, input);
883 match c {
884 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
885 let esc = if &**self.temp_buf.borrow() == "script" {
886 DoubleEscaped
887 } else {
888 Escaped
889 };
890 go!(self: emit c; to RawData ScriptDataEscaped esc);
891 },
892 _ => match lower_ascii_letter(c) {
893 Some(cl) => go!(self: push_temp cl; emit c),
894 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
895 },
896 }
897 },
898
899 //ยง script-data-escape-start-state
900 states::ScriptDataEscapeStart(Escaped) => loop {
901 match get_char!(self, input) {
902 '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
903 _ => go!(self: reconsume RawData ScriptData),
904 }
905 },
906
907 //ยง script-data-escape-start-dash-state
908 states::ScriptDataEscapeStartDash => loop {
909 match get_char!(self, input) {
910 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
911 _ => go!(self: reconsume RawData ScriptData),
912 }
913 },
914
915 //ยง script-data-escaped-dash-state script-data-double-escaped-dash-state
916 states::ScriptDataEscapedDash(kind) => loop {
917 match get_char!(self, input) {
918 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
919 '<' => {
920 if kind == DoubleEscaped {
921 go!(self: emit '<');
922 }
923 go!(self: to RawLessThanSign ScriptDataEscaped kind);
924 },
925 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
926 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
927 }
928 },
929
930 //ยง script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
931 states::ScriptDataEscapedDashDash(kind) => loop {
932 match get_char!(self, input) {
933 '-' => go!(self: emit '-'),
934 '<' => {
935 if kind == DoubleEscaped {
936 go!(self: emit '<');
937 }
938 go!(self: to RawLessThanSign ScriptDataEscaped kind);
939 },
940 '>' => go!(self: emit '>'; to RawData ScriptData),
941 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
942 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
943 }
944 },
945
946 //ยง script-data-double-escape-end-state
947 states::ScriptDataDoubleEscapeEnd => loop {
948 let c = get_char!(self, input);
949 match c {
950 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
951 let esc = if &**self.temp_buf.borrow() == "script" {
952 Escaped
953 } else {
954 DoubleEscaped
955 };
956 go!(self: emit c; to RawData ScriptDataEscaped esc);
957 },
958 _ => match lower_ascii_letter(c) {
959 Some(cl) => go!(self: push_temp cl; emit c),
960 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
961 },
962 }
963 },
964
965 //ยง before-attribute-name-state
966 states::BeforeAttributeName => loop {
967 match get_char!(self, input) {
968 '\t' | '\n' | '\x0C' | ' ' => (),
969 '/' => go!(self: to SelfClosingStartTag),
970 '>' => go!(self: emit_tag Data),
971 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
972 c => match lower_ascii_letter(c) {
973 Some(cl) => go!(self: create_attr cl; to AttributeName),
974 None => {
975 go_match!(self: c,
976 '"' , '\'' , '<' , '=' => error);
977 go!(self: create_attr c; to AttributeName);
978 },
979 },
980 }
981 },
982
983 //ยง attribute-name-state
984 states::AttributeName => loop {
985 match get_char!(self, input) {
986 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
987 '/' => go!(self: to SelfClosingStartTag),
988 '=' => go!(self: to BeforeAttributeValue),
989 '>' => go!(self: emit_tag Data),
990 '\0' => go!(self: error; push_name '\u{fffd}'),
991 c => match lower_ascii_letter(c) {
992 Some(cl) => go!(self: push_name cl),
993 None => {
994 go_match!(self: c,
995 '"' , '\'' , '<' => error);
996 go!(self: push_name c);
997 },
998 },
999 }
1000 },
1001
1002 //ยง after-attribute-name-state
1003 states::AfterAttributeName => loop {
1004 match get_char!(self, input) {
1005 '\t' | '\n' | '\x0C' | ' ' => (),
1006 '/' => go!(self: to SelfClosingStartTag),
1007 '=' => go!(self: to BeforeAttributeValue),
1008 '>' => go!(self: emit_tag Data),
1009 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
1010 c => match lower_ascii_letter(c) {
1011 Some(cl) => go!(self: create_attr cl; to AttributeName),
1012 None => {
1013 go_match!(self: c,
1014 '"' , '\'' , '<' => error);
1015 go!(self: create_attr c; to AttributeName);
1016 },
1017 },
1018 }
1019 },
1020
1021 //ยง before-attribute-value-state
1022 // Use peek so we can handle the first attr character along with the rest,
1023 // hopefully in the same zero-copy buffer.
1024 states::BeforeAttributeValue => loop {
1025 match peek!(self, input) {
1026 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1027 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1028 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1029 '>' => go!(self: discard_char input; error; emit_tag Data),
1030 _ => go!(self: to AttributeValue Unquoted),
1031 }
1032 },
1033
1034 //ยง attribute-value-(double-quoted)-state
1035 states::AttributeValue(DoubleQuoted) => loop {
1036 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1037 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1038 FromSet('&') => go!(self: consume_char_ref),
1039 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1040 FromSet(c) => go!(self: push_value c),
1041 NotFromSet(ref b) => go!(self: append_value b),
1042 }
1043 },
1044
1045 //ยง attribute-value-(single-quoted)-state
1046 states::AttributeValue(SingleQuoted) => loop {
1047 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1048 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1049 FromSet('&') => go!(self: consume_char_ref),
1050 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1051 FromSet(c) => go!(self: push_value c),
1052 NotFromSet(ref b) => go!(self: append_value b),
1053 }
1054 },
1055
1056 //ยง attribute-value-(unquoted)-state
1057 states::AttributeValue(Unquoted) => loop {
1058 match pop_except_from!(
1059 self,
1060 input,
1061 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1062 ) {
1063 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1064 go!(self: to BeforeAttributeName)
1065 },
1066 FromSet('&') => go!(self: consume_char_ref),
1067 FromSet('>') => go!(self: emit_tag Data),
1068 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1069 FromSet(c) => {
1070 go_match!(self: c,
1071 '"' , '\'' , '<' , '=' , '`' => error);
1072 go!(self: push_value c);
1073 },
1074 NotFromSet(ref b) => go!(self: append_value b),
1075 }
1076 },
1077
1078 //ยง after-attribute-value-(quoted)-state
1079 states::AfterAttributeValueQuoted => loop {
1080 match get_char!(self, input) {
1081 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1082 '/' => go!(self: to SelfClosingStartTag),
1083 '>' => go!(self: emit_tag Data),
1084 _ => go!(self: error; reconsume BeforeAttributeName),
1085 }
1086 },
1087
1088 //ยง self-closing-start-tag-state
1089 states::SelfClosingStartTag => loop {
1090 match get_char!(self, input) {
1091 '>' => {
1092 self.current_tag_self_closing.set(true);
1093 go!(self: emit_tag Data);
1094 },
1095 _ => go!(self: error; reconsume BeforeAttributeName),
1096 }
1097 },
1098
1099 //ยง comment-start-state
1100 states::CommentStart => loop {
1101 match get_char!(self, input) {
1102 '-' => go!(self: to CommentStartDash),
1103 '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1104 '>' => go!(self: error; emit_comment; to Data),
1105 c => go!(self: push_comment c; to Comment),
1106 }
1107 },
1108
1109 //ยง comment-start-dash-state
1110 states::CommentStartDash => loop {
1111 match get_char!(self, input) {
1112 '-' => go!(self: to CommentEnd),
1113 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1114 '>' => go!(self: error; emit_comment; to Data),
1115 c => go!(self: push_comment '-'; push_comment c; to Comment),
1116 }
1117 },
1118
1119 //ยง comment-state
1120 states::Comment => loop {
1121 match get_char!(self, input) {
1122 c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1123 '-' => go!(self: to CommentEndDash),
1124 '\0' => go!(self: error; push_comment '\u{fffd}'),
1125 c => go!(self: push_comment c),
1126 }
1127 },
1128
1129 //ยง comment-less-than-sign-state
1130 states::CommentLessThanSign => loop {
1131 match get_char!(self, input) {
1132 c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1133 c @ '<' => go!(self: push_comment c),
1134 _ => go!(self: reconsume Comment),
1135 }
1136 },
1137
1138 //ยง comment-less-than-sign-bang
1139 states::CommentLessThanSignBang => loop {
1140 match get_char!(self, input) {
1141 '-' => go!(self: to CommentLessThanSignBangDash),
1142 _ => go!(self: reconsume Comment),
1143 }
1144 },
1145
1146 //ยง comment-less-than-sign-bang-dash
1147 states::CommentLessThanSignBangDash => loop {
1148 match get_char!(self, input) {
1149 '-' => go!(self: to CommentLessThanSignBangDashDash),
1150 _ => go!(self: reconsume CommentEndDash),
1151 }
1152 },
1153
1154 //ยง comment-less-than-sign-bang-dash-dash
1155 states::CommentLessThanSignBangDashDash => loop {
1156 match get_char!(self, input) {
1157 '>' => go!(self: reconsume CommentEnd),
1158 _ => go!(self: error; reconsume CommentEnd),
1159 }
1160 },
1161
1162 //ยง comment-end-dash-state
1163 states::CommentEndDash => loop {
1164 match get_char!(self, input) {
1165 '-' => go!(self: to CommentEnd),
1166 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1167 c => go!(self: push_comment '-'; push_comment c; to Comment),
1168 }
1169 },
1170
1171 //ยง comment-end-state
1172 states::CommentEnd => loop {
1173 match get_char!(self, input) {
1174 '>' => go!(self: emit_comment; to Data),
1175 '!' => go!(self: to CommentEndBang),
1176 '-' => go!(self: push_comment '-'),
1177 _ => go!(self: append_comment "--"; reconsume Comment),
1178 }
1179 },
1180
1181 //ยง comment-end-bang-state
1182 states::CommentEndBang => loop {
1183 match get_char!(self, input) {
1184 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1185 '>' => go!(self: error; emit_comment; to Data),
1186 '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1187 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1188 }
1189 },
1190
1191 //ยง doctype-state
1192 states::Doctype => loop {
1193 match get_char!(self, input) {
1194 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1195 '>' => go!(self: reconsume BeforeDoctypeName),
1196 _ => go!(self: error; reconsume BeforeDoctypeName),
1197 }
1198 },
1199
1200 //ยง before-doctype-name-state
1201 states::BeforeDoctypeName => loop {
1202 match get_char!(self, input) {
1203 '\t' | '\n' | '\x0C' | ' ' => (),
1204 '\0' => {
1205 go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1206 },
1207 '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1208 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1209 to DoctypeName),
1210 }
1211 },
1212
1213 //ยง doctype-name-state
1214 states::DoctypeName => loop {
1215 match get_char!(self, input) {
1216 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1217 '>' => go!(self: emit_doctype; to Data),
1218 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1219 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1220 }
1221 },
1222
1223 //ยง after-doctype-name-state
1224 states::AfterDoctypeName => loop {
1225 if eat!(self, input, "public") {
1226 go!(self: to AfterDoctypeKeyword Public);
1227 } else if eat!(self, input, "system") {
1228 go!(self: to AfterDoctypeKeyword System);
1229 } else {
1230 match get_char!(self, input) {
1231 '\t' | '\n' | '\x0C' | ' ' => (),
1232 '>' => go!(self: emit_doctype; to Data),
1233 _ => go!(self: error; force_quirks; reconsume BogusDoctype),
1234 }
1235 }
1236 },
1237
1238 //ยง after-doctype-public-keyword-state after-doctype-system-keyword-state
1239 states::AfterDoctypeKeyword(kind) => loop {
1240 match get_char!(self, input) {
1241 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1242 '"' => {
1243 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1244 },
1245 '\'' => {
1246 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1247 },
1248 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1249 _ => go!(self: error; force_quirks; reconsume BogusDoctype),
1250 }
1251 },
1252
1253 //ยง before-doctype-public-identifier-state before-doctype-system-identifier-state
1254 states::BeforeDoctypeIdentifier(kind) => loop {
1255 match get_char!(self, input) {
1256 '\t' | '\n' | '\x0C' | ' ' => (),
1257 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1258 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1259 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1260 _ => go!(self: error; force_quirks; reconsume BogusDoctype),
1261 }
1262 },
1263
1264 //ยง doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1265 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1266 match get_char!(self, input) {
1267 '"' => go!(self: to AfterDoctypeIdentifier kind),
1268 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1269 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1270 c => go!(self: push_doctype_id kind c),
1271 }
1272 },
1273
1274 //ยง doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1275 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1276 match get_char!(self, input) {
1277 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1278 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1279 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1280 c => go!(self: push_doctype_id kind c),
1281 }
1282 },
1283
1284 //ยง after-doctype-public-identifier-state
1285 states::AfterDoctypeIdentifier(Public) => loop {
1286 match get_char!(self, input) {
1287 '\t' | '\n' | '\x0C' | ' ' => {
1288 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1289 },
1290 '>' => go!(self: emit_doctype; to Data),
1291 '"' => {
1292 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1293 },
1294 '\'' => {
1295 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1296 },
1297 _ => go!(self: error; force_quirks; reconsume BogusDoctype),
1298 }
1299 },
1300
1301 //ยง after-doctype-system-identifier-state
1302 states::AfterDoctypeIdentifier(System) => loop {
1303 match get_char!(self, input) {
1304 '\t' | '\n' | '\x0C' | ' ' => (),
1305 '>' => go!(self: emit_doctype; to Data),
1306 _ => go!(self: error; reconsume BogusDoctype),
1307 }
1308 },
1309
1310 //ยง between-doctype-public-and-system-identifiers-state
1311 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1312 match get_char!(self, input) {
1313 '\t' | '\n' | '\x0C' | ' ' => (),
1314 '>' => go!(self: emit_doctype; to Data),
1315 '"' => {
1316 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1317 },
1318 '\'' => {
1319 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1320 },
1321 _ => go!(self: error; force_quirks; reconsume BogusDoctype),
1322 }
1323 },
1324
1325 //ยง bogus-doctype-state
1326 states::BogusDoctype => loop {
1327 match get_char!(self, input) {
1328 '>' => go!(self: emit_doctype; to Data),
1329 '\0' => go!(self: error),
1330 _ => (),
1331 }
1332 },
1333
1334 //ยง bogus-comment-state
1335 states::BogusComment => loop {
1336 match get_char!(self, input) {
1337 '>' => go!(self: emit_comment; to Data),
1338 '\0' => go!(self: error; push_comment '\u{fffd}'),
1339 c => go!(self: push_comment c),
1340 }
1341 },
1342
1343 //ยง markup-declaration-open-state
1344 states::MarkupDeclarationOpen => loop {
1345 if eat_exact!(self, input, "--") {
1346 go!(self: clear_comment; to CommentStart);
1347 } else if eat!(self, input, "doctype") {
1348 go!(self: to Doctype);
1349 } else {
1350 if self
1351 .sink
1352 .adjusted_current_node_present_but_not_in_html_namespace()
1353 && eat_exact!(self, input, "[CDATA[")
1354 {
1355 go!(self: clear_temp; to CdataSection);
1356 }
1357 go!(self: error; clear_comment; to BogusComment);
1358 }
1359 },
1360
1361 //ยง cdata-section-state
1362 states::CdataSection => loop {
1363 match get_char!(self, input) {
1364 ']' => go!(self: to CdataSectionBracket),
1365 '\0' => go!(self: emit_temp; emit '\0'),
1366 c => go!(self: push_temp c),
1367 }
1368 },
1369
1370 //ยง cdata-section-bracket
1371 states::CdataSectionBracket => match get_char!(self, input) {
1372 ']' => go!(self: to CdataSectionEnd),
1373 _ => go!(self: push_temp ']'; reconsume CdataSection),
1374 },
1375
1376 //ยง cdata-section-end
1377 states::CdataSectionEnd => loop {
1378 match get_char!(self, input) {
1379 ']' => go!(self: push_temp ']'),
1380 '>' => go!(self: emit_temp; to Data),
1381 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1382 }
1383 },
1384 //ยง END
1385 }
1386 }
1387
1388 fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1389 // FIXME HACK: Take and replace the tokenizer so we don't
1390 // double-mut-borrow self. This is why it's boxed.
1391 let mut tok = self.char_ref_tokenizer.take().unwrap();
1392 let outcome = tok.step(self, input);
1393
1394 let progress = match outcome {
1395 char_ref::Done => {
1396 self.process_char_ref(tok.get_result());
1397 return ProcessResult::Continue;
1398 },
1399
1400 char_ref::Stuck => ProcessResult::Suspend,
1401 char_ref::Progress => ProcessResult::Continue,
1402 };
1403
1404 *self.char_ref_tokenizer.borrow_mut() = Some(tok);
1405 progress
1406 }
1407
1408 fn process_char_ref(&self, char_ref: CharRef) {
1409 let CharRef {
1410 mut chars,
1411 mut num_chars,
1412 } = char_ref;
1413
1414 if num_chars == 0 {
1415 chars[0] = '&';
1416 num_chars = 1;
1417 }
1418
1419 for i in 0..num_chars {
1420 let c = chars[i as usize];
1421 match self.state.get() {
1422 states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
1423
1424 states::AttributeValue(_) => go!(self: push_value c),
1425
1426 _ => panic!(
1427 "state {:?} should not be reachable in process_char_ref",
1428 self.state.get()
1429 ),
1430 }
1431 }
1432 }
1433
1434 /// Indicate that we have reached the end of the input.
1435 pub fn end(&self) {
1436 // Handle EOF in the char ref sub-tokenizer, if there is one.
1437 // Do this first because it might un-consume stuff.
1438 let input = BufferQueue::default();
1439 match self.char_ref_tokenizer.take() {
1440 None => (),
1441 Some(mut tok) => {
1442 tok.end_of_file(self, &input);
1443 self.process_char_ref(tok.get_result());
1444 },
1445 }
1446
1447 // Process all remaining buffered input.
1448 // If we're waiting for lookahead, we're not gonna get it.
1449 self.at_eof.set(true);
1450 assert!(matches!(self.run(&input), TokenizerResult::Done));
1451 assert!(input.is_empty());
1452
1453 loop {
1454 match self.eof_step() {
1455 ProcessResult::Continue => (),
1456 ProcessResult::Suspend => break,
1457 ProcessResult::Script(_) => unreachable!(),
1458 }
1459 }
1460
1461 self.sink.end();
1462
1463 if self.opts.profile {
1464 self.dump_profile();
1465 }
1466 }
1467
1468 fn dump_profile(&self) {
1469 let mut results: Vec<(states::State, u64)> = self
1470 .state_profile
1471 .borrow()
1472 .iter()
1473 .map(|(s, t)| (*s, *t))
1474 .collect();
1475 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1476
1477 let total: u64 = results
1478 .iter()
1479 .map(|&(_, t)| t)
1480 .fold(0, ::std::ops::Add::add);
1481 println!("\nTokenizer profile, in nanoseconds");
1482 println!(
1483 "\n{:12} total in token sink",
1484 self.time_in_sink.get()
1485 );
1486 println!("\n{total:12} total in tokenizer");
1487
1488 for (k, v) in results.into_iter() {
1489 let pct = 100.0 * (v as f64) / (total as f64);
1490 println!("{v:12} {pct:4.1}% {k:?}");
1491 }
1492 }
1493
1494 fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1495 debug!("processing EOF in state {:?}", self.state.get());
1496 match self.state.get() {
1497 states::Data
1498 | states::RawData(Rcdata)
1499 | states::RawData(Rawtext)
1500 | states::RawData(ScriptData)
1501 | states::Plaintext => go!(self: eof),
1502
1503 states::TagName
1504 | states::RawData(ScriptDataEscaped(_))
1505 | states::BeforeAttributeName
1506 | states::AttributeName
1507 | states::AfterAttributeName
1508 | states::AttributeValue(_)
1509 | states::AfterAttributeValueQuoted
1510 | states::SelfClosingStartTag
1511 | states::ScriptDataEscapedDash(_)
1512 | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1513
1514 states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1515
1516 states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1517
1518 states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1519
1520 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1521 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1522 },
1523
1524 states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1525
1526 states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1527
1528 states::RawEndTagName(kind) => {
1529 go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1530 },
1531
1532 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1533
1534 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1535
1536 states::ScriptDataDoubleEscapeEnd => {
1537 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1538 },
1539
1540 states::CommentStart
1541 | states::CommentStartDash
1542 | states::Comment
1543 | states::CommentEndDash
1544 | states::CommentEnd
1545 | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1546
1547 states::CommentLessThanSign | states::CommentLessThanSignBang => {
1548 go!(self: reconsume Comment)
1549 },
1550
1551 states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1552
1553 states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1554
1555 states::Doctype | states::BeforeDoctypeName => {
1556 go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1557 },
1558
1559 states::DoctypeName
1560 | states::AfterDoctypeName
1561 | states::AfterDoctypeKeyword(_)
1562 | states::BeforeDoctypeIdentifier(_)
1563 | states::DoctypeIdentifierDoubleQuoted(_)
1564 | states::DoctypeIdentifierSingleQuoted(_)
1565 | states::AfterDoctypeIdentifier(_)
1566 | states::BetweenDoctypePublicAndSystemIdentifiers => {
1567 go!(self: error_eof; force_quirks; emit_doctype; to Data)
1568 },
1569
1570 states::BogusDoctype => go!(self: emit_doctype; to Data),
1571
1572 states::BogusComment => go!(self: emit_comment; to Data),
1573
1574 states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1575
1576 states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1577
1578 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1579
1580 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1581 }
1582 }
1583}
1584
1585#[cfg(test)]
1586#[allow(non_snake_case)]
1587mod test {
1588 use super::option_push; // private items
1589 use crate::tendril::{SliceExt, StrTendril};
1590
1591 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1592
1593 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1594 use super::interface::{EndTag, StartTag, Tag, TagKind};
1595 use super::interface::{TagToken, Token};
1596
1597 use markup5ever::buffer_queue::BufferQueue;
1598 use std::cell::RefCell;
1599
1600 use crate::LocalName;
1601
1602 // LinesMatch implements the TokenSink trait. It is used for testing to see
1603 // if current_line is being updated when process_token is called. The lines
1604 // vector is a collection of the line numbers that each token is on.
1605 struct LinesMatch {
1606 tokens: RefCell<Vec<Token>>,
1607 current_str: RefCell<StrTendril>,
1608 lines: RefCell<Vec<(Token, u64)>>,
1609 }
1610
1611 impl LinesMatch {
1612 fn new() -> LinesMatch {
1613 LinesMatch {
1614 tokens: RefCell::new(vec![]),
1615 current_str: RefCell::new(StrTendril::new()),
1616 lines: RefCell::new(vec![]),
1617 }
1618 }
1619
1620 fn push(&self, token: Token, line_number: u64) {
1621 self.finish_str();
1622 self.lines.borrow_mut().push((token, line_number));
1623 }
1624
1625 fn finish_str(&self) {
1626 if self.current_str.borrow().len() > 0 {
1627 let s = self.current_str.take();
1628 self.tokens.borrow_mut().push(CharacterTokens(s));
1629 }
1630 }
1631 }
1632
1633 impl TokenSink for LinesMatch {
1634 type Handle = ();
1635
1636 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
1637 match token {
1638 CharacterTokens(b) => {
1639 self.current_str.borrow_mut().push_slice(&b);
1640 },
1641
1642 NullCharacterToken => {
1643 self.current_str.borrow_mut().push_char('\0');
1644 },
1645
1646 ParseError(_) => {
1647 panic!("unexpected parse error");
1648 },
1649
1650 TagToken(mut t) => {
1651 // The spec seems to indicate that one can emit
1652 // erroneous end tags with attrs, but the test
1653 // cases don't contain them.
1654 match t.kind {
1655 EndTag => {
1656 t.self_closing = false;
1657 t.attrs = vec![];
1658 },
1659 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1660 }
1661 self.push(TagToken(t), line_number);
1662 },
1663
1664 EOFToken => (),
1665
1666 _ => self.push(token, line_number),
1667 }
1668 TokenSinkResult::Continue
1669 }
1670 }
1671
1672 // Take in tokens, process them, and return vector with line
1673 // numbers that each token is on
1674 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1675 let sink = LinesMatch::new();
1676 let tok = Tokenizer::new(sink, opts);
1677 let buffer = BufferQueue::default();
1678 for chunk in input.into_iter() {
1679 buffer.push_back(chunk);
1680 let _ = tok.feed(&buffer);
1681 }
1682 tok.end();
1683 tok.sink.lines.take()
1684 }
1685
1686 // Create a tag token
1687 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1688 let name = LocalName::from(&*token);
1689
1690 TagToken(Tag {
1691 kind: tagkind,
1692 name,
1693 self_closing: false,
1694 attrs: vec![],
1695 })
1696 }
1697
1698 #[test]
1699 fn push_to_None_gives_singleton() {
1700 let mut s: Option<StrTendril> = None;
1701 option_push(&mut s, 'x');
1702 assert_eq!(s, Some("x".to_tendril()));
1703 }
1704
1705 #[test]
1706 fn push_to_empty_appends() {
1707 let mut s: Option<StrTendril> = Some(StrTendril::new());
1708 option_push(&mut s, 'x');
1709 assert_eq!(s, Some("x".to_tendril()));
1710 }
1711
1712 #[test]
1713 fn push_to_nonempty_appends() {
1714 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1715 option_push(&mut s, 'x');
1716 assert_eq!(s, Some("yx".to_tendril()));
1717 }
1718
1719 #[test]
1720 fn check_lines() {
1721 let opts = TokenizerOpts {
1722 exact_errors: false,
1723 discard_bom: true,
1724 profile: false,
1725 initial_state: None,
1726 last_start_tag_name: None,
1727 };
1728 let vector = vec![
1729 StrTendril::from("<a>\n"),
1730 StrTendril::from("<b>\n"),
1731 StrTendril::from("</b>\n"),
1732 StrTendril::from("</a>\n"),
1733 ];
1734 let expected = vec![
1735 (create_tag(StrTendril::from("a"), StartTag), 1),
1736 (create_tag(StrTendril::from("b"), StartTag), 2),
1737 (create_tag(StrTendril::from("b"), EndTag), 3),
1738 (create_tag(StrTendril::from("a"), EndTag), 4),
1739 ];
1740 let results = tokenize(vector, opts);
1741 assert_eq!(results, expected);
1742 }
1743
1744 #[test]
1745 fn check_lines_with_new_line() {
1746 let opts = TokenizerOpts {
1747 exact_errors: false,
1748 discard_bom: true,
1749 profile: false,
1750 initial_state: None,
1751 last_start_tag_name: None,
1752 };
1753 let vector = vec![
1754 StrTendril::from("<a>\r\n"),
1755 StrTendril::from("<b>\r\n"),
1756 StrTendril::from("</b>\r\n"),
1757 StrTendril::from("</a>\r\n"),
1758 ];
1759 let expected = vec![
1760 (create_tag(StrTendril::from("a"), StartTag), 1),
1761 (create_tag(StrTendril::from("b"), StartTag), 2),
1762 (create_tag(StrTendril::from("b"), EndTag), 3),
1763 (create_tag(StrTendril::from("a"), EndTag), 4),
1764 ];
1765 let results = tokenize(vector, opts);
1766 assert_eq!(results, expected);
1767 }
1768}
1769