1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The HTML5 tokenizer.
11
12pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15pub use self::interface::{TokenSink, TokenSinkResult};
16
17use self::states::{DoctypeIdKind, Public, System};
18use self::states::{DoubleEscaped, Escaped};
19use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22use self::char_ref::{CharRef, CharRefTokenizer};
23
24use crate::util::str::lower_ascii_letter;
25
26use log::{debug, trace};
27use mac::{_tt_as_expr_hack, format_if, matches};
28use markup5ever::{namespace_url, ns, small_char_set};
29use std::borrow::Cow::{self, Borrowed};
30use std::collections::BTreeMap;
31use std::default::Default;
32use std::mem::replace;
33
34pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
35use crate::tendril::StrTendril;
36use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38mod char_ref;
39mod interface;
40pub mod states;
41
42pub enum ProcessResult<Handle> {
43 Continue,
44 Suspend,
45 Script(Handle),
46}
47
48#[must_use]
49pub enum TokenizerResult<Handle> {
50 Done,
51 Script(Handle),
52}
53
54fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
55 match *opt_str {
56 Some(ref mut s: &mut Tendril) => s.push_char(c),
57 None => *opt_str = Some(StrTendril::from_char(c)),
58 }
59}
60
61/// Tokenizer options, with an impl for `Default`.
62#[derive(Clone)]
63pub struct TokenizerOpts {
64 /// Report all parse errors described in the spec, at some
65 /// performance penalty? Default: false
66 pub exact_errors: bool,
67
68 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
69 /// of the stream? Default: true
70 pub discard_bom: bool,
71
72 /// Keep a record of how long we spent in each state? Printed
73 /// when `end()` is called. Default: false
74 pub profile: bool,
75
76 /// Initial state override. Only the test runner should use
77 /// a non-`None` value!
78 pub initial_state: Option<states::State>,
79
80 /// Last start tag. Only the test runner should use a
81 /// non-`None` value!
82 ///
83 /// FIXME: Can't use Tendril because we want TokenizerOpts
84 /// to be Send.
85 pub last_start_tag_name: Option<String>,
86}
87
88impl Default for TokenizerOpts {
89 fn default() -> TokenizerOpts {
90 TokenizerOpts {
91 exact_errors: false,
92 discard_bom: true,
93 profile: false,
94 initial_state: None,
95 last_start_tag_name: None,
96 }
97 }
98}
99
100/// The HTML tokenizer.
101pub struct Tokenizer<Sink> {
102 /// Options controlling the behavior of the tokenizer.
103 opts: TokenizerOpts,
104
105 /// Destination for tokens we emit.
106 pub sink: Sink,
107
108 /// The abstract machine state as described in the spec.
109 state: states::State,
110
111 /// Are we at the end of the file, once buffers have been processed
112 /// completely? This affects whether we will wait for lookahead or not.
113 at_eof: bool,
114
115 /// Tokenizer for character references, if we're tokenizing
116 /// one at the moment.
117 char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
118
119 /// Current input character. Just consumed, may reconsume.
120 current_char: char,
121
122 /// Should we reconsume the current input character?
123 reconsume: bool,
124
125 /// Did we just consume \r, translating it to \n? In that case we need
126 /// to ignore the next character if it's \n.
127 ignore_lf: bool,
128
129 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
130 /// beginning of the stream.
131 discard_bom: bool,
132
133 /// Current tag kind.
134 current_tag_kind: TagKind,
135
136 /// Current tag name.
137 current_tag_name: StrTendril,
138
139 /// Current tag is self-closing?
140 current_tag_self_closing: bool,
141
142 /// Current tag attributes.
143 current_tag_attrs: Vec<Attribute>,
144
145 /// Current attribute name.
146 current_attr_name: StrTendril,
147
148 /// Current attribute value.
149 current_attr_value: StrTendril,
150
151 /// Current comment.
152 current_comment: StrTendril,
153
154 /// Current doctype token.
155 current_doctype: Doctype,
156
157 /// Last start tag name, for use in checking "appropriate end tag".
158 last_start_tag_name: Option<LocalName>,
159
160 /// The "temporary buffer" mentioned in the spec.
161 temp_buf: StrTendril,
162
163 /// Record of how many ns we spent in each state, if profiling is enabled.
164 state_profile: BTreeMap<states::State, u64>,
165
166 /// Record of how many ns we spent in the token sink.
167 time_in_sink: u64,
168
169 /// Track current line
170 current_line: u64,
171}
172
173impl<Sink: TokenSink> Tokenizer<Sink> {
174 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
175 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
176 let start_tag_name = opts
177 .last_start_tag_name
178 .take()
179 .map(|s| LocalName::from(&*s));
180 let state = opts.initial_state.unwrap_or(states::Data);
181 let discard_bom = opts.discard_bom;
182 Tokenizer {
183 opts,
184 sink,
185 state,
186 char_ref_tokenizer: None,
187 at_eof: false,
188 current_char: '\0',
189 reconsume: false,
190 ignore_lf: false,
191 discard_bom,
192 current_tag_kind: StartTag,
193 current_tag_name: StrTendril::new(),
194 current_tag_self_closing: false,
195 current_tag_attrs: vec![],
196 current_attr_name: StrTendril::new(),
197 current_attr_value: StrTendril::new(),
198 current_comment: StrTendril::new(),
199 current_doctype: Doctype::new(),
200 last_start_tag_name: start_tag_name,
201 temp_buf: StrTendril::new(),
202 state_profile: BTreeMap::new(),
203 time_in_sink: 0,
204 current_line: 1,
205 }
206 }
207
208 /// Feed an input string into the tokenizer.
209 pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
210 if input.is_empty() {
211 return TokenizerResult::Done;
212 }
213
214 if self.discard_bom {
215 if let Some(c) = input.peek() {
216 if c == '\u{feff}' {
217 input.next();
218 }
219 } else {
220 return TokenizerResult::Done;
221 }
222 };
223
224 self.run(input)
225 }
226
227 pub fn set_plaintext_state(&mut self) {
228 self.state = states::Plaintext;
229 }
230
231 fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
232 if self.opts.profile {
233 let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
234 self.time_in_sink += dt;
235 ret
236 } else {
237 self.sink.process_token(token, self.current_line)
238 }
239 }
240
241 fn process_token_and_continue(&mut self, token: Token) {
242 assert!(matches!(
243 self.process_token(token),
244 TokenSinkResult::Continue
245 ));
246 }
247
248 //§ preprocessing-the-input-stream
249 // Get the next input character, which might be the character
250 // 'c' that we already consumed from the buffers.
251 fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
252 if self.ignore_lf {
253 self.ignore_lf = false;
254 if c == '\n' {
255 c = unwrap_or_return!(input.next(), None);
256 }
257 }
258
259 if c == '\r' {
260 self.ignore_lf = true;
261 c = '\n';
262 }
263
264 if c == '\n' {
265 self.current_line += 1;
266 }
267
268 if self.opts.exact_errors &&
269 match c as u32 {
270 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
271 n if (n & 0xFFFE) == 0xFFFE => true,
272 _ => false,
273 }
274 {
275 let msg = format!("Bad character {}", c);
276 self.emit_error(Cow::Owned(msg));
277 }
278
279 trace!("got character {}", c);
280 self.current_char = c;
281 Some(c)
282 }
283
284 //§ tokenization
285 // Get the next input character, if one is available.
286 fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
287 if self.reconsume {
288 self.reconsume = false;
289 Some(self.current_char)
290 } else {
291 input
292 .next()
293 .and_then(|c| self.get_preprocessed_char(c, input))
294 }
295 }
296
297 fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
298 // Bail to the slow path for various corner cases.
299 // This means that `FromSet` can contain characters not in the set!
300 // It shouldn't matter because the fallback `FromSet` case should
301 // always do the same thing as the `NotFromSet` case.
302 if self.opts.exact_errors || self.reconsume || self.ignore_lf {
303 return self.get_char(input).map(FromSet);
304 }
305
306 let d = input.pop_except_from(set);
307 trace!("got characters {:?}", d);
308 match d {
309 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
310
311 // NB: We don't set self.current_char for a run of characters not
312 // in the set. It shouldn't matter for the codepaths that use
313 // this.
314 _ => d,
315 }
316 }
317
318 // Check if the next characters are an ASCII case-insensitive match. See
319 // BufferQueue::eat.
320 //
321 // NB: this doesn't do input stream preprocessing or set the current input
322 // character.
323 fn eat(
324 &mut self,
325 input: &mut BufferQueue,
326 pat: &str,
327 eq: fn(&u8, &u8) -> bool,
328 ) -> Option<bool> {
329 input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
330 match input.eat(pat, eq) {
331 None if self.at_eof => Some(false),
332 None => {
333 while let Some(c) = input.next() {
334 self.temp_buf.push_char(c);
335 }
336 None
337 },
338 Some(matched) => Some(matched),
339 }
340 }
341
342 /// Run the state machine for as long as we can.
343 fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
344 if self.opts.profile {
345 loop {
346 let state = self.state;
347 let old_sink = self.time_in_sink;
348 let (run, mut dt) = time!(self.step(input));
349 dt -= (self.time_in_sink - old_sink);
350 let new = match self.state_profile.get_mut(&state) {
351 Some(x) => {
352 *x += dt;
353 false
354 },
355 None => true,
356 };
357 if new {
358 // do this here because of borrow shenanigans
359 self.state_profile.insert(state, dt);
360 }
361 match run {
362 ProcessResult::Continue => (),
363 ProcessResult::Suspend => break,
364 ProcessResult::Script(node) => return TokenizerResult::Script(node),
365 }
366 }
367 } else {
368 loop {
369 match self.step(input) {
370 ProcessResult::Continue => (),
371 ProcessResult::Suspend => break,
372 ProcessResult::Script(node) => return TokenizerResult::Script(node),
373 }
374 }
375 }
376 TokenizerResult::Done
377 }
378
379 fn bad_char_error(&mut self) {
380 let msg = format_if!(
381 self.opts.exact_errors,
382 "Bad character",
383 "Saw {} in state {:?}",
384 self.current_char,
385 self.state
386 );
387 self.emit_error(msg);
388 }
389
390 fn bad_eof_error(&mut self) {
391 let msg = format_if!(
392 self.opts.exact_errors,
393 "Unexpected EOF",
394 "Saw EOF in state {:?}",
395 self.state
396 );
397 self.emit_error(msg);
398 }
399
400 fn emit_char(&mut self, c: char) {
401 self.process_token_and_continue(match c {
402 '\0' => NullCharacterToken,
403 _ => CharacterTokens(StrTendril::from_char(c)),
404 });
405 }
406
407 // The string must not contain '\0'!
408 fn emit_chars(&mut self, b: StrTendril) {
409 self.process_token_and_continue(CharacterTokens(b));
410 }
411
412 fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
413 self.finish_attribute();
414
415 let name = LocalName::from(&*self.current_tag_name);
416 self.current_tag_name.clear();
417
418 match self.current_tag_kind {
419 StartTag => {
420 self.last_start_tag_name = Some(name.clone());
421 },
422 EndTag => {
423 if !self.current_tag_attrs.is_empty() {
424 self.emit_error(Borrowed("Attributes on an end tag"));
425 }
426 if self.current_tag_self_closing {
427 self.emit_error(Borrowed("Self-closing end tag"));
428 }
429 },
430 }
431
432 let token = TagToken(Tag {
433 kind: self.current_tag_kind,
434 name,
435 self_closing: self.current_tag_self_closing,
436 attrs: replace(&mut self.current_tag_attrs, vec![]),
437 });
438
439 match self.process_token(token) {
440 TokenSinkResult::Continue => ProcessResult::Continue,
441 TokenSinkResult::Plaintext => {
442 self.state = states::Plaintext;
443 ProcessResult::Continue
444 },
445 TokenSinkResult::Script(node) => {
446 self.state = states::Data;
447 ProcessResult::Script(node)
448 },
449 TokenSinkResult::RawData(kind) => {
450 self.state = states::RawData(kind);
451 ProcessResult::Continue
452 },
453 }
454 }
455
456 fn emit_temp_buf(&mut self) {
457 // FIXME: Make sure that clearing on emit is spec-compatible.
458 let buf = replace(&mut self.temp_buf, StrTendril::new());
459 self.emit_chars(buf);
460 }
461
462 fn clear_temp_buf(&mut self) {
463 // Do this without a new allocation.
464 self.temp_buf.clear();
465 }
466
467 fn emit_current_comment(&mut self) {
468 let comment = replace(&mut self.current_comment, StrTendril::new());
469 self.process_token_and_continue(CommentToken(comment));
470 }
471
472 fn discard_tag(&mut self) {
473 self.current_tag_name.clear();
474 self.current_tag_self_closing = false;
475 self.current_tag_attrs = vec![];
476 }
477
478 fn create_tag(&mut self, kind: TagKind, c: char) {
479 self.discard_tag();
480 self.current_tag_name.push_char(c);
481 self.current_tag_kind = kind;
482 }
483
484 fn have_appropriate_end_tag(&self) -> bool {
485 match self.last_start_tag_name.as_ref() {
486 Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
487 None => false,
488 }
489 }
490
491 fn create_attribute(&mut self, c: char) {
492 self.finish_attribute();
493
494 self.current_attr_name.push_char(c);
495 }
496
497 fn finish_attribute(&mut self) {
498 if self.current_attr_name.is_empty() {
499 return;
500 }
501
502 // Check for a duplicate attribute.
503 // FIXME: the spec says we should error as soon as the name is finished.
504 // FIXME: linear time search, do we care?
505 let dup = {
506 let name = &*self.current_attr_name;
507 self.current_tag_attrs
508 .iter()
509 .any(|a| &*a.name.local == name)
510 };
511
512 if dup {
513 self.emit_error(Borrowed("Duplicate attribute"));
514 self.current_attr_name.clear();
515 self.current_attr_value.clear();
516 } else {
517 let name = LocalName::from(&*self.current_attr_name);
518 self.current_attr_name.clear();
519 self.current_tag_attrs.push(Attribute {
520 // The tree builder will adjust the namespace if necessary.
521 // This only happens in foreign elements.
522 name: QualName::new(None, ns!(), name),
523 value: replace(&mut self.current_attr_value, StrTendril::new()),
524 });
525 }
526 }
527
528 fn emit_current_doctype(&mut self) {
529 let doctype = replace(&mut self.current_doctype, Doctype::new());
530 self.process_token_and_continue(DoctypeToken(doctype));
531 }
532
533 fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> {
534 match kind {
535 Public => &mut self.current_doctype.public_id,
536 System => &mut self.current_doctype.system_id,
537 }
538 }
539
540 fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
541 let id = self.doctype_id(kind);
542 match *id {
543 Some(ref mut s) => s.clear(),
544 None => *id = Some(StrTendril::new()),
545 }
546 }
547
548 fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
549 // NB: The char ref tokenizer assumes we have an additional allowed
550 // character iff we're tokenizing in an attribute value.
551 self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
552 }
553
554 fn emit_eof(&mut self) {
555 self.process_token_and_continue(EOFToken);
556 }
557
558 fn peek(&mut self, input: &BufferQueue) -> Option<char> {
559 if self.reconsume {
560 Some(self.current_char)
561 } else {
562 input.peek()
563 }
564 }
565
566 fn discard_char(&mut self, input: &mut BufferQueue) {
567 self.get_char(input);
568 }
569
570 fn emit_error(&mut self, error: Cow<'static, str>) {
571 self.process_token_and_continue(ParseError(error));
572 }
573}
574//§ END
575
576// Shorthand for common state machine behaviors.
577macro_rules! shorthand (
578 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
579 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
580 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c) );
581 ( $me:ident : discard_tag ) => ( $me.discard_tag() );
582 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
583 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c) );
584 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
585 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
586 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
587 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c) );
588 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c) );
589 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c) );
590 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c) );
591 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c) );
592 ( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
593 ( $me:ident : clear_comment ) => ( $me.current_comment.clear() );
594 ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new() );
595 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c) );
596 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c) );
597 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
598 ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true );
599 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
600 ( $me:ident : error ) => ( $me.bad_char_error() );
601 ( $me:ident : error_eof ) => ( $me.bad_eof_error() );
602);
603
604// Tracing of tokenizer actions. This adds significant bloat and compile time,
605// so it's behind a cfg flag.
606#[cfg(trace_tokenizer)]
607macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
608 trace!(" {:s}", stringify!($($cmds)*));
609 shorthand!($me:expr : $($cmds)*);
610}));
611
612#[cfg(not(trace_tokenizer))]
613macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
614
615// A little DSL for sequencing shorthand actions.
616macro_rules! go (
617 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
618 // We have to tell the parser how much lookahead we need.
619
620 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
621 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
622 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
623 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
624
625 // These can only come at the end.
626
627 ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; });
628 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; });
629 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
630
631 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
632 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
633 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
634
635 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
636 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
637
638 // We have a default next state after emitting a tag, but the sink can override.
639 ( $me:ident : emit_tag $s:ident ) => ({
640 $me.state = states::$s;
641 return $me.emit_current_tag();
642 });
643
644 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
645
646 // If nothing else matched, it's a single command
647 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
648
649 // or nothing.
650 ( $me:ident : ) => (());
651);
652
653macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
654 match $x {
655 $($pats)|+ => go!($me: $($cmds)*),
656 _ => (),
657 }
658));
659
660// This is a macro because it can cause early return
661// from the function where it is used.
662macro_rules! get_char ( ($me:expr, $input:expr) => (
663 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
664));
665
666macro_rules! peek ( ($me:expr, $input:expr) => (
667 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
668));
669
670macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
671 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
672));
673
674macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
675 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
676));
677
678macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
679 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
680));
681
682impl<Sink: TokenSink> Tokenizer<Sink> {
683 // Run the state machine for a while.
684 // Return true if we should be immediately re-invoked
685 // (this just simplifies control flow vs. break / continue).
686 #[allow(clippy::never_loop)]
687 fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
688 if self.char_ref_tokenizer.is_some() {
689 return self.step_char_ref_tokenizer(input);
690 }
691
692 trace!("processing in state {:?}", self.state);
693 match self.state {
694 //§ data-state
695 states::Data => loop {
696 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
697 FromSet('\0') => go!(self: error; emit '\0'),
698 FromSet('&') => go!(self: consume_char_ref),
699 FromSet('<') => go!(self: to TagOpen),
700 FromSet(c) => go!(self: emit c),
701 NotFromSet(b) => self.emit_chars(b),
702 }
703 },
704
705 //§ rcdata-state
706 states::RawData(Rcdata) => loop {
707 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
708 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
709 FromSet('&') => go!(self: consume_char_ref),
710 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
711 FromSet(c) => go!(self: emit c),
712 NotFromSet(b) => self.emit_chars(b),
713 }
714 },
715
716 //§ rawtext-state
717 states::RawData(Rawtext) => loop {
718 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
719 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
720 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
721 FromSet(c) => go!(self: emit c),
722 NotFromSet(b) => self.emit_chars(b),
723 }
724 },
725
726 //§ script-data-state
727 states::RawData(ScriptData) => loop {
728 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
729 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
730 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
731 FromSet(c) => go!(self: emit c),
732 NotFromSet(b) => self.emit_chars(b),
733 }
734 },
735
736 //§ script-data-escaped-state
737 states::RawData(ScriptDataEscaped(Escaped)) => loop {
738 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
739 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
740 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
741 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
742 FromSet(c) => go!(self: emit c),
743 NotFromSet(b) => self.emit_chars(b),
744 }
745 },
746
747 //§ script-data-double-escaped-state
748 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
749 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
750 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
751 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
752 FromSet('<') => {
753 go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
754 },
755 FromSet(c) => go!(self: emit c),
756 NotFromSet(b) => self.emit_chars(b),
757 }
758 },
759
760 //§ plaintext-state
761 states::Plaintext => loop {
762 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
763 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
764 FromSet(c) => go!(self: emit c),
765 NotFromSet(b) => self.emit_chars(b),
766 }
767 },
768
769 //§ tag-open-state
770 states::TagOpen => loop {
771 match get_char!(self, input) {
772 '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
773 '/' => go!(self: to EndTagOpen),
774 '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
775 c => match lower_ascii_letter(c) {
776 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
777 None => go!(self: error; emit '<'; reconsume Data),
778 },
779 }
780 },
781
782 //§ end-tag-open-state
783 states::EndTagOpen => loop {
784 match get_char!(self, input) {
785 '>' => go!(self: error; to Data),
786 '\0' => {
787 go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
788 },
789 c => match lower_ascii_letter(c) {
790 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
791 None => go!(self: error; clear_comment; push_comment c; to BogusComment),
792 },
793 }
794 },
795
796 //§ tag-name-state
797 states::TagName => loop {
798 match get_char!(self, input) {
799 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
800 '/' => go!(self: to SelfClosingStartTag),
801 '>' => go!(self: emit_tag Data),
802 '\0' => go!(self: error; push_tag '\u{fffd}'),
803 c => go!(self: push_tag (c.to_ascii_lowercase())),
804 }
805 },
806
807 //§ script-data-escaped-less-than-sign-state
808 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
809 match get_char!(self, input) {
810 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
811 c => match lower_ascii_letter(c) {
812 Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
813 to ScriptDataEscapeStart DoubleEscaped),
814 None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
815 },
816 }
817 },
818
819 //§ script-data-double-escaped-less-than-sign-state
820 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
821 match get_char!(self, input) {
822 '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
823 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
824 }
825 },
826
827 //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
828 // otherwise
829 states::RawLessThanSign(kind) => loop {
830 match get_char!(self, input) {
831 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
832 '!' if kind == ScriptData => {
833 go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
834 },
835 _ => go!(self: emit '<'; reconsume RawData kind),
836 }
837 },
838
839 //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
840 states::RawEndTagOpen(kind) => loop {
841 let c = get_char!(self, input);
842 match lower_ascii_letter(c) {
843 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
844 None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
845 }
846 },
847
848 //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
849 states::RawEndTagName(kind) => loop {
850 let c = get_char!(self, input);
851 if self.have_appropriate_end_tag() {
852 match c {
853 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
854 '/' => go!(self: to SelfClosingStartTag),
855 '>' => go!(self: emit_tag Data),
856 _ => (),
857 }
858 }
859
860 match lower_ascii_letter(c) {
861 Some(cl) => go!(self: push_tag cl; push_temp c),
862 None => {
863 go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
864 },
865 }
866 },
867
868 //§ script-data-double-escape-start-state
869 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
870 let c = get_char!(self, input);
871 match c {
872 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
873 let esc = if &*self.temp_buf == "script" {
874 DoubleEscaped
875 } else {
876 Escaped
877 };
878 go!(self: emit c; to RawData ScriptDataEscaped esc);
879 },
880 _ => match lower_ascii_letter(c) {
881 Some(cl) => go!(self: push_temp cl; emit c),
882 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
883 },
884 }
885 },
886
887 //§ script-data-escape-start-state
888 states::ScriptDataEscapeStart(Escaped) => loop {
889 match get_char!(self, input) {
890 '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
891 _ => go!(self: reconsume RawData ScriptData),
892 }
893 },
894
895 //§ script-data-escape-start-dash-state
896 states::ScriptDataEscapeStartDash => loop {
897 match get_char!(self, input) {
898 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
899 _ => go!(self: reconsume RawData ScriptData),
900 }
901 },
902
903 //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
904 states::ScriptDataEscapedDash(kind) => loop {
905 match get_char!(self, input) {
906 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
907 '<' => {
908 if kind == DoubleEscaped {
909 go!(self: emit '<');
910 }
911 go!(self: to RawLessThanSign ScriptDataEscaped kind);
912 },
913 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
914 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
915 }
916 },
917
918 //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
919 states::ScriptDataEscapedDashDash(kind) => loop {
920 match get_char!(self, input) {
921 '-' => go!(self: emit '-'),
922 '<' => {
923 if kind == DoubleEscaped {
924 go!(self: emit '<');
925 }
926 go!(self: to RawLessThanSign ScriptDataEscaped kind);
927 },
928 '>' => go!(self: emit '>'; to RawData ScriptData),
929 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
930 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
931 }
932 },
933
934 //§ script-data-double-escape-end-state
935 states::ScriptDataDoubleEscapeEnd => loop {
936 let c = get_char!(self, input);
937 match c {
938 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
939 let esc = if &*self.temp_buf == "script" {
940 Escaped
941 } else {
942 DoubleEscaped
943 };
944 go!(self: emit c; to RawData ScriptDataEscaped esc);
945 },
946 _ => match lower_ascii_letter(c) {
947 Some(cl) => go!(self: push_temp cl; emit c),
948 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
949 },
950 }
951 },
952
953 //§ before-attribute-name-state
954 states::BeforeAttributeName => loop {
955 match get_char!(self, input) {
956 '\t' | '\n' | '\x0C' | ' ' => (),
957 '/' => go!(self: to SelfClosingStartTag),
958 '>' => go!(self: emit_tag Data),
959 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
960 c => match lower_ascii_letter(c) {
961 Some(cl) => go!(self: create_attr cl; to AttributeName),
962 None => {
963 go_match!(self: c,
964 '"' , '\'' , '<' , '=' => error);
965 go!(self: create_attr c; to AttributeName);
966 },
967 },
968 }
969 },
970
971 //§ attribute-name-state
972 states::AttributeName => loop {
973 match get_char!(self, input) {
974 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
975 '/' => go!(self: to SelfClosingStartTag),
976 '=' => go!(self: to BeforeAttributeValue),
977 '>' => go!(self: emit_tag Data),
978 '\0' => go!(self: error; push_name '\u{fffd}'),
979 c => match lower_ascii_letter(c) {
980 Some(cl) => go!(self: push_name cl),
981 None => {
982 go_match!(self: c,
983 '"' , '\'' , '<' => error);
984 go!(self: push_name c);
985 },
986 },
987 }
988 },
989
990 //§ after-attribute-name-state
991 states::AfterAttributeName => loop {
992 match get_char!(self, input) {
993 '\t' | '\n' | '\x0C' | ' ' => (),
994 '/' => go!(self: to SelfClosingStartTag),
995 '=' => go!(self: to BeforeAttributeValue),
996 '>' => go!(self: emit_tag Data),
997 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
998 c => match lower_ascii_letter(c) {
999 Some(cl) => go!(self: create_attr cl; to AttributeName),
1000 None => {
1001 go_match!(self: c,
1002 '"' , '\'' , '<' => error);
1003 go!(self: create_attr c; to AttributeName);
1004 },
1005 },
1006 }
1007 },
1008
1009 //§ before-attribute-value-state
1010 // Use peek so we can handle the first attr character along with the rest,
1011 // hopefully in the same zero-copy buffer.
1012 states::BeforeAttributeValue => loop {
1013 match peek!(self, input) {
1014 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1015 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1016 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1017 '\0' => {
1018 go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
1019 },
1020 '>' => go!(self: discard_char input; error; emit_tag Data),
1021 _ => go!(self: to AttributeValue Unquoted),
1022 }
1023 },
1024
1025 //§ attribute-value-(double-quoted)-state
1026 states::AttributeValue(DoubleQuoted) => loop {
1027 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1028 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1029 FromSet('&') => go!(self: consume_char_ref '"'),
1030 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1031 FromSet(c) => go!(self: push_value c),
1032 NotFromSet(ref b) => go!(self: append_value b),
1033 }
1034 },
1035
1036 //§ attribute-value-(single-quoted)-state
1037 states::AttributeValue(SingleQuoted) => loop {
1038 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1039 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1040 FromSet('&') => go!(self: consume_char_ref '\''),
1041 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1042 FromSet(c) => go!(self: push_value c),
1043 NotFromSet(ref b) => go!(self: append_value b),
1044 }
1045 },
1046
1047 //§ attribute-value-(unquoted)-state
1048 states::AttributeValue(Unquoted) => loop {
1049 match pop_except_from!(
1050 self,
1051 input,
1052 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1053 ) {
1054 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1055 go!(self: to BeforeAttributeName)
1056 },
1057 FromSet('&') => go!(self: consume_char_ref '>'),
1058 FromSet('>') => go!(self: emit_tag Data),
1059 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1060 FromSet(c) => {
1061 go_match!(self: c,
1062 '"' , '\'' , '<' , '=' , '`' => error);
1063 go!(self: push_value c);
1064 },
1065 NotFromSet(ref b) => go!(self: append_value b),
1066 }
1067 },
1068
1069 //§ after-attribute-value-(quoted)-state
1070 states::AfterAttributeValueQuoted => loop {
1071 match get_char!(self, input) {
1072 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1073 '/' => go!(self: to SelfClosingStartTag),
1074 '>' => go!(self: emit_tag Data),
1075 _ => go!(self: error; reconsume BeforeAttributeName),
1076 }
1077 },
1078
1079 //§ self-closing-start-tag-state
1080 states::SelfClosingStartTag => loop {
1081 match get_char!(self, input) {
1082 '>' => {
1083 self.current_tag_self_closing = true;
1084 go!(self: emit_tag Data);
1085 },
1086 _ => go!(self: error; reconsume BeforeAttributeName),
1087 }
1088 },
1089
1090 //§ comment-start-state
1091 states::CommentStart => loop {
1092 match get_char!(self, input) {
1093 '-' => go!(self: to CommentStartDash),
1094 '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1095 '>' => go!(self: error; emit_comment; to Data),
1096 c => go!(self: push_comment c; to Comment),
1097 }
1098 },
1099
1100 //§ comment-start-dash-state
1101 states::CommentStartDash => loop {
1102 match get_char!(self, input) {
1103 '-' => go!(self: to CommentEnd),
1104 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1105 '>' => go!(self: error; emit_comment; to Data),
1106 c => go!(self: push_comment '-'; push_comment c; to Comment),
1107 }
1108 },
1109
1110 //§ comment-state
1111 states::Comment => loop {
1112 match get_char!(self, input) {
1113 '-' => go!(self: to CommentEndDash),
1114 '\0' => go!(self: error; push_comment '\u{fffd}'),
1115 c => go!(self: push_comment c),
1116 }
1117 },
1118
1119 //§ comment-end-dash-state
1120 states::CommentEndDash => loop {
1121 match get_char!(self, input) {
1122 '-' => go!(self: to CommentEnd),
1123 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1124 c => go!(self: push_comment '-'; push_comment c; to Comment),
1125 }
1126 },
1127
1128 //§ comment-end-state
1129 states::CommentEnd => loop {
1130 match get_char!(self, input) {
1131 '>' => go!(self: emit_comment; to Data),
1132 '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1133 '!' => go!(self: error; to CommentEndBang),
1134 '-' => go!(self: error; push_comment '-'),
1135 c => go!(self: error; append_comment "--"; push_comment c; to Comment),
1136 }
1137 },
1138
1139 //§ comment-end-bang-state
1140 states::CommentEndBang => loop {
1141 match get_char!(self, input) {
1142 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1143 '>' => go!(self: emit_comment; to Data),
1144 '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1145 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1146 }
1147 },
1148
1149 //§ doctype-state
1150 states::Doctype => loop {
1151 match get_char!(self, input) {
1152 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1153 _ => go!(self: error; reconsume BeforeDoctypeName),
1154 }
1155 },
1156
1157 //§ before-doctype-name-state
1158 states::BeforeDoctypeName => loop {
1159 match get_char!(self, input) {
1160 '\t' | '\n' | '\x0C' | ' ' => (),
1161 '\0' => {
1162 go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1163 },
1164 '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1165 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1166 to DoctypeName),
1167 }
1168 },
1169
1170 //§ doctype-name-state
1171 states::DoctypeName => loop {
1172 match get_char!(self, input) {
1173 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1174 '>' => go!(self: emit_doctype; to Data),
1175 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1176 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1177 }
1178 },
1179
1180 //§ after-doctype-name-state
1181 states::AfterDoctypeName => loop {
1182 if eat!(self, input, "public") {
1183 go!(self: to AfterDoctypeKeyword Public);
1184 } else if eat!(self, input, "system") {
1185 go!(self: to AfterDoctypeKeyword System);
1186 } else {
1187 match get_char!(self, input) {
1188 '\t' | '\n' | '\x0C' | ' ' => (),
1189 '>' => go!(self: emit_doctype; to Data),
1190 _ => go!(self: error; force_quirks; to BogusDoctype),
1191 }
1192 }
1193 },
1194
1195 //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1196 states::AfterDoctypeKeyword(kind) => loop {
1197 match get_char!(self, input) {
1198 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1199 '"' => {
1200 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1201 },
1202 '\'' => {
1203 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1204 },
1205 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1206 _ => go!(self: error; force_quirks; to BogusDoctype),
1207 }
1208 },
1209
1210 //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1211 states::BeforeDoctypeIdentifier(kind) => loop {
1212 match get_char!(self, input) {
1213 '\t' | '\n' | '\x0C' | ' ' => (),
1214 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1215 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1216 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1217 _ => go!(self: error; force_quirks; to BogusDoctype),
1218 }
1219 },
1220
1221 //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1222 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1223 match get_char!(self, input) {
1224 '"' => go!(self: to AfterDoctypeIdentifier kind),
1225 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1226 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1227 c => go!(self: push_doctype_id kind c),
1228 }
1229 },
1230
1231 //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1232 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1233 match get_char!(self, input) {
1234 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1235 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1236 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1237 c => go!(self: push_doctype_id kind c),
1238 }
1239 },
1240
1241 //§ after-doctype-public-identifier-state
1242 states::AfterDoctypeIdentifier(Public) => loop {
1243 match get_char!(self, input) {
1244 '\t' | '\n' | '\x0C' | ' ' => {
1245 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1246 },
1247 '>' => go!(self: emit_doctype; to Data),
1248 '"' => {
1249 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1250 },
1251 '\'' => {
1252 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1253 },
1254 _ => go!(self: error; force_quirks; to BogusDoctype),
1255 }
1256 },
1257
1258 //§ after-doctype-system-identifier-state
1259 states::AfterDoctypeIdentifier(System) => loop {
1260 match get_char!(self, input) {
1261 '\t' | '\n' | '\x0C' | ' ' => (),
1262 '>' => go!(self: emit_doctype; to Data),
1263 _ => go!(self: error; to BogusDoctype),
1264 }
1265 },
1266
1267 //§ between-doctype-public-and-system-identifiers-state
1268 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1269 match get_char!(self, input) {
1270 '\t' | '\n' | '\x0C' | ' ' => (),
1271 '>' => go!(self: emit_doctype; to Data),
1272 '"' => {
1273 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1274 },
1275 '\'' => {
1276 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1277 },
1278 _ => go!(self: error; force_quirks; to BogusDoctype),
1279 }
1280 },
1281
1282 //§ bogus-doctype-state
1283 states::BogusDoctype => loop {
1284 match get_char!(self, input) {
1285 '>' => go!(self: emit_doctype; to Data),
1286 _ => (),
1287 }
1288 },
1289
1290 //§ bogus-comment-state
1291 states::BogusComment => loop {
1292 match get_char!(self, input) {
1293 '>' => go!(self: emit_comment; to Data),
1294 '\0' => go!(self: push_comment '\u{fffd}'),
1295 c => go!(self: push_comment c),
1296 }
1297 },
1298
1299 //§ markup-declaration-open-state
1300 states::MarkupDeclarationOpen => loop {
1301 if eat_exact!(self, input, "--") {
1302 go!(self: clear_comment; to CommentStart);
1303 } else if eat!(self, input, "doctype") {
1304 go!(self: to Doctype);
1305 } else {
1306 if self
1307 .sink
1308 .adjusted_current_node_present_but_not_in_html_namespace()
1309 {
1310 if eat_exact!(self, input, "[CDATA[") {
1311 go!(self: clear_temp; to CdataSection);
1312 }
1313 }
1314 go!(self: error; to BogusComment);
1315 }
1316 },
1317
1318 //§ cdata-section-state
1319 states::CdataSection => loop {
1320 match get_char!(self, input) {
1321 ']' => go!(self: to CdataSectionBracket),
1322 '\0' => go!(self: emit_temp; emit '\0'),
1323 c => go!(self: push_temp c),
1324 }
1325 },
1326
1327 //§ cdata-section-bracket
1328 states::CdataSectionBracket => match get_char!(self, input) {
1329 ']' => go!(self: to CdataSectionEnd),
1330 _ => go!(self: push_temp ']'; reconsume CdataSection),
1331 },
1332
1333 //§ cdata-section-end
1334 states::CdataSectionEnd => loop {
1335 match get_char!(self, input) {
1336 ']' => go!(self: push_temp ']'),
1337 '>' => go!(self: emit_temp; to Data),
1338 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1339 }
1340 },
1341 //§ END
1342 }
1343 }
1344
1345 fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1346 // FIXME HACK: Take and replace the tokenizer so we don't
1347 // double-mut-borrow self. This is why it's boxed.
1348 let mut tok = self.char_ref_tokenizer.take().unwrap();
1349 let outcome = tok.step(self, input);
1350
1351 let progress = match outcome {
1352 char_ref::Done => {
1353 self.process_char_ref(tok.get_result());
1354 return ProcessResult::Continue;
1355 },
1356
1357 char_ref::Stuck => ProcessResult::Suspend,
1358 char_ref::Progress => ProcessResult::Continue,
1359 };
1360
1361 self.char_ref_tokenizer = Some(tok);
1362 progress
1363 }
1364
1365 fn process_char_ref(&mut self, char_ref: CharRef) {
1366 let CharRef {
1367 mut chars,
1368 mut num_chars,
1369 } = char_ref;
1370
1371 if num_chars == 0 {
1372 chars[0] = '&';
1373 num_chars = 1;
1374 }
1375
1376 for i in 0..num_chars {
1377 let c = chars[i as usize];
1378 match self.state {
1379 states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
1380
1381 states::AttributeValue(_) => go!(self: push_value c),
1382
1383 _ => panic!(
1384 "state {:?} should not be reachable in process_char_ref",
1385 self.state
1386 ),
1387 }
1388 }
1389 }
1390
1391 /// Indicate that we have reached the end of the input.
1392 pub fn end(&mut self) {
1393 // Handle EOF in the char ref sub-tokenizer, if there is one.
1394 // Do this first because it might un-consume stuff.
1395 let mut input = BufferQueue::new();
1396 match self.char_ref_tokenizer.take() {
1397 None => (),
1398 Some(mut tok) => {
1399 tok.end_of_file(self, &mut input);
1400 self.process_char_ref(tok.get_result());
1401 },
1402 }
1403
1404 // Process all remaining buffered input.
1405 // If we're waiting for lookahead, we're not gonna get it.
1406 self.at_eof = true;
1407 assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1408 assert!(input.is_empty());
1409
1410 loop {
1411 match self.eof_step() {
1412 ProcessResult::Continue => (),
1413 ProcessResult::Suspend => break,
1414 ProcessResult::Script(_) => unreachable!(),
1415 }
1416 }
1417
1418 self.sink.end();
1419
1420 if self.opts.profile {
1421 self.dump_profile();
1422 }
1423 }
1424
1425 fn dump_profile(&self) {
1426 let mut results: Vec<(states::State, u64)> =
1427 self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1428 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1429
1430 let total: u64 = results
1431 .iter()
1432 .map(|&(_, t)| t)
1433 .fold(0, ::std::ops::Add::add);
1434 println!("\nTokenizer profile, in nanoseconds");
1435 println!("\n{:12} total in token sink", self.time_in_sink);
1436 println!("\n{:12} total in tokenizer", total);
1437
1438 for (k, v) in results.into_iter() {
1439 let pct = 100.0 * (v as f64) / (total as f64);
1440 println!("{:12} {:4.1}% {:?}", v, pct, k);
1441 }
1442 }
1443
1444 fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1445 debug!("processing EOF in state {:?}", self.state);
1446 match self.state {
1447 states::Data |
1448 states::RawData(Rcdata) |
1449 states::RawData(Rawtext) |
1450 states::RawData(ScriptData) |
1451 states::Plaintext => go!(self: eof),
1452
1453 states::TagName |
1454 states::RawData(ScriptDataEscaped(_)) |
1455 states::BeforeAttributeName |
1456 states::AttributeName |
1457 states::AfterAttributeName |
1458 states::BeforeAttributeValue |
1459 states::AttributeValue(_) |
1460 states::AfterAttributeValueQuoted |
1461 states::SelfClosingStartTag |
1462 states::ScriptDataEscapedDash(_) |
1463 states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1464
1465 states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1466
1467 states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1468
1469 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1470 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1471 },
1472
1473 states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1474
1475 states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1476
1477 states::RawEndTagName(kind) => {
1478 go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1479 },
1480
1481 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1482
1483 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1484
1485 states::ScriptDataDoubleEscapeEnd => {
1486 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1487 },
1488
1489 states::CommentStart |
1490 states::CommentStartDash |
1491 states::Comment |
1492 states::CommentEndDash |
1493 states::CommentEnd |
1494 states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1495
1496 states::Doctype | states::BeforeDoctypeName => {
1497 go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1498 },
1499
1500 states::DoctypeName |
1501 states::AfterDoctypeName |
1502 states::AfterDoctypeKeyword(_) |
1503 states::BeforeDoctypeIdentifier(_) |
1504 states::DoctypeIdentifierDoubleQuoted(_) |
1505 states::DoctypeIdentifierSingleQuoted(_) |
1506 states::AfterDoctypeIdentifier(_) |
1507 states::BetweenDoctypePublicAndSystemIdentifiers => {
1508 go!(self: error_eof; force_quirks; emit_doctype; to Data)
1509 },
1510
1511 states::BogusDoctype => go!(self: emit_doctype; to Data),
1512
1513 states::BogusComment => go!(self: emit_comment; to Data),
1514
1515 states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1516
1517 states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1518
1519 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1520
1521 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1522 }
1523 }
1524}
1525
1526#[cfg(test)]
1527#[allow(non_snake_case)]
1528mod test {
1529 use super::option_push; // private items
1530 use crate::tendril::{SliceExt, StrTendril};
1531
1532 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1533
1534 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1535 use super::interface::{EndTag, StartTag, Tag, TagKind};
1536 use super::interface::{TagToken, Token};
1537
1538 use markup5ever::buffer_queue::BufferQueue;
1539 use std::mem::replace;
1540
1541 use crate::LocalName;
1542
1543 // LinesMatch implements the TokenSink trait. It is used for testing to see
1544 // if current_line is being updated when process_token is called. The lines
1545 // vector is a collection of the line numbers that each token is on.
1546 struct LinesMatch {
1547 tokens: Vec<Token>,
1548 current_str: StrTendril,
1549 lines: Vec<(Token, u64)>,
1550 }
1551
1552 impl LinesMatch {
1553 fn new() -> LinesMatch {
1554 LinesMatch {
1555 tokens: vec![],
1556 current_str: StrTendril::new(),
1557 lines: vec![],
1558 }
1559 }
1560
1561 fn push(&mut self, token: Token, line_number: u64) {
1562 self.finish_str();
1563 self.lines.push((token, line_number));
1564 }
1565
1566 fn finish_str(&mut self) {
1567 if self.current_str.len() > 0 {
1568 let s = replace(&mut self.current_str, StrTendril::new());
1569 self.tokens.push(CharacterTokens(s));
1570 }
1571 }
1572 }
1573
1574 impl TokenSink for LinesMatch {
1575 type Handle = ();
1576
1577 fn process_token(
1578 &mut self,
1579 token: Token,
1580 line_number: u64,
1581 ) -> TokenSinkResult<Self::Handle> {
1582 match token {
1583 CharacterTokens(b) => {
1584 self.current_str.push_slice(&b);
1585 },
1586
1587 NullCharacterToken => {
1588 self.current_str.push_char('\0');
1589 },
1590
1591 ParseError(_) => {
1592 panic!("unexpected parse error");
1593 },
1594
1595 TagToken(mut t) => {
1596 // The spec seems to indicate that one can emit
1597 // erroneous end tags with attrs, but the test
1598 // cases don't contain them.
1599 match t.kind {
1600 EndTag => {
1601 t.self_closing = false;
1602 t.attrs = vec![];
1603 },
1604 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1605 }
1606 self.push(TagToken(t), line_number);
1607 },
1608
1609 EOFToken => (),
1610
1611 _ => self.push(token, line_number),
1612 }
1613 TokenSinkResult::Continue
1614 }
1615 }
1616
1617 // Take in tokens, process them, and return vector with line
1618 // numbers that each token is on
1619 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1620 let sink = LinesMatch::new();
1621 let mut tok = Tokenizer::new(sink, opts);
1622 let mut buffer = BufferQueue::new();
1623 for chunk in input.into_iter() {
1624 buffer.push_back(chunk);
1625 let _ = tok.feed(&mut buffer);
1626 }
1627 tok.end();
1628 tok.sink.lines
1629 }
1630
1631 // Create a tag token
1632 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1633 let name = LocalName::from(&*token);
1634 let token = TagToken(Tag {
1635 kind: tagkind,
1636 name,
1637 self_closing: false,
1638 attrs: vec![],
1639 });
1640 token
1641 }
1642
1643 #[test]
1644 fn push_to_None_gives_singleton() {
1645 let mut s: Option<StrTendril> = None;
1646 option_push(&mut s, 'x');
1647 assert_eq!(s, Some("x".to_tendril()));
1648 }
1649
1650 #[test]
1651 fn push_to_empty_appends() {
1652 let mut s: Option<StrTendril> = Some(StrTendril::new());
1653 option_push(&mut s, 'x');
1654 assert_eq!(s, Some("x".to_tendril()));
1655 }
1656
1657 #[test]
1658 fn push_to_nonempty_appends() {
1659 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1660 option_push(&mut s, 'x');
1661 assert_eq!(s, Some("yx".to_tendril()));
1662 }
1663
1664 #[test]
1665 fn check_lines() {
1666 let opts = TokenizerOpts {
1667 exact_errors: false,
1668 discard_bom: true,
1669 profile: false,
1670 initial_state: None,
1671 last_start_tag_name: None,
1672 };
1673 let vector = vec![
1674 StrTendril::from("<a>\n"),
1675 StrTendril::from("<b>\n"),
1676 StrTendril::from("</b>\n"),
1677 StrTendril::from("</a>\n"),
1678 ];
1679 let expected = vec![
1680 (create_tag(StrTendril::from("a"), StartTag), 1),
1681 (create_tag(StrTendril::from("b"), StartTag), 2),
1682 (create_tag(StrTendril::from("b"), EndTag), 3),
1683 (create_tag(StrTendril::from("a"), EndTag), 4),
1684 ];
1685 let results = tokenize(vector, opts);
1686 assert_eq!(results, expected);
1687 }
1688
1689 #[test]
1690 fn check_lines_with_new_line() {
1691 let opts = TokenizerOpts {
1692 exact_errors: false,
1693 discard_bom: true,
1694 profile: false,
1695 initial_state: None,
1696 last_start_tag_name: None,
1697 };
1698 let vector = vec![
1699 StrTendril::from("<a>\r\n"),
1700 StrTendril::from("<b>\r\n"),
1701 StrTendril::from("</b>\r\n"),
1702 StrTendril::from("</a>\r\n"),
1703 ];
1704 let expected = vec![
1705 (create_tag(StrTendril::from("a"), StartTag), 1),
1706 (create_tag(StrTendril::from("b"), StartTag), 2),
1707 (create_tag(StrTendril::from("b"), EndTag), 3),
1708 (create_tag(StrTendril::from("a"), EndTag), 4),
1709 ];
1710 let results = tokenize(vector, opts);
1711 assert_eq!(results, expected);
1712 }
1713}
1714