mod.rs source code [crates/html5ever/src/tokenizer/mod.rs]

1	// Copyright 2014-2017 The html5ever Project Developers. See the
2	// COPYRIGHT file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	//! The HTML5 tokenizer.
11
12	pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13	pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14	pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15	pub use self::interface::{TokenSink, TokenSinkResult};
16
17	use self::states::{DoctypeIdKind, Public, System};
18	use self::states::{DoubleEscaped, Escaped};
19	use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20	use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22	use self::char_ref::{CharRef, CharRefTokenizer};
23
24	use crate::util::str::lower_ascii_letter;
25
26	use log::{debug, trace};
27	use mac::format_if;
28	use markup5ever::{namespace_url, ns, small_char_set};
29	use std::borrow::Cow::{self, Borrowed};
30	use std::cell::{Cell, RefCell, RefMut};
31	use std::collections::BTreeMap;
32	use std::mem;
33
34	pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
35	use crate::tendril::StrTendril;
36	use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38	mod char_ref;
39	mod interface;
40	pub mod states;
41
42	pub enum ProcessResult<Handle> {
43	Continue,
44	Suspend,
45	Script(Handle),
46	}
47
48	#[must_use]
49	#[derive(Debug)]
50	pub enum TokenizerResult<Handle> {
51	Done,
52	Script(Handle),
53	}
54
55	fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
56	match *opt_str {
57	Some(ref mut s: &mut Tendril) => s.push_char(c),
58	None => *opt_str = Some(StrTendril::from_char(c)),
59	}
60	}
61
62	/// Tokenizer options, with an impl for `Default`.
63	#[derive(Clone)]
64	pub struct TokenizerOpts {
65	/// Report all parse errors described in the spec, at some
66	/// performance penalty? Default: false
67	pub exact_errors: bool,
68
69	/// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
70	/// of the stream? Default: true
71	pub discard_bom: bool,
72
73	/// Keep a record of how long we spent in each state? Printed
74	/// when `end()` is called. Default: false
75	pub profile: bool,
76
77	/// Initial state override. Only the test runner should use
78	/// a non-`None` value!
79	pub initial_state: Option<states::State>,
80
81	/// Last start tag. Only the test runner should use a
82	/// non-`None` value!
83	///
84	/// FIXME: Can't use Tendril because we want TokenizerOpts
85	/// to be Send.
86	pub last_start_tag_name: Option<String>,
87	}
88
89	impl Default for TokenizerOpts {
90	fn default() -> TokenizerOpts {
91	TokenizerOpts {
92	exact_errors: `false`,
93	discard_bom: `true`,
94	profile: `false`,
95	initial_state: None,
96	last_start_tag_name: None,
97	}
98	}
99	}
100
101	/// The HTML tokenizer.
102	pub struct Tokenizer<Sink> {
103	/// Options controlling the behavior of the tokenizer.
104	opts: TokenizerOpts,
105
106	/// Destination for tokens we emit.
107	pub sink: Sink,
108
109	/// The abstract machine state as described in the spec.
110	state: Cell<states::State>,
111
112	/// Are we at the end of the file, once buffers have been processed
113	/// completely? This affects whether we will wait for lookahead or not.
114	at_eof: Cell<bool>,
115
116	/// Tokenizer for character references, if we're tokenizing
117	/// one at the moment.
118	char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>,
119
120	/// Current input character. Just consumed, may reconsume.
121	current_char: Cell<char>,
122
123	/// Should we reconsume the current input character?
124	reconsume: Cell<bool>,
125
126	/// Did we just consume \r, translating it to \n? In that case we need
127	/// to ignore the next character if it's \n.
128	ignore_lf: Cell<bool>,
129
130	/// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
131	/// beginning of the stream.
132	discard_bom: Cell<bool>,
133
134	/// Current tag kind.
135	current_tag_kind: Cell<TagKind>,
136
137	/// Current tag name.
138	current_tag_name: RefCell<StrTendril>,
139
140	/// Current tag is self-closing?
141	current_tag_self_closing: Cell<bool>,
142
143	/// Current tag attributes.
144	current_tag_attrs: RefCell<Vec<Attribute>>,
145
146	/// Current attribute name.
147	current_attr_name: RefCell<StrTendril>,
148
149	/// Current attribute value.
150	current_attr_value: RefCell<StrTendril>,
151
152	/// Current comment.
153	current_comment: RefCell<StrTendril>,
154
155	/// Current doctype token.
156	current_doctype: RefCell<Doctype>,
157
158	/// Last start tag name, for use in checking "appropriate end tag".
159	last_start_tag_name: RefCell<Option<LocalName>>,
160
161	/// The "temporary buffer" mentioned in the spec.
162	temp_buf: RefCell<StrTendril>,
163
164	/// Record of how many ns we spent in each state, if profiling is enabled.
165	state_profile: RefCell<BTreeMap<states::State, u64>>,
166
167	/// Record of how many ns we spent in the token sink.
168	time_in_sink: Cell<u64>,
169
170	/// Track current line
171	current_line: Cell<u64>,
172	}
173
174	impl<Sink: TokenSink> Tokenizer<Sink> {
175	/// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
176	pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
177	let start_tag_name = opts
178	.last_start_tag_name
179	.take()
180	.map(\|s\| LocalName::from(&*s));
181	let state = opts.initial_state.unwrap_or(states::Data);
182	let discard_bom = opts.discard_bom;
183	Tokenizer {
184	opts,
185	sink,
186	state: Cell::new(state),
187	char_ref_tokenizer: RefCell::new(None),
188	at_eof: Cell::new(`false`),
189	current_char: Cell::new('`\0`'),
190	reconsume: Cell::new(`false`),
191	ignore_lf: Cell::new(`false`),
192	discard_bom: Cell::new(discard_bom),
193	current_tag_kind: Cell::new(StartTag),
194	current_tag_name: RefCell::new(StrTendril::new()),
195	current_tag_self_closing: Cell::new(`false`),
196	current_tag_attrs: RefCell::new(vec![]),
197	current_attr_name: RefCell::new(StrTendril::new()),
198	current_attr_value: RefCell::new(StrTendril::new()),
199	current_comment: RefCell::new(StrTendril::new()),
200	current_doctype: RefCell::new(Doctype::default()),
201	last_start_tag_name: RefCell::new(start_tag_name),
202	temp_buf: RefCell::new(StrTendril::new()),
203	state_profile: RefCell::new(BTreeMap::new()),
204	time_in_sink: Cell::new(`0`),
205	current_line: Cell::new(`1`),
206	}
207	}
208
209	/// Feed an input string into the tokenizer.
210	pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
211	if input.is_empty() {
212	return TokenizerResult::Done;
213	}
214
215	if self.discard_bom.get() {
216	if let Some(c) = input.peek() {
217	if c == '`\u{feff}`' {
218	input.next();
219	}
220	} else {
221	return TokenizerResult::Done;
222	}
223	};
224
225	self.run(input)
226	}
227
228	pub fn set_plaintext_state(&self) {
229	self.state.set(states::Plaintext);
230	}
231
232	fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> {
233	if self.opts.profile {
234	let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get()));
235	self.time_in_sink.set(self.time_in_sink.get() + dt);
236	ret
237	} else {
238	self.sink.process_token(token, self.current_line.get())
239	}
240	}
241
242	fn process_token_and_continue(&self, token: Token) {
243	assert!(matches!(
244	self.process_token(token),
245	TokenSinkResult::Continue
246	));
247	}
248
249	//§ preprocessing-the-input-stream
250	// Get the next input character, which might be the character
251	// 'c' that we already consumed from the buffers.
252	fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> {
253	if self.ignore_lf.get() {
254	self.ignore_lf.set(`false`);
255	if c == '`\n`' {
256	c = input.next()?;
257	}
258	}
259
260	if c == '`\r`' {
261	self.ignore_lf.set(`true`);
262	c = '`\n`';
263	}
264
265	if c == '`\n`' {
266	self.current_line.set(self.current_line.get() + `1`);
267	}
268
269	if self.opts.exact_errors
270	&& match c as u32 {
271	`0x01`..=`0x08` \| `0x0B` \| `0x0E`..=`0x1F` \| `0x7F`..=`0x9F` \| `0xFDD0`..=`0xFDEF` => `true`,
272	n if (n & `0xFFFE`) == `0xFFFE` => `true`,
273	_ => `false`,
274	}
275	{
276	let msg = format!("Bad character {c}");
277	self.emit_error(Cow::Owned(msg));
278	}
279
280	trace!("got character {}", c);
281	self.current_char.set(c);
282	Some(c)
283	}
284
285	//§ tokenization
286	// Get the next input character, if one is available.
287	fn get_char(&self, input: &BufferQueue) -> Option<char> {
288	if self.reconsume.get() {
289	self.reconsume.set(`false`);
290	Some(self.current_char.get())
291	} else {
292	input
293	.next()
294	.and_then(\|c\| self.get_preprocessed_char(c, input))
295	}
296	}
297
298	fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> {
299	// Bail to the slow path for various corner cases.
300	// This means that `FromSet` can contain characters not in the set!
301	// It shouldn't matter because the fallback `FromSet` case should
302	// always do the same thing as the `NotFromSet` case.
303	if self.opts.exact_errors \|\| self.reconsume.get() \|\| self.ignore_lf.get() {
304	return self.get_char(input).map(FromSet);
305	}
306
307	let d = input.pop_except_from(set);
308	trace!("got characters {:?}", d);
309	match d {
310	Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
311
312	// NB: We don't set self.current_char for a run of characters not
313	// in the set. It shouldn't matter for the codepaths that use
314	// this.
315	_ => d,
316	}
317	}
318
319	// Check if the next characters are an ASCII case-insensitive match. See
320	// BufferQueue::eat.
321	//
322	// NB: this doesn't set the current input character.
323	fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> {
324	if self.ignore_lf.get() {
325	self.ignore_lf.set(`false`);
326	if self.peek(input) == Some('`\n`') {
327	self.discard_char(input);
328	}
329	}
330
331	input.push_front(mem::take(&mut self.temp_buf.borrow_mut()));
332	match input.eat(pat, eq) {
333	None if self.at_eof.get() => Some(`false`),
334	None => {
335	while let Some(data) = input.next() {
336	self.temp_buf.borrow_mut().push_char(data);
337	}
338	None
339	},
340	Some(matched) => Some(matched),
341	}
342	}
343
344	/// Run the state machine for as long as we can.
345	fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> {
346	if self.opts.profile {
347	loop {
348	let state = self.state.get();
349	let old_sink = self.time_in_sink.get();
350	let (run, mut dt) = time!(self.step(input));
351	dt -= (self.time_in_sink.get() - old_sink);
352	let new = match self.state_profile.borrow_mut().get_mut(&state) {
353	Some(x) => {
354	*x += dt;
355	`false`
356	},
357	None => `true`,
358	};
359	if new {
360	// do this here because of borrow shenanigans
361	self.state_profile.borrow_mut().insert(state, dt);
362	}
363	match run {
364	ProcessResult::Continue => (),
365	ProcessResult::Suspend => break,
366	ProcessResult::Script(node) => return TokenizerResult::Script(node),
367	}
368	}
369	} else {
370	loop {
371	match self.step(input) {
372	ProcessResult::Continue => (),
373	ProcessResult::Suspend => break,
374	ProcessResult::Script(node) => return TokenizerResult::Script(node),
375	}
376	}
377	}
378	TokenizerResult::Done
379	}
380
381	fn bad_char_error(&self) {
382	let msg = format_if!(
383	self.opts.exact_errors,
384	"Bad character",
385	"Saw {} in state {:?}",
386	self.current_char.get(),
387	self.state.get()
388	);
389	self.emit_error(msg);
390	}
391
392	fn bad_eof_error(&self) {
393	let msg = format_if!(
394	self.opts.exact_errors,
395	"Unexpected EOF",
396	"Saw EOF in state {:?}",
397	self.state.get()
398	);
399	self.emit_error(msg);
400	}
401
402	fn emit_char(&self, c: char) {
403	self.process_token_and_continue(match c {
404	'`\0`' => NullCharacterToken,
405	_ => CharacterTokens(StrTendril::from_char(c)),
406	});
407	}
408
409	// The string must not contain '\0'!
410	fn emit_chars(&self, b: StrTendril) {
411	self.process_token_and_continue(CharacterTokens(b));
412	}
413
414	fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> {
415	self.finish_attribute();
416
417	let name = LocalName::from(&**self.current_tag_name.borrow());
418	self.current_tag_name.borrow_mut().clear();
419
420	match self.current_tag_kind.get() {
421	StartTag => {
422	*self.last_start_tag_name.borrow_mut() = Some(name.clone());
423	},
424	EndTag => {
425	if !self.current_tag_attrs.borrow().is_empty() {
426	self.emit_error(Borrowed("Attributes on an end tag"));
427	}
428	if self.current_tag_self_closing.get() {
429	self.emit_error(Borrowed("Self-closing end tag"));
430	}
431	},
432	}
433
434	let token = TagToken(Tag {
435	kind: self.current_tag_kind.get(),
436	name,
437	self_closing: self.current_tag_self_closing.get(),
438	attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()),
439	});
440
441	match self.process_token(token) {
442	TokenSinkResult::Continue => ProcessResult::Continue,
443	TokenSinkResult::Plaintext => {
444	self.state.set(states::Plaintext);
445	ProcessResult::Continue
446	},
447	TokenSinkResult::Script(node) => {
448	self.state.set(states::Data);
449	ProcessResult::Script(node)
450	},
451	TokenSinkResult::RawData(kind) => {
452	self.state.set(states::RawData(kind));
453	ProcessResult::Continue
454	},
455	}
456	}
457
458	fn emit_temp_buf(&self) {
459	// FIXME: Make sure that clearing on emit is spec-compatible.
460	let buf = mem::take(&mut *self.temp_buf.borrow_mut());
461	self.emit_chars(buf);
462	}
463
464	fn clear_temp_buf(&self) {
465	// Do this without a new allocation.
466	self.temp_buf.borrow_mut().clear();
467	}
468
469	fn emit_current_comment(&self) {
470	let comment = mem::take(&mut *self.current_comment.borrow_mut());
471	self.process_token_and_continue(CommentToken(comment));
472	}
473
474	fn discard_tag(&self) {
475	self.current_tag_name.borrow_mut().clear();
476	self.current_tag_self_closing.set(`false`);
477	*self.current_tag_attrs.borrow_mut() = vec![];
478	}
479
480	fn create_tag(&self, kind: TagKind, c: char) {
481	self.discard_tag();
482	self.current_tag_name.borrow_mut().push_char(c);
483	self.current_tag_kind.set(kind);
484	}
485
486	fn have_appropriate_end_tag(&self) -> bool {
487	match self.last_start_tag_name.borrow().as_ref() {
488	Some(last) => {
489	(self.current_tag_kind.get() == EndTag)
490	&& (self.current_tag_name.borrow() == last)
491	},
492	None => `false`,
493	}
494	}
495
496	fn create_attribute(&self, c: char) {
497	self.finish_attribute();
498
499	self.current_attr_name.borrow_mut().push_char(c);
500	}
501
502	fn finish_attribute(&self) {
503	if self.current_attr_name.borrow().is_empty() {
504	return;
505	}
506
507	// Check for a duplicate attribute.
508	// FIXME: the spec says we should error as soon as the name is finished.
509	let dup = {
510	let name = &*self.current_attr_name.borrow();
511	self.current_tag_attrs
512	.borrow()
513	.iter()
514	.any(\|a\| a.name.local == *name)
515	};
516
517	if dup {
518	self.emit_error(Borrowed("Duplicate attribute"));
519	self.current_attr_name.borrow_mut().clear();
520	self.current_attr_value.borrow_mut().clear();
521	} else {
522	let name = LocalName::from(&**self.current_attr_name.borrow());
523	self.current_attr_name.borrow_mut().clear();
524	self.current_tag_attrs.borrow_mut().push(Attribute {
525	// The tree builder will adjust the namespace if necessary.
526	// This only happens in foreign elements.
527	name: QualName::new(None, ns!(), name),
528	value: mem::take(&mut self.current_attr_value.borrow_mut()),
529	});
530	}
531	}
532
533	fn emit_current_doctype(&self) {
534	let doctype = self.current_doctype.take();
535	self.process_token_and_continue(DoctypeToken(doctype));
536	}
537
538	fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<Option<StrTendril>> {
539	let current_doctype = self.current_doctype.borrow_mut();
540	match kind {
541	Public => RefMut::map(current_doctype, \|d\| &mut d.public_id),
542	System => RefMut::map(current_doctype, \|d\| &mut d.system_id),
543	}
544	}
545
546	fn clear_doctype_id(&self, kind: DoctypeIdKind) {
547	let mut id = self.doctype_id(kind);
548	match *id {
549	Some(ref mut s) => s.clear(),
550	None => *id = Some(StrTendril::new()),
551	}
552	}
553
554	fn consume_char_ref(&self) {
555	*self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!(
556	self.state.get(),
557	states::AttributeValue(_)
558	))));
559	}
560
561	fn emit_eof(&self) {
562	self.process_token_and_continue(EOFToken);
563	}
564
565	fn peek(&self, input: &BufferQueue) -> Option<char> {
566	if self.reconsume.get() {
567	Some(self.current_char.get())
568	} else {
569	input.peek()
570	}
571	}
572
573	fn discard_char(&self, input: &BufferQueue) {
574	// peek() deals in un-processed characters (no newline normalization), while get_char()
575	// does.
576	//
577	// since discard_char is supposed to be used in combination with peek(), discard_char must
578	// discard a single raw input character, not a normalized newline.
579	if self.reconsume.get() {
580	self.reconsume.set(`false`);
581	} else {
582	input.next();
583	}
584	}
585
586	fn emit_error(&self, error: Cow<'static, str>) {
587	self.process_token_and_continue(ParseError(error));
588	}
589	}
590	//§ END
591
592	// Shorthand for common state machine behaviors.
593	macro_rules! shorthand (
594	( $me:ident : emit $c:expr ) => ( $me.emit_char($c) );
595	( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) );
596	( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) );
597	( $me:ident : discard_tag ) => ( $me.discard_tag() );
598	( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) );
599	( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) );
600	( $me:ident : emit_temp ) => ( $me.emit_temp_buf() );
601	( $me:ident : clear_temp ) => ( $me.clear_temp_buf() );
602	( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) );
603	( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) );
604	( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) );
605	( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c));
606	( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) );
607	( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) );
608	( $me:ident : emit_comment ) => ( $me.emit_current_comment() );
609	( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() );
610	( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() );
611	( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) );
612	( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) );
613	( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) );
614	( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = `true`);
615	( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() );
616	( $me:ident : error ) => ( $me.bad_char_error() );
617	( $me:ident : error_eof ) => ( $me.bad_eof_error() );
618	);
619
620	// Tracing of tokenizer actions. This adds significant bloat and compile time,
621	// so it's behind a cfg flag.
622	#[cfg(feature = "trace_tokenizer")]
623	macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
624	trace!(" {:?}", stringify!($($cmds)*));
625	shorthand!($me : $($cmds)*);
626	}));
627
628	#[cfg(not(feature = "trace_tokenizer"))]
629	macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
630
631	// A little DSL for sequencing shorthand actions.
632	macro_rules! go (
633	// A pattern like $($cmd:tt) ; $($rest:tt)* causes parse ambiguity.*
634	// We have to tell the parser how much lookahead we need.
635
636	( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
637	( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
638	( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
639	( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
640
641	// These can only come at the end.
642
643	( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; });
644	( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; });
645	( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; });
646
647	( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(`true`); go!($me: to $s); });
648	( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(`true`); go!($me: to $s $k1); });
649	( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(`true`); go!($me: to $s $k1 $k2); });
650
651	( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; });
652
653	// We have a default next state after emitting a tag, but the sink can override.
654	( $me:ident : emit_tag $s:ident ) => ({
655	$me.state.set(states::$s);
656	return $me.emit_current_tag();
657	});
658
659	( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
660
661	// If nothing else matched, it's a single command
662	( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) );
663
664	// or nothing.
665	( $me:ident : ) => (());
666	);
667
668	macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
669	match $x {
670	$($pats)\|+ => go!($me: $($cmds)*),
671	_ => (),
672	}
673	));
674
675	// This is a macro because it can cause early return
676	// from the function where it is used.
677	macro_rules! get_char ( ($me:expr, $input:expr) => (
678	unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
679	));
680
681	macro_rules! peek ( ($me:expr, $input:expr) => (
682	unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
683	));
684
685	macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
686	unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
687	));
688
689	macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
690	unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
691	));
692
693	macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
694	unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
695	));
696
697	impl<Sink: TokenSink> Tokenizer<Sink> {
698	// Run the state machine for a while.
699	// Return true if we should be immediately re-invoked
700	// (this just simplifies control flow vs. break / continue).
701	#[allow(clippy::never_loop)]
702	fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
703	if self.char_ref_tokenizer.borrow().is_some() {
704	return self.step_char_ref_tokenizer(input);
705	}
706
707	trace!("processing in state {:?}", self.state);
708	match self.state.get() {
709	//§ data-state
710	states::Data => loop {
711	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '&' '<' '`\n`')) {
712	FromSet('`\0`') => go!(self: error; emit '`\0`'),
713	FromSet('&') => go!(self: consume_char_ref),
714	FromSet('<') => go!(self: to TagOpen),
715	FromSet(c) => go!(self: emit c),
716	NotFromSet(b) => self.emit_chars(b),
717	}
718	},
719
720	//§ rcdata-state
721	states::RawData(Rcdata) => loop {
722	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '&' '<' '`\n`')) {
723	FromSet('`\0`') => go!(self: error; emit '`\u{fffd}`'),
724	FromSet('&') => go!(self: consume_char_ref),
725	FromSet('<') => go!(self: to RawLessThanSign Rcdata),
726	FromSet(c) => go!(self: emit c),
727	NotFromSet(b) => self.emit_chars(b),
728	}
729	},
730
731	//§ rawtext-state
732	states::RawData(Rawtext) => loop {
733	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '<' '`\n`')) {
734	FromSet('`\0`') => go!(self: error; emit '`\u{fffd}`'),
735	FromSet('<') => go!(self: to RawLessThanSign Rawtext),
736	FromSet(c) => go!(self: emit c),
737	NotFromSet(b) => self.emit_chars(b),
738	}
739	},
740
741	//§ script-data-state
742	states::RawData(ScriptData) => loop {
743	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '<' '`\n`')) {
744	FromSet('`\0`') => go!(self: error; emit '`\u{fffd}`'),
745	FromSet('<') => go!(self: to RawLessThanSign ScriptData),
746	FromSet(c) => go!(self: emit c),
747	NotFromSet(b) => self.emit_chars(b),
748	}
749	},
750
751	//§ script-data-escaped-state
752	states::RawData(ScriptDataEscaped(Escaped)) => loop {
753	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '-' '<' '`\n`')) {
754	FromSet('`\0`') => go!(self: error; emit '`\u{fffd}`'),
755	FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
756	FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
757	FromSet(c) => go!(self: emit c),
758	NotFromSet(b) => self.emit_chars(b),
759	}
760	},
761
762	//§ script-data-double-escaped-state
763	states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
764	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '-' '<' '`\n`')) {
765	FromSet('`\0`') => go!(self: error; emit '`\u{fffd}`'),
766	FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
767	FromSet('<') => {
768	go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
769	},
770	FromSet(c) => go!(self: emit c),
771	NotFromSet(b) => self.emit_chars(b),
772	}
773	},
774
775	//§ plaintext-state
776	states::Plaintext => loop {
777	match pop_except_from!(self, input, small_char_set!('`\r`' '`\0`' '`\n`')) {
778	FromSet('`\0`') => go!(self: error; emit '`\u{fffd}`'),
779	FromSet(c) => go!(self: emit c),
780	NotFromSet(b) => self.emit_chars(b),
781	}
782	},
783
784	//§ tag-open-state
785	states::TagOpen => loop {
786	match get_char!(self, input) {
787	'!' => go!(self: to MarkupDeclarationOpen),
788	'/' => go!(self: to EndTagOpen),
789	'?' => go!(self: error; clear_comment; reconsume BogusComment),
790	c => match lower_ascii_letter(c) {
791	Some(cl) => go!(self: create_tag StartTag cl; to TagName),
792	None => go!(self: error; emit '<'; reconsume Data),
793	},
794	}
795	},
796
797	//§ end-tag-open-state
798	states::EndTagOpen => loop {
799	match get_char!(self, input) {
800	'>' => go!(self: error; to Data),
801	c => match lower_ascii_letter(c) {
802	Some(cl) => go!(self: create_tag EndTag cl; to TagName),
803	None => go!(self: error; clear_comment; reconsume BogusComment),
804	},
805	}
806	},
807
808	//§ tag-name-state
809	states::TagName => loop {
810	match get_char!(self, input) {
811	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: to BeforeAttributeName),
812	'/' => go!(self: to SelfClosingStartTag),
813	'>' => go!(self: emit_tag Data),
814	'`\0`' => go!(self: error; push_tag '`\u{fffd}`'),
815	c => go!(self: push_tag (c.to_ascii_lowercase())),
816	}
817	},
818
819	//§ script-data-escaped-less-than-sign-state
820	states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
821	match get_char!(self, input) {
822	'/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
823	c => match lower_ascii_letter(c) {
824	Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
825	to ScriptDataEscapeStart DoubleEscaped),
826	None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
827	},
828	}
829	},
830
831	//§ script-data-double-escaped-less-than-sign-state
832	states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
833	match get_char!(self, input) {
834	'/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
835	_ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
836	}
837	},
838
839	//§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
840	// otherwise
841	states::RawLessThanSign(kind) => loop {
842	match get_char!(self, input) {
843	'/' => go!(self: clear_temp; to RawEndTagOpen kind),
844	'!' if kind == ScriptData => {
845	go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
846	},
847	_ => go!(self: emit '<'; reconsume RawData kind),
848	}
849	},
850
851	//§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
852	states::RawEndTagOpen(kind) => loop {
853	let c = get_char!(self, input);
854	match lower_ascii_letter(c) {
855	Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
856	None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
857	}
858	},
859
860	//§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
861	states::RawEndTagName(kind) => loop {
862	let c = get_char!(self, input);
863	if self.have_appropriate_end_tag() {
864	match c {
865	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: clear_temp; to BeforeAttributeName),
866	'/' => go!(self: clear_temp; to SelfClosingStartTag),
867	'>' => go!(self: clear_temp; emit_tag Data),
868	_ => (),
869	}
870	}
871
872	match lower_ascii_letter(c) {
873	Some(cl) => go!(self: push_tag cl; push_temp c),
874	None => {
875	go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
876	},
877	}
878	},
879
880	//§ script-data-double-escape-start-state
881	states::ScriptDataEscapeStart(DoubleEscaped) => loop {
882	let c = get_char!(self, input);
883	match c {
884	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' \| '/' \| '>' => {
885	let esc = if &**self.temp_buf.borrow() == "script" {
886	DoubleEscaped
887	} else {
888	Escaped
889	};
890	go!(self: emit c; to RawData ScriptDataEscaped esc);
891	},
892	_ => match lower_ascii_letter(c) {
893	Some(cl) => go!(self: push_temp cl; emit c),
894	None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
895	},
896	}
897	},
898
899	//§ script-data-escape-start-state
900	states::ScriptDataEscapeStart(Escaped) => loop {
901	match get_char!(self, input) {
902	'-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
903	_ => go!(self: reconsume RawData ScriptData),
904	}
905	},
906
907	//§ script-data-escape-start-dash-state
908	states::ScriptDataEscapeStartDash => loop {
909	match get_char!(self, input) {
910	'-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
911	_ => go!(self: reconsume RawData ScriptData),
912	}
913	},
914
915	//§ script-data-escaped-dash-state script-data-double-escaped-dash-state
916	states::ScriptDataEscapedDash(kind) => loop {
917	match get_char!(self, input) {
918	'-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
919	'<' => {
920	if kind == DoubleEscaped {
921	go!(self: emit '<');
922	}
923	go!(self: to RawLessThanSign ScriptDataEscaped kind);
924	},
925	'`\0`' => go!(self: error; emit '`\u{fffd}`'; to RawData ScriptDataEscaped kind),
926	c => go!(self: emit c; to RawData ScriptDataEscaped kind),
927	}
928	},
929
930	//§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
931	states::ScriptDataEscapedDashDash(kind) => loop {
932	match get_char!(self, input) {
933	'-' => go!(self: emit '-'),
934	'<' => {
935	if kind == DoubleEscaped {
936	go!(self: emit '<');
937	}
938	go!(self: to RawLessThanSign ScriptDataEscaped kind);
939	},
940	'>' => go!(self: emit '>'; to RawData ScriptData),
941	'`\0`' => go!(self: error; emit '`\u{fffd}`'; to RawData ScriptDataEscaped kind),
942	c => go!(self: emit c; to RawData ScriptDataEscaped kind),
943	}
944	},
945
946	//§ script-data-double-escape-end-state
947	states::ScriptDataDoubleEscapeEnd => loop {
948	let c = get_char!(self, input);
949	match c {
950	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' \| '/' \| '>' => {
951	let esc = if &**self.temp_buf.borrow() == "script" {
952	Escaped
953	} else {
954	DoubleEscaped
955	};
956	go!(self: emit c; to RawData ScriptDataEscaped esc);
957	},
958	_ => match lower_ascii_letter(c) {
959	Some(cl) => go!(self: push_temp cl; emit c),
960	None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
961	},
962	}
963	},
964
965	//§ before-attribute-name-state
966	states::BeforeAttributeName => loop {
967	match get_char!(self, input) {
968	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
969	'/' => go!(self: to SelfClosingStartTag),
970	'>' => go!(self: emit_tag Data),
971	'`\0`' => go!(self: error; create_attr '`\u{fffd}`'; to AttributeName),
972	c => match lower_ascii_letter(c) {
973	Some(cl) => go!(self: create_attr cl; to AttributeName),
974	None => {
975	go_match!(self: c,
976	'"' , '`\'`' , '<' , '=' => error);
977	go!(self: create_attr c; to AttributeName);
978	},
979	},
980	}
981	},
982
983	//§ attribute-name-state
984	states::AttributeName => loop {
985	match get_char!(self, input) {
986	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: to AfterAttributeName),
987	'/' => go!(self: to SelfClosingStartTag),
988	'=' => go!(self: to BeforeAttributeValue),
989	'>' => go!(self: emit_tag Data),
990	'`\0`' => go!(self: error; push_name '`\u{fffd}`'),
991	c => match lower_ascii_letter(c) {
992	Some(cl) => go!(self: push_name cl),
993	None => {
994	go_match!(self: c,
995	'"' , '`\'`' , '<' => error);
996	go!(self: push_name c);
997	},
998	},
999	}
1000	},
1001
1002	//§ after-attribute-name-state
1003	states::AfterAttributeName => loop {
1004	match get_char!(self, input) {
1005	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
1006	'/' => go!(self: to SelfClosingStartTag),
1007	'=' => go!(self: to BeforeAttributeValue),
1008	'>' => go!(self: emit_tag Data),
1009	'`\0`' => go!(self: error; create_attr '`\u{fffd}`'; to AttributeName),
1010	c => match lower_ascii_letter(c) {
1011	Some(cl) => go!(self: create_attr cl; to AttributeName),
1012	None => {
1013	go_match!(self: c,
1014	'"' , '`\'`' , '<' => error);
1015	go!(self: create_attr c; to AttributeName);
1016	},
1017	},
1018	}
1019	},
1020
1021	//§ before-attribute-value-state
1022	// Use peek so we can handle the first attr character along with the rest,
1023	// hopefully in the same zero-copy buffer.
1024	states::BeforeAttributeValue => loop {
1025	match peek!(self, input) {
1026	'`\t`' \| '`\n`' \| '`\r`' \| '`\x0C`' \| ' ' => go!(self: discard_char input),
1027	'"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1028	'`\'`' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1029	'>' => go!(self: discard_char input; error; emit_tag Data),
1030	_ => go!(self: to AttributeValue Unquoted),
1031	}
1032	},
1033
1034	//§ attribute-value-(double-quoted)-state
1035	states::AttributeValue(DoubleQuoted) => loop {
1036	match pop_except_from!(self, input, small_char_set!('`\r`' '"' '&' '`\0`' '`\n`')) {
1037	FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1038	FromSet('&') => go!(self: consume_char_ref),
1039	FromSet('`\0`') => go!(self: error; push_value '`\u{fffd}`'),
1040	FromSet(c) => go!(self: push_value c),
1041	NotFromSet(ref b) => go!(self: append_value b),
1042	}
1043	},
1044
1045	//§ attribute-value-(single-quoted)-state
1046	states::AttributeValue(SingleQuoted) => loop {
1047	match pop_except_from!(self, input, small_char_set!('`\r`' '`\'`' '&' '`\0`' '`\n`')) {
1048	FromSet('`\'`') => go!(self: to AfterAttributeValueQuoted),
1049	FromSet('&') => go!(self: consume_char_ref),
1050	FromSet('`\0`') => go!(self: error; push_value '`\u{fffd}`'),
1051	FromSet(c) => go!(self: push_value c),
1052	NotFromSet(ref b) => go!(self: append_value b),
1053	}
1054	},
1055
1056	//§ attribute-value-(unquoted)-state
1057	states::AttributeValue(Unquoted) => loop {
1058	match pop_except_from!(
1059	self,
1060	input,
1061	small_char_set!('`\r`' '`\t`' '`\n`' '`\x0C`' ' ' '&' '>' '`\0`')
1062	) {
1063	FromSet('`\t`') \| FromSet('`\n`') \| FromSet('`\x0C`') \| FromSet(' ') => {
1064	go!(self: to BeforeAttributeName)
1065	},
1066	FromSet('&') => go!(self: consume_char_ref),
1067	FromSet('>') => go!(self: emit_tag Data),
1068	FromSet('`\0`') => go!(self: error; push_value '`\u{fffd}`'),
1069	FromSet(c) => {
1070	go_match!(self: c,
1071	'"' , '`\'`' , '<' , '=' , '`' => error);
1072	go!(self: push_value c);
1073	},
1074	NotFromSet(ref b) => go!(self: append_value b),
1075	}
1076	},
1077
1078	//§ after-attribute-value-(quoted)-state
1079	states::AfterAttributeValueQuoted => loop {
1080	match get_char!(self, input) {
1081	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: to BeforeAttributeName),
1082	'/' => go!(self: to SelfClosingStartTag),
1083	'>' => go!(self: emit_tag Data),
1084	_ => go!(self: error; reconsume BeforeAttributeName),
1085	}
1086	},
1087
1088	//§ self-closing-start-tag-state
1089	states::SelfClosingStartTag => loop {
1090	match get_char!(self, input) {
1091	'>' => {
1092	self.current_tag_self_closing.set(`true`);
1093	go!(self: emit_tag Data);
1094	},
1095	_ => go!(self: error; reconsume BeforeAttributeName),
1096	}
1097	},
1098
1099	//§ comment-start-state
1100	states::CommentStart => loop {
1101	match get_char!(self, input) {
1102	'-' => go!(self: to CommentStartDash),
1103	'`\0`' => go!(self: error; push_comment '`\u{fffd}`'; to Comment),
1104	'>' => go!(self: error; emit_comment; to Data),
1105	c => go!(self: push_comment c; to Comment),
1106	}
1107	},
1108
1109	//§ comment-start-dash-state
1110	states::CommentStartDash => loop {
1111	match get_char!(self, input) {
1112	'-' => go!(self: to CommentEnd),
1113	'`\0`' => go!(self: error; append_comment "-`\u{fffd}`"; to Comment),
1114	'>' => go!(self: error; emit_comment; to Data),
1115	c => go!(self: push_comment '-'; push_comment c; to Comment),
1116	}
1117	},
1118
1119	//§ comment-state
1120	states::Comment => loop {
1121	match get_char!(self, input) {
1122	c @ '<' => go!(self: push_comment c; to CommentLessThanSign),
1123	'-' => go!(self: to CommentEndDash),
1124	'`\0`' => go!(self: error; push_comment '`\u{fffd}`'),
1125	c => go!(self: push_comment c),
1126	}
1127	},
1128
1129	//§ comment-less-than-sign-state
1130	states::CommentLessThanSign => loop {
1131	match get_char!(self, input) {
1132	c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang),
1133	c @ '<' => go!(self: push_comment c),
1134	_ => go!(self: reconsume Comment),
1135	}
1136	},
1137
1138	//§ comment-less-than-sign-bang
1139	states::CommentLessThanSignBang => loop {
1140	match get_char!(self, input) {
1141	'-' => go!(self: to CommentLessThanSignBangDash),
1142	_ => go!(self: reconsume Comment),
1143	}
1144	},
1145
1146	//§ comment-less-than-sign-bang-dash
1147	states::CommentLessThanSignBangDash => loop {
1148	match get_char!(self, input) {
1149	'-' => go!(self: to CommentLessThanSignBangDashDash),
1150	_ => go!(self: reconsume CommentEndDash),
1151	}
1152	},
1153
1154	//§ comment-less-than-sign-bang-dash-dash
1155	states::CommentLessThanSignBangDashDash => loop {
1156	match get_char!(self, input) {
1157	'>' => go!(self: reconsume CommentEnd),
1158	_ => go!(self: error; reconsume CommentEnd),
1159	}
1160	},
1161
1162	//§ comment-end-dash-state
1163	states::CommentEndDash => loop {
1164	match get_char!(self, input) {
1165	'-' => go!(self: to CommentEnd),
1166	'`\0`' => go!(self: error; append_comment "-`\u{fffd}`"; to Comment),
1167	c => go!(self: push_comment '-'; push_comment c; to Comment),
1168	}
1169	},
1170
1171	//§ comment-end-state
1172	states::CommentEnd => loop {
1173	match get_char!(self, input) {
1174	'>' => go!(self: emit_comment; to Data),
1175	'!' => go!(self: to CommentEndBang),
1176	'-' => go!(self: push_comment '-'),
1177	_ => go!(self: append_comment "--"; reconsume Comment),
1178	}
1179	},
1180
1181	//§ comment-end-bang-state
1182	states::CommentEndBang => loop {
1183	match get_char!(self, input) {
1184	'-' => go!(self: append_comment "--!"; to CommentEndDash),
1185	'>' => go!(self: error; emit_comment; to Data),
1186	'`\0`' => go!(self: error; append_comment "--!`\u{fffd}`"; to Comment),
1187	c => go!(self: append_comment "--!"; push_comment c; to Comment),
1188	}
1189	},
1190
1191	//§ doctype-state
1192	states::Doctype => loop {
1193	match get_char!(self, input) {
1194	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: to BeforeDoctypeName),
1195	'>' => go!(self: reconsume BeforeDoctypeName),
1196	_ => go!(self: error; reconsume BeforeDoctypeName),
1197	}
1198	},
1199
1200	//§ before-doctype-name-state
1201	states::BeforeDoctypeName => loop {
1202	match get_char!(self, input) {
1203	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
1204	'`\0`' => {
1205	go!(self: error; create_doctype; push_doctype_name '`\u{fffd}`'; to DoctypeName)
1206	},
1207	'>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1208	c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1209	to DoctypeName),
1210	}
1211	},
1212
1213	//§ doctype-name-state
1214	states::DoctypeName => loop {
1215	match get_char!(self, input) {
1216	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: clear_temp; to AfterDoctypeName),
1217	'>' => go!(self: emit_doctype; to Data),
1218	'`\0`' => go!(self: error; push_doctype_name '`\u{fffd}`'),
1219	c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1220	}
1221	},
1222
1223	//§ after-doctype-name-state
1224	states::AfterDoctypeName => loop {
1225	if eat!(self, input, "public") {
1226	go!(self: to AfterDoctypeKeyword Public);
1227	} else if eat!(self, input, "system") {
1228	go!(self: to AfterDoctypeKeyword System);
1229	} else {
1230	match get_char!(self, input) {
1231	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
1232	'>' => go!(self: emit_doctype; to Data),
1233	_ => go!(self: error; force_quirks; reconsume BogusDoctype),
1234	}
1235	}
1236	},
1237
1238	//§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1239	states::AfterDoctypeKeyword(kind) => loop {
1240	match get_char!(self, input) {
1241	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1242	'"' => {
1243	go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1244	},
1245	'`\'`' => {
1246	go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1247	},
1248	'>' => go!(self: error; force_quirks; emit_doctype; to Data),
1249	_ => go!(self: error; force_quirks; reconsume BogusDoctype),
1250	}
1251	},
1252
1253	//§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1254	states::BeforeDoctypeIdentifier(kind) => loop {
1255	match get_char!(self, input) {
1256	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
1257	'"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1258	'`\'`' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1259	'>' => go!(self: error; force_quirks; emit_doctype; to Data),
1260	_ => go!(self: error; force_quirks; reconsume BogusDoctype),
1261	}
1262	},
1263
1264	//§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1265	states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1266	match get_char!(self, input) {
1267	'"' => go!(self: to AfterDoctypeIdentifier kind),
1268	'`\0`' => go!(self: error; push_doctype_id kind '`\u{fffd}`'),
1269	'>' => go!(self: error; force_quirks; emit_doctype; to Data),
1270	c => go!(self: push_doctype_id kind c),
1271	}
1272	},
1273
1274	//§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1275	states::DoctypeIdentifierSingleQuoted(kind) => loop {
1276	match get_char!(self, input) {
1277	'`\'`' => go!(self: to AfterDoctypeIdentifier kind),
1278	'`\0`' => go!(self: error; push_doctype_id kind '`\u{fffd}`'),
1279	'>' => go!(self: error; force_quirks; emit_doctype; to Data),
1280	c => go!(self: push_doctype_id kind c),
1281	}
1282	},
1283
1284	//§ after-doctype-public-identifier-state
1285	states::AfterDoctypeIdentifier(Public) => loop {
1286	match get_char!(self, input) {
1287	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => {
1288	go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1289	},
1290	'>' => go!(self: emit_doctype; to Data),
1291	'"' => {
1292	go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1293	},
1294	'`\'`' => {
1295	go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1296	},
1297	_ => go!(self: error; force_quirks; reconsume BogusDoctype),
1298	}
1299	},
1300
1301	//§ after-doctype-system-identifier-state
1302	states::AfterDoctypeIdentifier(System) => loop {
1303	match get_char!(self, input) {
1304	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
1305	'>' => go!(self: emit_doctype; to Data),
1306	_ => go!(self: error; reconsume BogusDoctype),
1307	}
1308	},
1309
1310	//§ between-doctype-public-and-system-identifiers-state
1311	states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1312	match get_char!(self, input) {
1313	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' => (),
1314	'>' => go!(self: emit_doctype; to Data),
1315	'"' => {
1316	go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1317	},
1318	'`\'`' => {
1319	go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1320	},
1321	_ => go!(self: error; force_quirks; reconsume BogusDoctype),
1322	}
1323	},
1324
1325	//§ bogus-doctype-state
1326	states::BogusDoctype => loop {
1327	match get_char!(self, input) {
1328	'>' => go!(self: emit_doctype; to Data),
1329	'`\0`' => go!(self: error),
1330	_ => (),
1331	}
1332	},
1333
1334	//§ bogus-comment-state
1335	states::BogusComment => loop {
1336	match get_char!(self, input) {
1337	'>' => go!(self: emit_comment; to Data),
1338	'`\0`' => go!(self: error; push_comment '`\u{fffd}`'),
1339	c => go!(self: push_comment c),
1340	}
1341	},
1342
1343	//§ markup-declaration-open-state
1344	states::MarkupDeclarationOpen => loop {
1345	if eat_exact!(self, input, "--") {
1346	go!(self: clear_comment; to CommentStart);
1347	} else if eat!(self, input, "doctype") {
1348	go!(self: to Doctype);
1349	} else {
1350	if self
1351	.sink
1352	.adjusted_current_node_present_but_not_in_html_namespace()
1353	&& eat_exact!(self, input, "[CDATA[")
1354	{
1355	go!(self: clear_temp; to CdataSection);
1356	}
1357	go!(self: error; clear_comment; to BogusComment);
1358	}
1359	},
1360
1361	//§ cdata-section-state
1362	states::CdataSection => loop {
1363	match get_char!(self, input) {
1364	']' => go!(self: to CdataSectionBracket),
1365	'`\0`' => go!(self: emit_temp; emit '`\0`'),
1366	c => go!(self: push_temp c),
1367	}
1368	},
1369
1370	//§ cdata-section-bracket
1371	states::CdataSectionBracket => match get_char!(self, input) {
1372	']' => go!(self: to CdataSectionEnd),
1373	_ => go!(self: push_temp ']'; reconsume CdataSection),
1374	},
1375
1376	//§ cdata-section-end
1377	states::CdataSectionEnd => loop {
1378	match get_char!(self, input) {
1379	']' => go!(self: push_temp ']'),
1380	'>' => go!(self: emit_temp; to Data),
1381	_ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1382	}
1383	},
1384	//§ END
1385	}
1386	}
1387
1388	fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> {
1389	// FIXME HACK: Take and replace the tokenizer so we don't
1390	// double-mut-borrow self. This is why it's boxed.
1391	let mut tok = self.char_ref_tokenizer.take().unwrap();
1392	let outcome = tok.step(self, input);
1393
1394	let progress = match outcome {
1395	char_ref::Done => {
1396	self.process_char_ref(tok.get_result());
1397	return ProcessResult::Continue;
1398	},
1399
1400	char_ref::Stuck => ProcessResult::Suspend,
1401	char_ref::Progress => ProcessResult::Continue,
1402	};
1403
1404	*self.char_ref_tokenizer.borrow_mut() = Some(tok);
1405	progress
1406	}
1407
1408	fn process_char_ref(&self, char_ref: CharRef) {
1409	let CharRef {
1410	mut chars,
1411	mut num_chars,
1412	} = char_ref;
1413
1414	if num_chars == `0` {
1415	chars[`0`] = '&';
1416	num_chars = `1`;
1417	}
1418
1419	for i in `0`..num_chars {
1420	let c = chars[i as usize];
1421	match self.state.get() {
1422	states::Data \| states::RawData(states::Rcdata) => go!(self: emit c),
1423
1424	states::AttributeValue(_) => go!(self: push_value c),
1425
1426	_ => panic!(
1427	"state {:?} should not be reachable in process_char_ref",
1428	self.state.get()
1429	),
1430	}
1431	}
1432	}
1433
1434	/// Indicate that we have reached the end of the input.
1435	pub fn end(&self) {
1436	// Handle EOF in the char ref sub-tokenizer, if there is one.
1437	// Do this first because it might un-consume stuff.
1438	let input = BufferQueue::default();
1439	match self.char_ref_tokenizer.take() {
1440	None => (),
1441	Some(mut tok) => {
1442	tok.end_of_file(self, &input);
1443	self.process_char_ref(tok.get_result());
1444	},
1445	}
1446
1447	// Process all remaining buffered input.
1448	// If we're waiting for lookahead, we're not gonna get it.
1449	self.at_eof.set(`true`);
1450	assert!(matches!(self.run(&input), TokenizerResult::Done));
1451	assert!(input.is_empty());
1452
1453	loop {
1454	match self.eof_step() {
1455	ProcessResult::Continue => (),
1456	ProcessResult::Suspend => break,
1457	ProcessResult::Script(_) => unreachable!(),
1458	}
1459	}
1460
1461	self.sink.end();
1462
1463	if self.opts.profile {
1464	self.dump_profile();
1465	}
1466	}
1467
1468	fn dump_profile(&self) {
1469	let mut results: Vec<(states::State, u64)> = self
1470	.state_profile
1471	.borrow()
1472	.iter()
1473	.map(\|(s, t)\| (s, t))
1474	.collect();
1475	results.sort_by(\|&(_, x), &(_, y)\| y.cmp(&x));
1476
1477	let total: u64 = results
1478	.iter()
1479	.map(\|&(_, t)\| t)
1480	.fold(`0`, ::std::ops::Add::add);
1481	println!("`\n`Tokenizer profile, in nanoseconds");
1482	println!(
1483	"`\n`{:`12`} total in token sink",
1484	self.time_in_sink.get()
1485	);
1486	println!("`\n`{total:`12`} total in tokenizer");
1487
1488	for (k, v) in results.into_iter() {
1489	let pct = `100.0` * (v as f64) / (total as f64);
1490	println!("{v:`12`} {pct:`4`.`1`}% {k:?}");
1491	}
1492	}
1493
1494	fn eof_step(&self) -> ProcessResult<Sink::Handle> {
1495	debug!("processing EOF in state {:?}", self.state.get());
1496	match self.state.get() {
1497	states::Data
1498	\| states::RawData(Rcdata)
1499	\| states::RawData(Rawtext)
1500	\| states::RawData(ScriptData)
1501	\| states::Plaintext => go!(self: eof),
1502
1503	states::TagName
1504	\| states::RawData(ScriptDataEscaped(_))
1505	\| states::BeforeAttributeName
1506	\| states::AttributeName
1507	\| states::AfterAttributeName
1508	\| states::AttributeValue(_)
1509	\| states::AfterAttributeValueQuoted
1510	\| states::SelfClosingStartTag
1511	\| states::ScriptDataEscapedDash(_)
1512	\| states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1513
1514	states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted),
1515
1516	states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1517
1518	states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1519
1520	states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1521	go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1522	},
1523
1524	states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1525
1526	states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1527
1528	states::RawEndTagName(kind) => {
1529	go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1530	},
1531
1532	states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1533
1534	states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1535
1536	states::ScriptDataDoubleEscapeEnd => {
1537	go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1538	},
1539
1540	states::CommentStart
1541	\| states::CommentStartDash
1542	\| states::Comment
1543	\| states::CommentEndDash
1544	\| states::CommentEnd
1545	\| states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1546
1547	states::CommentLessThanSign \| states::CommentLessThanSignBang => {
1548	go!(self: reconsume Comment)
1549	},
1550
1551	states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash),
1552
1553	states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd),
1554
1555	states::Doctype \| states::BeforeDoctypeName => {
1556	go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1557	},
1558
1559	states::DoctypeName
1560	\| states::AfterDoctypeName
1561	\| states::AfterDoctypeKeyword(_)
1562	\| states::BeforeDoctypeIdentifier(_)
1563	\| states::DoctypeIdentifierDoubleQuoted(_)
1564	\| states::DoctypeIdentifierSingleQuoted(_)
1565	\| states::AfterDoctypeIdentifier(_)
1566	\| states::BetweenDoctypePublicAndSystemIdentifiers => {
1567	go!(self: error_eof; force_quirks; emit_doctype; to Data)
1568	},
1569
1570	states::BogusDoctype => go!(self: emit_doctype; to Data),
1571
1572	states::BogusComment => go!(self: emit_comment; to Data),
1573
1574	states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1575
1576	states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1577
1578	states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1579
1580	states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1581	}
1582	}
1583	}
1584
1585	#[cfg(test)]
1586	#[allow(non_snake_case)]
1587	mod test {
1588	use super::option_push; // private items
1589	use crate::tendril::{SliceExt, StrTendril};
1590
1591	use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1592
1593	use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1594	use super::interface::{EndTag, StartTag, Tag, TagKind};
1595	use super::interface::{TagToken, Token};
1596
1597	use markup5ever::buffer_queue::BufferQueue;
1598	use std::cell::RefCell;
1599
1600	use crate::LocalName;
1601
1602	// LinesMatch implements the TokenSink trait. It is used for testing to see
1603	// if current_line is being updated when process_token is called. The lines
1604	// vector is a collection of the line numbers that each token is on.
1605	struct LinesMatch {
1606	tokens: RefCell<Vec<Token>>,
1607	current_str: RefCell<StrTendril>,
1608	lines: RefCell<Vec<(Token, u64)>>,
1609	}
1610
1611	impl LinesMatch {
1612	fn new() -> LinesMatch {
1613	LinesMatch {
1614	tokens: RefCell::new(vec![]),
1615	current_str: RefCell::new(StrTendril::new()),
1616	lines: RefCell::new(vec![]),
1617	}
1618	}
1619
1620	fn push(&self, token: Token, line_number: u64) {
1621	self.finish_str();
1622	self.lines.borrow_mut().push((token, line_number));
1623	}
1624
1625	fn finish_str(&self) {
1626	if self.current_str.borrow().len() > `0` {
1627	let s = self.current_str.take();
1628	self.tokens.borrow_mut().push(CharacterTokens(s));
1629	}
1630	}
1631	}
1632
1633	impl TokenSink for LinesMatch {
1634	type Handle = ();
1635
1636	fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
1637	match token {
1638	CharacterTokens(b) => {
1639	self.current_str.borrow_mut().push_slice(&b);
1640	},
1641
1642	NullCharacterToken => {
1643	self.current_str.borrow_mut().push_char('`\0`');
1644	},
1645
1646	ParseError(_) => {
1647	panic!("unexpected parse error");
1648	},
1649
1650	TagToken(mut t) => {
1651	// The spec seems to indicate that one can emit
1652	// erroneous end tags with attrs, but the test
1653	// cases don't contain them.
1654	match t.kind {
1655	EndTag => {
1656	t.self_closing = `false`;
1657	t.attrs = vec![];
1658	},
1659	_ => t.attrs.sort_by(\|a1, a2\| a1.name.cmp(&a2.name)),
1660	}
1661	self.push(TagToken(t), line_number);
1662	},
1663
1664	EOFToken => (),
1665
1666	_ => self.push(token, line_number),
1667	}
1668	TokenSinkResult::Continue
1669	}
1670	}
1671
1672	// Take in tokens, process them, and return vector with line
1673	// numbers that each token is on
1674	fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1675	let sink = LinesMatch::new();
1676	let tok = Tokenizer::new(sink, opts);
1677	let buffer = BufferQueue::default();
1678	for chunk in input.into_iter() {
1679	buffer.push_back(chunk);
1680	let _ = tok.feed(&buffer);
1681	}
1682	tok.end();
1683	tok.sink.lines.take()
1684	}
1685
1686	// Create a tag token
1687	fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1688	let name = LocalName::from(&*token);
1689
1690	TagToken(Tag {
1691	kind: tagkind,
1692	name,
1693	self_closing: `false`,
1694	attrs: vec![],
1695	})
1696	}
1697
1698	#[test]
1699	fn push_to_None_gives_singleton() {
1700	let mut s: Option<StrTendril> = None;
1701	option_push(&mut s, 'x');
1702	assert_eq!(s, Some("x".to_tendril()));
1703	}
1704
1705	#[test]
1706	fn push_to_empty_appends() {
1707	let mut s: Option<StrTendril> = Some(StrTendril::new());
1708	option_push(&mut s, 'x');
1709	assert_eq!(s, Some("x".to_tendril()));
1710	}
1711
1712	#[test]
1713	fn push_to_nonempty_appends() {
1714	let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1715	option_push(&mut s, 'x');
1716	assert_eq!(s, Some("yx".to_tendril()));
1717	}
1718
1719	#[test]
1720	fn check_lines() {
1721	let opts = TokenizerOpts {
1722	exact_errors: `false`,
1723	discard_bom: `true`,
1724	profile: `false`,
1725	initial_state: None,
1726	last_start_tag_name: None,
1727	};
1728	let vector = vec![
1729	StrTendril::from("<a>`\n`"),
1730	StrTendril::from("<b>`\n`"),
1731	StrTendril::from("</b>`\n`"),
1732	StrTendril::from("</a>`\n`"),
1733	];
1734	let expected = vec![
1735	(create_tag(StrTendril::from("a"), StartTag), `1`),
1736	(create_tag(StrTendril::from("b"), StartTag), `2`),
1737	(create_tag(StrTendril::from("b"), EndTag), `3`),
1738	(create_tag(StrTendril::from("a"), EndTag), `4`),
1739	];
1740	let results = tokenize(vector, opts);
1741	assert_eq!(results, expected);
1742	}
1743
1744	#[test]
1745	fn check_lines_with_new_line() {
1746	let opts = TokenizerOpts {
1747	exact_errors: `false`,
1748	discard_bom: `true`,
1749	profile: `false`,
1750	initial_state: None,
1751	last_start_tag_name: None,
1752	};
1753	let vector = vec![
1754	StrTendril::from("<a>`\r\n`"),
1755	StrTendril::from("<b>`\r\n`"),
1756	StrTendril::from("</b>`\r\n`"),
1757	StrTendril::from("</a>`\r\n`"),
1758	];
1759	let expected = vec![
1760	(create_tag(StrTendril::from("a"), StartTag), `1`),
1761	(create_tag(StrTendril::from("b"), StartTag), `2`),
1762	(create_tag(StrTendril::from("b"), EndTag), `3`),
1763	(create_tag(StrTendril::from("a"), EndTag), `4`),
1764	];
1765	let results = tokenize(vector, opts);
1766	assert_eq!(results, expected);
1767	}
1768	}
1769