1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! The HTML5 tokenizer. |
11 | |
12 | pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; |
13 | pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; |
14 | pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; |
15 | pub use self::interface::{TokenSink, TokenSinkResult}; |
16 | |
17 | use self::states::{DoctypeIdKind, Public, System}; |
18 | use self::states::{DoubleEscaped, Escaped}; |
19 | use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; |
20 | use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; |
21 | |
22 | use self::char_ref::{CharRef, CharRefTokenizer}; |
23 | |
24 | use crate::util::str::lower_ascii_letter; |
25 | |
26 | use log::{debug, trace}; |
27 | use mac::format_if; |
28 | use markup5ever::{namespace_url, ns, small_char_set}; |
29 | use std::borrow::Cow::{self, Borrowed}; |
30 | use std::cell::{Cell, RefCell, RefMut}; |
31 | use std::collections::BTreeMap; |
32 | use std::mem; |
33 | |
34 | pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; |
35 | use crate::tendril::StrTendril; |
36 | use crate::{Attribute, LocalName, QualName, SmallCharSet}; |
37 | |
38 | mod char_ref; |
39 | mod interface; |
40 | pub mod states; |
41 | |
42 | pub enum ProcessResult<Handle> { |
43 | Continue, |
44 | Suspend, |
45 | Script(Handle), |
46 | } |
47 | |
48 | #[must_use ] |
49 | #[derive (Debug)] |
50 | pub enum TokenizerResult<Handle> { |
51 | Done, |
52 | Script(Handle), |
53 | } |
54 | |
55 | fn option_push(opt_str: &mut Option<StrTendril>, c: char) { |
56 | match *opt_str { |
57 | Some(ref mut s: &mut Tendril) => s.push_char(c), |
58 | None => *opt_str = Some(StrTendril::from_char(c)), |
59 | } |
60 | } |
61 | |
62 | /// Tokenizer options, with an impl for `Default`. |
63 | #[derive (Clone)] |
64 | pub struct TokenizerOpts { |
65 | /// Report all parse errors described in the spec, at some |
66 | /// performance penalty? Default: false |
67 | pub exact_errors: bool, |
68 | |
69 | /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning |
70 | /// of the stream? Default: true |
71 | pub discard_bom: bool, |
72 | |
73 | /// Keep a record of how long we spent in each state? Printed |
74 | /// when `end()` is called. Default: false |
75 | pub profile: bool, |
76 | |
77 | /// Initial state override. Only the test runner should use |
78 | /// a non-`None` value! |
79 | pub initial_state: Option<states::State>, |
80 | |
81 | /// Last start tag. Only the test runner should use a |
82 | /// non-`None` value! |
83 | /// |
84 | /// FIXME: Can't use Tendril because we want TokenizerOpts |
85 | /// to be Send. |
86 | pub last_start_tag_name: Option<String>, |
87 | } |
88 | |
89 | impl Default for TokenizerOpts { |
90 | fn default() -> TokenizerOpts { |
91 | TokenizerOpts { |
92 | exact_errors: false, |
93 | discard_bom: true, |
94 | profile: false, |
95 | initial_state: None, |
96 | last_start_tag_name: None, |
97 | } |
98 | } |
99 | } |
100 | |
101 | /// The HTML tokenizer. |
102 | pub struct Tokenizer<Sink> { |
103 | /// Options controlling the behavior of the tokenizer. |
104 | opts: TokenizerOpts, |
105 | |
106 | /// Destination for tokens we emit. |
107 | pub sink: Sink, |
108 | |
109 | /// The abstract machine state as described in the spec. |
110 | state: Cell<states::State>, |
111 | |
112 | /// Are we at the end of the file, once buffers have been processed |
113 | /// completely? This affects whether we will wait for lookahead or not. |
114 | at_eof: Cell<bool>, |
115 | |
116 | /// Tokenizer for character references, if we're tokenizing |
117 | /// one at the moment. |
118 | char_ref_tokenizer: RefCell<Option<Box<CharRefTokenizer>>>, |
119 | |
120 | /// Current input character. Just consumed, may reconsume. |
121 | current_char: Cell<char>, |
122 | |
123 | /// Should we reconsume the current input character? |
124 | reconsume: Cell<bool>, |
125 | |
126 | /// Did we just consume \r, translating it to \n? In that case we need |
127 | /// to ignore the next character if it's \n. |
128 | ignore_lf: Cell<bool>, |
129 | |
130 | /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the |
131 | /// beginning of the stream. |
132 | discard_bom: Cell<bool>, |
133 | |
134 | /// Current tag kind. |
135 | current_tag_kind: Cell<TagKind>, |
136 | |
137 | /// Current tag name. |
138 | current_tag_name: RefCell<StrTendril>, |
139 | |
140 | /// Current tag is self-closing? |
141 | current_tag_self_closing: Cell<bool>, |
142 | |
143 | /// Current tag attributes. |
144 | current_tag_attrs: RefCell<Vec<Attribute>>, |
145 | |
146 | /// Current attribute name. |
147 | current_attr_name: RefCell<StrTendril>, |
148 | |
149 | /// Current attribute value. |
150 | current_attr_value: RefCell<StrTendril>, |
151 | |
152 | /// Current comment. |
153 | current_comment: RefCell<StrTendril>, |
154 | |
155 | /// Current doctype token. |
156 | current_doctype: RefCell<Doctype>, |
157 | |
158 | /// Last start tag name, for use in checking "appropriate end tag". |
159 | last_start_tag_name: RefCell<Option<LocalName>>, |
160 | |
161 | /// The "temporary buffer" mentioned in the spec. |
162 | temp_buf: RefCell<StrTendril>, |
163 | |
164 | /// Record of how many ns we spent in each state, if profiling is enabled. |
165 | state_profile: RefCell<BTreeMap<states::State, u64>>, |
166 | |
167 | /// Record of how many ns we spent in the token sink. |
168 | time_in_sink: Cell<u64>, |
169 | |
170 | /// Track current line |
171 | current_line: Cell<u64>, |
172 | } |
173 | |
174 | impl<Sink: TokenSink> Tokenizer<Sink> { |
175 | /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. |
176 | pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> { |
177 | let start_tag_name = opts |
178 | .last_start_tag_name |
179 | .take() |
180 | .map(|s| LocalName::from(&*s)); |
181 | let state = opts.initial_state.unwrap_or(states::Data); |
182 | let discard_bom = opts.discard_bom; |
183 | Tokenizer { |
184 | opts, |
185 | sink, |
186 | state: Cell::new(state), |
187 | char_ref_tokenizer: RefCell::new(None), |
188 | at_eof: Cell::new(false), |
189 | current_char: Cell::new(' \0' ), |
190 | reconsume: Cell::new(false), |
191 | ignore_lf: Cell::new(false), |
192 | discard_bom: Cell::new(discard_bom), |
193 | current_tag_kind: Cell::new(StartTag), |
194 | current_tag_name: RefCell::new(StrTendril::new()), |
195 | current_tag_self_closing: Cell::new(false), |
196 | current_tag_attrs: RefCell::new(vec![]), |
197 | current_attr_name: RefCell::new(StrTendril::new()), |
198 | current_attr_value: RefCell::new(StrTendril::new()), |
199 | current_comment: RefCell::new(StrTendril::new()), |
200 | current_doctype: RefCell::new(Doctype::default()), |
201 | last_start_tag_name: RefCell::new(start_tag_name), |
202 | temp_buf: RefCell::new(StrTendril::new()), |
203 | state_profile: RefCell::new(BTreeMap::new()), |
204 | time_in_sink: Cell::new(0), |
205 | current_line: Cell::new(1), |
206 | } |
207 | } |
208 | |
209 | /// Feed an input string into the tokenizer. |
210 | pub fn feed(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> { |
211 | if input.is_empty() { |
212 | return TokenizerResult::Done; |
213 | } |
214 | |
215 | if self.discard_bom.get() { |
216 | if let Some(c) = input.peek() { |
217 | if c == ' \u{feff}' { |
218 | input.next(); |
219 | } |
220 | } else { |
221 | return TokenizerResult::Done; |
222 | } |
223 | }; |
224 | |
225 | self.run(input) |
226 | } |
227 | |
228 | pub fn set_plaintext_state(&self) { |
229 | self.state.set(states::Plaintext); |
230 | } |
231 | |
232 | fn process_token(&self, token: Token) -> TokenSinkResult<Sink::Handle> { |
233 | if self.opts.profile { |
234 | let (ret, dt) = time!(self.sink.process_token(token, self.current_line.get())); |
235 | self.time_in_sink.set(self.time_in_sink.get() + dt); |
236 | ret |
237 | } else { |
238 | self.sink.process_token(token, self.current_line.get()) |
239 | } |
240 | } |
241 | |
242 | fn process_token_and_continue(&self, token: Token) { |
243 | assert!(matches!( |
244 | self.process_token(token), |
245 | TokenSinkResult::Continue |
246 | )); |
247 | } |
248 | |
249 | //ยง preprocessing-the-input-stream |
250 | // Get the next input character, which might be the character |
251 | // 'c' that we already consumed from the buffers. |
252 | fn get_preprocessed_char(&self, mut c: char, input: &BufferQueue) -> Option<char> { |
253 | if self.ignore_lf.get() { |
254 | self.ignore_lf.set(false); |
255 | if c == ' \n' { |
256 | c = input.next()?; |
257 | } |
258 | } |
259 | |
260 | if c == ' \r' { |
261 | self.ignore_lf.set(true); |
262 | c = ' \n' ; |
263 | } |
264 | |
265 | if c == ' \n' { |
266 | self.current_line.set(self.current_line.get() + 1); |
267 | } |
268 | |
269 | if self.opts.exact_errors |
270 | && match c as u32 { |
271 | 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, |
272 | n if (n & 0xFFFE) == 0xFFFE => true, |
273 | _ => false, |
274 | } |
275 | { |
276 | let msg = format!("Bad character {c}" ); |
277 | self.emit_error(Cow::Owned(msg)); |
278 | } |
279 | |
280 | trace!("got character {}" , c); |
281 | self.current_char.set(c); |
282 | Some(c) |
283 | } |
284 | |
285 | //ยง tokenization |
286 | // Get the next input character, if one is available. |
287 | fn get_char(&self, input: &BufferQueue) -> Option<char> { |
288 | if self.reconsume.get() { |
289 | self.reconsume.set(false); |
290 | Some(self.current_char.get()) |
291 | } else { |
292 | input |
293 | .next() |
294 | .and_then(|c| self.get_preprocessed_char(c, input)) |
295 | } |
296 | } |
297 | |
298 | fn pop_except_from(&self, input: &BufferQueue, set: SmallCharSet) -> Option<SetResult> { |
299 | // Bail to the slow path for various corner cases. |
300 | // This means that `FromSet` can contain characters not in the set! |
301 | // It shouldn't matter because the fallback `FromSet` case should |
302 | // always do the same thing as the `NotFromSet` case. |
303 | if self.opts.exact_errors || self.reconsume.get() || self.ignore_lf.get() { |
304 | return self.get_char(input).map(FromSet); |
305 | } |
306 | |
307 | let d = input.pop_except_from(set); |
308 | trace!("got characters {:?}" , d); |
309 | match d { |
310 | Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet), |
311 | |
312 | // NB: We don't set self.current_char for a run of characters not |
313 | // in the set. It shouldn't matter for the codepaths that use |
314 | // this. |
315 | _ => d, |
316 | } |
317 | } |
318 | |
319 | // Check if the next characters are an ASCII case-insensitive match. See |
320 | // BufferQueue::eat. |
321 | // |
322 | // NB: this doesn't set the current input character. |
323 | fn eat(&self, input: &BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool> { |
324 | if self.ignore_lf.get() { |
325 | self.ignore_lf.set(false); |
326 | if self.peek(input) == Some(' \n' ) { |
327 | self.discard_char(input); |
328 | } |
329 | } |
330 | |
331 | input.push_front(mem::take(&mut self.temp_buf.borrow_mut())); |
332 | match input.eat(pat, eq) { |
333 | None if self.at_eof.get() => Some(false), |
334 | None => { |
335 | while let Some(data) = input.next() { |
336 | self.temp_buf.borrow_mut().push_char(data); |
337 | } |
338 | None |
339 | }, |
340 | Some(matched) => Some(matched), |
341 | } |
342 | } |
343 | |
344 | /// Run the state machine for as long as we can. |
345 | fn run(&self, input: &BufferQueue) -> TokenizerResult<Sink::Handle> { |
346 | if self.opts.profile { |
347 | loop { |
348 | let state = self.state.get(); |
349 | let old_sink = self.time_in_sink.get(); |
350 | let (run, mut dt) = time!(self.step(input)); |
351 | dt -= (self.time_in_sink.get() - old_sink); |
352 | let new = match self.state_profile.borrow_mut().get_mut(&state) { |
353 | Some(x) => { |
354 | *x += dt; |
355 | false |
356 | }, |
357 | None => true, |
358 | }; |
359 | if new { |
360 | // do this here because of borrow shenanigans |
361 | self.state_profile.borrow_mut().insert(state, dt); |
362 | } |
363 | match run { |
364 | ProcessResult::Continue => (), |
365 | ProcessResult::Suspend => break, |
366 | ProcessResult::Script(node) => return TokenizerResult::Script(node), |
367 | } |
368 | } |
369 | } else { |
370 | loop { |
371 | match self.step(input) { |
372 | ProcessResult::Continue => (), |
373 | ProcessResult::Suspend => break, |
374 | ProcessResult::Script(node) => return TokenizerResult::Script(node), |
375 | } |
376 | } |
377 | } |
378 | TokenizerResult::Done |
379 | } |
380 | |
381 | fn bad_char_error(&self) { |
382 | let msg = format_if!( |
383 | self.opts.exact_errors, |
384 | "Bad character" , |
385 | "Saw {} in state {:?}" , |
386 | self.current_char.get(), |
387 | self.state.get() |
388 | ); |
389 | self.emit_error(msg); |
390 | } |
391 | |
392 | fn bad_eof_error(&self) { |
393 | let msg = format_if!( |
394 | self.opts.exact_errors, |
395 | "Unexpected EOF" , |
396 | "Saw EOF in state {:?}" , |
397 | self.state.get() |
398 | ); |
399 | self.emit_error(msg); |
400 | } |
401 | |
402 | fn emit_char(&self, c: char) { |
403 | self.process_token_and_continue(match c { |
404 | ' \0' => NullCharacterToken, |
405 | _ => CharacterTokens(StrTendril::from_char(c)), |
406 | }); |
407 | } |
408 | |
409 | // The string must not contain '\0'! |
410 | fn emit_chars(&self, b: StrTendril) { |
411 | self.process_token_and_continue(CharacterTokens(b)); |
412 | } |
413 | |
414 | fn emit_current_tag(&self) -> ProcessResult<Sink::Handle> { |
415 | self.finish_attribute(); |
416 | |
417 | let name = LocalName::from(&**self.current_tag_name.borrow()); |
418 | self.current_tag_name.borrow_mut().clear(); |
419 | |
420 | match self.current_tag_kind.get() { |
421 | StartTag => { |
422 | *self.last_start_tag_name.borrow_mut() = Some(name.clone()); |
423 | }, |
424 | EndTag => { |
425 | if !self.current_tag_attrs.borrow().is_empty() { |
426 | self.emit_error(Borrowed("Attributes on an end tag" )); |
427 | } |
428 | if self.current_tag_self_closing.get() { |
429 | self.emit_error(Borrowed("Self-closing end tag" )); |
430 | } |
431 | }, |
432 | } |
433 | |
434 | let token = TagToken(Tag { |
435 | kind: self.current_tag_kind.get(), |
436 | name, |
437 | self_closing: self.current_tag_self_closing.get(), |
438 | attrs: std::mem::take(&mut self.current_tag_attrs.borrow_mut()), |
439 | }); |
440 | |
441 | match self.process_token(token) { |
442 | TokenSinkResult::Continue => ProcessResult::Continue, |
443 | TokenSinkResult::Plaintext => { |
444 | self.state.set(states::Plaintext); |
445 | ProcessResult::Continue |
446 | }, |
447 | TokenSinkResult::Script(node) => { |
448 | self.state.set(states::Data); |
449 | ProcessResult::Script(node) |
450 | }, |
451 | TokenSinkResult::RawData(kind) => { |
452 | self.state.set(states::RawData(kind)); |
453 | ProcessResult::Continue |
454 | }, |
455 | } |
456 | } |
457 | |
458 | fn emit_temp_buf(&self) { |
459 | // FIXME: Make sure that clearing on emit is spec-compatible. |
460 | let buf = mem::take(&mut *self.temp_buf.borrow_mut()); |
461 | self.emit_chars(buf); |
462 | } |
463 | |
464 | fn clear_temp_buf(&self) { |
465 | // Do this without a new allocation. |
466 | self.temp_buf.borrow_mut().clear(); |
467 | } |
468 | |
469 | fn emit_current_comment(&self) { |
470 | let comment = mem::take(&mut *self.current_comment.borrow_mut()); |
471 | self.process_token_and_continue(CommentToken(comment)); |
472 | } |
473 | |
474 | fn discard_tag(&self) { |
475 | self.current_tag_name.borrow_mut().clear(); |
476 | self.current_tag_self_closing.set(false); |
477 | *self.current_tag_attrs.borrow_mut() = vec![]; |
478 | } |
479 | |
480 | fn create_tag(&self, kind: TagKind, c: char) { |
481 | self.discard_tag(); |
482 | self.current_tag_name.borrow_mut().push_char(c); |
483 | self.current_tag_kind.set(kind); |
484 | } |
485 | |
486 | fn have_appropriate_end_tag(&self) -> bool { |
487 | match self.last_start_tag_name.borrow().as_ref() { |
488 | Some(last) => { |
489 | (self.current_tag_kind.get() == EndTag) |
490 | && (**self.current_tag_name.borrow() == **last) |
491 | }, |
492 | None => false, |
493 | } |
494 | } |
495 | |
496 | fn create_attribute(&self, c: char) { |
497 | self.finish_attribute(); |
498 | |
499 | self.current_attr_name.borrow_mut().push_char(c); |
500 | } |
501 | |
502 | fn finish_attribute(&self) { |
503 | if self.current_attr_name.borrow().is_empty() { |
504 | return; |
505 | } |
506 | |
507 | // Check for a duplicate attribute. |
508 | // FIXME: the spec says we should error as soon as the name is finished. |
509 | let dup = { |
510 | let name = &*self.current_attr_name.borrow(); |
511 | self.current_tag_attrs |
512 | .borrow() |
513 | .iter() |
514 | .any(|a| *a.name.local == **name) |
515 | }; |
516 | |
517 | if dup { |
518 | self.emit_error(Borrowed("Duplicate attribute" )); |
519 | self.current_attr_name.borrow_mut().clear(); |
520 | self.current_attr_value.borrow_mut().clear(); |
521 | } else { |
522 | let name = LocalName::from(&**self.current_attr_name.borrow()); |
523 | self.current_attr_name.borrow_mut().clear(); |
524 | self.current_tag_attrs.borrow_mut().push(Attribute { |
525 | // The tree builder will adjust the namespace if necessary. |
526 | // This only happens in foreign elements. |
527 | name: QualName::new(None, ns!(), name), |
528 | value: mem::take(&mut self.current_attr_value.borrow_mut()), |
529 | }); |
530 | } |
531 | } |
532 | |
533 | fn emit_current_doctype(&self) { |
534 | let doctype = self.current_doctype.take(); |
535 | self.process_token_and_continue(DoctypeToken(doctype)); |
536 | } |
537 | |
538 | fn doctype_id(&self, kind: DoctypeIdKind) -> RefMut<Option<StrTendril>> { |
539 | let current_doctype = self.current_doctype.borrow_mut(); |
540 | match kind { |
541 | Public => RefMut::map(current_doctype, |d| &mut d.public_id), |
542 | System => RefMut::map(current_doctype, |d| &mut d.system_id), |
543 | } |
544 | } |
545 | |
546 | fn clear_doctype_id(&self, kind: DoctypeIdKind) { |
547 | let mut id = self.doctype_id(kind); |
548 | match *id { |
549 | Some(ref mut s) => s.clear(), |
550 | None => *id = Some(StrTendril::new()), |
551 | } |
552 | } |
553 | |
554 | fn consume_char_ref(&self) { |
555 | *self.char_ref_tokenizer.borrow_mut() = Some(Box::new(CharRefTokenizer::new(matches!( |
556 | self.state.get(), |
557 | states::AttributeValue(_) |
558 | )))); |
559 | } |
560 | |
561 | fn emit_eof(&self) { |
562 | self.process_token_and_continue(EOFToken); |
563 | } |
564 | |
565 | fn peek(&self, input: &BufferQueue) -> Option<char> { |
566 | if self.reconsume.get() { |
567 | Some(self.current_char.get()) |
568 | } else { |
569 | input.peek() |
570 | } |
571 | } |
572 | |
573 | fn discard_char(&self, input: &BufferQueue) { |
574 | // peek() deals in un-processed characters (no newline normalization), while get_char() |
575 | // does. |
576 | // |
577 | // since discard_char is supposed to be used in combination with peek(), discard_char must |
578 | // discard a single raw input character, not a normalized newline. |
579 | if self.reconsume.get() { |
580 | self.reconsume.set(false); |
581 | } else { |
582 | input.next(); |
583 | } |
584 | } |
585 | |
586 | fn emit_error(&self, error: Cow<'static, str>) { |
587 | self.process_token_and_continue(ParseError(error)); |
588 | } |
589 | } |
590 | //ยง END |
591 | |
592 | // Shorthand for common state machine behaviors. |
593 | macro_rules! shorthand ( |
594 | ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) ); |
595 | ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) ); |
596 | ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.borrow_mut().push_char($c) ); |
597 | ( $me:ident : discard_tag ) => ( $me.discard_tag() ); |
598 | ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) ); |
599 | ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.borrow_mut().push_char($c) ); |
600 | ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() ); |
601 | ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() ); |
602 | ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) ); |
603 | ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.borrow_mut().push_char($c) ); |
604 | ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_char($c) ); |
605 | ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.borrow_mut().push_tendril($c)); |
606 | ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_char($c) ); |
607 | ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.borrow_mut().push_slice($c) ); |
608 | ( $me:ident : emit_comment ) => ( $me.emit_current_comment() ); |
609 | ( $me:ident : clear_comment ) => ( $me.current_comment.borrow_mut().clear() ); |
610 | ( $me:ident : create_doctype ) => ( *$me.current_doctype.borrow_mut() = Doctype::default() ); |
611 | ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.borrow_mut().name, $c) ); |
612 | ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push(&mut $me.doctype_id($k), $c) ); |
613 | ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) ); |
614 | ( $me:ident : force_quirks ) => ( $me.current_doctype.borrow_mut().force_quirks = true); |
615 | ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() ); |
616 | ( $me:ident : error ) => ( $me.bad_char_error() ); |
617 | ( $me:ident : error_eof ) => ( $me.bad_eof_error() ); |
618 | ); |
619 | |
620 | // Tracing of tokenizer actions. This adds significant bloat and compile time, |
621 | // so it's behind a cfg flag. |
622 | #[cfg (feature = "trace_tokenizer" )] |
623 | macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ |
624 | trace!(" {:?}" , stringify!($($cmds)*)); |
625 | shorthand!($me : $($cmds)*); |
626 | })); |
627 | |
628 | #[cfg (not(feature = "trace_tokenizer" ))] |
629 | macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); |
630 | |
631 | // A little DSL for sequencing shorthand actions. |
632 | macro_rules! go ( |
633 | // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. |
634 | // We have to tell the parser how much lookahead we need. |
635 | |
636 | ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); }); |
637 | ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); }); |
638 | ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); }); |
639 | ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); |
640 | |
641 | // These can only come at the end. |
642 | |
643 | ( $me:ident : to $s:ident ) => ({ $me.state.set(states::$s); return ProcessResult::Continue; }); |
644 | ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state.set(states::$s($k1)); return ProcessResult::Continue; }); |
645 | ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state.set(states::$s($k1($k2))); return ProcessResult::Continue; }); |
646 | |
647 | ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume.set(true); go!($me: to $s); }); |
648 | ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1); }); |
649 | ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume.set(true); go!($me: to $s $k1 $k2); }); |
650 | |
651 | ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; }); |
652 | |
653 | // We have a default next state after emitting a tag, but the sink can override. |
654 | ( $me:ident : emit_tag $s:ident ) => ({ |
655 | $me.state.set(states::$s); |
656 | return $me.emit_current_tag(); |
657 | }); |
658 | |
659 | ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; }); |
660 | |
661 | // If nothing else matched, it's a single command |
662 | ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) ); |
663 | |
664 | // or nothing. |
665 | ( $me:ident : ) => (()); |
666 | ); |
667 | |
668 | macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => ( |
669 | match $x { |
670 | $($pats)|+ => go!($me: $($cmds)*), |
671 | _ => (), |
672 | } |
673 | )); |
674 | |
675 | // This is a macro because it can cause early return |
676 | // from the function where it is used. |
677 | macro_rules! get_char ( ($me:expr, $input:expr) => ( |
678 | unwrap_or_return!($me.get_char($input), ProcessResult::Suspend) |
679 | )); |
680 | |
681 | macro_rules! peek ( ($me:expr, $input:expr) => ( |
682 | unwrap_or_return!($me.peek($input), ProcessResult::Suspend) |
683 | )); |
684 | |
685 | macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( |
686 | unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend) |
687 | )); |
688 | |
689 | macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( |
690 | unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend) |
691 | )); |
692 | |
693 | macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( |
694 | unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend) |
695 | )); |
696 | |
697 | impl<Sink: TokenSink> Tokenizer<Sink> { |
698 | // Run the state machine for a while. |
699 | // Return true if we should be immediately re-invoked |
700 | // (this just simplifies control flow vs. break / continue). |
701 | #[allow (clippy::never_loop)] |
702 | fn step(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> { |
703 | if self.char_ref_tokenizer.borrow().is_some() { |
704 | return self.step_char_ref_tokenizer(input); |
705 | } |
706 | |
707 | trace!("processing in state {:?}" , self.state); |
708 | match self.state.get() { |
709 | //ยง data-state |
710 | states::Data => loop { |
711 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '&' '<' ' \n' )) { |
712 | FromSet(' \0' ) => go!(self: error; emit ' \0' ), |
713 | FromSet('&' ) => go!(self: consume_char_ref), |
714 | FromSet('<' ) => go!(self: to TagOpen), |
715 | FromSet(c) => go!(self: emit c), |
716 | NotFromSet(b) => self.emit_chars(b), |
717 | } |
718 | }, |
719 | |
720 | //ยง rcdata-state |
721 | states::RawData(Rcdata) => loop { |
722 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '&' '<' ' \n' )) { |
723 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
724 | FromSet('&' ) => go!(self: consume_char_ref), |
725 | FromSet('<' ) => go!(self: to RawLessThanSign Rcdata), |
726 | FromSet(c) => go!(self: emit c), |
727 | NotFromSet(b) => self.emit_chars(b), |
728 | } |
729 | }, |
730 | |
731 | //ยง rawtext-state |
732 | states::RawData(Rawtext) => loop { |
733 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '<' ' \n' )) { |
734 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
735 | FromSet('<' ) => go!(self: to RawLessThanSign Rawtext), |
736 | FromSet(c) => go!(self: emit c), |
737 | NotFromSet(b) => self.emit_chars(b), |
738 | } |
739 | }, |
740 | |
741 | //ยง script-data-state |
742 | states::RawData(ScriptData) => loop { |
743 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '<' ' \n' )) { |
744 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
745 | FromSet('<' ) => go!(self: to RawLessThanSign ScriptData), |
746 | FromSet(c) => go!(self: emit c), |
747 | NotFromSet(b) => self.emit_chars(b), |
748 | } |
749 | }, |
750 | |
751 | //ยง script-data-escaped-state |
752 | states::RawData(ScriptDataEscaped(Escaped)) => loop { |
753 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '-' '<' ' \n' )) { |
754 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
755 | FromSet('-' ) => go!(self: emit '-' ; to ScriptDataEscapedDash Escaped), |
756 | FromSet('<' ) => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), |
757 | FromSet(c) => go!(self: emit c), |
758 | NotFromSet(b) => self.emit_chars(b), |
759 | } |
760 | }, |
761 | |
762 | //ยง script-data-double-escaped-state |
763 | states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { |
764 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '-' '<' ' \n' )) { |
765 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
766 | FromSet('-' ) => go!(self: emit '-' ; to ScriptDataEscapedDash DoubleEscaped), |
767 | FromSet('<' ) => { |
768 | go!(self: emit '<' ; to RawLessThanSign ScriptDataEscaped DoubleEscaped) |
769 | }, |
770 | FromSet(c) => go!(self: emit c), |
771 | NotFromSet(b) => self.emit_chars(b), |
772 | } |
773 | }, |
774 | |
775 | //ยง plaintext-state |
776 | states::Plaintext => loop { |
777 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' ' \n' )) { |
778 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
779 | FromSet(c) => go!(self: emit c), |
780 | NotFromSet(b) => self.emit_chars(b), |
781 | } |
782 | }, |
783 | |
784 | //ยง tag-open-state |
785 | states::TagOpen => loop { |
786 | match get_char!(self, input) { |
787 | '!' => go!(self: to MarkupDeclarationOpen), |
788 | '/' => go!(self: to EndTagOpen), |
789 | '?' => go!(self: error; clear_comment; reconsume BogusComment), |
790 | c => match lower_ascii_letter(c) { |
791 | Some(cl) => go!(self: create_tag StartTag cl; to TagName), |
792 | None => go!(self: error; emit '<' ; reconsume Data), |
793 | }, |
794 | } |
795 | }, |
796 | |
797 | //ยง end-tag-open-state |
798 | states::EndTagOpen => loop { |
799 | match get_char!(self, input) { |
800 | '>' => go!(self: error; to Data), |
801 | c => match lower_ascii_letter(c) { |
802 | Some(cl) => go!(self: create_tag EndTag cl; to TagName), |
803 | None => go!(self: error; clear_comment; reconsume BogusComment), |
804 | }, |
805 | } |
806 | }, |
807 | |
808 | //ยง tag-name-state |
809 | states::TagName => loop { |
810 | match get_char!(self, input) { |
811 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeAttributeName), |
812 | '/' => go!(self: to SelfClosingStartTag), |
813 | '>' => go!(self: emit_tag Data), |
814 | ' \0' => go!(self: error; push_tag ' \u{fffd}' ), |
815 | c => go!(self: push_tag (c.to_ascii_lowercase())), |
816 | } |
817 | }, |
818 | |
819 | //ยง script-data-escaped-less-than-sign-state |
820 | states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { |
821 | match get_char!(self, input) { |
822 | '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), |
823 | c => match lower_ascii_letter(c) { |
824 | Some(cl) => go!(self: clear_temp; push_temp cl; emit '<' ; emit c; |
825 | to ScriptDataEscapeStart DoubleEscaped), |
826 | None => go!(self: emit '<' ; reconsume RawData ScriptDataEscaped Escaped), |
827 | }, |
828 | } |
829 | }, |
830 | |
831 | //ยง script-data-double-escaped-less-than-sign-state |
832 | states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { |
833 | match get_char!(self, input) { |
834 | '/' => go!(self: clear_temp; emit '/' ; to ScriptDataDoubleEscapeEnd), |
835 | _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), |
836 | } |
837 | }, |
838 | |
839 | //ยง rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state |
840 | // otherwise |
841 | states::RawLessThanSign(kind) => loop { |
842 | match get_char!(self, input) { |
843 | '/' => go!(self: clear_temp; to RawEndTagOpen kind), |
844 | '!' if kind == ScriptData => { |
845 | go!(self: emit '<' ; emit '!' ; to ScriptDataEscapeStart Escaped) |
846 | }, |
847 | _ => go!(self: emit '<' ; reconsume RawData kind), |
848 | } |
849 | }, |
850 | |
851 | //ยง rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state |
852 | states::RawEndTagOpen(kind) => loop { |
853 | let c = get_char!(self, input); |
854 | match lower_ascii_letter(c) { |
855 | Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), |
856 | None => go!(self: emit '<' ; emit '/' ; reconsume RawData kind), |
857 | } |
858 | }, |
859 | |
860 | //ยง rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state |
861 | states::RawEndTagName(kind) => loop { |
862 | let c = get_char!(self, input); |
863 | if self.have_appropriate_end_tag() { |
864 | match c { |
865 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: clear_temp; to BeforeAttributeName), |
866 | '/' => go!(self: clear_temp; to SelfClosingStartTag), |
867 | '>' => go!(self: clear_temp; emit_tag Data), |
868 | _ => (), |
869 | } |
870 | } |
871 | |
872 | match lower_ascii_letter(c) { |
873 | Some(cl) => go!(self: push_tag cl; push_temp c), |
874 | None => { |
875 | go!(self: discard_tag; emit '<' ; emit '/' ; emit_temp; reconsume RawData kind) |
876 | }, |
877 | } |
878 | }, |
879 | |
880 | //ยง script-data-double-escape-start-state |
881 | states::ScriptDataEscapeStart(DoubleEscaped) => loop { |
882 | let c = get_char!(self, input); |
883 | match c { |
884 | ' \t' | ' \n' | ' \x0C' | ' ' | '/' | '>' => { |
885 | let esc = if &**self.temp_buf.borrow() == "script" { |
886 | DoubleEscaped |
887 | } else { |
888 | Escaped |
889 | }; |
890 | go!(self: emit c; to RawData ScriptDataEscaped esc); |
891 | }, |
892 | _ => match lower_ascii_letter(c) { |
893 | Some(cl) => go!(self: push_temp cl; emit c), |
894 | None => go!(self: reconsume RawData ScriptDataEscaped Escaped), |
895 | }, |
896 | } |
897 | }, |
898 | |
899 | //ยง script-data-escape-start-state |
900 | states::ScriptDataEscapeStart(Escaped) => loop { |
901 | match get_char!(self, input) { |
902 | '-' => go!(self: emit '-' ; to ScriptDataEscapeStartDash), |
903 | _ => go!(self: reconsume RawData ScriptData), |
904 | } |
905 | }, |
906 | |
907 | //ยง script-data-escape-start-dash-state |
908 | states::ScriptDataEscapeStartDash => loop { |
909 | match get_char!(self, input) { |
910 | '-' => go!(self: emit '-' ; to ScriptDataEscapedDashDash Escaped), |
911 | _ => go!(self: reconsume RawData ScriptData), |
912 | } |
913 | }, |
914 | |
915 | //ยง script-data-escaped-dash-state script-data-double-escaped-dash-state |
916 | states::ScriptDataEscapedDash(kind) => loop { |
917 | match get_char!(self, input) { |
918 | '-' => go!(self: emit '-' ; to ScriptDataEscapedDashDash kind), |
919 | '<' => { |
920 | if kind == DoubleEscaped { |
921 | go!(self: emit '<' ); |
922 | } |
923 | go!(self: to RawLessThanSign ScriptDataEscaped kind); |
924 | }, |
925 | ' \0' => go!(self: error; emit ' \u{fffd}' ; to RawData ScriptDataEscaped kind), |
926 | c => go!(self: emit c; to RawData ScriptDataEscaped kind), |
927 | } |
928 | }, |
929 | |
930 | //ยง script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state |
931 | states::ScriptDataEscapedDashDash(kind) => loop { |
932 | match get_char!(self, input) { |
933 | '-' => go!(self: emit '-' ), |
934 | '<' => { |
935 | if kind == DoubleEscaped { |
936 | go!(self: emit '<' ); |
937 | } |
938 | go!(self: to RawLessThanSign ScriptDataEscaped kind); |
939 | }, |
940 | '>' => go!(self: emit '>' ; to RawData ScriptData), |
941 | ' \0' => go!(self: error; emit ' \u{fffd}' ; to RawData ScriptDataEscaped kind), |
942 | c => go!(self: emit c; to RawData ScriptDataEscaped kind), |
943 | } |
944 | }, |
945 | |
946 | //ยง script-data-double-escape-end-state |
947 | states::ScriptDataDoubleEscapeEnd => loop { |
948 | let c = get_char!(self, input); |
949 | match c { |
950 | ' \t' | ' \n' | ' \x0C' | ' ' | '/' | '>' => { |
951 | let esc = if &**self.temp_buf.borrow() == "script" { |
952 | Escaped |
953 | } else { |
954 | DoubleEscaped |
955 | }; |
956 | go!(self: emit c; to RawData ScriptDataEscaped esc); |
957 | }, |
958 | _ => match lower_ascii_letter(c) { |
959 | Some(cl) => go!(self: push_temp cl; emit c), |
960 | None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), |
961 | }, |
962 | } |
963 | }, |
964 | |
965 | //ยง before-attribute-name-state |
966 | states::BeforeAttributeName => loop { |
967 | match get_char!(self, input) { |
968 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
969 | '/' => go!(self: to SelfClosingStartTag), |
970 | '>' => go!(self: emit_tag Data), |
971 | ' \0' => go!(self: error; create_attr ' \u{fffd}' ; to AttributeName), |
972 | c => match lower_ascii_letter(c) { |
973 | Some(cl) => go!(self: create_attr cl; to AttributeName), |
974 | None => { |
975 | go_match!(self: c, |
976 | '"' , ' \'' , '<' , '=' => error); |
977 | go!(self: create_attr c; to AttributeName); |
978 | }, |
979 | }, |
980 | } |
981 | }, |
982 | |
983 | //ยง attribute-name-state |
984 | states::AttributeName => loop { |
985 | match get_char!(self, input) { |
986 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to AfterAttributeName), |
987 | '/' => go!(self: to SelfClosingStartTag), |
988 | '=' => go!(self: to BeforeAttributeValue), |
989 | '>' => go!(self: emit_tag Data), |
990 | ' \0' => go!(self: error; push_name ' \u{fffd}' ), |
991 | c => match lower_ascii_letter(c) { |
992 | Some(cl) => go!(self: push_name cl), |
993 | None => { |
994 | go_match!(self: c, |
995 | '"' , ' \'' , '<' => error); |
996 | go!(self: push_name c); |
997 | }, |
998 | }, |
999 | } |
1000 | }, |
1001 | |
1002 | //ยง after-attribute-name-state |
1003 | states::AfterAttributeName => loop { |
1004 | match get_char!(self, input) { |
1005 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1006 | '/' => go!(self: to SelfClosingStartTag), |
1007 | '=' => go!(self: to BeforeAttributeValue), |
1008 | '>' => go!(self: emit_tag Data), |
1009 | ' \0' => go!(self: error; create_attr ' \u{fffd}' ; to AttributeName), |
1010 | c => match lower_ascii_letter(c) { |
1011 | Some(cl) => go!(self: create_attr cl; to AttributeName), |
1012 | None => { |
1013 | go_match!(self: c, |
1014 | '"' , ' \'' , '<' => error); |
1015 | go!(self: create_attr c; to AttributeName); |
1016 | }, |
1017 | }, |
1018 | } |
1019 | }, |
1020 | |
1021 | //ยง before-attribute-value-state |
1022 | // Use peek so we can handle the first attr character along with the rest, |
1023 | // hopefully in the same zero-copy buffer. |
1024 | states::BeforeAttributeValue => loop { |
1025 | match peek!(self, input) { |
1026 | ' \t' | ' \n' | ' \r' | ' \x0C' | ' ' => go!(self: discard_char input), |
1027 | '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), |
1028 | ' \'' => go!(self: discard_char input; to AttributeValue SingleQuoted), |
1029 | '>' => go!(self: discard_char input; error; emit_tag Data), |
1030 | _ => go!(self: to AttributeValue Unquoted), |
1031 | } |
1032 | }, |
1033 | |
1034 | //ยง attribute-value-(double-quoted)-state |
1035 | states::AttributeValue(DoubleQuoted) => loop { |
1036 | match pop_except_from!(self, input, small_char_set!(' \r' '"' '&' ' \0' ' \n' )) { |
1037 | FromSet('"' ) => go!(self: to AfterAttributeValueQuoted), |
1038 | FromSet('&' ) => go!(self: consume_char_ref), |
1039 | FromSet(' \0' ) => go!(self: error; push_value ' \u{fffd}' ), |
1040 | FromSet(c) => go!(self: push_value c), |
1041 | NotFromSet(ref b) => go!(self: append_value b), |
1042 | } |
1043 | }, |
1044 | |
1045 | //ยง attribute-value-(single-quoted)-state |
1046 | states::AttributeValue(SingleQuoted) => loop { |
1047 | match pop_except_from!(self, input, small_char_set!(' \r' ' \'' '&' ' \0' ' \n' )) { |
1048 | FromSet(' \'' ) => go!(self: to AfterAttributeValueQuoted), |
1049 | FromSet('&' ) => go!(self: consume_char_ref), |
1050 | FromSet(' \0' ) => go!(self: error; push_value ' \u{fffd}' ), |
1051 | FromSet(c) => go!(self: push_value c), |
1052 | NotFromSet(ref b) => go!(self: append_value b), |
1053 | } |
1054 | }, |
1055 | |
1056 | //ยง attribute-value-(unquoted)-state |
1057 | states::AttributeValue(Unquoted) => loop { |
1058 | match pop_except_from!( |
1059 | self, |
1060 | input, |
1061 | small_char_set!(' \r' ' \t' ' \n' ' \x0C' ' ' '&' '>' ' \0' ) |
1062 | ) { |
1063 | FromSet(' \t' ) | FromSet(' \n' ) | FromSet(' \x0C' ) | FromSet(' ' ) => { |
1064 | go!(self: to BeforeAttributeName) |
1065 | }, |
1066 | FromSet('&' ) => go!(self: consume_char_ref), |
1067 | FromSet('>' ) => go!(self: emit_tag Data), |
1068 | FromSet(' \0' ) => go!(self: error; push_value ' \u{fffd}' ), |
1069 | FromSet(c) => { |
1070 | go_match!(self: c, |
1071 | '"' , ' \'' , '<' , '=' , '`' => error); |
1072 | go!(self: push_value c); |
1073 | }, |
1074 | NotFromSet(ref b) => go!(self: append_value b), |
1075 | } |
1076 | }, |
1077 | |
1078 | //ยง after-attribute-value-(quoted)-state |
1079 | states::AfterAttributeValueQuoted => loop { |
1080 | match get_char!(self, input) { |
1081 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeAttributeName), |
1082 | '/' => go!(self: to SelfClosingStartTag), |
1083 | '>' => go!(self: emit_tag Data), |
1084 | _ => go!(self: error; reconsume BeforeAttributeName), |
1085 | } |
1086 | }, |
1087 | |
1088 | //ยง self-closing-start-tag-state |
1089 | states::SelfClosingStartTag => loop { |
1090 | match get_char!(self, input) { |
1091 | '>' => { |
1092 | self.current_tag_self_closing.set(true); |
1093 | go!(self: emit_tag Data); |
1094 | }, |
1095 | _ => go!(self: error; reconsume BeforeAttributeName), |
1096 | } |
1097 | }, |
1098 | |
1099 | //ยง comment-start-state |
1100 | states::CommentStart => loop { |
1101 | match get_char!(self, input) { |
1102 | '-' => go!(self: to CommentStartDash), |
1103 | ' \0' => go!(self: error; push_comment ' \u{fffd}' ; to Comment), |
1104 | '>' => go!(self: error; emit_comment; to Data), |
1105 | c => go!(self: push_comment c; to Comment), |
1106 | } |
1107 | }, |
1108 | |
1109 | //ยง comment-start-dash-state |
1110 | states::CommentStartDash => loop { |
1111 | match get_char!(self, input) { |
1112 | '-' => go!(self: to CommentEnd), |
1113 | ' \0' => go!(self: error; append_comment "- \u{fffd}" ; to Comment), |
1114 | '>' => go!(self: error; emit_comment; to Data), |
1115 | c => go!(self: push_comment '-' ; push_comment c; to Comment), |
1116 | } |
1117 | }, |
1118 | |
1119 | //ยง comment-state |
1120 | states::Comment => loop { |
1121 | match get_char!(self, input) { |
1122 | c @ '<' => go!(self: push_comment c; to CommentLessThanSign), |
1123 | '-' => go!(self: to CommentEndDash), |
1124 | ' \0' => go!(self: error; push_comment ' \u{fffd}' ), |
1125 | c => go!(self: push_comment c), |
1126 | } |
1127 | }, |
1128 | |
1129 | //ยง comment-less-than-sign-state |
1130 | states::CommentLessThanSign => loop { |
1131 | match get_char!(self, input) { |
1132 | c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang), |
1133 | c @ '<' => go!(self: push_comment c), |
1134 | _ => go!(self: reconsume Comment), |
1135 | } |
1136 | }, |
1137 | |
1138 | //ยง comment-less-than-sign-bang |
1139 | states::CommentLessThanSignBang => loop { |
1140 | match get_char!(self, input) { |
1141 | '-' => go!(self: to CommentLessThanSignBangDash), |
1142 | _ => go!(self: reconsume Comment), |
1143 | } |
1144 | }, |
1145 | |
1146 | //ยง comment-less-than-sign-bang-dash |
1147 | states::CommentLessThanSignBangDash => loop { |
1148 | match get_char!(self, input) { |
1149 | '-' => go!(self: to CommentLessThanSignBangDashDash), |
1150 | _ => go!(self: reconsume CommentEndDash), |
1151 | } |
1152 | }, |
1153 | |
1154 | //ยง comment-less-than-sign-bang-dash-dash |
1155 | states::CommentLessThanSignBangDashDash => loop { |
1156 | match get_char!(self, input) { |
1157 | '>' => go!(self: reconsume CommentEnd), |
1158 | _ => go!(self: error; reconsume CommentEnd), |
1159 | } |
1160 | }, |
1161 | |
1162 | //ยง comment-end-dash-state |
1163 | states::CommentEndDash => loop { |
1164 | match get_char!(self, input) { |
1165 | '-' => go!(self: to CommentEnd), |
1166 | ' \0' => go!(self: error; append_comment "- \u{fffd}" ; to Comment), |
1167 | c => go!(self: push_comment '-' ; push_comment c; to Comment), |
1168 | } |
1169 | }, |
1170 | |
1171 | //ยง comment-end-state |
1172 | states::CommentEnd => loop { |
1173 | match get_char!(self, input) { |
1174 | '>' => go!(self: emit_comment; to Data), |
1175 | '!' => go!(self: to CommentEndBang), |
1176 | '-' => go!(self: push_comment '-' ), |
1177 | _ => go!(self: append_comment "--" ; reconsume Comment), |
1178 | } |
1179 | }, |
1180 | |
1181 | //ยง comment-end-bang-state |
1182 | states::CommentEndBang => loop { |
1183 | match get_char!(self, input) { |
1184 | '-' => go!(self: append_comment "--!" ; to CommentEndDash), |
1185 | '>' => go!(self: error; emit_comment; to Data), |
1186 | ' \0' => go!(self: error; append_comment "--! \u{fffd}" ; to Comment), |
1187 | c => go!(self: append_comment "--!" ; push_comment c; to Comment), |
1188 | } |
1189 | }, |
1190 | |
1191 | //ยง doctype-state |
1192 | states::Doctype => loop { |
1193 | match get_char!(self, input) { |
1194 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeDoctypeName), |
1195 | '>' => go!(self: reconsume BeforeDoctypeName), |
1196 | _ => go!(self: error; reconsume BeforeDoctypeName), |
1197 | } |
1198 | }, |
1199 | |
1200 | //ยง before-doctype-name-state |
1201 | states::BeforeDoctypeName => loop { |
1202 | match get_char!(self, input) { |
1203 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1204 | ' \0' => { |
1205 | go!(self: error; create_doctype; push_doctype_name ' \u{fffd}' ; to DoctypeName) |
1206 | }, |
1207 | '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), |
1208 | c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); |
1209 | to DoctypeName), |
1210 | } |
1211 | }, |
1212 | |
1213 | //ยง doctype-name-state |
1214 | states::DoctypeName => loop { |
1215 | match get_char!(self, input) { |
1216 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), |
1217 | '>' => go!(self: emit_doctype; to Data), |
1218 | ' \0' => go!(self: error; push_doctype_name ' \u{fffd}' ), |
1219 | c => go!(self: push_doctype_name (c.to_ascii_lowercase())), |
1220 | } |
1221 | }, |
1222 | |
1223 | //ยง after-doctype-name-state |
1224 | states::AfterDoctypeName => loop { |
1225 | if eat!(self, input, "public" ) { |
1226 | go!(self: to AfterDoctypeKeyword Public); |
1227 | } else if eat!(self, input, "system" ) { |
1228 | go!(self: to AfterDoctypeKeyword System); |
1229 | } else { |
1230 | match get_char!(self, input) { |
1231 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1232 | '>' => go!(self: emit_doctype; to Data), |
1233 | _ => go!(self: error; force_quirks; reconsume BogusDoctype), |
1234 | } |
1235 | } |
1236 | }, |
1237 | |
1238 | //ยง after-doctype-public-keyword-state after-doctype-system-keyword-state |
1239 | states::AfterDoctypeKeyword(kind) => loop { |
1240 | match get_char!(self, input) { |
1241 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), |
1242 | '"' => { |
1243 | go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) |
1244 | }, |
1245 | ' \'' => { |
1246 | go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) |
1247 | }, |
1248 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1249 | _ => go!(self: error; force_quirks; reconsume BogusDoctype), |
1250 | } |
1251 | }, |
1252 | |
1253 | //ยง before-doctype-public-identifier-state before-doctype-system-identifier-state |
1254 | states::BeforeDoctypeIdentifier(kind) => loop { |
1255 | match get_char!(self, input) { |
1256 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1257 | '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), |
1258 | ' \'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), |
1259 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1260 | _ => go!(self: error; force_quirks; reconsume BogusDoctype), |
1261 | } |
1262 | }, |
1263 | |
1264 | //ยง doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state |
1265 | states::DoctypeIdentifierDoubleQuoted(kind) => loop { |
1266 | match get_char!(self, input) { |
1267 | '"' => go!(self: to AfterDoctypeIdentifier kind), |
1268 | ' \0' => go!(self: error; push_doctype_id kind ' \u{fffd}' ), |
1269 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1270 | c => go!(self: push_doctype_id kind c), |
1271 | } |
1272 | }, |
1273 | |
1274 | //ยง doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state |
1275 | states::DoctypeIdentifierSingleQuoted(kind) => loop { |
1276 | match get_char!(self, input) { |
1277 | ' \'' => go!(self: to AfterDoctypeIdentifier kind), |
1278 | ' \0' => go!(self: error; push_doctype_id kind ' \u{fffd}' ), |
1279 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1280 | c => go!(self: push_doctype_id kind c), |
1281 | } |
1282 | }, |
1283 | |
1284 | //ยง after-doctype-public-identifier-state |
1285 | states::AfterDoctypeIdentifier(Public) => loop { |
1286 | match get_char!(self, input) { |
1287 | ' \t' | ' \n' | ' \x0C' | ' ' => { |
1288 | go!(self: to BetweenDoctypePublicAndSystemIdentifiers) |
1289 | }, |
1290 | '>' => go!(self: emit_doctype; to Data), |
1291 | '"' => { |
1292 | go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) |
1293 | }, |
1294 | ' \'' => { |
1295 | go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) |
1296 | }, |
1297 | _ => go!(self: error; force_quirks; reconsume BogusDoctype), |
1298 | } |
1299 | }, |
1300 | |
1301 | //ยง after-doctype-system-identifier-state |
1302 | states::AfterDoctypeIdentifier(System) => loop { |
1303 | match get_char!(self, input) { |
1304 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1305 | '>' => go!(self: emit_doctype; to Data), |
1306 | _ => go!(self: error; reconsume BogusDoctype), |
1307 | } |
1308 | }, |
1309 | |
1310 | //ยง between-doctype-public-and-system-identifiers-state |
1311 | states::BetweenDoctypePublicAndSystemIdentifiers => loop { |
1312 | match get_char!(self, input) { |
1313 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1314 | '>' => go!(self: emit_doctype; to Data), |
1315 | '"' => { |
1316 | go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) |
1317 | }, |
1318 | ' \'' => { |
1319 | go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) |
1320 | }, |
1321 | _ => go!(self: error; force_quirks; reconsume BogusDoctype), |
1322 | } |
1323 | }, |
1324 | |
1325 | //ยง bogus-doctype-state |
1326 | states::BogusDoctype => loop { |
1327 | match get_char!(self, input) { |
1328 | '>' => go!(self: emit_doctype; to Data), |
1329 | ' \0' => go!(self: error), |
1330 | _ => (), |
1331 | } |
1332 | }, |
1333 | |
1334 | //ยง bogus-comment-state |
1335 | states::BogusComment => loop { |
1336 | match get_char!(self, input) { |
1337 | '>' => go!(self: emit_comment; to Data), |
1338 | ' \0' => go!(self: error; push_comment ' \u{fffd}' ), |
1339 | c => go!(self: push_comment c), |
1340 | } |
1341 | }, |
1342 | |
1343 | //ยง markup-declaration-open-state |
1344 | states::MarkupDeclarationOpen => loop { |
1345 | if eat_exact!(self, input, "--" ) { |
1346 | go!(self: clear_comment; to CommentStart); |
1347 | } else if eat!(self, input, "doctype" ) { |
1348 | go!(self: to Doctype); |
1349 | } else { |
1350 | if self |
1351 | .sink |
1352 | .adjusted_current_node_present_but_not_in_html_namespace() |
1353 | && eat_exact!(self, input, "[CDATA[" ) |
1354 | { |
1355 | go!(self: clear_temp; to CdataSection); |
1356 | } |
1357 | go!(self: error; clear_comment; to BogusComment); |
1358 | } |
1359 | }, |
1360 | |
1361 | //ยง cdata-section-state |
1362 | states::CdataSection => loop { |
1363 | match get_char!(self, input) { |
1364 | ']' => go!(self: to CdataSectionBracket), |
1365 | ' \0' => go!(self: emit_temp; emit ' \0' ), |
1366 | c => go!(self: push_temp c), |
1367 | } |
1368 | }, |
1369 | |
1370 | //ยง cdata-section-bracket |
1371 | states::CdataSectionBracket => match get_char!(self, input) { |
1372 | ']' => go!(self: to CdataSectionEnd), |
1373 | _ => go!(self: push_temp ']' ; reconsume CdataSection), |
1374 | }, |
1375 | |
1376 | //ยง cdata-section-end |
1377 | states::CdataSectionEnd => loop { |
1378 | match get_char!(self, input) { |
1379 | ']' => go!(self: push_temp ']' ), |
1380 | '>' => go!(self: emit_temp; to Data), |
1381 | _ => go!(self: push_temp ']' ; push_temp ']' ; reconsume CdataSection), |
1382 | } |
1383 | }, |
1384 | //ยง END |
1385 | } |
1386 | } |
1387 | |
1388 | fn step_char_ref_tokenizer(&self, input: &BufferQueue) -> ProcessResult<Sink::Handle> { |
1389 | // FIXME HACK: Take and replace the tokenizer so we don't |
1390 | // double-mut-borrow self. This is why it's boxed. |
1391 | let mut tok = self.char_ref_tokenizer.take().unwrap(); |
1392 | let outcome = tok.step(self, input); |
1393 | |
1394 | let progress = match outcome { |
1395 | char_ref::Done => { |
1396 | self.process_char_ref(tok.get_result()); |
1397 | return ProcessResult::Continue; |
1398 | }, |
1399 | |
1400 | char_ref::Stuck => ProcessResult::Suspend, |
1401 | char_ref::Progress => ProcessResult::Continue, |
1402 | }; |
1403 | |
1404 | *self.char_ref_tokenizer.borrow_mut() = Some(tok); |
1405 | progress |
1406 | } |
1407 | |
1408 | fn process_char_ref(&self, char_ref: CharRef) { |
1409 | let CharRef { |
1410 | mut chars, |
1411 | mut num_chars, |
1412 | } = char_ref; |
1413 | |
1414 | if num_chars == 0 { |
1415 | chars[0] = '&' ; |
1416 | num_chars = 1; |
1417 | } |
1418 | |
1419 | for i in 0..num_chars { |
1420 | let c = chars[i as usize]; |
1421 | match self.state.get() { |
1422 | states::Data | states::RawData(states::Rcdata) => go!(self: emit c), |
1423 | |
1424 | states::AttributeValue(_) => go!(self: push_value c), |
1425 | |
1426 | _ => panic!( |
1427 | "state {:?} should not be reachable in process_char_ref" , |
1428 | self.state.get() |
1429 | ), |
1430 | } |
1431 | } |
1432 | } |
1433 | |
1434 | /// Indicate that we have reached the end of the input. |
1435 | pub fn end(&self) { |
1436 | // Handle EOF in the char ref sub-tokenizer, if there is one. |
1437 | // Do this first because it might un-consume stuff. |
1438 | let input = BufferQueue::default(); |
1439 | match self.char_ref_tokenizer.take() { |
1440 | None => (), |
1441 | Some(mut tok) => { |
1442 | tok.end_of_file(self, &input); |
1443 | self.process_char_ref(tok.get_result()); |
1444 | }, |
1445 | } |
1446 | |
1447 | // Process all remaining buffered input. |
1448 | // If we're waiting for lookahead, we're not gonna get it. |
1449 | self.at_eof.set(true); |
1450 | assert!(matches!(self.run(&input), TokenizerResult::Done)); |
1451 | assert!(input.is_empty()); |
1452 | |
1453 | loop { |
1454 | match self.eof_step() { |
1455 | ProcessResult::Continue => (), |
1456 | ProcessResult::Suspend => break, |
1457 | ProcessResult::Script(_) => unreachable!(), |
1458 | } |
1459 | } |
1460 | |
1461 | self.sink.end(); |
1462 | |
1463 | if self.opts.profile { |
1464 | self.dump_profile(); |
1465 | } |
1466 | } |
1467 | |
1468 | fn dump_profile(&self) { |
1469 | let mut results: Vec<(states::State, u64)> = self |
1470 | .state_profile |
1471 | .borrow() |
1472 | .iter() |
1473 | .map(|(s, t)| (*s, *t)) |
1474 | .collect(); |
1475 | results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); |
1476 | |
1477 | let total: u64 = results |
1478 | .iter() |
1479 | .map(|&(_, t)| t) |
1480 | .fold(0, ::std::ops::Add::add); |
1481 | println!(" \nTokenizer profile, in nanoseconds" ); |
1482 | println!( |
1483 | " \n{:12} total in token sink" , |
1484 | self.time_in_sink.get() |
1485 | ); |
1486 | println!(" \n{total:12} total in tokenizer" ); |
1487 | |
1488 | for (k, v) in results.into_iter() { |
1489 | let pct = 100.0 * (v as f64) / (total as f64); |
1490 | println!(" {v:12} {pct:4.1}% {k:?}" ); |
1491 | } |
1492 | } |
1493 | |
1494 | fn eof_step(&self) -> ProcessResult<Sink::Handle> { |
1495 | debug!("processing EOF in state {:?}" , self.state.get()); |
1496 | match self.state.get() { |
1497 | states::Data |
1498 | | states::RawData(Rcdata) |
1499 | | states::RawData(Rawtext) |
1500 | | states::RawData(ScriptData) |
1501 | | states::Plaintext => go!(self: eof), |
1502 | |
1503 | states::TagName |
1504 | | states::RawData(ScriptDataEscaped(_)) |
1505 | | states::BeforeAttributeName |
1506 | | states::AttributeName |
1507 | | states::AfterAttributeName |
1508 | | states::AttributeValue(_) |
1509 | | states::AfterAttributeValueQuoted |
1510 | | states::SelfClosingStartTag |
1511 | | states::ScriptDataEscapedDash(_) |
1512 | | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), |
1513 | |
1514 | states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted), |
1515 | |
1516 | states::TagOpen => go!(self: error_eof; emit '<' ; to Data), |
1517 | |
1518 | states::EndTagOpen => go!(self: error_eof; emit '<' ; emit '/' ; to Data), |
1519 | |
1520 | states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { |
1521 | go!(self: to RawData ScriptDataEscaped DoubleEscaped) |
1522 | }, |
1523 | |
1524 | states::RawLessThanSign(kind) => go!(self: emit '<' ; to RawData kind), |
1525 | |
1526 | states::RawEndTagOpen(kind) => go!(self: emit '<' ; emit '/' ; to RawData kind), |
1527 | |
1528 | states::RawEndTagName(kind) => { |
1529 | go!(self: emit '<' ; emit '/' ; emit_temp; to RawData kind) |
1530 | }, |
1531 | |
1532 | states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), |
1533 | |
1534 | states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), |
1535 | |
1536 | states::ScriptDataDoubleEscapeEnd => { |
1537 | go!(self: to RawData ScriptDataEscaped DoubleEscaped) |
1538 | }, |
1539 | |
1540 | states::CommentStart |
1541 | | states::CommentStartDash |
1542 | | states::Comment |
1543 | | states::CommentEndDash |
1544 | | states::CommentEnd |
1545 | | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), |
1546 | |
1547 | states::CommentLessThanSign | states::CommentLessThanSignBang => { |
1548 | go!(self: reconsume Comment) |
1549 | }, |
1550 | |
1551 | states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash), |
1552 | |
1553 | states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd), |
1554 | |
1555 | states::Doctype | states::BeforeDoctypeName => { |
1556 | go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) |
1557 | }, |
1558 | |
1559 | states::DoctypeName |
1560 | | states::AfterDoctypeName |
1561 | | states::AfterDoctypeKeyword(_) |
1562 | | states::BeforeDoctypeIdentifier(_) |
1563 | | states::DoctypeIdentifierDoubleQuoted(_) |
1564 | | states::DoctypeIdentifierSingleQuoted(_) |
1565 | | states::AfterDoctypeIdentifier(_) |
1566 | | states::BetweenDoctypePublicAndSystemIdentifiers => { |
1567 | go!(self: error_eof; force_quirks; emit_doctype; to Data) |
1568 | }, |
1569 | |
1570 | states::BogusDoctype => go!(self: emit_doctype; to Data), |
1571 | |
1572 | states::BogusComment => go!(self: emit_comment; to Data), |
1573 | |
1574 | states::MarkupDeclarationOpen => go!(self: error; to BogusComment), |
1575 | |
1576 | states::CdataSection => go!(self: emit_temp; error_eof; to Data), |
1577 | |
1578 | states::CdataSectionBracket => go!(self: push_temp ']' ; to CdataSection), |
1579 | |
1580 | states::CdataSectionEnd => go!(self: push_temp ']' ; push_temp ']' ; to CdataSection), |
1581 | } |
1582 | } |
1583 | } |
1584 | |
1585 | #[cfg (test)] |
1586 | #[allow (non_snake_case)] |
1587 | mod test { |
1588 | use super::option_push; // private items |
1589 | use crate::tendril::{SliceExt, StrTendril}; |
1590 | |
1591 | use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; |
1592 | |
1593 | use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; |
1594 | use super::interface::{EndTag, StartTag, Tag, TagKind}; |
1595 | use super::interface::{TagToken, Token}; |
1596 | |
1597 | use markup5ever::buffer_queue::BufferQueue; |
1598 | use std::cell::RefCell; |
1599 | |
1600 | use crate::LocalName; |
1601 | |
1602 | // LinesMatch implements the TokenSink trait. It is used for testing to see |
1603 | // if current_line is being updated when process_token is called. The lines |
1604 | // vector is a collection of the line numbers that each token is on. |
1605 | struct LinesMatch { |
1606 | tokens: RefCell<Vec<Token>>, |
1607 | current_str: RefCell<StrTendril>, |
1608 | lines: RefCell<Vec<(Token, u64)>>, |
1609 | } |
1610 | |
1611 | impl LinesMatch { |
1612 | fn new() -> LinesMatch { |
1613 | LinesMatch { |
1614 | tokens: RefCell::new(vec![]), |
1615 | current_str: RefCell::new(StrTendril::new()), |
1616 | lines: RefCell::new(vec![]), |
1617 | } |
1618 | } |
1619 | |
1620 | fn push(&self, token: Token, line_number: u64) { |
1621 | self.finish_str(); |
1622 | self.lines.borrow_mut().push((token, line_number)); |
1623 | } |
1624 | |
1625 | fn finish_str(&self) { |
1626 | if self.current_str.borrow().len() > 0 { |
1627 | let s = self.current_str.take(); |
1628 | self.tokens.borrow_mut().push(CharacterTokens(s)); |
1629 | } |
1630 | } |
1631 | } |
1632 | |
1633 | impl TokenSink for LinesMatch { |
1634 | type Handle = (); |
1635 | |
1636 | fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> { |
1637 | match token { |
1638 | CharacterTokens(b) => { |
1639 | self.current_str.borrow_mut().push_slice(&b); |
1640 | }, |
1641 | |
1642 | NullCharacterToken => { |
1643 | self.current_str.borrow_mut().push_char(' \0' ); |
1644 | }, |
1645 | |
1646 | ParseError(_) => { |
1647 | panic!("unexpected parse error" ); |
1648 | }, |
1649 | |
1650 | TagToken(mut t) => { |
1651 | // The spec seems to indicate that one can emit |
1652 | // erroneous end tags with attrs, but the test |
1653 | // cases don't contain them. |
1654 | match t.kind { |
1655 | EndTag => { |
1656 | t.self_closing = false; |
1657 | t.attrs = vec![]; |
1658 | }, |
1659 | _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), |
1660 | } |
1661 | self.push(TagToken(t), line_number); |
1662 | }, |
1663 | |
1664 | EOFToken => (), |
1665 | |
1666 | _ => self.push(token, line_number), |
1667 | } |
1668 | TokenSinkResult::Continue |
1669 | } |
1670 | } |
1671 | |
1672 | // Take in tokens, process them, and return vector with line |
1673 | // numbers that each token is on |
1674 | fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> { |
1675 | let sink = LinesMatch::new(); |
1676 | let tok = Tokenizer::new(sink, opts); |
1677 | let buffer = BufferQueue::default(); |
1678 | for chunk in input.into_iter() { |
1679 | buffer.push_back(chunk); |
1680 | let _ = tok.feed(&buffer); |
1681 | } |
1682 | tok.end(); |
1683 | tok.sink.lines.take() |
1684 | } |
1685 | |
1686 | // Create a tag token |
1687 | fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { |
1688 | let name = LocalName::from(&*token); |
1689 | |
1690 | TagToken(Tag { |
1691 | kind: tagkind, |
1692 | name, |
1693 | self_closing: false, |
1694 | attrs: vec![], |
1695 | }) |
1696 | } |
1697 | |
1698 | #[test ] |
1699 | fn push_to_None_gives_singleton() { |
1700 | let mut s: Option<StrTendril> = None; |
1701 | option_push(&mut s, 'x' ); |
1702 | assert_eq!(s, Some("x" .to_tendril())); |
1703 | } |
1704 | |
1705 | #[test ] |
1706 | fn push_to_empty_appends() { |
1707 | let mut s: Option<StrTendril> = Some(StrTendril::new()); |
1708 | option_push(&mut s, 'x' ); |
1709 | assert_eq!(s, Some("x" .to_tendril())); |
1710 | } |
1711 | |
1712 | #[test ] |
1713 | fn push_to_nonempty_appends() { |
1714 | let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y" )); |
1715 | option_push(&mut s, 'x' ); |
1716 | assert_eq!(s, Some("yx" .to_tendril())); |
1717 | } |
1718 | |
1719 | #[test ] |
1720 | fn check_lines() { |
1721 | let opts = TokenizerOpts { |
1722 | exact_errors: false, |
1723 | discard_bom: true, |
1724 | profile: false, |
1725 | initial_state: None, |
1726 | last_start_tag_name: None, |
1727 | }; |
1728 | let vector = vec![ |
1729 | StrTendril::from("<a> \n" ), |
1730 | StrTendril::from("<b> \n" ), |
1731 | StrTendril::from("</b> \n" ), |
1732 | StrTendril::from("</a> \n" ), |
1733 | ]; |
1734 | let expected = vec![ |
1735 | (create_tag(StrTendril::from("a" ), StartTag), 1), |
1736 | (create_tag(StrTendril::from("b" ), StartTag), 2), |
1737 | (create_tag(StrTendril::from("b" ), EndTag), 3), |
1738 | (create_tag(StrTendril::from("a" ), EndTag), 4), |
1739 | ]; |
1740 | let results = tokenize(vector, opts); |
1741 | assert_eq!(results, expected); |
1742 | } |
1743 | |
1744 | #[test ] |
1745 | fn check_lines_with_new_line() { |
1746 | let opts = TokenizerOpts { |
1747 | exact_errors: false, |
1748 | discard_bom: true, |
1749 | profile: false, |
1750 | initial_state: None, |
1751 | last_start_tag_name: None, |
1752 | }; |
1753 | let vector = vec![ |
1754 | StrTendril::from("<a> \r\n" ), |
1755 | StrTendril::from("<b> \r\n" ), |
1756 | StrTendril::from("</b> \r\n" ), |
1757 | StrTendril::from("</a> \r\n" ), |
1758 | ]; |
1759 | let expected = vec![ |
1760 | (create_tag(StrTendril::from("a" ), StartTag), 1), |
1761 | (create_tag(StrTendril::from("b" ), StartTag), 2), |
1762 | (create_tag(StrTendril::from("b" ), EndTag), 3), |
1763 | (create_tag(StrTendril::from("a" ), EndTag), 4), |
1764 | ]; |
1765 | let results = tokenize(vector, opts); |
1766 | assert_eq!(results, expected); |
1767 | } |
1768 | } |
1769 | |