1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! The HTML5 tokenizer. |
11 | |
12 | pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; |
13 | pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token}; |
14 | pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind}; |
15 | pub use self::interface::{TokenSink, TokenSinkResult}; |
16 | |
17 | use self::states::{DoctypeIdKind, Public, System}; |
18 | use self::states::{DoubleEscaped, Escaped}; |
19 | use self::states::{DoubleQuoted, SingleQuoted, Unquoted}; |
20 | use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; |
21 | |
22 | use self::char_ref::{CharRef, CharRefTokenizer}; |
23 | |
24 | use crate::util::str::lower_ascii_letter; |
25 | |
26 | use log::{debug, trace}; |
27 | use mac::{_tt_as_expr_hack, format_if, matches}; |
28 | use markup5ever::{namespace_url, ns, small_char_set}; |
29 | use std::borrow::Cow::{self, Borrowed}; |
30 | use std::collections::BTreeMap; |
31 | use std::default::Default; |
32 | use std::mem::replace; |
33 | |
34 | pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; |
35 | use crate::tendril::StrTendril; |
36 | use crate::{Attribute, LocalName, QualName, SmallCharSet}; |
37 | |
38 | mod char_ref; |
39 | mod interface; |
40 | pub mod states; |
41 | |
42 | pub enum ProcessResult<Handle> { |
43 | Continue, |
44 | Suspend, |
45 | Script(Handle), |
46 | } |
47 | |
48 | #[must_use ] |
49 | pub enum TokenizerResult<Handle> { |
50 | Done, |
51 | Script(Handle), |
52 | } |
53 | |
54 | fn option_push(opt_str: &mut Option<StrTendril>, c: char) { |
55 | match *opt_str { |
56 | Some(ref mut s: &mut Tendril) => s.push_char(c), |
57 | None => *opt_str = Some(StrTendril::from_char(c)), |
58 | } |
59 | } |
60 | |
61 | /// Tokenizer options, with an impl for `Default`. |
62 | #[derive (Clone)] |
63 | pub struct TokenizerOpts { |
64 | /// Report all parse errors described in the spec, at some |
65 | /// performance penalty? Default: false |
66 | pub exact_errors: bool, |
67 | |
68 | /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning |
69 | /// of the stream? Default: true |
70 | pub discard_bom: bool, |
71 | |
72 | /// Keep a record of how long we spent in each state? Printed |
73 | /// when `end()` is called. Default: false |
74 | pub profile: bool, |
75 | |
76 | /// Initial state override. Only the test runner should use |
77 | /// a non-`None` value! |
78 | pub initial_state: Option<states::State>, |
79 | |
80 | /// Last start tag. Only the test runner should use a |
81 | /// non-`None` value! |
82 | /// |
83 | /// FIXME: Can't use Tendril because we want TokenizerOpts |
84 | /// to be Send. |
85 | pub last_start_tag_name: Option<String>, |
86 | } |
87 | |
88 | impl Default for TokenizerOpts { |
89 | fn default() -> TokenizerOpts { |
90 | TokenizerOpts { |
91 | exact_errors: false, |
92 | discard_bom: true, |
93 | profile: false, |
94 | initial_state: None, |
95 | last_start_tag_name: None, |
96 | } |
97 | } |
98 | } |
99 | |
100 | /// The HTML tokenizer. |
101 | pub struct Tokenizer<Sink> { |
102 | /// Options controlling the behavior of the tokenizer. |
103 | opts: TokenizerOpts, |
104 | |
105 | /// Destination for tokens we emit. |
106 | pub sink: Sink, |
107 | |
108 | /// The abstract machine state as described in the spec. |
109 | state: states::State, |
110 | |
111 | /// Are we at the end of the file, once buffers have been processed |
112 | /// completely? This affects whether we will wait for lookahead or not. |
113 | at_eof: bool, |
114 | |
115 | /// Tokenizer for character references, if we're tokenizing |
116 | /// one at the moment. |
117 | char_ref_tokenizer: Option<Box<CharRefTokenizer>>, |
118 | |
119 | /// Current input character. Just consumed, may reconsume. |
120 | current_char: char, |
121 | |
122 | /// Should we reconsume the current input character? |
123 | reconsume: bool, |
124 | |
125 | /// Did we just consume \r, translating it to \n? In that case we need |
126 | /// to ignore the next character if it's \n. |
127 | ignore_lf: bool, |
128 | |
129 | /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the |
130 | /// beginning of the stream. |
131 | discard_bom: bool, |
132 | |
133 | /// Current tag kind. |
134 | current_tag_kind: TagKind, |
135 | |
136 | /// Current tag name. |
137 | current_tag_name: StrTendril, |
138 | |
139 | /// Current tag is self-closing? |
140 | current_tag_self_closing: bool, |
141 | |
142 | /// Current tag attributes. |
143 | current_tag_attrs: Vec<Attribute>, |
144 | |
145 | /// Current attribute name. |
146 | current_attr_name: StrTendril, |
147 | |
148 | /// Current attribute value. |
149 | current_attr_value: StrTendril, |
150 | |
151 | /// Current comment. |
152 | current_comment: StrTendril, |
153 | |
154 | /// Current doctype token. |
155 | current_doctype: Doctype, |
156 | |
157 | /// Last start tag name, for use in checking "appropriate end tag". |
158 | last_start_tag_name: Option<LocalName>, |
159 | |
160 | /// The "temporary buffer" mentioned in the spec. |
161 | temp_buf: StrTendril, |
162 | |
163 | /// Record of how many ns we spent in each state, if profiling is enabled. |
164 | state_profile: BTreeMap<states::State, u64>, |
165 | |
166 | /// Record of how many ns we spent in the token sink. |
167 | time_in_sink: u64, |
168 | |
169 | /// Track current line |
170 | current_line: u64, |
171 | } |
172 | |
173 | impl<Sink: TokenSink> Tokenizer<Sink> { |
174 | /// Create a new tokenizer which feeds tokens to a particular `TokenSink`. |
175 | pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> { |
176 | let start_tag_name = opts |
177 | .last_start_tag_name |
178 | .take() |
179 | .map(|s| LocalName::from(&*s)); |
180 | let state = opts.initial_state.unwrap_or(states::Data); |
181 | let discard_bom = opts.discard_bom; |
182 | Tokenizer { |
183 | opts, |
184 | sink, |
185 | state, |
186 | char_ref_tokenizer: None, |
187 | at_eof: false, |
188 | current_char: ' \0' , |
189 | reconsume: false, |
190 | ignore_lf: false, |
191 | discard_bom, |
192 | current_tag_kind: StartTag, |
193 | current_tag_name: StrTendril::new(), |
194 | current_tag_self_closing: false, |
195 | current_tag_attrs: vec![], |
196 | current_attr_name: StrTendril::new(), |
197 | current_attr_value: StrTendril::new(), |
198 | current_comment: StrTendril::new(), |
199 | current_doctype: Doctype::new(), |
200 | last_start_tag_name: start_tag_name, |
201 | temp_buf: StrTendril::new(), |
202 | state_profile: BTreeMap::new(), |
203 | time_in_sink: 0, |
204 | current_line: 1, |
205 | } |
206 | } |
207 | |
208 | /// Feed an input string into the tokenizer. |
209 | pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> { |
210 | if input.is_empty() { |
211 | return TokenizerResult::Done; |
212 | } |
213 | |
214 | if self.discard_bom { |
215 | if let Some(c) = input.peek() { |
216 | if c == ' \u{feff}' { |
217 | input.next(); |
218 | } |
219 | } else { |
220 | return TokenizerResult::Done; |
221 | } |
222 | }; |
223 | |
224 | self.run(input) |
225 | } |
226 | |
227 | pub fn set_plaintext_state(&mut self) { |
228 | self.state = states::Plaintext; |
229 | } |
230 | |
231 | fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> { |
232 | if self.opts.profile { |
233 | let (ret, dt) = time!(self.sink.process_token(token, self.current_line)); |
234 | self.time_in_sink += dt; |
235 | ret |
236 | } else { |
237 | self.sink.process_token(token, self.current_line) |
238 | } |
239 | } |
240 | |
241 | fn process_token_and_continue(&mut self, token: Token) { |
242 | assert!(matches!( |
243 | self.process_token(token), |
244 | TokenSinkResult::Continue |
245 | )); |
246 | } |
247 | |
248 | //§ preprocessing-the-input-stream |
249 | // Get the next input character, which might be the character |
250 | // 'c' that we already consumed from the buffers. |
251 | fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> { |
252 | if self.ignore_lf { |
253 | self.ignore_lf = false; |
254 | if c == ' \n' { |
255 | c = unwrap_or_return!(input.next(), None); |
256 | } |
257 | } |
258 | |
259 | if c == ' \r' { |
260 | self.ignore_lf = true; |
261 | c = ' \n' ; |
262 | } |
263 | |
264 | if c == ' \n' { |
265 | self.current_line += 1; |
266 | } |
267 | |
268 | if self.opts.exact_errors && |
269 | match c as u32 { |
270 | 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true, |
271 | n if (n & 0xFFFE) == 0xFFFE => true, |
272 | _ => false, |
273 | } |
274 | { |
275 | let msg = format!("Bad character {}" , c); |
276 | self.emit_error(Cow::Owned(msg)); |
277 | } |
278 | |
279 | trace!("got character {}" , c); |
280 | self.current_char = c; |
281 | Some(c) |
282 | } |
283 | |
284 | //§ tokenization |
285 | // Get the next input character, if one is available. |
286 | fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> { |
287 | if self.reconsume { |
288 | self.reconsume = false; |
289 | Some(self.current_char) |
290 | } else { |
291 | input |
292 | .next() |
293 | .and_then(|c| self.get_preprocessed_char(c, input)) |
294 | } |
295 | } |
296 | |
297 | fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> { |
298 | // Bail to the slow path for various corner cases. |
299 | // This means that `FromSet` can contain characters not in the set! |
300 | // It shouldn't matter because the fallback `FromSet` case should |
301 | // always do the same thing as the `NotFromSet` case. |
302 | if self.opts.exact_errors || self.reconsume || self.ignore_lf { |
303 | return self.get_char(input).map(FromSet); |
304 | } |
305 | |
306 | let d = input.pop_except_from(set); |
307 | trace!("got characters {:?}" , d); |
308 | match d { |
309 | Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet), |
310 | |
311 | // NB: We don't set self.current_char for a run of characters not |
312 | // in the set. It shouldn't matter for the codepaths that use |
313 | // this. |
314 | _ => d, |
315 | } |
316 | } |
317 | |
318 | // Check if the next characters are an ASCII case-insensitive match. See |
319 | // BufferQueue::eat. |
320 | // |
321 | // NB: this doesn't do input stream preprocessing or set the current input |
322 | // character. |
323 | fn eat( |
324 | &mut self, |
325 | input: &mut BufferQueue, |
326 | pat: &str, |
327 | eq: fn(&u8, &u8) -> bool, |
328 | ) -> Option<bool> { |
329 | input.push_front(replace(&mut self.temp_buf, StrTendril::new())); |
330 | match input.eat(pat, eq) { |
331 | None if self.at_eof => Some(false), |
332 | None => { |
333 | while let Some(c) = input.next() { |
334 | self.temp_buf.push_char(c); |
335 | } |
336 | None |
337 | }, |
338 | Some(matched) => Some(matched), |
339 | } |
340 | } |
341 | |
342 | /// Run the state machine for as long as we can. |
343 | fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> { |
344 | if self.opts.profile { |
345 | loop { |
346 | let state = self.state; |
347 | let old_sink = self.time_in_sink; |
348 | let (run, mut dt) = time!(self.step(input)); |
349 | dt -= (self.time_in_sink - old_sink); |
350 | let new = match self.state_profile.get_mut(&state) { |
351 | Some(x) => { |
352 | *x += dt; |
353 | false |
354 | }, |
355 | None => true, |
356 | }; |
357 | if new { |
358 | // do this here because of borrow shenanigans |
359 | self.state_profile.insert(state, dt); |
360 | } |
361 | match run { |
362 | ProcessResult::Continue => (), |
363 | ProcessResult::Suspend => break, |
364 | ProcessResult::Script(node) => return TokenizerResult::Script(node), |
365 | } |
366 | } |
367 | } else { |
368 | loop { |
369 | match self.step(input) { |
370 | ProcessResult::Continue => (), |
371 | ProcessResult::Suspend => break, |
372 | ProcessResult::Script(node) => return TokenizerResult::Script(node), |
373 | } |
374 | } |
375 | } |
376 | TokenizerResult::Done |
377 | } |
378 | |
379 | fn bad_char_error(&mut self) { |
380 | let msg = format_if!( |
381 | self.opts.exact_errors, |
382 | "Bad character" , |
383 | "Saw {} in state {:?}" , |
384 | self.current_char, |
385 | self.state |
386 | ); |
387 | self.emit_error(msg); |
388 | } |
389 | |
390 | fn bad_eof_error(&mut self) { |
391 | let msg = format_if!( |
392 | self.opts.exact_errors, |
393 | "Unexpected EOF" , |
394 | "Saw EOF in state {:?}" , |
395 | self.state |
396 | ); |
397 | self.emit_error(msg); |
398 | } |
399 | |
400 | fn emit_char(&mut self, c: char) { |
401 | self.process_token_and_continue(match c { |
402 | ' \0' => NullCharacterToken, |
403 | _ => CharacterTokens(StrTendril::from_char(c)), |
404 | }); |
405 | } |
406 | |
407 | // The string must not contain '\0'! |
408 | fn emit_chars(&mut self, b: StrTendril) { |
409 | self.process_token_and_continue(CharacterTokens(b)); |
410 | } |
411 | |
412 | fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> { |
413 | self.finish_attribute(); |
414 | |
415 | let name = LocalName::from(&*self.current_tag_name); |
416 | self.current_tag_name.clear(); |
417 | |
418 | match self.current_tag_kind { |
419 | StartTag => { |
420 | self.last_start_tag_name = Some(name.clone()); |
421 | }, |
422 | EndTag => { |
423 | if !self.current_tag_attrs.is_empty() { |
424 | self.emit_error(Borrowed("Attributes on an end tag" )); |
425 | } |
426 | if self.current_tag_self_closing { |
427 | self.emit_error(Borrowed("Self-closing end tag" )); |
428 | } |
429 | }, |
430 | } |
431 | |
432 | let token = TagToken(Tag { |
433 | kind: self.current_tag_kind, |
434 | name, |
435 | self_closing: self.current_tag_self_closing, |
436 | attrs: replace(&mut self.current_tag_attrs, vec![]), |
437 | }); |
438 | |
439 | match self.process_token(token) { |
440 | TokenSinkResult::Continue => ProcessResult::Continue, |
441 | TokenSinkResult::Plaintext => { |
442 | self.state = states::Plaintext; |
443 | ProcessResult::Continue |
444 | }, |
445 | TokenSinkResult::Script(node) => { |
446 | self.state = states::Data; |
447 | ProcessResult::Script(node) |
448 | }, |
449 | TokenSinkResult::RawData(kind) => { |
450 | self.state = states::RawData(kind); |
451 | ProcessResult::Continue |
452 | }, |
453 | } |
454 | } |
455 | |
456 | fn emit_temp_buf(&mut self) { |
457 | // FIXME: Make sure that clearing on emit is spec-compatible. |
458 | let buf = replace(&mut self.temp_buf, StrTendril::new()); |
459 | self.emit_chars(buf); |
460 | } |
461 | |
462 | fn clear_temp_buf(&mut self) { |
463 | // Do this without a new allocation. |
464 | self.temp_buf.clear(); |
465 | } |
466 | |
467 | fn emit_current_comment(&mut self) { |
468 | let comment = replace(&mut self.current_comment, StrTendril::new()); |
469 | self.process_token_and_continue(CommentToken(comment)); |
470 | } |
471 | |
472 | fn discard_tag(&mut self) { |
473 | self.current_tag_name.clear(); |
474 | self.current_tag_self_closing = false; |
475 | self.current_tag_attrs = vec![]; |
476 | } |
477 | |
478 | fn create_tag(&mut self, kind: TagKind, c: char) { |
479 | self.discard_tag(); |
480 | self.current_tag_name.push_char(c); |
481 | self.current_tag_kind = kind; |
482 | } |
483 | |
484 | fn have_appropriate_end_tag(&self) -> bool { |
485 | match self.last_start_tag_name.as_ref() { |
486 | Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last), |
487 | None => false, |
488 | } |
489 | } |
490 | |
491 | fn create_attribute(&mut self, c: char) { |
492 | self.finish_attribute(); |
493 | |
494 | self.current_attr_name.push_char(c); |
495 | } |
496 | |
497 | fn finish_attribute(&mut self) { |
498 | if self.current_attr_name.is_empty() { |
499 | return; |
500 | } |
501 | |
502 | // Check for a duplicate attribute. |
503 | // FIXME: the spec says we should error as soon as the name is finished. |
504 | // FIXME: linear time search, do we care? |
505 | let dup = { |
506 | let name = &*self.current_attr_name; |
507 | self.current_tag_attrs |
508 | .iter() |
509 | .any(|a| &*a.name.local == name) |
510 | }; |
511 | |
512 | if dup { |
513 | self.emit_error(Borrowed("Duplicate attribute" )); |
514 | self.current_attr_name.clear(); |
515 | self.current_attr_value.clear(); |
516 | } else { |
517 | let name = LocalName::from(&*self.current_attr_name); |
518 | self.current_attr_name.clear(); |
519 | self.current_tag_attrs.push(Attribute { |
520 | // The tree builder will adjust the namespace if necessary. |
521 | // This only happens in foreign elements. |
522 | name: QualName::new(None, ns!(), name), |
523 | value: replace(&mut self.current_attr_value, StrTendril::new()), |
524 | }); |
525 | } |
526 | } |
527 | |
528 | fn emit_current_doctype(&mut self) { |
529 | let doctype = replace(&mut self.current_doctype, Doctype::new()); |
530 | self.process_token_and_continue(DoctypeToken(doctype)); |
531 | } |
532 | |
533 | fn doctype_id(&mut self, kind: DoctypeIdKind) -> &mut Option<StrTendril> { |
534 | match kind { |
535 | Public => &mut self.current_doctype.public_id, |
536 | System => &mut self.current_doctype.system_id, |
537 | } |
538 | } |
539 | |
540 | fn clear_doctype_id(&mut self, kind: DoctypeIdKind) { |
541 | let id = self.doctype_id(kind); |
542 | match *id { |
543 | Some(ref mut s) => s.clear(), |
544 | None => *id = Some(StrTendril::new()), |
545 | } |
546 | } |
547 | |
548 | fn consume_char_ref(&mut self, addnl_allowed: Option<char>) { |
549 | // NB: The char ref tokenizer assumes we have an additional allowed |
550 | // character iff we're tokenizing in an attribute value. |
551 | self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed))); |
552 | } |
553 | |
554 | fn emit_eof(&mut self) { |
555 | self.process_token_and_continue(EOFToken); |
556 | } |
557 | |
558 | fn peek(&mut self, input: &BufferQueue) -> Option<char> { |
559 | if self.reconsume { |
560 | Some(self.current_char) |
561 | } else { |
562 | input.peek() |
563 | } |
564 | } |
565 | |
566 | fn discard_char(&mut self, input: &mut BufferQueue) { |
567 | self.get_char(input); |
568 | } |
569 | |
570 | fn emit_error(&mut self, error: Cow<'static, str>) { |
571 | self.process_token_and_continue(ParseError(error)); |
572 | } |
573 | } |
574 | //§ END |
575 | |
576 | // Shorthand for common state machine behaviors. |
577 | macro_rules! shorthand ( |
578 | ( $me:ident : emit $c:expr ) => ( $me.emit_char($c) ); |
579 | ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c) ); |
580 | ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c) ); |
581 | ( $me:ident : discard_tag ) => ( $me.discard_tag() ); |
582 | ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input) ); |
583 | ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c) ); |
584 | ( $me:ident : emit_temp ) => ( $me.emit_temp_buf() ); |
585 | ( $me:ident : clear_temp ) => ( $me.clear_temp_buf() ); |
586 | ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c) ); |
587 | ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c) ); |
588 | ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c) ); |
589 | ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c) ); |
590 | ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c) ); |
591 | ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c) ); |
592 | ( $me:ident : emit_comment ) => ( $me.emit_current_comment() ); |
593 | ( $me:ident : clear_comment ) => ( $me.current_comment.clear() ); |
594 | ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new() ); |
595 | ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c) ); |
596 | ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c) ); |
597 | ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k) ); |
598 | ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true ); |
599 | ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype() ); |
600 | ( $me:ident : error ) => ( $me.bad_char_error() ); |
601 | ( $me:ident : error_eof ) => ( $me.bad_eof_error() ); |
602 | ); |
603 | |
604 | // Tracing of tokenizer actions. This adds significant bloat and compile time, |
605 | // so it's behind a cfg flag. |
606 | #[cfg (trace_tokenizer)] |
607 | macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({ |
608 | trace!(" {:s}" , stringify!($($cmds)*)); |
609 | shorthand!($me:expr : $($cmds)*); |
610 | })); |
611 | |
612 | #[cfg (not(trace_tokenizer))] |
613 | macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) ); |
614 | |
615 | // A little DSL for sequencing shorthand actions. |
616 | macro_rules! go ( |
617 | // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity. |
618 | // We have to tell the parser how much lookahead we need. |
619 | |
620 | ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); }); |
621 | ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); }); |
622 | ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); }); |
623 | ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); }); |
624 | |
625 | // These can only come at the end. |
626 | |
627 | ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; }); |
628 | ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; }); |
629 | ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; }); |
630 | |
631 | ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); }); |
632 | ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); }); |
633 | ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); }); |
634 | |
635 | ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; }); |
636 | ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; }); |
637 | |
638 | // We have a default next state after emitting a tag, but the sink can override. |
639 | ( $me:ident : emit_tag $s:ident ) => ({ |
640 | $me.state = states::$s; |
641 | return $me.emit_current_tag(); |
642 | }); |
643 | |
644 | ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; }); |
645 | |
646 | // If nothing else matched, it's a single command |
647 | ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+) ); |
648 | |
649 | // or nothing. |
650 | ( $me:ident : ) => (()); |
651 | ); |
652 | |
653 | macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => ( |
654 | match $x { |
655 | $($pats)|+ => go!($me: $($cmds)*), |
656 | _ => (), |
657 | } |
658 | )); |
659 | |
660 | // This is a macro because it can cause early return |
661 | // from the function where it is used. |
662 | macro_rules! get_char ( ($me:expr, $input:expr) => ( |
663 | unwrap_or_return!($me.get_char($input), ProcessResult::Suspend) |
664 | )); |
665 | |
666 | macro_rules! peek ( ($me:expr, $input:expr) => ( |
667 | unwrap_or_return!($me.peek($input), ProcessResult::Suspend) |
668 | )); |
669 | |
670 | macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => ( |
671 | unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend) |
672 | )); |
673 | |
674 | macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => ( |
675 | unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend) |
676 | )); |
677 | |
678 | macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => ( |
679 | unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend) |
680 | )); |
681 | |
682 | impl<Sink: TokenSink> Tokenizer<Sink> { |
683 | // Run the state machine for a while. |
684 | // Return true if we should be immediately re-invoked |
685 | // (this just simplifies control flow vs. break / continue). |
686 | #[allow (clippy::never_loop)] |
687 | fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> { |
688 | if self.char_ref_tokenizer.is_some() { |
689 | return self.step_char_ref_tokenizer(input); |
690 | } |
691 | |
692 | trace!("processing in state {:?}" , self.state); |
693 | match self.state { |
694 | //§ data-state |
695 | states::Data => loop { |
696 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '&' '<' ' \n' )) { |
697 | FromSet(' \0' ) => go!(self: error; emit ' \0' ), |
698 | FromSet('&' ) => go!(self: consume_char_ref), |
699 | FromSet('<' ) => go!(self: to TagOpen), |
700 | FromSet(c) => go!(self: emit c), |
701 | NotFromSet(b) => self.emit_chars(b), |
702 | } |
703 | }, |
704 | |
705 | //§ rcdata-state |
706 | states::RawData(Rcdata) => loop { |
707 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '&' '<' ' \n' )) { |
708 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
709 | FromSet('&' ) => go!(self: consume_char_ref), |
710 | FromSet('<' ) => go!(self: to RawLessThanSign Rcdata), |
711 | FromSet(c) => go!(self: emit c), |
712 | NotFromSet(b) => self.emit_chars(b), |
713 | } |
714 | }, |
715 | |
716 | //§ rawtext-state |
717 | states::RawData(Rawtext) => loop { |
718 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '<' ' \n' )) { |
719 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
720 | FromSet('<' ) => go!(self: to RawLessThanSign Rawtext), |
721 | FromSet(c) => go!(self: emit c), |
722 | NotFromSet(b) => self.emit_chars(b), |
723 | } |
724 | }, |
725 | |
726 | //§ script-data-state |
727 | states::RawData(ScriptData) => loop { |
728 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '<' ' \n' )) { |
729 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
730 | FromSet('<' ) => go!(self: to RawLessThanSign ScriptData), |
731 | FromSet(c) => go!(self: emit c), |
732 | NotFromSet(b) => self.emit_chars(b), |
733 | } |
734 | }, |
735 | |
736 | //§ script-data-escaped-state |
737 | states::RawData(ScriptDataEscaped(Escaped)) => loop { |
738 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '-' '<' ' \n' )) { |
739 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
740 | FromSet('-' ) => go!(self: emit '-' ; to ScriptDataEscapedDash Escaped), |
741 | FromSet('<' ) => go!(self: to RawLessThanSign ScriptDataEscaped Escaped), |
742 | FromSet(c) => go!(self: emit c), |
743 | NotFromSet(b) => self.emit_chars(b), |
744 | } |
745 | }, |
746 | |
747 | //§ script-data-double-escaped-state |
748 | states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop { |
749 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' '-' '<' ' \n' )) { |
750 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
751 | FromSet('-' ) => go!(self: emit '-' ; to ScriptDataEscapedDash DoubleEscaped), |
752 | FromSet('<' ) => { |
753 | go!(self: emit '<' ; to RawLessThanSign ScriptDataEscaped DoubleEscaped) |
754 | }, |
755 | FromSet(c) => go!(self: emit c), |
756 | NotFromSet(b) => self.emit_chars(b), |
757 | } |
758 | }, |
759 | |
760 | //§ plaintext-state |
761 | states::Plaintext => loop { |
762 | match pop_except_from!(self, input, small_char_set!(' \r' ' \0' ' \n' )) { |
763 | FromSet(' \0' ) => go!(self: error; emit ' \u{fffd}' ), |
764 | FromSet(c) => go!(self: emit c), |
765 | NotFromSet(b) => self.emit_chars(b), |
766 | } |
767 | }, |
768 | |
769 | //§ tag-open-state |
770 | states::TagOpen => loop { |
771 | match get_char!(self, input) { |
772 | '!' => go!(self: clear_temp; to MarkupDeclarationOpen), |
773 | '/' => go!(self: to EndTagOpen), |
774 | '?' => go!(self: error; clear_comment; push_comment '?' ; to BogusComment), |
775 | c => match lower_ascii_letter(c) { |
776 | Some(cl) => go!(self: create_tag StartTag cl; to TagName), |
777 | None => go!(self: error; emit '<' ; reconsume Data), |
778 | }, |
779 | } |
780 | }, |
781 | |
782 | //§ end-tag-open-state |
783 | states::EndTagOpen => loop { |
784 | match get_char!(self, input) { |
785 | '>' => go!(self: error; to Data), |
786 | ' \0' => { |
787 | go!(self: error; clear_comment; push_comment ' \u{fffd}' ; to BogusComment) |
788 | }, |
789 | c => match lower_ascii_letter(c) { |
790 | Some(cl) => go!(self: create_tag EndTag cl; to TagName), |
791 | None => go!(self: error; clear_comment; push_comment c; to BogusComment), |
792 | }, |
793 | } |
794 | }, |
795 | |
796 | //§ tag-name-state |
797 | states::TagName => loop { |
798 | match get_char!(self, input) { |
799 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeAttributeName), |
800 | '/' => go!(self: to SelfClosingStartTag), |
801 | '>' => go!(self: emit_tag Data), |
802 | ' \0' => go!(self: error; push_tag ' \u{fffd}' ), |
803 | c => go!(self: push_tag (c.to_ascii_lowercase())), |
804 | } |
805 | }, |
806 | |
807 | //§ script-data-escaped-less-than-sign-state |
808 | states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { |
809 | match get_char!(self, input) { |
810 | '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped), |
811 | c => match lower_ascii_letter(c) { |
812 | Some(cl) => go!(self: clear_temp; push_temp cl; emit '<' ; emit c; |
813 | to ScriptDataEscapeStart DoubleEscaped), |
814 | None => go!(self: emit '<' ; reconsume RawData ScriptDataEscaped Escaped), |
815 | }, |
816 | } |
817 | }, |
818 | |
819 | //§ script-data-double-escaped-less-than-sign-state |
820 | states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { |
821 | match get_char!(self, input) { |
822 | '/' => go!(self: clear_temp; emit '/' ; to ScriptDataDoubleEscapeEnd), |
823 | _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), |
824 | } |
825 | }, |
826 | |
827 | //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state |
828 | // otherwise |
829 | states::RawLessThanSign(kind) => loop { |
830 | match get_char!(self, input) { |
831 | '/' => go!(self: clear_temp; to RawEndTagOpen kind), |
832 | '!' if kind == ScriptData => { |
833 | go!(self: emit '<' ; emit '!' ; to ScriptDataEscapeStart Escaped) |
834 | }, |
835 | _ => go!(self: emit '<' ; reconsume RawData kind), |
836 | } |
837 | }, |
838 | |
839 | //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state |
840 | states::RawEndTagOpen(kind) => loop { |
841 | let c = get_char!(self, input); |
842 | match lower_ascii_letter(c) { |
843 | Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind), |
844 | None => go!(self: emit '<' ; emit '/' ; reconsume RawData kind), |
845 | } |
846 | }, |
847 | |
848 | //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state |
849 | states::RawEndTagName(kind) => loop { |
850 | let c = get_char!(self, input); |
851 | if self.have_appropriate_end_tag() { |
852 | match c { |
853 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeAttributeName), |
854 | '/' => go!(self: to SelfClosingStartTag), |
855 | '>' => go!(self: emit_tag Data), |
856 | _ => (), |
857 | } |
858 | } |
859 | |
860 | match lower_ascii_letter(c) { |
861 | Some(cl) => go!(self: push_tag cl; push_temp c), |
862 | None => { |
863 | go!(self: discard_tag; emit '<' ; emit '/' ; emit_temp; reconsume RawData kind) |
864 | }, |
865 | } |
866 | }, |
867 | |
868 | //§ script-data-double-escape-start-state |
869 | states::ScriptDataEscapeStart(DoubleEscaped) => loop { |
870 | let c = get_char!(self, input); |
871 | match c { |
872 | ' \t' | ' \n' | ' \x0C' | ' ' | '/' | '>' => { |
873 | let esc = if &*self.temp_buf == "script" { |
874 | DoubleEscaped |
875 | } else { |
876 | Escaped |
877 | }; |
878 | go!(self: emit c; to RawData ScriptDataEscaped esc); |
879 | }, |
880 | _ => match lower_ascii_letter(c) { |
881 | Some(cl) => go!(self: push_temp cl; emit c), |
882 | None => go!(self: reconsume RawData ScriptDataEscaped Escaped), |
883 | }, |
884 | } |
885 | }, |
886 | |
887 | //§ script-data-escape-start-state |
888 | states::ScriptDataEscapeStart(Escaped) => loop { |
889 | match get_char!(self, input) { |
890 | '-' => go!(self: emit '-' ; to ScriptDataEscapeStartDash), |
891 | _ => go!(self: reconsume RawData ScriptData), |
892 | } |
893 | }, |
894 | |
895 | //§ script-data-escape-start-dash-state |
896 | states::ScriptDataEscapeStartDash => loop { |
897 | match get_char!(self, input) { |
898 | '-' => go!(self: emit '-' ; to ScriptDataEscapedDashDash Escaped), |
899 | _ => go!(self: reconsume RawData ScriptData), |
900 | } |
901 | }, |
902 | |
903 | //§ script-data-escaped-dash-state script-data-double-escaped-dash-state |
904 | states::ScriptDataEscapedDash(kind) => loop { |
905 | match get_char!(self, input) { |
906 | '-' => go!(self: emit '-' ; to ScriptDataEscapedDashDash kind), |
907 | '<' => { |
908 | if kind == DoubleEscaped { |
909 | go!(self: emit '<' ); |
910 | } |
911 | go!(self: to RawLessThanSign ScriptDataEscaped kind); |
912 | }, |
913 | ' \0' => go!(self: error; emit ' \u{fffd}' ; to RawData ScriptDataEscaped kind), |
914 | c => go!(self: emit c; to RawData ScriptDataEscaped kind), |
915 | } |
916 | }, |
917 | |
918 | //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state |
919 | states::ScriptDataEscapedDashDash(kind) => loop { |
920 | match get_char!(self, input) { |
921 | '-' => go!(self: emit '-' ), |
922 | '<' => { |
923 | if kind == DoubleEscaped { |
924 | go!(self: emit '<' ); |
925 | } |
926 | go!(self: to RawLessThanSign ScriptDataEscaped kind); |
927 | }, |
928 | '>' => go!(self: emit '>' ; to RawData ScriptData), |
929 | ' \0' => go!(self: error; emit ' \u{fffd}' ; to RawData ScriptDataEscaped kind), |
930 | c => go!(self: emit c; to RawData ScriptDataEscaped kind), |
931 | } |
932 | }, |
933 | |
934 | //§ script-data-double-escape-end-state |
935 | states::ScriptDataDoubleEscapeEnd => loop { |
936 | let c = get_char!(self, input); |
937 | match c { |
938 | ' \t' | ' \n' | ' \x0C' | ' ' | '/' | '>' => { |
939 | let esc = if &*self.temp_buf == "script" { |
940 | Escaped |
941 | } else { |
942 | DoubleEscaped |
943 | }; |
944 | go!(self: emit c; to RawData ScriptDataEscaped esc); |
945 | }, |
946 | _ => match lower_ascii_letter(c) { |
947 | Some(cl) => go!(self: push_temp cl; emit c), |
948 | None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped), |
949 | }, |
950 | } |
951 | }, |
952 | |
953 | //§ before-attribute-name-state |
954 | states::BeforeAttributeName => loop { |
955 | match get_char!(self, input) { |
956 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
957 | '/' => go!(self: to SelfClosingStartTag), |
958 | '>' => go!(self: emit_tag Data), |
959 | ' \0' => go!(self: error; create_attr ' \u{fffd}' ; to AttributeName), |
960 | c => match lower_ascii_letter(c) { |
961 | Some(cl) => go!(self: create_attr cl; to AttributeName), |
962 | None => { |
963 | go_match!(self: c, |
964 | '"' , ' \'' , '<' , '=' => error); |
965 | go!(self: create_attr c; to AttributeName); |
966 | }, |
967 | }, |
968 | } |
969 | }, |
970 | |
971 | //§ attribute-name-state |
972 | states::AttributeName => loop { |
973 | match get_char!(self, input) { |
974 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to AfterAttributeName), |
975 | '/' => go!(self: to SelfClosingStartTag), |
976 | '=' => go!(self: to BeforeAttributeValue), |
977 | '>' => go!(self: emit_tag Data), |
978 | ' \0' => go!(self: error; push_name ' \u{fffd}' ), |
979 | c => match lower_ascii_letter(c) { |
980 | Some(cl) => go!(self: push_name cl), |
981 | None => { |
982 | go_match!(self: c, |
983 | '"' , ' \'' , '<' => error); |
984 | go!(self: push_name c); |
985 | }, |
986 | }, |
987 | } |
988 | }, |
989 | |
990 | //§ after-attribute-name-state |
991 | states::AfterAttributeName => loop { |
992 | match get_char!(self, input) { |
993 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
994 | '/' => go!(self: to SelfClosingStartTag), |
995 | '=' => go!(self: to BeforeAttributeValue), |
996 | '>' => go!(self: emit_tag Data), |
997 | ' \0' => go!(self: error; create_attr ' \u{fffd}' ; to AttributeName), |
998 | c => match lower_ascii_letter(c) { |
999 | Some(cl) => go!(self: create_attr cl; to AttributeName), |
1000 | None => { |
1001 | go_match!(self: c, |
1002 | '"' , ' \'' , '<' => error); |
1003 | go!(self: create_attr c; to AttributeName); |
1004 | }, |
1005 | }, |
1006 | } |
1007 | }, |
1008 | |
1009 | //§ before-attribute-value-state |
1010 | // Use peek so we can handle the first attr character along with the rest, |
1011 | // hopefully in the same zero-copy buffer. |
1012 | states::BeforeAttributeValue => loop { |
1013 | match peek!(self, input) { |
1014 | ' \t' | ' \n' | ' \r' | ' \x0C' | ' ' => go!(self: discard_char input), |
1015 | '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), |
1016 | ' \'' => go!(self: discard_char input; to AttributeValue SingleQuoted), |
1017 | ' \0' => { |
1018 | go!(self: discard_char input; error; push_value ' \u{fffd}' ; to AttributeValue Unquoted) |
1019 | }, |
1020 | '>' => go!(self: discard_char input; error; emit_tag Data), |
1021 | _ => go!(self: to AttributeValue Unquoted), |
1022 | } |
1023 | }, |
1024 | |
1025 | //§ attribute-value-(double-quoted)-state |
1026 | states::AttributeValue(DoubleQuoted) => loop { |
1027 | match pop_except_from!(self, input, small_char_set!(' \r' '"' '&' ' \0' ' \n' )) { |
1028 | FromSet('"' ) => go!(self: to AfterAttributeValueQuoted), |
1029 | FromSet('&' ) => go!(self: consume_char_ref '"' ), |
1030 | FromSet(' \0' ) => go!(self: error; push_value ' \u{fffd}' ), |
1031 | FromSet(c) => go!(self: push_value c), |
1032 | NotFromSet(ref b) => go!(self: append_value b), |
1033 | } |
1034 | }, |
1035 | |
1036 | //§ attribute-value-(single-quoted)-state |
1037 | states::AttributeValue(SingleQuoted) => loop { |
1038 | match pop_except_from!(self, input, small_char_set!(' \r' ' \'' '&' ' \0' ' \n' )) { |
1039 | FromSet(' \'' ) => go!(self: to AfterAttributeValueQuoted), |
1040 | FromSet('&' ) => go!(self: consume_char_ref ' \'' ), |
1041 | FromSet(' \0' ) => go!(self: error; push_value ' \u{fffd}' ), |
1042 | FromSet(c) => go!(self: push_value c), |
1043 | NotFromSet(ref b) => go!(self: append_value b), |
1044 | } |
1045 | }, |
1046 | |
1047 | //§ attribute-value-(unquoted)-state |
1048 | states::AttributeValue(Unquoted) => loop { |
1049 | match pop_except_from!( |
1050 | self, |
1051 | input, |
1052 | small_char_set!(' \r' ' \t' ' \n' ' \x0C' ' ' '&' '>' ' \0' ) |
1053 | ) { |
1054 | FromSet(' \t' ) | FromSet(' \n' ) | FromSet(' \x0C' ) | FromSet(' ' ) => { |
1055 | go!(self: to BeforeAttributeName) |
1056 | }, |
1057 | FromSet('&' ) => go!(self: consume_char_ref '>' ), |
1058 | FromSet('>' ) => go!(self: emit_tag Data), |
1059 | FromSet(' \0' ) => go!(self: error; push_value ' \u{fffd}' ), |
1060 | FromSet(c) => { |
1061 | go_match!(self: c, |
1062 | '"' , ' \'' , '<' , '=' , '`' => error); |
1063 | go!(self: push_value c); |
1064 | }, |
1065 | NotFromSet(ref b) => go!(self: append_value b), |
1066 | } |
1067 | }, |
1068 | |
1069 | //§ after-attribute-value-(quoted)-state |
1070 | states::AfterAttributeValueQuoted => loop { |
1071 | match get_char!(self, input) { |
1072 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeAttributeName), |
1073 | '/' => go!(self: to SelfClosingStartTag), |
1074 | '>' => go!(self: emit_tag Data), |
1075 | _ => go!(self: error; reconsume BeforeAttributeName), |
1076 | } |
1077 | }, |
1078 | |
1079 | //§ self-closing-start-tag-state |
1080 | states::SelfClosingStartTag => loop { |
1081 | match get_char!(self, input) { |
1082 | '>' => { |
1083 | self.current_tag_self_closing = true; |
1084 | go!(self: emit_tag Data); |
1085 | }, |
1086 | _ => go!(self: error; reconsume BeforeAttributeName), |
1087 | } |
1088 | }, |
1089 | |
1090 | //§ comment-start-state |
1091 | states::CommentStart => loop { |
1092 | match get_char!(self, input) { |
1093 | '-' => go!(self: to CommentStartDash), |
1094 | ' \0' => go!(self: error; push_comment ' \u{fffd}' ; to Comment), |
1095 | '>' => go!(self: error; emit_comment; to Data), |
1096 | c => go!(self: push_comment c; to Comment), |
1097 | } |
1098 | }, |
1099 | |
1100 | //§ comment-start-dash-state |
1101 | states::CommentStartDash => loop { |
1102 | match get_char!(self, input) { |
1103 | '-' => go!(self: to CommentEnd), |
1104 | ' \0' => go!(self: error; append_comment "- \u{fffd}" ; to Comment), |
1105 | '>' => go!(self: error; emit_comment; to Data), |
1106 | c => go!(self: push_comment '-' ; push_comment c; to Comment), |
1107 | } |
1108 | }, |
1109 | |
1110 | //§ comment-state |
1111 | states::Comment => loop { |
1112 | match get_char!(self, input) { |
1113 | '-' => go!(self: to CommentEndDash), |
1114 | ' \0' => go!(self: error; push_comment ' \u{fffd}' ), |
1115 | c => go!(self: push_comment c), |
1116 | } |
1117 | }, |
1118 | |
1119 | //§ comment-end-dash-state |
1120 | states::CommentEndDash => loop { |
1121 | match get_char!(self, input) { |
1122 | '-' => go!(self: to CommentEnd), |
1123 | ' \0' => go!(self: error; append_comment "- \u{fffd}" ; to Comment), |
1124 | c => go!(self: push_comment '-' ; push_comment c; to Comment), |
1125 | } |
1126 | }, |
1127 | |
1128 | //§ comment-end-state |
1129 | states::CommentEnd => loop { |
1130 | match get_char!(self, input) { |
1131 | '>' => go!(self: emit_comment; to Data), |
1132 | ' \0' => go!(self: error; append_comment "-- \u{fffd}" ; to Comment), |
1133 | '!' => go!(self: error; to CommentEndBang), |
1134 | '-' => go!(self: error; push_comment '-' ), |
1135 | c => go!(self: error; append_comment "--" ; push_comment c; to Comment), |
1136 | } |
1137 | }, |
1138 | |
1139 | //§ comment-end-bang-state |
1140 | states::CommentEndBang => loop { |
1141 | match get_char!(self, input) { |
1142 | '-' => go!(self: append_comment "--!" ; to CommentEndDash), |
1143 | '>' => go!(self: emit_comment; to Data), |
1144 | ' \0' => go!(self: error; append_comment "--! \u{fffd}" ; to Comment), |
1145 | c => go!(self: append_comment "--!" ; push_comment c; to Comment), |
1146 | } |
1147 | }, |
1148 | |
1149 | //§ doctype-state |
1150 | states::Doctype => loop { |
1151 | match get_char!(self, input) { |
1152 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeDoctypeName), |
1153 | _ => go!(self: error; reconsume BeforeDoctypeName), |
1154 | } |
1155 | }, |
1156 | |
1157 | //§ before-doctype-name-state |
1158 | states::BeforeDoctypeName => loop { |
1159 | match get_char!(self, input) { |
1160 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1161 | ' \0' => { |
1162 | go!(self: error; create_doctype; push_doctype_name ' \u{fffd}' ; to DoctypeName) |
1163 | }, |
1164 | '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data), |
1165 | c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); |
1166 | to DoctypeName), |
1167 | } |
1168 | }, |
1169 | |
1170 | //§ doctype-name-state |
1171 | states::DoctypeName => loop { |
1172 | match get_char!(self, input) { |
1173 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName), |
1174 | '>' => go!(self: emit_doctype; to Data), |
1175 | ' \0' => go!(self: error; push_doctype_name ' \u{fffd}' ), |
1176 | c => go!(self: push_doctype_name (c.to_ascii_lowercase())), |
1177 | } |
1178 | }, |
1179 | |
1180 | //§ after-doctype-name-state |
1181 | states::AfterDoctypeName => loop { |
1182 | if eat!(self, input, "public" ) { |
1183 | go!(self: to AfterDoctypeKeyword Public); |
1184 | } else if eat!(self, input, "system" ) { |
1185 | go!(self: to AfterDoctypeKeyword System); |
1186 | } else { |
1187 | match get_char!(self, input) { |
1188 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1189 | '>' => go!(self: emit_doctype; to Data), |
1190 | _ => go!(self: error; force_quirks; to BogusDoctype), |
1191 | } |
1192 | } |
1193 | }, |
1194 | |
1195 | //§ after-doctype-public-keyword-state after-doctype-system-keyword-state |
1196 | states::AfterDoctypeKeyword(kind) => loop { |
1197 | match get_char!(self, input) { |
1198 | ' \t' | ' \n' | ' \x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind), |
1199 | '"' => { |
1200 | go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind) |
1201 | }, |
1202 | ' \'' => { |
1203 | go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) |
1204 | }, |
1205 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1206 | _ => go!(self: error; force_quirks; to BogusDoctype), |
1207 | } |
1208 | }, |
1209 | |
1210 | //§ before-doctype-public-identifier-state before-doctype-system-identifier-state |
1211 | states::BeforeDoctypeIdentifier(kind) => loop { |
1212 | match get_char!(self, input) { |
1213 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1214 | '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), |
1215 | ' \'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), |
1216 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1217 | _ => go!(self: error; force_quirks; to BogusDoctype), |
1218 | } |
1219 | }, |
1220 | |
1221 | //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state |
1222 | states::DoctypeIdentifierDoubleQuoted(kind) => loop { |
1223 | match get_char!(self, input) { |
1224 | '"' => go!(self: to AfterDoctypeIdentifier kind), |
1225 | ' \0' => go!(self: error; push_doctype_id kind ' \u{fffd}' ), |
1226 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1227 | c => go!(self: push_doctype_id kind c), |
1228 | } |
1229 | }, |
1230 | |
1231 | //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state |
1232 | states::DoctypeIdentifierSingleQuoted(kind) => loop { |
1233 | match get_char!(self, input) { |
1234 | ' \'' => go!(self: to AfterDoctypeIdentifier kind), |
1235 | ' \0' => go!(self: error; push_doctype_id kind ' \u{fffd}' ), |
1236 | '>' => go!(self: error; force_quirks; emit_doctype; to Data), |
1237 | c => go!(self: push_doctype_id kind c), |
1238 | } |
1239 | }, |
1240 | |
1241 | //§ after-doctype-public-identifier-state |
1242 | states::AfterDoctypeIdentifier(Public) => loop { |
1243 | match get_char!(self, input) { |
1244 | ' \t' | ' \n' | ' \x0C' | ' ' => { |
1245 | go!(self: to BetweenDoctypePublicAndSystemIdentifiers) |
1246 | }, |
1247 | '>' => go!(self: emit_doctype; to Data), |
1248 | '"' => { |
1249 | go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) |
1250 | }, |
1251 | ' \'' => { |
1252 | go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) |
1253 | }, |
1254 | _ => go!(self: error; force_quirks; to BogusDoctype), |
1255 | } |
1256 | }, |
1257 | |
1258 | //§ after-doctype-system-identifier-state |
1259 | states::AfterDoctypeIdentifier(System) => loop { |
1260 | match get_char!(self, input) { |
1261 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1262 | '>' => go!(self: emit_doctype; to Data), |
1263 | _ => go!(self: error; to BogusDoctype), |
1264 | } |
1265 | }, |
1266 | |
1267 | //§ between-doctype-public-and-system-identifiers-state |
1268 | states::BetweenDoctypePublicAndSystemIdentifiers => loop { |
1269 | match get_char!(self, input) { |
1270 | ' \t' | ' \n' | ' \x0C' | ' ' => (), |
1271 | '>' => go!(self: emit_doctype; to Data), |
1272 | '"' => { |
1273 | go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System) |
1274 | }, |
1275 | ' \'' => { |
1276 | go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) |
1277 | }, |
1278 | _ => go!(self: error; force_quirks; to BogusDoctype), |
1279 | } |
1280 | }, |
1281 | |
1282 | //§ bogus-doctype-state |
1283 | states::BogusDoctype => loop { |
1284 | match get_char!(self, input) { |
1285 | '>' => go!(self: emit_doctype; to Data), |
1286 | _ => (), |
1287 | } |
1288 | }, |
1289 | |
1290 | //§ bogus-comment-state |
1291 | states::BogusComment => loop { |
1292 | match get_char!(self, input) { |
1293 | '>' => go!(self: emit_comment; to Data), |
1294 | ' \0' => go!(self: push_comment ' \u{fffd}' ), |
1295 | c => go!(self: push_comment c), |
1296 | } |
1297 | }, |
1298 | |
1299 | //§ markup-declaration-open-state |
1300 | states::MarkupDeclarationOpen => loop { |
1301 | if eat_exact!(self, input, "--" ) { |
1302 | go!(self: clear_comment; to CommentStart); |
1303 | } else if eat!(self, input, "doctype" ) { |
1304 | go!(self: to Doctype); |
1305 | } else { |
1306 | if self |
1307 | .sink |
1308 | .adjusted_current_node_present_but_not_in_html_namespace() |
1309 | { |
1310 | if eat_exact!(self, input, "[CDATA[" ) { |
1311 | go!(self: clear_temp; to CdataSection); |
1312 | } |
1313 | } |
1314 | go!(self: error; to BogusComment); |
1315 | } |
1316 | }, |
1317 | |
1318 | //§ cdata-section-state |
1319 | states::CdataSection => loop { |
1320 | match get_char!(self, input) { |
1321 | ']' => go!(self: to CdataSectionBracket), |
1322 | ' \0' => go!(self: emit_temp; emit ' \0' ), |
1323 | c => go!(self: push_temp c), |
1324 | } |
1325 | }, |
1326 | |
1327 | //§ cdata-section-bracket |
1328 | states::CdataSectionBracket => match get_char!(self, input) { |
1329 | ']' => go!(self: to CdataSectionEnd), |
1330 | _ => go!(self: push_temp ']' ; reconsume CdataSection), |
1331 | }, |
1332 | |
1333 | //§ cdata-section-end |
1334 | states::CdataSectionEnd => loop { |
1335 | match get_char!(self, input) { |
1336 | ']' => go!(self: push_temp ']' ), |
1337 | '>' => go!(self: emit_temp; to Data), |
1338 | _ => go!(self: push_temp ']' ; push_temp ']' ; reconsume CdataSection), |
1339 | } |
1340 | }, |
1341 | //§ END |
1342 | } |
1343 | } |
1344 | |
1345 | fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> { |
1346 | // FIXME HACK: Take and replace the tokenizer so we don't |
1347 | // double-mut-borrow self. This is why it's boxed. |
1348 | let mut tok = self.char_ref_tokenizer.take().unwrap(); |
1349 | let outcome = tok.step(self, input); |
1350 | |
1351 | let progress = match outcome { |
1352 | char_ref::Done => { |
1353 | self.process_char_ref(tok.get_result()); |
1354 | return ProcessResult::Continue; |
1355 | }, |
1356 | |
1357 | char_ref::Stuck => ProcessResult::Suspend, |
1358 | char_ref::Progress => ProcessResult::Continue, |
1359 | }; |
1360 | |
1361 | self.char_ref_tokenizer = Some(tok); |
1362 | progress |
1363 | } |
1364 | |
1365 | fn process_char_ref(&mut self, char_ref: CharRef) { |
1366 | let CharRef { |
1367 | mut chars, |
1368 | mut num_chars, |
1369 | } = char_ref; |
1370 | |
1371 | if num_chars == 0 { |
1372 | chars[0] = '&' ; |
1373 | num_chars = 1; |
1374 | } |
1375 | |
1376 | for i in 0..num_chars { |
1377 | let c = chars[i as usize]; |
1378 | match self.state { |
1379 | states::Data | states::RawData(states::Rcdata) => go!(self: emit c), |
1380 | |
1381 | states::AttributeValue(_) => go!(self: push_value c), |
1382 | |
1383 | _ => panic!( |
1384 | "state {:?} should not be reachable in process_char_ref" , |
1385 | self.state |
1386 | ), |
1387 | } |
1388 | } |
1389 | } |
1390 | |
1391 | /// Indicate that we have reached the end of the input. |
1392 | pub fn end(&mut self) { |
1393 | // Handle EOF in the char ref sub-tokenizer, if there is one. |
1394 | // Do this first because it might un-consume stuff. |
1395 | let mut input = BufferQueue::new(); |
1396 | match self.char_ref_tokenizer.take() { |
1397 | None => (), |
1398 | Some(mut tok) => { |
1399 | tok.end_of_file(self, &mut input); |
1400 | self.process_char_ref(tok.get_result()); |
1401 | }, |
1402 | } |
1403 | |
1404 | // Process all remaining buffered input. |
1405 | // If we're waiting for lookahead, we're not gonna get it. |
1406 | self.at_eof = true; |
1407 | assert!(matches!(self.run(&mut input), TokenizerResult::Done)); |
1408 | assert!(input.is_empty()); |
1409 | |
1410 | loop { |
1411 | match self.eof_step() { |
1412 | ProcessResult::Continue => (), |
1413 | ProcessResult::Suspend => break, |
1414 | ProcessResult::Script(_) => unreachable!(), |
1415 | } |
1416 | } |
1417 | |
1418 | self.sink.end(); |
1419 | |
1420 | if self.opts.profile { |
1421 | self.dump_profile(); |
1422 | } |
1423 | } |
1424 | |
1425 | fn dump_profile(&self) { |
1426 | let mut results: Vec<(states::State, u64)> = |
1427 | self.state_profile.iter().map(|(s, t)| (*s, *t)).collect(); |
1428 | results.sort_by(|&(_, x), &(_, y)| y.cmp(&x)); |
1429 | |
1430 | let total: u64 = results |
1431 | .iter() |
1432 | .map(|&(_, t)| t) |
1433 | .fold(0, ::std::ops::Add::add); |
1434 | println!(" \nTokenizer profile, in nanoseconds" ); |
1435 | println!(" \n{:12} total in token sink" , self.time_in_sink); |
1436 | println!(" \n{:12} total in tokenizer" , total); |
1437 | |
1438 | for (k, v) in results.into_iter() { |
1439 | let pct = 100.0 * (v as f64) / (total as f64); |
1440 | println!(" {:12} {:4.1}% {:?}" , v, pct, k); |
1441 | } |
1442 | } |
1443 | |
1444 | fn eof_step(&mut self) -> ProcessResult<Sink::Handle> { |
1445 | debug!("processing EOF in state {:?}" , self.state); |
1446 | match self.state { |
1447 | states::Data | |
1448 | states::RawData(Rcdata) | |
1449 | states::RawData(Rawtext) | |
1450 | states::RawData(ScriptData) | |
1451 | states::Plaintext => go!(self: eof), |
1452 | |
1453 | states::TagName | |
1454 | states::RawData(ScriptDataEscaped(_)) | |
1455 | states::BeforeAttributeName | |
1456 | states::AttributeName | |
1457 | states::AfterAttributeName | |
1458 | states::BeforeAttributeValue | |
1459 | states::AttributeValue(_) | |
1460 | states::AfterAttributeValueQuoted | |
1461 | states::SelfClosingStartTag | |
1462 | states::ScriptDataEscapedDash(_) | |
1463 | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), |
1464 | |
1465 | states::TagOpen => go!(self: error_eof; emit '<' ; to Data), |
1466 | |
1467 | states::EndTagOpen => go!(self: error_eof; emit '<' ; emit '/' ; to Data), |
1468 | |
1469 | states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => { |
1470 | go!(self: to RawData ScriptDataEscaped DoubleEscaped) |
1471 | }, |
1472 | |
1473 | states::RawLessThanSign(kind) => go!(self: emit '<' ; to RawData kind), |
1474 | |
1475 | states::RawEndTagOpen(kind) => go!(self: emit '<' ; emit '/' ; to RawData kind), |
1476 | |
1477 | states::RawEndTagName(kind) => { |
1478 | go!(self: emit '<' ; emit '/' ; emit_temp; to RawData kind) |
1479 | }, |
1480 | |
1481 | states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind), |
1482 | |
1483 | states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData), |
1484 | |
1485 | states::ScriptDataDoubleEscapeEnd => { |
1486 | go!(self: to RawData ScriptDataEscaped DoubleEscaped) |
1487 | }, |
1488 | |
1489 | states::CommentStart | |
1490 | states::CommentStartDash | |
1491 | states::Comment | |
1492 | states::CommentEndDash | |
1493 | states::CommentEnd | |
1494 | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), |
1495 | |
1496 | states::Doctype | states::BeforeDoctypeName => { |
1497 | go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) |
1498 | }, |
1499 | |
1500 | states::DoctypeName | |
1501 | states::AfterDoctypeName | |
1502 | states::AfterDoctypeKeyword(_) | |
1503 | states::BeforeDoctypeIdentifier(_) | |
1504 | states::DoctypeIdentifierDoubleQuoted(_) | |
1505 | states::DoctypeIdentifierSingleQuoted(_) | |
1506 | states::AfterDoctypeIdentifier(_) | |
1507 | states::BetweenDoctypePublicAndSystemIdentifiers => { |
1508 | go!(self: error_eof; force_quirks; emit_doctype; to Data) |
1509 | }, |
1510 | |
1511 | states::BogusDoctype => go!(self: emit_doctype; to Data), |
1512 | |
1513 | states::BogusComment => go!(self: emit_comment; to Data), |
1514 | |
1515 | states::MarkupDeclarationOpen => go!(self: error; to BogusComment), |
1516 | |
1517 | states::CdataSection => go!(self: emit_temp; error_eof; to Data), |
1518 | |
1519 | states::CdataSectionBracket => go!(self: push_temp ']' ; to CdataSection), |
1520 | |
1521 | states::CdataSectionEnd => go!(self: push_temp ']' ; push_temp ']' ; to CdataSection), |
1522 | } |
1523 | } |
1524 | } |
1525 | |
1526 | #[cfg (test)] |
1527 | #[allow (non_snake_case)] |
1528 | mod test { |
1529 | use super::option_push; // private items |
1530 | use crate::tendril::{SliceExt, StrTendril}; |
1531 | |
1532 | use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; |
1533 | |
1534 | use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; |
1535 | use super::interface::{EndTag, StartTag, Tag, TagKind}; |
1536 | use super::interface::{TagToken, Token}; |
1537 | |
1538 | use markup5ever::buffer_queue::BufferQueue; |
1539 | use std::mem::replace; |
1540 | |
1541 | use crate::LocalName; |
1542 | |
1543 | // LinesMatch implements the TokenSink trait. It is used for testing to see |
1544 | // if current_line is being updated when process_token is called. The lines |
1545 | // vector is a collection of the line numbers that each token is on. |
1546 | struct LinesMatch { |
1547 | tokens: Vec<Token>, |
1548 | current_str: StrTendril, |
1549 | lines: Vec<(Token, u64)>, |
1550 | } |
1551 | |
1552 | impl LinesMatch { |
1553 | fn new() -> LinesMatch { |
1554 | LinesMatch { |
1555 | tokens: vec![], |
1556 | current_str: StrTendril::new(), |
1557 | lines: vec![], |
1558 | } |
1559 | } |
1560 | |
1561 | fn push(&mut self, token: Token, line_number: u64) { |
1562 | self.finish_str(); |
1563 | self.lines.push((token, line_number)); |
1564 | } |
1565 | |
1566 | fn finish_str(&mut self) { |
1567 | if self.current_str.len() > 0 { |
1568 | let s = replace(&mut self.current_str, StrTendril::new()); |
1569 | self.tokens.push(CharacterTokens(s)); |
1570 | } |
1571 | } |
1572 | } |
1573 | |
1574 | impl TokenSink for LinesMatch { |
1575 | type Handle = (); |
1576 | |
1577 | fn process_token( |
1578 | &mut self, |
1579 | token: Token, |
1580 | line_number: u64, |
1581 | ) -> TokenSinkResult<Self::Handle> { |
1582 | match token { |
1583 | CharacterTokens(b) => { |
1584 | self.current_str.push_slice(&b); |
1585 | }, |
1586 | |
1587 | NullCharacterToken => { |
1588 | self.current_str.push_char(' \0' ); |
1589 | }, |
1590 | |
1591 | ParseError(_) => { |
1592 | panic!("unexpected parse error" ); |
1593 | }, |
1594 | |
1595 | TagToken(mut t) => { |
1596 | // The spec seems to indicate that one can emit |
1597 | // erroneous end tags with attrs, but the test |
1598 | // cases don't contain them. |
1599 | match t.kind { |
1600 | EndTag => { |
1601 | t.self_closing = false; |
1602 | t.attrs = vec![]; |
1603 | }, |
1604 | _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)), |
1605 | } |
1606 | self.push(TagToken(t), line_number); |
1607 | }, |
1608 | |
1609 | EOFToken => (), |
1610 | |
1611 | _ => self.push(token, line_number), |
1612 | } |
1613 | TokenSinkResult::Continue |
1614 | } |
1615 | } |
1616 | |
1617 | // Take in tokens, process them, and return vector with line |
1618 | // numbers that each token is on |
1619 | fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> { |
1620 | let sink = LinesMatch::new(); |
1621 | let mut tok = Tokenizer::new(sink, opts); |
1622 | let mut buffer = BufferQueue::new(); |
1623 | for chunk in input.into_iter() { |
1624 | buffer.push_back(chunk); |
1625 | let _ = tok.feed(&mut buffer); |
1626 | } |
1627 | tok.end(); |
1628 | tok.sink.lines |
1629 | } |
1630 | |
1631 | // Create a tag token |
1632 | fn create_tag(token: StrTendril, tagkind: TagKind) -> Token { |
1633 | let name = LocalName::from(&*token); |
1634 | let token = TagToken(Tag { |
1635 | kind: tagkind, |
1636 | name, |
1637 | self_closing: false, |
1638 | attrs: vec![], |
1639 | }); |
1640 | token |
1641 | } |
1642 | |
1643 | #[test ] |
1644 | fn push_to_None_gives_singleton() { |
1645 | let mut s: Option<StrTendril> = None; |
1646 | option_push(&mut s, 'x' ); |
1647 | assert_eq!(s, Some("x" .to_tendril())); |
1648 | } |
1649 | |
1650 | #[test ] |
1651 | fn push_to_empty_appends() { |
1652 | let mut s: Option<StrTendril> = Some(StrTendril::new()); |
1653 | option_push(&mut s, 'x' ); |
1654 | assert_eq!(s, Some("x" .to_tendril())); |
1655 | } |
1656 | |
1657 | #[test ] |
1658 | fn push_to_nonempty_appends() { |
1659 | let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y" )); |
1660 | option_push(&mut s, 'x' ); |
1661 | assert_eq!(s, Some("yx" .to_tendril())); |
1662 | } |
1663 | |
1664 | #[test ] |
1665 | fn check_lines() { |
1666 | let opts = TokenizerOpts { |
1667 | exact_errors: false, |
1668 | discard_bom: true, |
1669 | profile: false, |
1670 | initial_state: None, |
1671 | last_start_tag_name: None, |
1672 | }; |
1673 | let vector = vec![ |
1674 | StrTendril::from("<a> \n" ), |
1675 | StrTendril::from("<b> \n" ), |
1676 | StrTendril::from("</b> \n" ), |
1677 | StrTendril::from("</a> \n" ), |
1678 | ]; |
1679 | let expected = vec![ |
1680 | (create_tag(StrTendril::from("a" ), StartTag), 1), |
1681 | (create_tag(StrTendril::from("b" ), StartTag), 2), |
1682 | (create_tag(StrTendril::from("b" ), EndTag), 3), |
1683 | (create_tag(StrTendril::from("a" ), EndTag), 4), |
1684 | ]; |
1685 | let results = tokenize(vector, opts); |
1686 | assert_eq!(results, expected); |
1687 | } |
1688 | |
1689 | #[test ] |
1690 | fn check_lines_with_new_line() { |
1691 | let opts = TokenizerOpts { |
1692 | exact_errors: false, |
1693 | discard_bom: true, |
1694 | profile: false, |
1695 | initial_state: None, |
1696 | last_start_tag_name: None, |
1697 | }; |
1698 | let vector = vec![ |
1699 | StrTendril::from("<a> \r\n" ), |
1700 | StrTendril::from("<b> \r\n" ), |
1701 | StrTendril::from("</b> \r\n" ), |
1702 | StrTendril::from("</a> \r\n" ), |
1703 | ]; |
1704 | let expected = vec![ |
1705 | (create_tag(StrTendril::from("a" ), StartTag), 1), |
1706 | (create_tag(StrTendril::from("b" ), StartTag), 2), |
1707 | (create_tag(StrTendril::from("b" ), EndTag), 3), |
1708 | (create_tag(StrTendril::from("a" ), EndTag), 4), |
1709 | ]; |
1710 | let results = tokenize(vector, opts); |
1711 | assert_eq!(results, expected); |
1712 | } |
1713 | } |
1714 | |