1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::{TokenSink, Tokenizer};
11use crate::buffer_queue::BufferQueue;
12use crate::data;
13use crate::tendril::StrTendril;
14
15use log::debug;
16use mac::format_if;
17use std::borrow::Cow::Borrowed;
18use std::char::from_u32;
19
20use self::State::*;
21pub(super) use self::Status::*;
22
23//ยง tokenizing-character-references
24pub(super) struct CharRef {
25 /// The resulting character(s)
26 pub(super) chars: [char; 2],
27
28 /// How many slots in `chars` are valid?
29 pub(super) num_chars: u8,
30}
31
32pub(super) enum Status {
33 Stuck,
34 Progress,
35 Done,
36}
37
38#[derive(Debug)]
39enum State {
40 Begin,
41 Octothorpe,
42 Numeric(u32), // base
43 NumericSemicolon,
44 Named,
45 BogusName,
46}
47
48pub(super) struct CharRefTokenizer {
49 state: State,
50 result: Option<CharRef>,
51 is_consumed_in_attribute: bool,
52
53 num: u32,
54 num_too_big: bool,
55 seen_digit: bool,
56 hex_marker: Option<char>,
57
58 name_buf_opt: Option<StrTendril>,
59 name_match: Option<(u32, u32)>,
60 name_len: usize,
61}
62
63impl CharRefTokenizer {
64 pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
65 CharRefTokenizer {
66 is_consumed_in_attribute,
67 state: Begin,
68 result: None,
69 num: 0,
70 num_too_big: false,
71 seen_digit: false,
72 hex_marker: None,
73 name_buf_opt: None,
74 name_match: None,
75 name_len: 0,
76 }
77 }
78
79 // A CharRefTokenizer can only tokenize one character reference,
80 // so this method consumes the tokenizer.
81 pub(super) fn get_result(self) -> CharRef {
82 self.result.expect("get_result called before done")
83 }
84
85 fn name_buf(&self) -> &StrTendril {
86 self.name_buf_opt
87 .as_ref()
88 .expect("name_buf missing in named character reference")
89 }
90
91 fn name_buf_mut(&mut self) -> &mut StrTendril {
92 self.name_buf_opt
93 .as_mut()
94 .expect("name_buf missing in named character reference")
95 }
96
97 fn finish_none(&mut self) -> Status {
98 self.result = Some(CharRef {
99 chars: ['\0', '\0'],
100 num_chars: 0,
101 });
102 Done
103 }
104
105 fn finish_one(&mut self, c: char) -> Status {
106 self.result = Some(CharRef {
107 chars: [c, '\0'],
108 num_chars: 1,
109 });
110 Done
111 }
112}
113
114impl CharRefTokenizer {
115 pub(super) fn step<Sink: TokenSink>(
116 &mut self,
117 tokenizer: &Tokenizer<Sink>,
118 input: &BufferQueue,
119 ) -> Status {
120 if self.result.is_some() {
121 return Done;
122 }
123
124 debug!("char ref tokenizer stepping in state {:?}", self.state);
125 match self.state {
126 Begin => self.do_begin(tokenizer, input),
127 Octothorpe => self.do_octothorpe(tokenizer, input),
128 Numeric(base) => self.do_numeric(tokenizer, input, base),
129 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
130 Named => self.do_named(tokenizer, input),
131 BogusName => self.do_bogus_name(tokenizer, input),
132 }
133 }
134
135 fn do_begin<Sink: TokenSink>(
136 &mut self,
137 tokenizer: &Tokenizer<Sink>,
138 input: &BufferQueue,
139 ) -> Status {
140 match unwrap_or_return!(tokenizer.peek(input), Stuck) {
141 'a'..='z' | 'A'..='Z' | '0'..='9' => {
142 self.state = Named;
143 self.name_buf_opt = Some(StrTendril::new());
144 Progress
145 },
146
147 '#' => {
148 tokenizer.discard_char(input);
149 self.state = Octothorpe;
150 Progress
151 },
152 _ => self.finish_none(),
153 }
154 }
155
156 fn do_octothorpe<Sink: TokenSink>(
157 &mut self,
158 tokenizer: &Tokenizer<Sink>,
159 input: &BufferQueue,
160 ) -> Status {
161 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
162 match c {
163 'x' | 'X' => {
164 tokenizer.discard_char(input);
165 self.hex_marker = Some(c);
166 self.state = Numeric(16);
167 },
168
169 _ => {
170 self.hex_marker = None;
171 self.state = Numeric(10);
172 },
173 }
174 Progress
175 }
176
177 fn do_numeric<Sink: TokenSink>(
178 &mut self,
179 tokenizer: &Tokenizer<Sink>,
180 input: &BufferQueue,
181 base: u32,
182 ) -> Status {
183 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
184 match c.to_digit(base) {
185 Some(n) => {
186 tokenizer.discard_char(input);
187 self.num = self.num.wrapping_mul(base);
188 if self.num > 0x10FFFF {
189 // We might overflow, and the character is definitely invalid.
190 // We still parse digits and semicolon, but don't use the result.
191 self.num_too_big = true;
192 }
193 self.num = self.num.wrapping_add(n);
194 self.seen_digit = true;
195 Progress
196 },
197
198 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
199
200 None => {
201 self.state = NumericSemicolon;
202 Progress
203 },
204 }
205 }
206
207 fn do_numeric_semicolon<Sink: TokenSink>(
208 &mut self,
209 tokenizer: &Tokenizer<Sink>,
210 input: &BufferQueue,
211 ) -> Status {
212 match unwrap_or_return!(tokenizer.peek(input), Stuck) {
213 ';' => tokenizer.discard_char(input),
214 _ => tokenizer.emit_error(Borrowed(
215 "Semicolon missing after numeric character reference",
216 )),
217 };
218 self.finish_numeric(tokenizer)
219 }
220
221 fn unconsume_numeric<Sink: TokenSink>(
222 &mut self,
223 tokenizer: &Tokenizer<Sink>,
224 input: &BufferQueue,
225 ) -> Status {
226 let mut unconsume = StrTendril::from_char('#');
227 if let Some(c) = self.hex_marker {
228 unconsume.push_char(c)
229 }
230
231 input.push_front(unconsume);
232 tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
233 self.finish_none()
234 }
235
236 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status {
237 fn conv(n: u32) -> char {
238 from_u32(n).expect("invalid char missed by error handling cases")
239 }
240
241 let (c, error) = match self.num {
242 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
243 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
244
245 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
246 Some(c) => (c, true),
247 None => (conv(self.num), true),
248 },
249
250 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
251
252 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
253
254 n => (conv(n), false),
255 };
256
257 if error {
258 let msg = format_if!(
259 tokenizer.opts.exact_errors,
260 "Invalid numeric character reference",
261 "Invalid numeric character reference value 0x{:06X}",
262 self.num
263 );
264 tokenizer.emit_error(msg);
265 }
266
267 self.finish_one(c)
268 }
269
270 fn do_named<Sink: TokenSink>(
271 &mut self,
272 tokenizer: &Tokenizer<Sink>,
273 input: &BufferQueue,
274 ) -> Status {
275 // peek + discard skips over newline normalization, therefore making it easier to
276 // un-consume
277 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
278 tokenizer.discard_char(input);
279 self.name_buf_mut().push_char(c);
280 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
281 // We have either a full match or a prefix of one.
282 Some(&m) => {
283 if m.0 != 0 {
284 // We have a full match, but there might be a longer one to come.
285 self.name_match = Some(m);
286 self.name_len = self.name_buf().len();
287 }
288 // Otherwise we just have a prefix match.
289 Progress
290 },
291
292 // Can't continue the match.
293 None => self.finish_named(tokenizer, input, Some(c)),
294 }
295 }
296
297 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) {
298 let msg = format_if!(
299 tokenizer.opts.exact_errors,
300 "Invalid character reference",
301 "Invalid character reference &{}",
302 self.name_buf()
303 );
304 tokenizer.emit_error(msg);
305 }
306
307 fn unconsume_name(&mut self, input: &BufferQueue) {
308 input.push_front(self.name_buf_opt.take().unwrap());
309 }
310
311 fn finish_named<Sink: TokenSink>(
312 &mut self,
313 tokenizer: &Tokenizer<Sink>,
314 input: &BufferQueue,
315 end_char: Option<char>,
316 ) -> Status {
317 match self.name_match {
318 None => {
319 match end_char {
320 Some(c) if c.is_ascii_alphanumeric() => {
321 // Keep looking for a semicolon, to determine whether
322 // we emit a parse error.
323 self.state = BogusName;
324 return Progress;
325 },
326
327 // Check length because &; is not a parse error.
328 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
329
330 _ => (),
331 }
332 self.unconsume_name(input);
333 self.finish_none()
334 },
335
336 Some((c1, c2)) => {
337 // We have a complete match, but we may have consumed
338 // additional characters into self.name_buf. Usually
339 // at least one, but several in cases like
340 //
341 // &not => match for U+00AC
342 // &noti => valid prefix for &notin
343 // &notit => can't continue match
344
345 let name_len = self.name_len;
346 assert!(name_len > 0);
347 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
348
349 // There might not be a next character after the match, if
350 // we had a full match and then hit EOF.
351 let next_after = if name_len == self.name_buf().len() {
352 None
353 } else {
354 Some(self.name_buf()[name_len..].chars().next().unwrap())
355 };
356
357 // If the character reference was consumed as part of an attribute, and the last
358 // character matched is not a U+003B SEMICOLON character (;), and the next input
359 // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
360 // then, for historical reasons, flush code points consumed as a character
361 // reference and switch to the return state.
362
363 let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after)
364 {
365 (_, ';', _) => false,
366 (true, _, Some('=')) => true,
367 (true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
368 _ => {
369 // 1. If the last character matched is not a U+003B SEMICOLON character
370 // (;), then this is a missing-semicolon-after-character-reference parse
371 // error.
372 tokenizer.emit_error(Borrowed(
373 "Character reference does not end with semicolon",
374 ));
375 false
376 },
377 };
378
379 if unconsume_all {
380 self.unconsume_name(input);
381 self.finish_none()
382 } else {
383 input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
384 tokenizer.ignore_lf.set(false);
385 self.result = Some(CharRef {
386 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
387 num_chars: if c2 == 0 { 1 } else { 2 },
388 });
389 Done
390 }
391 },
392 }
393 }
394
395 fn do_bogus_name<Sink: TokenSink>(
396 &mut self,
397 tokenizer: &Tokenizer<Sink>,
398 input: &BufferQueue,
399 ) -> Status {
400 // peek + discard skips over newline normalization, therefore making it easier to
401 // un-consume
402 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
403 tokenizer.discard_char(input);
404 self.name_buf_mut().push_char(c);
405 match c {
406 _ if c.is_ascii_alphanumeric() => return Progress,
407 ';' => self.emit_name_error(tokenizer),
408 _ => (),
409 }
410 self.unconsume_name(input);
411 self.finish_none()
412 }
413
414 pub(super) fn end_of_file<Sink: TokenSink>(
415 &mut self,
416 tokenizer: &Tokenizer<Sink>,
417 input: &BufferQueue,
418 ) {
419 while self.result.is_none() {
420 match self.state {
421 Begin => drop(self.finish_none()),
422
423 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
424
425 Numeric(_) | NumericSemicolon => {
426 tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
427 self.finish_numeric(tokenizer);
428 },
429
430 Named => drop(self.finish_named(tokenizer, input, None)),
431
432 BogusName => {
433 self.unconsume_name(input);
434 self.finish_none();
435 },
436
437 Octothorpe => {
438 input.push_front(StrTendril::from_slice("#"));
439 tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
440 self.finish_none();
441 },
442 }
443 }
444 }
445}
446