1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::{TokenSink, Tokenizer};
11use crate::buffer_queue::BufferQueue;
12use crate::data;
13use crate::tendril::StrTendril;
14
15use log::debug;
16use mac::format_if;
17use std::borrow::Cow::Borrowed;
18use std::char::from_u32;
19
20use self::State::*;
21pub use self::Status::*;
22
23//ยง tokenizing-character-references
24pub struct CharRef {
25 /// The resulting character(s)
26 pub chars: [char; 2],
27
28 /// How many slots in `chars` are valid?
29 pub num_chars: u8,
30}
31
32pub enum Status {
33 Stuck,
34 Progress,
35 Done,
36}
37
38#[derive(Debug)]
39enum State {
40 Begin,
41 Octothorpe,
42 Numeric(u32), // base
43 NumericSemicolon,
44 Named,
45 BogusName,
46}
47
48pub struct CharRefTokenizer {
49 state: State,
50 addnl_allowed: Option<char>,
51 result: Option<CharRef>,
52
53 num: u32,
54 num_too_big: bool,
55 seen_digit: bool,
56 hex_marker: Option<char>,
57
58 name_buf_opt: Option<StrTendril>,
59 name_match: Option<(u32, u32)>,
60 name_len: usize,
61}
62
63impl CharRefTokenizer {
64 // NB: We assume that we have an additional allowed character iff we're
65 // tokenizing in an attribute value.
66 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
67 CharRefTokenizer {
68 state: Begin,
69 addnl_allowed,
70 result: None,
71 num: 0,
72 num_too_big: false,
73 seen_digit: false,
74 hex_marker: None,
75 name_buf_opt: None,
76 name_match: None,
77 name_len: 0,
78 }
79 }
80
81 // A CharRefTokenizer can only tokenize one character reference,
82 // so this method consumes the tokenizer.
83 pub fn get_result(self) -> CharRef {
84 self.result.expect("get_result called before done")
85 }
86
87 fn name_buf(&self) -> &StrTendril {
88 self.name_buf_opt
89 .as_ref()
90 .expect("name_buf missing in named character reference")
91 }
92
93 fn name_buf_mut(&mut self) -> &mut StrTendril {
94 self.name_buf_opt
95 .as_mut()
96 .expect("name_buf missing in named character reference")
97 }
98
99 fn finish_none(&mut self) -> Status {
100 self.result = Some(CharRef {
101 chars: ['\0', '\0'],
102 num_chars: 0,
103 });
104 Done
105 }
106
107 fn finish_one(&mut self, c: char) -> Status {
108 self.result = Some(CharRef {
109 chars: [c, '\0'],
110 num_chars: 1,
111 });
112 Done
113 }
114}
115
116impl CharRefTokenizer {
117 pub fn step<Sink: TokenSink>(
118 &mut self,
119 tokenizer: &mut Tokenizer<Sink>,
120 input: &mut BufferQueue,
121 ) -> Status {
122 if self.result.is_some() {
123 return Done;
124 }
125
126 debug!("char ref tokenizer stepping in state {:?}", self.state);
127 match self.state {
128 Begin => self.do_begin(tokenizer, input),
129 Octothorpe => self.do_octothorpe(tokenizer, input),
130 Numeric(base) => self.do_numeric(tokenizer, input, base),
131 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
132 Named => self.do_named(tokenizer, input),
133 BogusName => self.do_bogus_name(tokenizer, input),
134 }
135 }
136
137 fn do_begin<Sink: TokenSink>(
138 &mut self,
139 tokenizer: &mut Tokenizer<Sink>,
140 input: &mut BufferQueue,
141 ) -> Status {
142 match unwrap_or_return!(tokenizer.peek(input), Stuck) {
143 '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
144 c if Some(c) == self.addnl_allowed => self.finish_none(),
145
146 '#' => {
147 tokenizer.discard_char(input);
148 self.state = Octothorpe;
149 Progress
150 },
151
152 _ => {
153 self.state = Named;
154 self.name_buf_opt = Some(StrTendril::new());
155 Progress
156 },
157 }
158 }
159
160 fn do_octothorpe<Sink: TokenSink>(
161 &mut self,
162 tokenizer: &mut Tokenizer<Sink>,
163 input: &mut BufferQueue,
164 ) -> Status {
165 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
166 match c {
167 'x' | 'X' => {
168 tokenizer.discard_char(input);
169 self.hex_marker = Some(c);
170 self.state = Numeric(16);
171 },
172
173 _ => {
174 self.hex_marker = None;
175 self.state = Numeric(10);
176 },
177 }
178 Progress
179 }
180
181 fn do_numeric<Sink: TokenSink>(
182 &mut self,
183 tokenizer: &mut Tokenizer<Sink>,
184 input: &mut BufferQueue,
185 base: u32,
186 ) -> Status {
187 let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
188 match c.to_digit(base) {
189 Some(n) => {
190 tokenizer.discard_char(input);
191 self.num = self.num.wrapping_mul(base);
192 if self.num > 0x10FFFF {
193 // We might overflow, and the character is definitely invalid.
194 // We still parse digits and semicolon, but don't use the result.
195 self.num_too_big = true;
196 }
197 self.num = self.num.wrapping_add(n);
198 self.seen_digit = true;
199 Progress
200 },
201
202 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
203
204 None => {
205 self.state = NumericSemicolon;
206 Progress
207 },
208 }
209 }
210
211 fn do_numeric_semicolon<Sink: TokenSink>(
212 &mut self,
213 tokenizer: &mut Tokenizer<Sink>,
214 input: &mut BufferQueue,
215 ) -> Status {
216 match unwrap_or_return!(tokenizer.peek(input), Stuck) {
217 ';' => tokenizer.discard_char(input),
218 _ => tokenizer.emit_error(Borrowed(
219 "Semicolon missing after numeric character reference",
220 )),
221 };
222 self.finish_numeric(tokenizer)
223 }
224
225 fn unconsume_numeric<Sink: TokenSink>(
226 &mut self,
227 tokenizer: &mut Tokenizer<Sink>,
228 input: &mut BufferQueue,
229 ) -> Status {
230 let mut unconsume = StrTendril::from_char('#');
231 match self.hex_marker {
232 Some(c) => unconsume.push_char(c),
233 None => (),
234 }
235
236 input.push_front(unconsume);
237 tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
238 self.finish_none()
239 }
240
241 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
242 fn conv(n: u32) -> char {
243 from_u32(n).expect("invalid char missed by error handling cases")
244 }
245
246 let (c, error) = match self.num {
247 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
248 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
249
250 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
251 Some(c) => (c, true),
252 None => (conv(self.num), true),
253 },
254
255 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
256
257 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
258
259 n => (conv(n), false),
260 };
261
262 if error {
263 let msg = format_if!(
264 tokenizer.opts.exact_errors,
265 "Invalid numeric character reference",
266 "Invalid numeric character reference value 0x{:06X}",
267 self.num
268 );
269 tokenizer.emit_error(msg);
270 }
271
272 self.finish_one(c)
273 }
274
275 fn do_named<Sink: TokenSink>(
276 &mut self,
277 tokenizer: &mut Tokenizer<Sink>,
278 input: &mut BufferQueue,
279 ) -> Status {
280 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
281 self.name_buf_mut().push_char(c);
282 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
283 // We have either a full match or a prefix of one.
284 Some(&m) => {
285 if m.0 != 0 {
286 // We have a full match, but there might be a longer one to come.
287 self.name_match = Some(m);
288 self.name_len = self.name_buf().len();
289 }
290 // Otherwise we just have a prefix match.
291 Progress
292 },
293
294 // Can't continue the match.
295 None => self.finish_named(tokenizer, input, Some(c)),
296 }
297 }
298
299 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
300 let msg = format_if!(
301 tokenizer.opts.exact_errors,
302 "Invalid character reference",
303 "Invalid character reference &{}",
304 self.name_buf()
305 );
306 tokenizer.emit_error(msg);
307 }
308
309 fn unconsume_name(&mut self, input: &mut BufferQueue) {
310 input.push_front(self.name_buf_opt.take().unwrap());
311 }
312
313 fn finish_named<Sink: TokenSink>(
314 &mut self,
315 tokenizer: &mut Tokenizer<Sink>,
316 input: &mut BufferQueue,
317 end_char: Option<char>,
318 ) -> Status {
319 match self.name_match {
320 None => {
321 match end_char {
322 Some(c) if c.is_ascii_alphanumeric() => {
323 // Keep looking for a semicolon, to determine whether
324 // we emit a parse error.
325 self.state = BogusName;
326 return Progress;
327 },
328
329 // Check length because &; is not a parse error.
330 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
331
332 _ => (),
333 }
334 self.unconsume_name(input);
335 self.finish_none()
336 },
337
338 Some((c1, c2)) => {
339 // We have a complete match, but we may have consumed
340 // additional characters into self.name_buf. Usually
341 // at least one, but several in cases like
342 //
343 // &not => match for U+00AC
344 // &noti => valid prefix for &notin
345 // &notit => can't continue match
346
347 let name_len = self.name_len;
348 assert!(name_len > 0);
349 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
350
351 // There might not be a next character after the match, if
352 // we had a full match and then hit EOF.
353 let next_after = if name_len == self.name_buf().len() {
354 None
355 } else {
356 Some(self.name_buf()[name_len..].chars().next().unwrap())
357 };
358
359 // "If the character reference is being consumed as part of an
360 // attribute, and the last character matched is not a U+003B
361 // SEMICOLON character (;), and the next character is either a
362 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
363 // character, then, for historical reasons, all the characters
364 // that were matched after the U+0026 AMPERSAND character (&)
365 // must be unconsumed, and nothing is returned. However, if
366 // this next character is in fact a U+003D EQUALS SIGN
367 // character (=), then this is a parse error"
368
369 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
370 (_, ';', _) => false,
371 (Some(_), _, Some('=')) => {
372 tokenizer.emit_error(Borrowed(
373 "Equals sign after character reference in attribute",
374 ));
375 true
376 },
377 (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
378 _ => {
379 tokenizer.emit_error(Borrowed(
380 "Character reference does not end with semicolon",
381 ));
382 false
383 },
384 };
385
386 if unconsume_all {
387 self.unconsume_name(input);
388 self.finish_none()
389 } else {
390 input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
391 self.result = Some(CharRef {
392 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
393 num_chars: if c2 == 0 { 1 } else { 2 },
394 });
395 Done
396 }
397 },
398 }
399 }
400
401 fn do_bogus_name<Sink: TokenSink>(
402 &mut self,
403 tokenizer: &mut Tokenizer<Sink>,
404 input: &mut BufferQueue,
405 ) -> Status {
406 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
407 self.name_buf_mut().push_char(c);
408 match c {
409 _ if c.is_ascii_alphanumeric() => return Progress,
410 ';' => self.emit_name_error(tokenizer),
411 _ => (),
412 }
413 self.unconsume_name(input);
414 self.finish_none()
415 }
416
417 pub fn end_of_file<Sink: TokenSink>(
418 &mut self,
419 tokenizer: &mut Tokenizer<Sink>,
420 input: &mut BufferQueue,
421 ) {
422 while self.result.is_none() {
423 match self.state {
424 Begin => drop(self.finish_none()),
425
426 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
427
428 Numeric(_) | NumericSemicolon => {
429 tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
430 self.finish_numeric(tokenizer);
431 },
432
433 Named => drop(self.finish_named(tokenizer, input, None)),
434
435 BogusName => {
436 self.unconsume_name(input);
437 self.finish_none();
438 },
439
440 Octothorpe => {
441 input.push_front(StrTendril::from_slice("#"));
442 tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
443 self.finish_none();
444 },
445 }
446 }
447 }
448}
449