1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::{TokenSink, Tokenizer}; |
11 | use crate::buffer_queue::BufferQueue; |
12 | use crate::data; |
13 | use crate::tendril::StrTendril; |
14 | |
15 | use log::debug; |
16 | use mac::format_if; |
17 | use std::borrow::Cow::Borrowed; |
18 | use std::char::from_u32; |
19 | |
20 | use self::State::*; |
21 | pub(super) use self::Status::*; |
22 | |
23 | //ยง tokenizing-character-references |
24 | pub(super) struct CharRef { |
25 | /// The resulting character(s) |
26 | pub(super) chars: [char; 2], |
27 | |
28 | /// How many slots in `chars` are valid? |
29 | pub(super) num_chars: u8, |
30 | } |
31 | |
32 | pub(super) enum Status { |
33 | Stuck, |
34 | Progress, |
35 | Done, |
36 | } |
37 | |
38 | #[derive (Debug)] |
39 | enum State { |
40 | Begin, |
41 | Octothorpe, |
42 | Numeric(u32), // base |
43 | NumericSemicolon, |
44 | Named, |
45 | BogusName, |
46 | } |
47 | |
48 | pub(super) struct CharRefTokenizer { |
49 | state: State, |
50 | result: Option<CharRef>, |
51 | is_consumed_in_attribute: bool, |
52 | |
53 | num: u32, |
54 | num_too_big: bool, |
55 | seen_digit: bool, |
56 | hex_marker: Option<char>, |
57 | |
58 | name_buf_opt: Option<StrTendril>, |
59 | name_match: Option<(u32, u32)>, |
60 | name_len: usize, |
61 | } |
62 | |
63 | impl CharRefTokenizer { |
64 | pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer { |
65 | CharRefTokenizer { |
66 | is_consumed_in_attribute, |
67 | state: Begin, |
68 | result: None, |
69 | num: 0, |
70 | num_too_big: false, |
71 | seen_digit: false, |
72 | hex_marker: None, |
73 | name_buf_opt: None, |
74 | name_match: None, |
75 | name_len: 0, |
76 | } |
77 | } |
78 | |
79 | // A CharRefTokenizer can only tokenize one character reference, |
80 | // so this method consumes the tokenizer. |
81 | pub(super) fn get_result(self) -> CharRef { |
82 | self.result.expect("get_result called before done" ) |
83 | } |
84 | |
85 | fn name_buf(&self) -> &StrTendril { |
86 | self.name_buf_opt |
87 | .as_ref() |
88 | .expect("name_buf missing in named character reference" ) |
89 | } |
90 | |
91 | fn name_buf_mut(&mut self) -> &mut StrTendril { |
92 | self.name_buf_opt |
93 | .as_mut() |
94 | .expect("name_buf missing in named character reference" ) |
95 | } |
96 | |
97 | fn finish_none(&mut self) -> Status { |
98 | self.result = Some(CharRef { |
99 | chars: [' \0' , ' \0' ], |
100 | num_chars: 0, |
101 | }); |
102 | Done |
103 | } |
104 | |
105 | fn finish_one(&mut self, c: char) -> Status { |
106 | self.result = Some(CharRef { |
107 | chars: [c, ' \0' ], |
108 | num_chars: 1, |
109 | }); |
110 | Done |
111 | } |
112 | } |
113 | |
114 | impl CharRefTokenizer { |
115 | pub(super) fn step<Sink: TokenSink>( |
116 | &mut self, |
117 | tokenizer: &Tokenizer<Sink>, |
118 | input: &BufferQueue, |
119 | ) -> Status { |
120 | if self.result.is_some() { |
121 | return Done; |
122 | } |
123 | |
124 | debug!("char ref tokenizer stepping in state {:?}" , self.state); |
125 | match self.state { |
126 | Begin => self.do_begin(tokenizer, input), |
127 | Octothorpe => self.do_octothorpe(tokenizer, input), |
128 | Numeric(base) => self.do_numeric(tokenizer, input, base), |
129 | NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), |
130 | Named => self.do_named(tokenizer, input), |
131 | BogusName => self.do_bogus_name(tokenizer, input), |
132 | } |
133 | } |
134 | |
135 | fn do_begin<Sink: TokenSink>( |
136 | &mut self, |
137 | tokenizer: &Tokenizer<Sink>, |
138 | input: &BufferQueue, |
139 | ) -> Status { |
140 | match unwrap_or_return!(tokenizer.peek(input), Stuck) { |
141 | 'a' ..='z' | 'A' ..='Z' | '0' ..='9' => { |
142 | self.state = Named; |
143 | self.name_buf_opt = Some(StrTendril::new()); |
144 | Progress |
145 | }, |
146 | |
147 | '#' => { |
148 | tokenizer.discard_char(input); |
149 | self.state = Octothorpe; |
150 | Progress |
151 | }, |
152 | _ => self.finish_none(), |
153 | } |
154 | } |
155 | |
156 | fn do_octothorpe<Sink: TokenSink>( |
157 | &mut self, |
158 | tokenizer: &Tokenizer<Sink>, |
159 | input: &BufferQueue, |
160 | ) -> Status { |
161 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
162 | match c { |
163 | 'x' | 'X' => { |
164 | tokenizer.discard_char(input); |
165 | self.hex_marker = Some(c); |
166 | self.state = Numeric(16); |
167 | }, |
168 | |
169 | _ => { |
170 | self.hex_marker = None; |
171 | self.state = Numeric(10); |
172 | }, |
173 | } |
174 | Progress |
175 | } |
176 | |
177 | fn do_numeric<Sink: TokenSink>( |
178 | &mut self, |
179 | tokenizer: &Tokenizer<Sink>, |
180 | input: &BufferQueue, |
181 | base: u32, |
182 | ) -> Status { |
183 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
184 | match c.to_digit(base) { |
185 | Some(n) => { |
186 | tokenizer.discard_char(input); |
187 | self.num = self.num.wrapping_mul(base); |
188 | if self.num > 0x10FFFF { |
189 | // We might overflow, and the character is definitely invalid. |
190 | // We still parse digits and semicolon, but don't use the result. |
191 | self.num_too_big = true; |
192 | } |
193 | self.num = self.num.wrapping_add(n); |
194 | self.seen_digit = true; |
195 | Progress |
196 | }, |
197 | |
198 | None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), |
199 | |
200 | None => { |
201 | self.state = NumericSemicolon; |
202 | Progress |
203 | }, |
204 | } |
205 | } |
206 | |
207 | fn do_numeric_semicolon<Sink: TokenSink>( |
208 | &mut self, |
209 | tokenizer: &Tokenizer<Sink>, |
210 | input: &BufferQueue, |
211 | ) -> Status { |
212 | match unwrap_or_return!(tokenizer.peek(input), Stuck) { |
213 | ';' => tokenizer.discard_char(input), |
214 | _ => tokenizer.emit_error(Borrowed( |
215 | "Semicolon missing after numeric character reference" , |
216 | )), |
217 | }; |
218 | self.finish_numeric(tokenizer) |
219 | } |
220 | |
221 | fn unconsume_numeric<Sink: TokenSink>( |
222 | &mut self, |
223 | tokenizer: &Tokenizer<Sink>, |
224 | input: &BufferQueue, |
225 | ) -> Status { |
226 | let mut unconsume = StrTendril::from_char('#' ); |
227 | if let Some(c) = self.hex_marker { |
228 | unconsume.push_char(c) |
229 | } |
230 | |
231 | input.push_front(unconsume); |
232 | tokenizer.emit_error(Borrowed("Numeric character reference without digits" )); |
233 | self.finish_none() |
234 | } |
235 | |
236 | fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status { |
237 | fn conv(n: u32) -> char { |
238 | from_u32(n).expect("invalid char missed by error handling cases" ) |
239 | } |
240 | |
241 | let (c, error) = match self.num { |
242 | n if (n > 0x10FFFF) || self.num_too_big => (' \u{fffd}' , true), |
243 | 0x00 | 0xD800..=0xDFFF => (' \u{fffd}' , true), |
244 | |
245 | 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { |
246 | Some(c) => (c, true), |
247 | None => (conv(self.num), true), |
248 | }, |
249 | |
250 | 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), |
251 | |
252 | n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), |
253 | |
254 | n => (conv(n), false), |
255 | }; |
256 | |
257 | if error { |
258 | let msg = format_if!( |
259 | tokenizer.opts.exact_errors, |
260 | "Invalid numeric character reference" , |
261 | "Invalid numeric character reference value 0x {:06X}" , |
262 | self.num |
263 | ); |
264 | tokenizer.emit_error(msg); |
265 | } |
266 | |
267 | self.finish_one(c) |
268 | } |
269 | |
270 | fn do_named<Sink: TokenSink>( |
271 | &mut self, |
272 | tokenizer: &Tokenizer<Sink>, |
273 | input: &BufferQueue, |
274 | ) -> Status { |
275 | // peek + discard skips over newline normalization, therefore making it easier to |
276 | // un-consume |
277 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
278 | tokenizer.discard_char(input); |
279 | self.name_buf_mut().push_char(c); |
280 | match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { |
281 | // We have either a full match or a prefix of one. |
282 | Some(&m) => { |
283 | if m.0 != 0 { |
284 | // We have a full match, but there might be a longer one to come. |
285 | self.name_match = Some(m); |
286 | self.name_len = self.name_buf().len(); |
287 | } |
288 | // Otherwise we just have a prefix match. |
289 | Progress |
290 | }, |
291 | |
292 | // Can't continue the match. |
293 | None => self.finish_named(tokenizer, input, Some(c)), |
294 | } |
295 | } |
296 | |
297 | fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) { |
298 | let msg = format_if!( |
299 | tokenizer.opts.exact_errors, |
300 | "Invalid character reference" , |
301 | "Invalid character reference & {}" , |
302 | self.name_buf() |
303 | ); |
304 | tokenizer.emit_error(msg); |
305 | } |
306 | |
307 | fn unconsume_name(&mut self, input: &BufferQueue) { |
308 | input.push_front(self.name_buf_opt.take().unwrap()); |
309 | } |
310 | |
311 | fn finish_named<Sink: TokenSink>( |
312 | &mut self, |
313 | tokenizer: &Tokenizer<Sink>, |
314 | input: &BufferQueue, |
315 | end_char: Option<char>, |
316 | ) -> Status { |
317 | match self.name_match { |
318 | None => { |
319 | match end_char { |
320 | Some(c) if c.is_ascii_alphanumeric() => { |
321 | // Keep looking for a semicolon, to determine whether |
322 | // we emit a parse error. |
323 | self.state = BogusName; |
324 | return Progress; |
325 | }, |
326 | |
327 | // Check length because &; is not a parse error. |
328 | Some(';' ) if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), |
329 | |
330 | _ => (), |
331 | } |
332 | self.unconsume_name(input); |
333 | self.finish_none() |
334 | }, |
335 | |
336 | Some((c1, c2)) => { |
337 | // We have a complete match, but we may have consumed |
338 | // additional characters into self.name_buf. Usually |
339 | // at least one, but several in cases like |
340 | // |
341 | // ¬ => match for U+00AC |
342 | // ¬i => valid prefix for ¬in |
343 | // ¬it => can't continue match |
344 | |
345 | let name_len = self.name_len; |
346 | assert!(name_len > 0); |
347 | let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); |
348 | |
349 | // There might not be a next character after the match, if |
350 | // we had a full match and then hit EOF. |
351 | let next_after = if name_len == self.name_buf().len() { |
352 | None |
353 | } else { |
354 | Some(self.name_buf()[name_len..].chars().next().unwrap()) |
355 | }; |
356 | |
357 | // If the character reference was consumed as part of an attribute, and the last |
358 | // character matched is not a U+003B SEMICOLON character (;), and the next input |
359 | // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric, |
360 | // then, for historical reasons, flush code points consumed as a character |
361 | // reference and switch to the return state. |
362 | |
363 | let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) |
364 | { |
365 | (_, ';' , _) => false, |
366 | (true, _, Some('=' )) => true, |
367 | (true, _, Some(c)) if c.is_ascii_alphanumeric() => true, |
368 | _ => { |
369 | // 1. If the last character matched is not a U+003B SEMICOLON character |
370 | // (;), then this is a missing-semicolon-after-character-reference parse |
371 | // error. |
372 | tokenizer.emit_error(Borrowed( |
373 | "Character reference does not end with semicolon" , |
374 | )); |
375 | false |
376 | }, |
377 | }; |
378 | |
379 | if unconsume_all { |
380 | self.unconsume_name(input); |
381 | self.finish_none() |
382 | } else { |
383 | input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); |
384 | tokenizer.ignore_lf.set(false); |
385 | self.result = Some(CharRef { |
386 | chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], |
387 | num_chars: if c2 == 0 { 1 } else { 2 }, |
388 | }); |
389 | Done |
390 | } |
391 | }, |
392 | } |
393 | } |
394 | |
395 | fn do_bogus_name<Sink: TokenSink>( |
396 | &mut self, |
397 | tokenizer: &Tokenizer<Sink>, |
398 | input: &BufferQueue, |
399 | ) -> Status { |
400 | // peek + discard skips over newline normalization, therefore making it easier to |
401 | // un-consume |
402 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
403 | tokenizer.discard_char(input); |
404 | self.name_buf_mut().push_char(c); |
405 | match c { |
406 | _ if c.is_ascii_alphanumeric() => return Progress, |
407 | ';' => self.emit_name_error(tokenizer), |
408 | _ => (), |
409 | } |
410 | self.unconsume_name(input); |
411 | self.finish_none() |
412 | } |
413 | |
414 | pub(super) fn end_of_file<Sink: TokenSink>( |
415 | &mut self, |
416 | tokenizer: &Tokenizer<Sink>, |
417 | input: &BufferQueue, |
418 | ) { |
419 | while self.result.is_none() { |
420 | match self.state { |
421 | Begin => drop(self.finish_none()), |
422 | |
423 | Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), |
424 | |
425 | Numeric(_) | NumericSemicolon => { |
426 | tokenizer.emit_error(Borrowed("EOF in numeric character reference" )); |
427 | self.finish_numeric(tokenizer); |
428 | }, |
429 | |
430 | Named => drop(self.finish_named(tokenizer, input, None)), |
431 | |
432 | BogusName => { |
433 | self.unconsume_name(input); |
434 | self.finish_none(); |
435 | }, |
436 | |
437 | Octothorpe => { |
438 | input.push_front(StrTendril::from_slice("#" )); |
439 | tokenizer.emit_error(Borrowed("EOF after '#' in character reference" )); |
440 | self.finish_none(); |
441 | }, |
442 | } |
443 | } |
444 | } |
445 | } |
446 | |