1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::{TokenSink, Tokenizer}; |
11 | use crate::buffer_queue::BufferQueue; |
12 | use crate::data; |
13 | use crate::tendril::StrTendril; |
14 | |
15 | use log::debug; |
16 | use mac::format_if; |
17 | use std::borrow::Cow::Borrowed; |
18 | use std::char::from_u32; |
19 | |
20 | use self::State::*; |
21 | pub use self::Status::*; |
22 | |
23 | //ยง tokenizing-character-references |
24 | pub struct CharRef { |
25 | /// The resulting character(s) |
26 | pub chars: [char; 2], |
27 | |
28 | /// How many slots in `chars` are valid? |
29 | pub num_chars: u8, |
30 | } |
31 | |
32 | pub enum Status { |
33 | Stuck, |
34 | Progress, |
35 | Done, |
36 | } |
37 | |
38 | #[derive (Debug)] |
39 | enum State { |
40 | Begin, |
41 | Octothorpe, |
42 | Numeric(u32), // base |
43 | NumericSemicolon, |
44 | Named, |
45 | BogusName, |
46 | } |
47 | |
48 | pub struct CharRefTokenizer { |
49 | state: State, |
50 | addnl_allowed: Option<char>, |
51 | result: Option<CharRef>, |
52 | |
53 | num: u32, |
54 | num_too_big: bool, |
55 | seen_digit: bool, |
56 | hex_marker: Option<char>, |
57 | |
58 | name_buf_opt: Option<StrTendril>, |
59 | name_match: Option<(u32, u32)>, |
60 | name_len: usize, |
61 | } |
62 | |
63 | impl CharRefTokenizer { |
64 | // NB: We assume that we have an additional allowed character iff we're |
65 | // tokenizing in an attribute value. |
66 | pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { |
67 | CharRefTokenizer { |
68 | state: Begin, |
69 | addnl_allowed, |
70 | result: None, |
71 | num: 0, |
72 | num_too_big: false, |
73 | seen_digit: false, |
74 | hex_marker: None, |
75 | name_buf_opt: None, |
76 | name_match: None, |
77 | name_len: 0, |
78 | } |
79 | } |
80 | |
81 | // A CharRefTokenizer can only tokenize one character reference, |
82 | // so this method consumes the tokenizer. |
83 | pub fn get_result(self) -> CharRef { |
84 | self.result.expect("get_result called before done" ) |
85 | } |
86 | |
87 | fn name_buf(&self) -> &StrTendril { |
88 | self.name_buf_opt |
89 | .as_ref() |
90 | .expect("name_buf missing in named character reference" ) |
91 | } |
92 | |
93 | fn name_buf_mut(&mut self) -> &mut StrTendril { |
94 | self.name_buf_opt |
95 | .as_mut() |
96 | .expect("name_buf missing in named character reference" ) |
97 | } |
98 | |
99 | fn finish_none(&mut self) -> Status { |
100 | self.result = Some(CharRef { |
101 | chars: [' \0' , ' \0' ], |
102 | num_chars: 0, |
103 | }); |
104 | Done |
105 | } |
106 | |
107 | fn finish_one(&mut self, c: char) -> Status { |
108 | self.result = Some(CharRef { |
109 | chars: [c, ' \0' ], |
110 | num_chars: 1, |
111 | }); |
112 | Done |
113 | } |
114 | } |
115 | |
116 | impl CharRefTokenizer { |
117 | pub fn step<Sink: TokenSink>( |
118 | &mut self, |
119 | tokenizer: &mut Tokenizer<Sink>, |
120 | input: &mut BufferQueue, |
121 | ) -> Status { |
122 | if self.result.is_some() { |
123 | return Done; |
124 | } |
125 | |
126 | debug!("char ref tokenizer stepping in state {:?}" , self.state); |
127 | match self.state { |
128 | Begin => self.do_begin(tokenizer, input), |
129 | Octothorpe => self.do_octothorpe(tokenizer, input), |
130 | Numeric(base) => self.do_numeric(tokenizer, input, base), |
131 | NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), |
132 | Named => self.do_named(tokenizer, input), |
133 | BogusName => self.do_bogus_name(tokenizer, input), |
134 | } |
135 | } |
136 | |
137 | fn do_begin<Sink: TokenSink>( |
138 | &mut self, |
139 | tokenizer: &mut Tokenizer<Sink>, |
140 | input: &mut BufferQueue, |
141 | ) -> Status { |
142 | match unwrap_or_return!(tokenizer.peek(input), Stuck) { |
143 | ' \t' | ' \n' | ' \x0C' | ' ' | '<' | '&' => self.finish_none(), |
144 | c if Some(c) == self.addnl_allowed => self.finish_none(), |
145 | |
146 | '#' => { |
147 | tokenizer.discard_char(input); |
148 | self.state = Octothorpe; |
149 | Progress |
150 | }, |
151 | |
152 | _ => { |
153 | self.state = Named; |
154 | self.name_buf_opt = Some(StrTendril::new()); |
155 | Progress |
156 | }, |
157 | } |
158 | } |
159 | |
160 | fn do_octothorpe<Sink: TokenSink>( |
161 | &mut self, |
162 | tokenizer: &mut Tokenizer<Sink>, |
163 | input: &mut BufferQueue, |
164 | ) -> Status { |
165 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
166 | match c { |
167 | 'x' | 'X' => { |
168 | tokenizer.discard_char(input); |
169 | self.hex_marker = Some(c); |
170 | self.state = Numeric(16); |
171 | }, |
172 | |
173 | _ => { |
174 | self.hex_marker = None; |
175 | self.state = Numeric(10); |
176 | }, |
177 | } |
178 | Progress |
179 | } |
180 | |
181 | fn do_numeric<Sink: TokenSink>( |
182 | &mut self, |
183 | tokenizer: &mut Tokenizer<Sink>, |
184 | input: &mut BufferQueue, |
185 | base: u32, |
186 | ) -> Status { |
187 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
188 | match c.to_digit(base) { |
189 | Some(n) => { |
190 | tokenizer.discard_char(input); |
191 | self.num = self.num.wrapping_mul(base); |
192 | if self.num > 0x10FFFF { |
193 | // We might overflow, and the character is definitely invalid. |
194 | // We still parse digits and semicolon, but don't use the result. |
195 | self.num_too_big = true; |
196 | } |
197 | self.num = self.num.wrapping_add(n); |
198 | self.seen_digit = true; |
199 | Progress |
200 | }, |
201 | |
202 | None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), |
203 | |
204 | None => { |
205 | self.state = NumericSemicolon; |
206 | Progress |
207 | }, |
208 | } |
209 | } |
210 | |
211 | fn do_numeric_semicolon<Sink: TokenSink>( |
212 | &mut self, |
213 | tokenizer: &mut Tokenizer<Sink>, |
214 | input: &mut BufferQueue, |
215 | ) -> Status { |
216 | match unwrap_or_return!(tokenizer.peek(input), Stuck) { |
217 | ';' => tokenizer.discard_char(input), |
218 | _ => tokenizer.emit_error(Borrowed( |
219 | "Semicolon missing after numeric character reference" , |
220 | )), |
221 | }; |
222 | self.finish_numeric(tokenizer) |
223 | } |
224 | |
225 | fn unconsume_numeric<Sink: TokenSink>( |
226 | &mut self, |
227 | tokenizer: &mut Tokenizer<Sink>, |
228 | input: &mut BufferQueue, |
229 | ) -> Status { |
230 | let mut unconsume = StrTendril::from_char('#' ); |
231 | match self.hex_marker { |
232 | Some(c) => unconsume.push_char(c), |
233 | None => (), |
234 | } |
235 | |
236 | input.push_front(unconsume); |
237 | tokenizer.emit_error(Borrowed("Numeric character reference without digits" )); |
238 | self.finish_none() |
239 | } |
240 | |
241 | fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { |
242 | fn conv(n: u32) -> char { |
243 | from_u32(n).expect("invalid char missed by error handling cases" ) |
244 | } |
245 | |
246 | let (c, error) = match self.num { |
247 | n if (n > 0x10FFFF) || self.num_too_big => (' \u{fffd}' , true), |
248 | 0x00 | 0xD800..=0xDFFF => (' \u{fffd}' , true), |
249 | |
250 | 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { |
251 | Some(c) => (c, true), |
252 | None => (conv(self.num), true), |
253 | }, |
254 | |
255 | 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), |
256 | |
257 | n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), |
258 | |
259 | n => (conv(n), false), |
260 | }; |
261 | |
262 | if error { |
263 | let msg = format_if!( |
264 | tokenizer.opts.exact_errors, |
265 | "Invalid numeric character reference" , |
266 | "Invalid numeric character reference value 0x {:06X}" , |
267 | self.num |
268 | ); |
269 | tokenizer.emit_error(msg); |
270 | } |
271 | |
272 | self.finish_one(c) |
273 | } |
274 | |
275 | fn do_named<Sink: TokenSink>( |
276 | &mut self, |
277 | tokenizer: &mut Tokenizer<Sink>, |
278 | input: &mut BufferQueue, |
279 | ) -> Status { |
280 | let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); |
281 | self.name_buf_mut().push_char(c); |
282 | match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { |
283 | // We have either a full match or a prefix of one. |
284 | Some(&m) => { |
285 | if m.0 != 0 { |
286 | // We have a full match, but there might be a longer one to come. |
287 | self.name_match = Some(m); |
288 | self.name_len = self.name_buf().len(); |
289 | } |
290 | // Otherwise we just have a prefix match. |
291 | Progress |
292 | }, |
293 | |
294 | // Can't continue the match. |
295 | None => self.finish_named(tokenizer, input, Some(c)), |
296 | } |
297 | } |
298 | |
299 | fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { |
300 | let msg = format_if!( |
301 | tokenizer.opts.exact_errors, |
302 | "Invalid character reference" , |
303 | "Invalid character reference & {}" , |
304 | self.name_buf() |
305 | ); |
306 | tokenizer.emit_error(msg); |
307 | } |
308 | |
309 | fn unconsume_name(&mut self, input: &mut BufferQueue) { |
310 | input.push_front(self.name_buf_opt.take().unwrap()); |
311 | } |
312 | |
313 | fn finish_named<Sink: TokenSink>( |
314 | &mut self, |
315 | tokenizer: &mut Tokenizer<Sink>, |
316 | input: &mut BufferQueue, |
317 | end_char: Option<char>, |
318 | ) -> Status { |
319 | match self.name_match { |
320 | None => { |
321 | match end_char { |
322 | Some(c) if c.is_ascii_alphanumeric() => { |
323 | // Keep looking for a semicolon, to determine whether |
324 | // we emit a parse error. |
325 | self.state = BogusName; |
326 | return Progress; |
327 | }, |
328 | |
329 | // Check length because &; is not a parse error. |
330 | Some(';' ) if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), |
331 | |
332 | _ => (), |
333 | } |
334 | self.unconsume_name(input); |
335 | self.finish_none() |
336 | }, |
337 | |
338 | Some((c1, c2)) => { |
339 | // We have a complete match, but we may have consumed |
340 | // additional characters into self.name_buf. Usually |
341 | // at least one, but several in cases like |
342 | // |
343 | // ¬ => match for U+00AC |
344 | // ¬i => valid prefix for ¬in |
345 | // ¬it => can't continue match |
346 | |
347 | let name_len = self.name_len; |
348 | assert!(name_len > 0); |
349 | let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); |
350 | |
351 | // There might not be a next character after the match, if |
352 | // we had a full match and then hit EOF. |
353 | let next_after = if name_len == self.name_buf().len() { |
354 | None |
355 | } else { |
356 | Some(self.name_buf()[name_len..].chars().next().unwrap()) |
357 | }; |
358 | |
359 | // "If the character reference is being consumed as part of an |
360 | // attribute, and the last character matched is not a U+003B |
361 | // SEMICOLON character (;), and the next character is either a |
362 | // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII |
363 | // character, then, for historical reasons, all the characters |
364 | // that were matched after the U+0026 AMPERSAND character (&) |
365 | // must be unconsumed, and nothing is returned. However, if |
366 | // this next character is in fact a U+003D EQUALS SIGN |
367 | // character (=), then this is a parse error" |
368 | |
369 | let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { |
370 | (_, ';' , _) => false, |
371 | (Some(_), _, Some('=' )) => { |
372 | tokenizer.emit_error(Borrowed( |
373 | "Equals sign after character reference in attribute" , |
374 | )); |
375 | true |
376 | }, |
377 | (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true, |
378 | _ => { |
379 | tokenizer.emit_error(Borrowed( |
380 | "Character reference does not end with semicolon" , |
381 | )); |
382 | false |
383 | }, |
384 | }; |
385 | |
386 | if unconsume_all { |
387 | self.unconsume_name(input); |
388 | self.finish_none() |
389 | } else { |
390 | input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); |
391 | self.result = Some(CharRef { |
392 | chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], |
393 | num_chars: if c2 == 0 { 1 } else { 2 }, |
394 | }); |
395 | Done |
396 | } |
397 | }, |
398 | } |
399 | } |
400 | |
401 | fn do_bogus_name<Sink: TokenSink>( |
402 | &mut self, |
403 | tokenizer: &mut Tokenizer<Sink>, |
404 | input: &mut BufferQueue, |
405 | ) -> Status { |
406 | let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); |
407 | self.name_buf_mut().push_char(c); |
408 | match c { |
409 | _ if c.is_ascii_alphanumeric() => return Progress, |
410 | ';' => self.emit_name_error(tokenizer), |
411 | _ => (), |
412 | } |
413 | self.unconsume_name(input); |
414 | self.finish_none() |
415 | } |
416 | |
417 | pub fn end_of_file<Sink: TokenSink>( |
418 | &mut self, |
419 | tokenizer: &mut Tokenizer<Sink>, |
420 | input: &mut BufferQueue, |
421 | ) { |
422 | while self.result.is_none() { |
423 | match self.state { |
424 | Begin => drop(self.finish_none()), |
425 | |
426 | Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), |
427 | |
428 | Numeric(_) | NumericSemicolon => { |
429 | tokenizer.emit_error(Borrowed("EOF in numeric character reference" )); |
430 | self.finish_numeric(tokenizer); |
431 | }, |
432 | |
433 | Named => drop(self.finish_named(tokenizer, input, None)), |
434 | |
435 | BogusName => { |
436 | self.unconsume_name(input); |
437 | self.finish_none(); |
438 | }, |
439 | |
440 | Octothorpe => { |
441 | input.push_front(StrTendril::from_slice("#" )); |
442 | tokenizer.emit_error(Borrowed("EOF after '#' in character reference" )); |
443 | self.finish_none(); |
444 | }, |
445 | } |
446 | } |
447 | } |
448 | } |
449 | |