| 1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
| 2 | // COPYRIGHT file at the top-level directory of this distribution. |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 7 | // option. This file may not be copied, modified, or distributed |
| 8 | // except according to those terms. |
| 9 | |
| 10 | use super::{TokenSink, Tokenizer}; |
| 11 | use crate::buffer_queue::BufferQueue; |
| 12 | use crate::data; |
| 13 | use crate::tendril::StrTendril; |
| 14 | |
| 15 | use log::debug; |
| 16 | use mac::format_if; |
| 17 | use std::borrow::Cow::Borrowed; |
| 18 | use std::char::from_u32; |
| 19 | |
| 20 | use self::State::*; |
| 21 | pub(super) use self::Status::*; |
| 22 | |
| 23 | //ยง tokenizing-character-references |
| 24 | pub(super) struct CharRef { |
| 25 | /// The resulting character(s) |
| 26 | pub(super) chars: [char; 2], |
| 27 | |
| 28 | /// How many slots in `chars` are valid? |
| 29 | pub(super) num_chars: u8, |
| 30 | } |
| 31 | |
| 32 | pub(super) enum Status { |
| 33 | Stuck, |
| 34 | Progress, |
| 35 | Done, |
| 36 | } |
| 37 | |
| 38 | #[derive (Debug)] |
| 39 | enum State { |
| 40 | Begin, |
| 41 | Octothorpe, |
| 42 | Numeric(u32), // base |
| 43 | NumericSemicolon, |
| 44 | Named, |
| 45 | BogusName, |
| 46 | } |
| 47 | |
| 48 | pub(super) struct CharRefTokenizer { |
| 49 | state: State, |
| 50 | result: Option<CharRef>, |
| 51 | is_consumed_in_attribute: bool, |
| 52 | |
| 53 | num: u32, |
| 54 | num_too_big: bool, |
| 55 | seen_digit: bool, |
| 56 | hex_marker: Option<char>, |
| 57 | |
| 58 | name_buf_opt: Option<StrTendril>, |
| 59 | name_match: Option<(u32, u32)>, |
| 60 | name_len: usize, |
| 61 | } |
| 62 | |
| 63 | impl CharRefTokenizer { |
| 64 | pub(super) fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer { |
| 65 | CharRefTokenizer { |
| 66 | is_consumed_in_attribute, |
| 67 | state: Begin, |
| 68 | result: None, |
| 69 | num: 0, |
| 70 | num_too_big: false, |
| 71 | seen_digit: false, |
| 72 | hex_marker: None, |
| 73 | name_buf_opt: None, |
| 74 | name_match: None, |
| 75 | name_len: 0, |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | // A CharRefTokenizer can only tokenize one character reference, |
| 80 | // so this method consumes the tokenizer. |
| 81 | pub(super) fn get_result(self) -> CharRef { |
| 82 | self.result.expect("get_result called before done" ) |
| 83 | } |
| 84 | |
| 85 | fn name_buf(&self) -> &StrTendril { |
| 86 | self.name_buf_opt |
| 87 | .as_ref() |
| 88 | .expect("name_buf missing in named character reference" ) |
| 89 | } |
| 90 | |
| 91 | fn name_buf_mut(&mut self) -> &mut StrTendril { |
| 92 | self.name_buf_opt |
| 93 | .as_mut() |
| 94 | .expect("name_buf missing in named character reference" ) |
| 95 | } |
| 96 | |
| 97 | fn finish_none(&mut self) -> Status { |
| 98 | self.result = Some(CharRef { |
| 99 | chars: [' \0' , ' \0' ], |
| 100 | num_chars: 0, |
| 101 | }); |
| 102 | Done |
| 103 | } |
| 104 | |
| 105 | fn finish_one(&mut self, c: char) -> Status { |
| 106 | self.result = Some(CharRef { |
| 107 | chars: [c, ' \0' ], |
| 108 | num_chars: 1, |
| 109 | }); |
| 110 | Done |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | impl CharRefTokenizer { |
| 115 | pub(super) fn step<Sink: TokenSink>( |
| 116 | &mut self, |
| 117 | tokenizer: &Tokenizer<Sink>, |
| 118 | input: &BufferQueue, |
| 119 | ) -> Status { |
| 120 | if self.result.is_some() { |
| 121 | return Done; |
| 122 | } |
| 123 | |
| 124 | debug!("char ref tokenizer stepping in state {:?}" , self.state); |
| 125 | match self.state { |
| 126 | Begin => self.do_begin(tokenizer, input), |
| 127 | Octothorpe => self.do_octothorpe(tokenizer, input), |
| 128 | Numeric(base) => self.do_numeric(tokenizer, input, base), |
| 129 | NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), |
| 130 | Named => self.do_named(tokenizer, input), |
| 131 | BogusName => self.do_bogus_name(tokenizer, input), |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | fn do_begin<Sink: TokenSink>( |
| 136 | &mut self, |
| 137 | tokenizer: &Tokenizer<Sink>, |
| 138 | input: &BufferQueue, |
| 139 | ) -> Status { |
| 140 | match unwrap_or_return!(tokenizer.peek(input), Stuck) { |
| 141 | 'a' ..='z' | 'A' ..='Z' | '0' ..='9' => { |
| 142 | self.state = Named; |
| 143 | self.name_buf_opt = Some(StrTendril::new()); |
| 144 | Progress |
| 145 | }, |
| 146 | |
| 147 | '#' => { |
| 148 | tokenizer.discard_char(input); |
| 149 | self.state = Octothorpe; |
| 150 | Progress |
| 151 | }, |
| 152 | _ => self.finish_none(), |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | fn do_octothorpe<Sink: TokenSink>( |
| 157 | &mut self, |
| 158 | tokenizer: &Tokenizer<Sink>, |
| 159 | input: &BufferQueue, |
| 160 | ) -> Status { |
| 161 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
| 162 | match c { |
| 163 | 'x' | 'X' => { |
| 164 | tokenizer.discard_char(input); |
| 165 | self.hex_marker = Some(c); |
| 166 | self.state = Numeric(16); |
| 167 | }, |
| 168 | |
| 169 | _ => { |
| 170 | self.hex_marker = None; |
| 171 | self.state = Numeric(10); |
| 172 | }, |
| 173 | } |
| 174 | Progress |
| 175 | } |
| 176 | |
| 177 | fn do_numeric<Sink: TokenSink>( |
| 178 | &mut self, |
| 179 | tokenizer: &Tokenizer<Sink>, |
| 180 | input: &BufferQueue, |
| 181 | base: u32, |
| 182 | ) -> Status { |
| 183 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
| 184 | match c.to_digit(base) { |
| 185 | Some(n) => { |
| 186 | tokenizer.discard_char(input); |
| 187 | self.num = self.num.wrapping_mul(base); |
| 188 | if self.num > 0x10FFFF { |
| 189 | // We might overflow, and the character is definitely invalid. |
| 190 | // We still parse digits and semicolon, but don't use the result. |
| 191 | self.num_too_big = true; |
| 192 | } |
| 193 | self.num = self.num.wrapping_add(n); |
| 194 | self.seen_digit = true; |
| 195 | Progress |
| 196 | }, |
| 197 | |
| 198 | None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), |
| 199 | |
| 200 | None => { |
| 201 | self.state = NumericSemicolon; |
| 202 | Progress |
| 203 | }, |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | fn do_numeric_semicolon<Sink: TokenSink>( |
| 208 | &mut self, |
| 209 | tokenizer: &Tokenizer<Sink>, |
| 210 | input: &BufferQueue, |
| 211 | ) -> Status { |
| 212 | match unwrap_or_return!(tokenizer.peek(input), Stuck) { |
| 213 | ';' => tokenizer.discard_char(input), |
| 214 | _ => tokenizer.emit_error(Borrowed( |
| 215 | "Semicolon missing after numeric character reference" , |
| 216 | )), |
| 217 | }; |
| 218 | self.finish_numeric(tokenizer) |
| 219 | } |
| 220 | |
| 221 | fn unconsume_numeric<Sink: TokenSink>( |
| 222 | &mut self, |
| 223 | tokenizer: &Tokenizer<Sink>, |
| 224 | input: &BufferQueue, |
| 225 | ) -> Status { |
| 226 | let mut unconsume = StrTendril::from_char('#' ); |
| 227 | if let Some(c) = self.hex_marker { |
| 228 | unconsume.push_char(c) |
| 229 | } |
| 230 | |
| 231 | input.push_front(unconsume); |
| 232 | tokenizer.emit_error(Borrowed("Numeric character reference without digits" )); |
| 233 | self.finish_none() |
| 234 | } |
| 235 | |
| 236 | fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) -> Status { |
| 237 | fn conv(n: u32) -> char { |
| 238 | from_u32(n).expect("invalid char missed by error handling cases" ) |
| 239 | } |
| 240 | |
| 241 | let (c, error) = match self.num { |
| 242 | n if (n > 0x10FFFF) || self.num_too_big => (' \u{fffd}' , true), |
| 243 | 0x00 | 0xD800..=0xDFFF => (' \u{fffd}' , true), |
| 244 | |
| 245 | 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { |
| 246 | Some(c) => (c, true), |
| 247 | None => (conv(self.num), true), |
| 248 | }, |
| 249 | |
| 250 | 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), |
| 251 | |
| 252 | n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), |
| 253 | |
| 254 | n => (conv(n), false), |
| 255 | }; |
| 256 | |
| 257 | if error { |
| 258 | let msg = format_if!( |
| 259 | tokenizer.opts.exact_errors, |
| 260 | "Invalid numeric character reference" , |
| 261 | "Invalid numeric character reference value 0x {:06X}" , |
| 262 | self.num |
| 263 | ); |
| 264 | tokenizer.emit_error(msg); |
| 265 | } |
| 266 | |
| 267 | self.finish_one(c) |
| 268 | } |
| 269 | |
| 270 | fn do_named<Sink: TokenSink>( |
| 271 | &mut self, |
| 272 | tokenizer: &Tokenizer<Sink>, |
| 273 | input: &BufferQueue, |
| 274 | ) -> Status { |
| 275 | // peek + discard skips over newline normalization, therefore making it easier to |
| 276 | // un-consume |
| 277 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
| 278 | tokenizer.discard_char(input); |
| 279 | self.name_buf_mut().push_char(c); |
| 280 | match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { |
| 281 | // We have either a full match or a prefix of one. |
| 282 | Some(&m) => { |
| 283 | if m.0 != 0 { |
| 284 | // We have a full match, but there might be a longer one to come. |
| 285 | self.name_match = Some(m); |
| 286 | self.name_len = self.name_buf().len(); |
| 287 | } |
| 288 | // Otherwise we just have a prefix match. |
| 289 | Progress |
| 290 | }, |
| 291 | |
| 292 | // Can't continue the match. |
| 293 | None => self.finish_named(tokenizer, input, Some(c)), |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &Tokenizer<Sink>) { |
| 298 | let msg = format_if!( |
| 299 | tokenizer.opts.exact_errors, |
| 300 | "Invalid character reference" , |
| 301 | "Invalid character reference & {}" , |
| 302 | self.name_buf() |
| 303 | ); |
| 304 | tokenizer.emit_error(msg); |
| 305 | } |
| 306 | |
| 307 | fn unconsume_name(&mut self, input: &BufferQueue) { |
| 308 | input.push_front(self.name_buf_opt.take().unwrap()); |
| 309 | } |
| 310 | |
| 311 | fn finish_named<Sink: TokenSink>( |
| 312 | &mut self, |
| 313 | tokenizer: &Tokenizer<Sink>, |
| 314 | input: &BufferQueue, |
| 315 | end_char: Option<char>, |
| 316 | ) -> Status { |
| 317 | match self.name_match { |
| 318 | None => { |
| 319 | match end_char { |
| 320 | Some(c) if c.is_ascii_alphanumeric() => { |
| 321 | // Keep looking for a semicolon, to determine whether |
| 322 | // we emit a parse error. |
| 323 | self.state = BogusName; |
| 324 | return Progress; |
| 325 | }, |
| 326 | |
| 327 | // Check length because &; is not a parse error. |
| 328 | Some(';' ) if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), |
| 329 | |
| 330 | _ => (), |
| 331 | } |
| 332 | self.unconsume_name(input); |
| 333 | self.finish_none() |
| 334 | }, |
| 335 | |
| 336 | Some((c1, c2)) => { |
| 337 | // We have a complete match, but we may have consumed |
| 338 | // additional characters into self.name_buf. Usually |
| 339 | // at least one, but several in cases like |
| 340 | // |
| 341 | // ¬ => match for U+00AC |
| 342 | // ¬i => valid prefix for ¬in |
| 343 | // ¬it => can't continue match |
| 344 | |
| 345 | let name_len = self.name_len; |
| 346 | assert!(name_len > 0); |
| 347 | let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); |
| 348 | |
| 349 | // There might not be a next character after the match, if |
| 350 | // we had a full match and then hit EOF. |
| 351 | let next_after = if name_len == self.name_buf().len() { |
| 352 | None |
| 353 | } else { |
| 354 | Some(self.name_buf()[name_len..].chars().next().unwrap()) |
| 355 | }; |
| 356 | |
| 357 | // If the character reference was consumed as part of an attribute, and the last |
| 358 | // character matched is not a U+003B SEMICOLON character (;), and the next input |
| 359 | // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric, |
| 360 | // then, for historical reasons, flush code points consumed as a character |
| 361 | // reference and switch to the return state. |
| 362 | |
| 363 | let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) |
| 364 | { |
| 365 | (_, ';' , _) => false, |
| 366 | (true, _, Some('=' )) => true, |
| 367 | (true, _, Some(c)) if c.is_ascii_alphanumeric() => true, |
| 368 | _ => { |
| 369 | // 1. If the last character matched is not a U+003B SEMICOLON character |
| 370 | // (;), then this is a missing-semicolon-after-character-reference parse |
| 371 | // error. |
| 372 | tokenizer.emit_error(Borrowed( |
| 373 | "Character reference does not end with semicolon" , |
| 374 | )); |
| 375 | false |
| 376 | }, |
| 377 | }; |
| 378 | |
| 379 | if unconsume_all { |
| 380 | self.unconsume_name(input); |
| 381 | self.finish_none() |
| 382 | } else { |
| 383 | input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); |
| 384 | tokenizer.ignore_lf.set(false); |
| 385 | self.result = Some(CharRef { |
| 386 | chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], |
| 387 | num_chars: if c2 == 0 { 1 } else { 2 }, |
| 388 | }); |
| 389 | Done |
| 390 | } |
| 391 | }, |
| 392 | } |
| 393 | } |
| 394 | |
| 395 | fn do_bogus_name<Sink: TokenSink>( |
| 396 | &mut self, |
| 397 | tokenizer: &Tokenizer<Sink>, |
| 398 | input: &BufferQueue, |
| 399 | ) -> Status { |
| 400 | // peek + discard skips over newline normalization, therefore making it easier to |
| 401 | // un-consume |
| 402 | let c = unwrap_or_return!(tokenizer.peek(input), Stuck); |
| 403 | tokenizer.discard_char(input); |
| 404 | self.name_buf_mut().push_char(c); |
| 405 | match c { |
| 406 | _ if c.is_ascii_alphanumeric() => return Progress, |
| 407 | ';' => self.emit_name_error(tokenizer), |
| 408 | _ => (), |
| 409 | } |
| 410 | self.unconsume_name(input); |
| 411 | self.finish_none() |
| 412 | } |
| 413 | |
| 414 | pub(super) fn end_of_file<Sink: TokenSink>( |
| 415 | &mut self, |
| 416 | tokenizer: &Tokenizer<Sink>, |
| 417 | input: &BufferQueue, |
| 418 | ) { |
| 419 | while self.result.is_none() { |
| 420 | match self.state { |
| 421 | Begin => drop(self.finish_none()), |
| 422 | |
| 423 | Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), |
| 424 | |
| 425 | Numeric(_) | NumericSemicolon => { |
| 426 | tokenizer.emit_error(Borrowed("EOF in numeric character reference" )); |
| 427 | self.finish_numeric(tokenizer); |
| 428 | }, |
| 429 | |
| 430 | Named => drop(self.finish_named(tokenizer, input, None)), |
| 431 | |
| 432 | BogusName => { |
| 433 | self.unconsume_name(input); |
| 434 | self.finish_none(); |
| 435 | }, |
| 436 | |
| 437 | Octothorpe => { |
| 438 | input.push_front(StrTendril::from_slice("#" )); |
| 439 | tokenizer.emit_error(Borrowed("EOF after '#' in character reference" )); |
| 440 | self.finish_none(); |
| 441 | }, |
| 442 | } |
| 443 | } |
| 444 | } |
| 445 | } |
| 446 | |