mod.rs source code [crates/html5ever/src/tokenizer/char_ref/mod.rs]

1	// Copyright 2014-2017 The html5ever Project Developers. See the
2	// COPYRIGHT file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::{TokenSink, Tokenizer};
11	use crate::buffer_queue::BufferQueue;
12	use crate::data;
13	use crate::tendril::StrTendril;
14
15	use log::debug;
16	use mac::format_if;
17	use std::borrow::Cow::Borrowed;
18	use std::char::from_u32;
19
20	use self::State::*;
21	pub use self::Status::*;
22
23	//§ tokenizing-character-references
24	pub struct CharRef {
25	/// The resulting character(s)
26	pub chars: [char; `2`],
27
28	/// How many slots in `chars` are valid?
29	pub num_chars: u8,
30	}
31
32	pub enum Status {
33	Stuck,
34	Progress,
35	Done,
36	}
37
38	#[derive(Debug)]
39	enum State {
40	Begin,
41	Octothorpe,
42	Numeric(u32), // base
43	NumericSemicolon,
44	Named,
45	BogusName,
46	}
47
48	pub struct CharRefTokenizer {
49	state: State,
50	addnl_allowed: Option<char>,
51	result: Option<CharRef>,
52
53	num: u32,
54	num_too_big: bool,
55	seen_digit: bool,
56	hex_marker: Option<char>,
57
58	name_buf_opt: Option<StrTendril>,
59	name_match: Option<(u32, u32)>,
60	name_len: usize,
61	}
62
63	impl CharRefTokenizer {
64	// NB: We assume that we have an additional allowed character iff we're
65	// tokenizing in an attribute value.
66	pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
67	CharRefTokenizer {
68	state: Begin,
69	addnl_allowed,
70	result: None,
71	num: `0`,
72	num_too_big: `false`,
73	seen_digit: `false`,
74	hex_marker: None,
75	name_buf_opt: None,
76	name_match: None,
77	name_len: `0`,
78	}
79	}
80
81	// A CharRefTokenizer can only tokenize one character reference,
82	// so this method consumes the tokenizer.
83	pub fn get_result(self) -> CharRef {
84	self.result.expect("get_result called before done")
85	}
86
87	fn name_buf(&self) -> &StrTendril {
88	self.name_buf_opt
89	.as_ref()
90	.expect("name_buf missing in named character reference")
91	}
92
93	fn name_buf_mut(&mut self) -> &mut StrTendril {
94	self.name_buf_opt
95	.as_mut()
96	.expect("name_buf missing in named character reference")
97	}
98
99	fn finish_none(&mut self) -> Status {
100	self.result = Some(CharRef {
101	chars: ['`\0`', '`\0`'],
102	num_chars: `0`,
103	});
104	Done
105	}
106
107	fn finish_one(&mut self, c: char) -> Status {
108	self.result = Some(CharRef {
109	chars: [c, '`\0`'],
110	num_chars: `1`,
111	});
112	Done
113	}
114	}
115
116	impl CharRefTokenizer {
117	pub fn step<Sink: TokenSink>(
118	&mut self,
119	tokenizer: &mut Tokenizer<Sink>,
120	input: &mut BufferQueue,
121	) -> Status {
122	if self.result.is_some() {
123	return Done;
124	}
125
126	debug!("char ref tokenizer stepping in state {:?}", self.state);
127	match self.state {
128	Begin => self.do_begin(tokenizer, input),
129	Octothorpe => self.do_octothorpe(tokenizer, input),
130	Numeric(base) => self.do_numeric(tokenizer, input, base),
131	NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
132	Named => self.do_named(tokenizer, input),
133	BogusName => self.do_bogus_name(tokenizer, input),
134	}
135	}
136
137	fn do_begin<Sink: TokenSink>(
138	&mut self,
139	tokenizer: &mut Tokenizer<Sink>,
140	input: &mut BufferQueue,
141	) -> Status {
142	match unwrap_or_return!(tokenizer.peek(input), Stuck) {
143	'`\t`' \| '`\n`' \| '`\x0C`' \| ' ' \| '<' \| '&' => self.finish_none(),
144	c if Some(c) == self.addnl_allowed => self.finish_none(),
145
146	'#' => {
147	tokenizer.discard_char(input);
148	self.state = Octothorpe;
149	Progress
150	},
151
152	_ => {
153	self.state = Named;
154	self.name_buf_opt = Some(StrTendril::new());
155	Progress
156	},
157	}
158	}
159
160	fn do_octothorpe<Sink: TokenSink>(
161	&mut self,
162	tokenizer: &mut Tokenizer<Sink>,
163	input: &mut BufferQueue,
164	) -> Status {
165	let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
166	match c {
167	'x' \| 'X' => {
168	tokenizer.discard_char(input);
169	self.hex_marker = Some(c);
170	self.state = Numeric(`16`);
171	},
172
173	_ => {
174	self.hex_marker = None;
175	self.state = Numeric(`10`);
176	},
177	}
178	Progress
179	}
180
181	fn do_numeric<Sink: TokenSink>(
182	&mut self,
183	tokenizer: &mut Tokenizer<Sink>,
184	input: &mut BufferQueue,
185	base: u32,
186	) -> Status {
187	let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
188	match c.to_digit(base) {
189	Some(n) => {
190	tokenizer.discard_char(input);
191	self.num = self.num.wrapping_mul(base);
192	if self.num > `0x10FFFF` {
193	// We might overflow, and the character is definitely invalid.
194	// We still parse digits and semicolon, but don't use the result.
195	self.num_too_big = `true`;
196	}
197	self.num = self.num.wrapping_add(n);
198	self.seen_digit = `true`;
199	Progress
200	},
201
202	None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
203
204	None => {
205	self.state = NumericSemicolon;
206	Progress
207	},
208	}
209	}
210
211	fn do_numeric_semicolon<Sink: TokenSink>(
212	&mut self,
213	tokenizer: &mut Tokenizer<Sink>,
214	input: &mut BufferQueue,
215	) -> Status {
216	match unwrap_or_return!(tokenizer.peek(input), Stuck) {
217	';' => tokenizer.discard_char(input),
218	_ => tokenizer.emit_error(Borrowed(
219	"Semicolon missing after numeric character reference",
220	)),
221	};
222	self.finish_numeric(tokenizer)
223	}
224
225	fn unconsume_numeric<Sink: TokenSink>(
226	&mut self,
227	tokenizer: &mut Tokenizer<Sink>,
228	input: &mut BufferQueue,
229	) -> Status {
230	let mut unconsume = StrTendril::from_char('#');
231	match self.hex_marker {
232	Some(c) => unconsume.push_char(c),
233	None => (),
234	}
235
236	input.push_front(unconsume);
237	tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
238	self.finish_none()
239	}
240
241	fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
242	fn conv(n: u32) -> char {
243	from_u32(n).expect("invalid char missed by error handling cases")
244	}
245
246	let (c, error) = match self.num {
247	n if (n > `0x10FFFF`) \|\| self.num_too_big => ('`\u{fffd}`', `true`),
248	`0x00` \| `0xD800`..=`0xDFFF` => ('`\u{fffd}`', `true`),
249
250	`0x80`..=`0x9F` => match data::C1_REPLACEMENTS[(self.num - `0x80`) as usize] {
251	Some(c) => (c, `true`),
252	None => (conv(self.num), `true`),
253	},
254
255	`0x01`..=`0x08` \| `0x0B` \| `0x0D`..=`0x1F` \| `0x7F` \| `0xFDD0`..=`0xFDEF` => (conv(self.num), `true`),
256
257	n if (n & `0xFFFE`) == `0xFFFE` => (conv(n), `true`),
258
259	n => (conv(n), `false`),
260	};
261
262	if error {
263	let msg = format_if!(
264	tokenizer.opts.exact_errors,
265	"Invalid numeric character reference",
266	"Invalid numeric character reference value 0x{:`06`X}",
267	self.num
268	);
269	tokenizer.emit_error(msg);
270	}
271
272	self.finish_one(c)
273	}
274
275	fn do_named<Sink: TokenSink>(
276	&mut self,
277	tokenizer: &mut Tokenizer<Sink>,
278	input: &mut BufferQueue,
279	) -> Status {
280	let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
281	self.name_buf_mut().push_char(c);
282	match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
283	// We have either a full match or a prefix of one.
284	Some(&m) => {
285	if m.0 != `0` {
286	// We have a full match, but there might be a longer one to come.
287	self.name_match = Some(m);
288	self.name_len = self.name_buf().len();
289	}
290	// Otherwise we just have a prefix match.
291	Progress
292	},
293
294	// Can't continue the match.
295	None => self.finish_named(tokenizer, input, Some(c)),
296	}
297	}
298
299	fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
300	let msg = format_if!(
301	tokenizer.opts.exact_errors,
302	"Invalid character reference",
303	"Invalid character reference &{}",
304	self.name_buf()
305	);
306	tokenizer.emit_error(msg);
307	}
308
309	fn unconsume_name(&mut self, input: &mut BufferQueue) {
310	input.push_front(self.name_buf_opt.take().unwrap());
311	}
312
313	fn finish_named<Sink: TokenSink>(
314	&mut self,
315	tokenizer: &mut Tokenizer<Sink>,
316	input: &mut BufferQueue,
317	end_char: Option<char>,
318	) -> Status {
319	match self.name_match {
320	None => {
321	match end_char {
322	Some(c) if c.is_ascii_alphanumeric() => {
323	// Keep looking for a semicolon, to determine whether
324	// we emit a parse error.
325	self.state = BogusName;
326	return Progress;
327	},
328
329	// Check length because &; is not a parse error.
330	Some(';') if self.name_buf().len() > `1` => self.emit_name_error(tokenizer),
331
332	_ => (),
333	}
334	self.unconsume_name(input);
335	self.finish_none()
336	},
337
338	Some((c1, c2)) => {
339	// We have a complete match, but we may have consumed
340	// additional characters into self.name_buf. Usually
341	// at least one, but several in cases like
342	//
343	// &not => match for U+00AC
344	// &noti => valid prefix for &notin
345	// &notit => can't continue match
346
347	let name_len = self.name_len;
348	assert!(name_len > `0`);
349	let last_matched = self.name_buf()[name_len - `1`..].chars().next().unwrap();
350
351	// There might not be a next character after the match, if
352	// we had a full match and then hit EOF.
353	let next_after = if name_len == self.name_buf().len() {
354	None
355	} else {
356	Some(self.name_buf()[name_len..].chars().next().unwrap())
357	};
358
359	// "If the character reference is being consumed as part of an
360	// attribute, and the last character matched is not a U+003B
361	// SEMICOLON character (;), and the next character is either a
362	// U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
363	// character, then, for historical reasons, all the characters
364	// that were matched after the U+0026 AMPERSAND character (&)
365	// must be unconsumed, and nothing is returned. However, if
366	// this next character is in fact a U+003D EQUALS SIGN
367	// character (=), then this is a parse error"
368
369	let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
370	(_, ';', _) => `false`,
371	(Some(_), _, Some('=')) => {
372	tokenizer.emit_error(Borrowed(
373	"Equals sign after character reference in attribute",
374	));
375	`true`
376	},
377	(Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => `true`,
378	_ => {
379	tokenizer.emit_error(Borrowed(
380	"Character reference does not end with semicolon",
381	));
382	`false`
383	},
384	};
385
386	if unconsume_all {
387	self.unconsume_name(input);
388	self.finish_none()
389	} else {
390	input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
391	self.result = Some(CharRef {
392	chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
393	num_chars: if c2 == `0` { `1` } else { `2` },
394	});
395	Done
396	}
397	},
398	}
399	}
400
401	fn do_bogus_name<Sink: TokenSink>(
402	&mut self,
403	tokenizer: &mut Tokenizer<Sink>,
404	input: &mut BufferQueue,
405	) -> Status {
406	let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
407	self.name_buf_mut().push_char(c);
408	match c {
409	_ if c.is_ascii_alphanumeric() => return Progress,
410	';' => self.emit_name_error(tokenizer),
411	_ => (),
412	}
413	self.unconsume_name(input);
414	self.finish_none()
415	}
416
417	pub fn end_of_file<Sink: TokenSink>(
418	&mut self,
419	tokenizer: &mut Tokenizer<Sink>,
420	input: &mut BufferQueue,
421	) {
422	while self.result.is_none() {
423	match self.state {
424	Begin => drop(self.finish_none()),
425
426	Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
427
428	Numeric(_) \| NumericSemicolon => {
429	tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
430	self.finish_numeric(tokenizer);
431	},
432
433	Named => drop(self.finish_named(tokenizer, input, None)),
434
435	BogusName => {
436	self.unconsume_name(input);
437	self.finish_none();
438	},
439
440	Octothorpe => {
441	input.push_front(StrTendril::from_slice("#"));
442	tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
443	self.finish_none();
444	},
445	}
446	}
447	}
448	}
449