1 | mod tables; |
2 | |
3 | pub use tables::*; |
4 | |
5 | use core::convert::TryFrom; |
6 | use core::str::from_utf8_unchecked; |
7 | |
8 | use alloc::borrow::Cow; |
9 | use alloc::string::String; |
10 | use alloc::vec::Vec; |
11 | |
12 | #[cfg (feature = "std" )] |
13 | use std::io::{self, Write}; |
14 | |
15 | use crate::functions::*; |
16 | |
17 | /// Decode html entities in a given string. |
18 | pub fn decode_html_entities<S: ?Sized + AsRef<str>>(text: &S) -> Cow<str> { |
19 | let text = text.as_ref(); |
20 | let text_bytes = text.as_bytes(); |
21 | let text_length = text_bytes.len(); |
22 | |
23 | let mut p = 0; |
24 | let mut ep = 0; |
25 | let mut e; |
26 | |
27 | let mut step = 0; |
28 | |
29 | let (mut v, mut start) = loop { |
30 | if p == text_length { |
31 | return Cow::from(text); |
32 | } |
33 | |
34 | e = text_bytes[p]; |
35 | |
36 | match step { |
37 | 0 => { |
38 | if e == b'&' { |
39 | step = 1; |
40 | ep = p; |
41 | } |
42 | } |
43 | 1 => { |
44 | match e { |
45 | b'#' => { |
46 | step = 3; |
47 | } |
48 | b';' => { |
49 | // incorrect |
50 | step = 0; |
51 | } |
52 | _ => { |
53 | step = 2; |
54 | } |
55 | } |
56 | } |
57 | 2 => { |
58 | if e == b';' { |
59 | // named |
60 | let mut v = Vec::with_capacity(text_length); |
61 | |
62 | v.extend_from_slice(&text_bytes[..ep]); |
63 | |
64 | let name = &text_bytes[(ep + 1)..p]; |
65 | |
66 | match NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name)) { |
67 | Ok(index) => { |
68 | v.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes()); |
69 | break (v, p + 1); |
70 | } |
71 | Err(_) => break (v, ep), |
72 | } |
73 | } |
74 | } |
75 | 3 => { |
76 | match e { |
77 | b'x' | b'X' => { |
78 | step = 5; |
79 | } |
80 | b';' => { |
81 | // incorrect |
82 | step = 0; |
83 | } |
84 | _ => step = 4, |
85 | } |
86 | } |
87 | 4 => { |
88 | if e == b';' { |
89 | // numeric |
90 | let mut v = Vec::with_capacity(text_length); |
91 | |
92 | v.extend_from_slice(&text_bytes[..ep]); |
93 | |
94 | let number = unsafe { text.get_unchecked((ep + 2)..p) }; |
95 | |
96 | match number.parse::<u32>() { |
97 | Ok(number) => { |
98 | match char::try_from(number) { |
99 | Ok(c) => { |
100 | write_char_to_vec(c, &mut v); |
101 | break (v, p + 1); |
102 | } |
103 | Err(_) => break (v, ep), |
104 | } |
105 | } |
106 | Err(_) => break (v, ep), |
107 | } |
108 | } |
109 | } |
110 | 5 => { |
111 | match e { |
112 | b';' => { |
113 | // incorrect |
114 | step = 0; |
115 | } |
116 | _ => step = 6, |
117 | } |
118 | } |
119 | 6 => { |
120 | if e == b';' { |
121 | // hex |
122 | let mut v = Vec::with_capacity(text_length); |
123 | |
124 | v.extend_from_slice(&text_bytes[..ep]); |
125 | |
126 | let hex = unsafe { text.get_unchecked((ep + 3)..p) }; |
127 | |
128 | match u32::from_str_radix(hex, 16) { |
129 | Ok(number) => { |
130 | match char::try_from(number) { |
131 | Ok(c) => { |
132 | write_char_to_vec(c, &mut v); |
133 | break (v, p + 1); |
134 | } |
135 | Err(_) => break (v, ep), |
136 | } |
137 | } |
138 | Err(_) => break (v, ep), |
139 | } |
140 | } |
141 | } |
142 | _ => unreachable!(), |
143 | } |
144 | |
145 | p += 1; |
146 | }; |
147 | |
148 | p += 1; |
149 | |
150 | step = 0; |
151 | |
152 | for e in text_bytes[p..].iter().copied() { |
153 | match step { |
154 | 0 => { |
155 | if e == b'&' { |
156 | step = 1; |
157 | ep = p; |
158 | } |
159 | } |
160 | 1 => { |
161 | match e { |
162 | b'#' => { |
163 | step = 3; |
164 | } |
165 | b';' => { |
166 | // incorrect |
167 | step = 0; |
168 | } |
169 | _ => { |
170 | step = 2; |
171 | } |
172 | } |
173 | } |
174 | 2 => { |
175 | if e == b';' { |
176 | // named |
177 | step = 0; |
178 | |
179 | let name = &text_bytes[(ep + 1)..p]; |
180 | |
181 | if let Ok(index) = |
182 | NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name)) |
183 | { |
184 | v.extend_from_slice(&text_bytes[start..ep]); |
185 | start = p + 1; |
186 | v.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes()); |
187 | } |
188 | } |
189 | } |
190 | 3 => { |
191 | match e { |
192 | b'x' | b'X' => { |
193 | step = 5; |
194 | } |
195 | b';' => { |
196 | // incorrect |
197 | step = 0; |
198 | } |
199 | _ => step = 4, |
200 | } |
201 | } |
202 | 4 => { |
203 | if e == b';' { |
204 | // numeric |
205 | step = 0; |
206 | |
207 | let number = unsafe { text.get_unchecked((ep + 2)..p) }; |
208 | |
209 | if let Ok(number) = number.parse::<u32>() { |
210 | if let Ok(c) = char::try_from(number) { |
211 | v.extend_from_slice(&text_bytes[start..ep]); |
212 | start = p + 1; |
213 | write_char_to_vec(c, &mut v); |
214 | } |
215 | } |
216 | } |
217 | } |
218 | 5 => { |
219 | match e { |
220 | b';' => { |
221 | // incorrect |
222 | step = 0; |
223 | } |
224 | _ => step = 6, |
225 | } |
226 | } |
227 | 6 => { |
228 | if e == b';' { |
229 | // hex |
230 | step = 0; |
231 | |
232 | let hex = unsafe { text.get_unchecked((ep + 3)..p) }; |
233 | |
234 | if let Ok(number) = u32::from_str_radix(hex, 16) { |
235 | if let Ok(c) = char::try_from(number) { |
236 | v.extend_from_slice(&text_bytes[start..ep]); |
237 | start = p + 1; |
238 | write_char_to_vec(c, &mut v); |
239 | } |
240 | } |
241 | } |
242 | } |
243 | _ => unreachable!(), |
244 | } |
245 | |
246 | p += 1; |
247 | } |
248 | |
249 | v.extend_from_slice(&text_bytes[start..p]); |
250 | |
251 | Cow::from(unsafe { String::from_utf8_unchecked(v) }) |
252 | } |
253 | |
254 | /// Decode html entities in a given string to a mutable `String` reference and return the decoded string slice. |
255 | pub fn decode_html_entities_to_string<S: AsRef<str>>(text: S, output: &mut String) -> &str { |
256 | unsafe { from_utf8_unchecked(decode_html_entities_to_vec(text, output.as_mut_vec())) } |
257 | } |
258 | |
259 | /// Decode html entities in a given string to a mutable `Vec<u8>` reference and return the decoded data slice. |
260 | pub fn decode_html_entities_to_vec<S: AsRef<str>>(text: S, output: &mut Vec<u8>) -> &[u8] { |
261 | let text = text.as_ref(); |
262 | let text_bytes = text.as_bytes(); |
263 | let text_length = text_bytes.len(); |
264 | |
265 | output.reserve(text_length); |
266 | |
267 | let current_length = output.len(); |
268 | |
269 | let mut start = 0; |
270 | let mut end = 0; |
271 | let mut ep = 0; |
272 | |
273 | let mut step = 0; |
274 | |
275 | for e in text_bytes.iter().copied() { |
276 | match step { |
277 | 0 => { |
278 | if e == b'&' { |
279 | step = 1; |
280 | ep = end; |
281 | } |
282 | } |
283 | 1 => { |
284 | match e { |
285 | b'#' => { |
286 | step = 3; |
287 | } |
288 | b';' => { |
289 | // incorrect |
290 | step = 0; |
291 | } |
292 | _ => { |
293 | step = 2; |
294 | } |
295 | } |
296 | } |
297 | 2 => { |
298 | if e == b';' { |
299 | // named |
300 | step = 0; |
301 | |
302 | let name = &text_bytes[(ep + 1)..end]; |
303 | |
304 | if let Ok(index) = |
305 | NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name)) |
306 | { |
307 | output.extend_from_slice(&text_bytes[start..ep]); |
308 | start = end + 1; |
309 | output.extend_from_slice(NAMED_ENTITIES[index].1.as_bytes()); |
310 | } |
311 | } |
312 | } |
313 | 3 => { |
314 | match e { |
315 | b'x' | b'X' => { |
316 | step = 5; |
317 | } |
318 | b';' => { |
319 | // incorrect |
320 | step = 0; |
321 | } |
322 | _ => step = 4, |
323 | } |
324 | } |
325 | 4 => { |
326 | if e == b';' { |
327 | // numeric |
328 | step = 0; |
329 | |
330 | let number = unsafe { text.get_unchecked((ep + 2)..end) }; |
331 | |
332 | if let Ok(number) = number.parse::<u32>() { |
333 | if let Ok(c) = char::try_from(number) { |
334 | output.extend_from_slice(&text_bytes[start..ep]); |
335 | start = end + 1; |
336 | write_char_to_vec(c, output); |
337 | } |
338 | } |
339 | } |
340 | } |
341 | 5 => { |
342 | match e { |
343 | b';' => { |
344 | // incorrect |
345 | step = 0; |
346 | } |
347 | _ => step = 6, |
348 | } |
349 | } |
350 | 6 => { |
351 | if e == b';' { |
352 | // hex |
353 | step = 0; |
354 | |
355 | let hex = unsafe { text.get_unchecked((ep + 3)..end) }; |
356 | |
357 | if let Ok(number) = u32::from_str_radix(hex, 16) { |
358 | if let Ok(c) = char::try_from(number) { |
359 | output.extend_from_slice(&text_bytes[start..ep]); |
360 | start = end + 1; |
361 | write_char_to_vec(c, output); |
362 | } |
363 | } |
364 | } |
365 | } |
366 | _ => unreachable!(), |
367 | } |
368 | |
369 | end += 1; |
370 | } |
371 | |
372 | output.extend_from_slice(&text_bytes[start..end]); |
373 | |
374 | &output[current_length..] |
375 | } |
376 | |
377 | #[cfg (feature = "std" )] |
378 | /// Decode html entities in a given string to a writer. |
379 | pub fn decode_html_entities_to_writer<S: AsRef<str>, W: Write>( |
380 | text: S, |
381 | output: &mut W, |
382 | ) -> Result<(), io::Error> { |
383 | let text = text.as_ref(); |
384 | let text_bytes = text.as_bytes(); |
385 | |
386 | let mut start = 0; |
387 | let mut end = 0; |
388 | let mut ep = 0; |
389 | |
390 | let mut step = 0; |
391 | |
392 | for e in text_bytes.iter().copied() { |
393 | match step { |
394 | 0 => { |
395 | if e == b'&' { |
396 | step = 1; |
397 | ep = end; |
398 | } |
399 | } |
400 | 1 => { |
401 | match e { |
402 | b'#' => { |
403 | step = 3; |
404 | } |
405 | b';' => { |
406 | // incorrect |
407 | step = 0; |
408 | } |
409 | _ => { |
410 | step = 2; |
411 | } |
412 | } |
413 | } |
414 | 2 => { |
415 | if e == b';' { |
416 | // named |
417 | step = 0; |
418 | |
419 | let name = &text_bytes[(ep + 1)..end]; |
420 | |
421 | if let Ok(index) = |
422 | NAMED_ENTITIES.binary_search_by(|(t_name, _)| t_name.cmp(&name)) |
423 | { |
424 | output.write_all(&text_bytes[start..ep])?; |
425 | start = end + 1; |
426 | output.write_all(NAMED_ENTITIES[index].1.as_bytes())?; |
427 | } |
428 | } |
429 | } |
430 | 3 => { |
431 | match e { |
432 | b'x' | b'X' => { |
433 | step = 5; |
434 | } |
435 | b';' => { |
436 | // incorrect |
437 | step = 0; |
438 | } |
439 | _ => step = 4, |
440 | } |
441 | } |
442 | 4 => { |
443 | if e == b';' { |
444 | // numeric |
445 | step = 0; |
446 | |
447 | let number = unsafe { text.get_unchecked((ep + 2)..end) }; |
448 | |
449 | if let Ok(number) = number.parse::<u32>() { |
450 | if let Ok(c) = char::try_from(number) { |
451 | output.write_all(&text_bytes[start..ep])?; |
452 | start = end + 1; |
453 | write_char_to_writer(c, output)?; |
454 | } |
455 | } |
456 | } |
457 | } |
458 | 5 => { |
459 | match e { |
460 | b';' => { |
461 | // incorrect |
462 | step = 0; |
463 | } |
464 | _ => step = 6, |
465 | } |
466 | } |
467 | 6 => { |
468 | if e == b';' { |
469 | // hex |
470 | step = 0; |
471 | |
472 | let hex = unsafe { text.get_unchecked((ep + 3)..end) }; |
473 | |
474 | if let Ok(number) = u32::from_str_radix(hex, 16) { |
475 | if let Ok(c) = char::try_from(number) { |
476 | output.write_all(&text_bytes[start..ep])?; |
477 | start = end + 1; |
478 | write_char_to_writer(c, output)?; |
479 | } |
480 | } |
481 | } |
482 | } |
483 | _ => unreachable!(), |
484 | } |
485 | |
486 | end += 1; |
487 | } |
488 | |
489 | output.write_all(&text_bytes[start..end]) |
490 | } |
491 | |