1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! High-level interface to the parser. |
11 | |
12 | use crate::buffer_queue::BufferQueue; |
13 | use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; |
14 | use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink}; |
15 | use crate::{Attribute, QualName}; |
16 | |
17 | use std::borrow::Cow; |
18 | |
19 | use crate::tendril; |
20 | use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder}; |
21 | use crate::tendril::StrTendril; |
22 | |
23 | /// All-encompassing options struct for the parser. |
24 | #[derive (Clone, Default)] |
25 | pub struct ParseOpts { |
26 | /// Tokenizer options. |
27 | pub tokenizer: TokenizerOpts, |
28 | |
29 | /// Tree builder options. |
30 | pub tree_builder: TreeBuilderOpts, |
31 | } |
32 | |
33 | /// Parse an HTML document |
34 | /// |
35 | /// The returned value implements `tendril::TendrilSink` |
36 | /// so that Unicode input may be provided incrementally, |
37 | /// or all at once with the `one` method. |
38 | /// |
39 | /// If your input is bytes, use `Parser::from_utf8`. |
40 | pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> |
41 | where |
42 | Sink: TreeSink, |
43 | { |
44 | let tb: TreeBuilder<::Handle, …> = TreeBuilder::new(sink, opts:opts.tree_builder); |
45 | let tok: Tokenizer::Handle, …>> = Tokenizer::new(sink:tb, opts:opts.tokenizer); |
46 | Parser { |
47 | tokenizer: tok, |
48 | input_buffer: BufferQueue::new(), |
49 | } |
50 | } |
51 | |
52 | /// Parse an HTML fragment |
53 | /// |
54 | /// The returned value implements `tendril::TendrilSink` |
55 | /// so that Unicode input may be provided incrementally, |
56 | /// or all at once with the `one` method. |
57 | /// |
58 | /// If your input is bytes, use `Parser::from_utf8`. |
59 | pub fn parse_fragment<Sink>( |
60 | mut sink: Sink, |
61 | opts: ParseOpts, |
62 | context_name: QualName, |
63 | context_attrs: Vec<Attribute>, |
64 | ) -> Parser<Sink> |
65 | where |
66 | Sink: TreeSink, |
67 | { |
68 | let context_elem: ::Handle = create_element(&mut sink, context_name, context_attrs); |
69 | parse_fragment_for_element(sink, opts, context_element:context_elem, form_element:None) |
70 | } |
71 | |
72 | /// Like `parse_fragment`, but with an existing context element |
73 | /// and optionally a form element. |
74 | pub fn parse_fragment_for_element<Sink>( |
75 | sink: Sink, |
76 | opts: ParseOpts, |
77 | context_element: Sink::Handle, |
78 | form_element: Option<Sink::Handle>, |
79 | ) -> Parser<Sink> |
80 | where |
81 | Sink: TreeSink, |
82 | { |
83 | let tb: TreeBuilder<::Handle, …> = TreeBuilder::new_for_fragment(sink, context_elem:context_element, form_elem:form_element, opts:opts.tree_builder); |
84 | let tok_opts: TokenizerOpts = TokenizerOpts { |
85 | initial_state: Some(tb.tokenizer_state_for_context_elem()), |
86 | ..opts.tokenizer |
87 | }; |
88 | let tok: Tokenizer::Handle, …>> = Tokenizer::new(sink:tb, tok_opts); |
89 | Parser { |
90 | tokenizer: tok, |
91 | input_buffer: BufferQueue::new(), |
92 | } |
93 | } |
94 | |
95 | /// An HTML parser, |
96 | /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods. |
97 | pub struct Parser<Sink> |
98 | where |
99 | Sink: TreeSink, |
100 | { |
101 | pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>, |
102 | pub input_buffer: BufferQueue, |
103 | } |
104 | |
105 | impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> { |
106 | fn process(&mut self, t: StrTendril) { |
107 | self.input_buffer.push_back(buf:t); |
108 | // FIXME: Properly support </script> somehow. |
109 | while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} |
110 | } |
111 | |
112 | // FIXME: Is it too noisy to report every character decoding error? |
113 | fn error(&mut self, desc: Cow<'static, str>) { |
114 | self.tokenizer.sink.sink.parse_error(msg:desc) |
115 | } |
116 | |
117 | type Output = Sink::Output; |
118 | |
119 | fn finish(mut self) -> Self::Output { |
120 | // FIXME: Properly support </script> somehow. |
121 | while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} |
122 | assert!(self.input_buffer.is_empty()); |
123 | self.tokenizer.end(); |
124 | self.tokenizer.sink.sink.finish() |
125 | } |
126 | } |
127 | |
128 | impl<Sink: TreeSink> Parser<Sink> { |
129 | /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. |
130 | /// |
131 | /// Use this when your input is bytes that are known to be in the UTF-8 encoding. |
132 | /// Decoding is lossy, like `String::from_utf8_lossy`. |
133 | #[allow (clippy::wrong_self_convention)] |
134 | pub fn from_utf8(self) -> Utf8LossyDecoder<Self> { |
135 | Utf8LossyDecoder::new(self) |
136 | } |
137 | } |
138 | |