1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! High-level interface to the parser.
11
12use crate::buffer_queue::BufferQueue;
13use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
14use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
15use crate::{Attribute, QualName};
16
17use std::borrow::Cow;
18
19use crate::tendril;
20use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder};
21use crate::tendril::StrTendril;
22
23/// All-encompassing options struct for the parser.
24#[derive(Clone, Default)]
25pub struct ParseOpts {
26 /// Tokenizer options.
27 pub tokenizer: TokenizerOpts,
28
29 /// Tree builder options.
30 pub tree_builder: TreeBuilderOpts,
31}
32
33/// Parse an HTML document
34///
35/// The returned value implements `tendril::TendrilSink`
36/// so that Unicode input may be provided incrementally,
37/// or all at once with the `one` method.
38///
39/// If your input is bytes, use `Parser::from_utf8`.
40pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink>
41where
42 Sink: TreeSink,
43{
44 let tb: TreeBuilder<::Handle, …> = TreeBuilder::new(sink, opts:opts.tree_builder);
45 let tok: Tokenizer::Handle, …>> = Tokenizer::new(sink:tb, opts:opts.tokenizer);
46 Parser {
47 tokenizer: tok,
48 input_buffer: BufferQueue::new(),
49 }
50}
51
52/// Parse an HTML fragment
53///
54/// The returned value implements `tendril::TendrilSink`
55/// so that Unicode input may be provided incrementally,
56/// or all at once with the `one` method.
57///
58/// If your input is bytes, use `Parser::from_utf8`.
59pub fn parse_fragment<Sink>(
60 mut sink: Sink,
61 opts: ParseOpts,
62 context_name: QualName,
63 context_attrs: Vec<Attribute>,
64) -> Parser<Sink>
65where
66 Sink: TreeSink,
67{
68 let context_elem: ::Handle = create_element(&mut sink, context_name, context_attrs);
69 parse_fragment_for_element(sink, opts, context_element:context_elem, form_element:None)
70}
71
72/// Like `parse_fragment`, but with an existing context element
73/// and optionally a form element.
74pub fn parse_fragment_for_element<Sink>(
75 sink: Sink,
76 opts: ParseOpts,
77 context_element: Sink::Handle,
78 form_element: Option<Sink::Handle>,
79) -> Parser<Sink>
80where
81 Sink: TreeSink,
82{
83 let tb: TreeBuilder<::Handle, …> = TreeBuilder::new_for_fragment(sink, context_elem:context_element, form_elem:form_element, opts:opts.tree_builder);
84 let tok_opts: TokenizerOpts = TokenizerOpts {
85 initial_state: Some(tb.tokenizer_state_for_context_elem()),
86 ..opts.tokenizer
87 };
88 let tok: Tokenizer::Handle, …>> = Tokenizer::new(sink:tb, tok_opts);
89 Parser {
90 tokenizer: tok,
91 input_buffer: BufferQueue::new(),
92 }
93}
94
95/// An HTML parser,
96/// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods.
97pub struct Parser<Sink>
98where
99 Sink: TreeSink,
100{
101 pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>,
102 pub input_buffer: BufferQueue,
103}
104
105impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> {
106 fn process(&mut self, t: StrTendril) {
107 self.input_buffer.push_back(buf:t);
108 // FIXME: Properly support </script> somehow.
109 while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
110 }
111
112 // FIXME: Is it too noisy to report every character decoding error?
113 fn error(&mut self, desc: Cow<'static, str>) {
114 self.tokenizer.sink.sink.parse_error(msg:desc)
115 }
116
117 type Output = Sink::Output;
118
119 fn finish(mut self) -> Self::Output {
120 // FIXME: Properly support </script> somehow.
121 while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {}
122 assert!(self.input_buffer.is_empty());
123 self.tokenizer.end();
124 self.tokenizer.sink.sink.finish()
125 }
126}
127
128impl<Sink: TreeSink> Parser<Sink> {
129 /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes.
130 ///
131 /// Use this when your input is bytes that are known to be in the UTF-8 encoding.
132 /// Decoding is lossy, like `String::from_utf8_lossy`.
133 #[allow(clippy::wrong_self_convention)]
134 pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
135 Utf8LossyDecoder::new(self)
136 }
137}
138