| 1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
| 2 | // COPYRIGHT file at the top-level directory of this distribution. |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 7 | // option. This file may not be copied, modified, or distributed |
| 8 | // except according to those terms. |
| 9 | |
| 10 | //! High-level interface to the parser. |
| 11 | |
| 12 | use crate::buffer_queue::BufferQueue; |
| 13 | use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; |
| 14 | use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink}; |
| 15 | use crate::{Attribute, QualName}; |
| 16 | |
| 17 | use std::borrow::Cow; |
| 18 | |
| 19 | use crate::tendril; |
| 20 | use crate::tendril::stream::{TendrilSink, Utf8LossyDecoder}; |
| 21 | use crate::tendril::StrTendril; |
| 22 | |
| 23 | /// All-encompassing options struct for the parser. |
| 24 | #[derive (Clone, Default)] |
| 25 | pub struct ParseOpts { |
| 26 | /// Tokenizer options. |
| 27 | pub tokenizer: TokenizerOpts, |
| 28 | |
| 29 | /// Tree builder options. |
| 30 | pub tree_builder: TreeBuilderOpts, |
| 31 | } |
| 32 | |
| 33 | /// Parse an HTML document |
| 34 | /// |
| 35 | /// The returned value implements `tendril::TendrilSink` |
| 36 | /// so that Unicode input may be provided incrementally, |
| 37 | /// or all at once with the `one` method. |
| 38 | /// |
| 39 | /// If your input is bytes, use `Parser::from_utf8`. |
| 40 | pub fn parse_document<Sink>(sink: Sink, opts: ParseOpts) -> Parser<Sink> |
| 41 | where |
| 42 | Sink: TreeSink, |
| 43 | { |
| 44 | let tb: TreeBuilder<::Handle, …> = TreeBuilder::new(sink, opts.tree_builder); |
| 45 | let tok: Tokenizer::Handle, …>> = Tokenizer::new(sink:tb, opts.tokenizer); |
| 46 | Parser { |
| 47 | tokenizer: tok, |
| 48 | input_buffer: BufferQueue::default(), |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | /// Parse an HTML fragment |
| 53 | /// |
| 54 | /// The returned value implements `tendril::TendrilSink` |
| 55 | /// so that Unicode input may be provided incrementally, |
| 56 | /// or all at once with the `one` method. |
| 57 | /// |
| 58 | /// If your input is bytes, use `Parser::from_utf8`. |
| 59 | pub fn parse_fragment<Sink>( |
| 60 | sink: Sink, |
| 61 | opts: ParseOpts, |
| 62 | context_name: QualName, |
| 63 | context_attrs: Vec<Attribute>, |
| 64 | ) -> Parser<Sink> |
| 65 | where |
| 66 | Sink: TreeSink, |
| 67 | { |
| 68 | let context_elem: ::Handle = create_element(&sink, context_name, context_attrs); |
| 69 | parse_fragment_for_element(sink, opts, context_element:context_elem, form_element:None) |
| 70 | } |
| 71 | |
| 72 | /// Like `parse_fragment`, but with an existing context element |
| 73 | /// and optionally a form element. |
| 74 | pub fn parse_fragment_for_element<Sink>( |
| 75 | sink: Sink, |
| 76 | opts: ParseOpts, |
| 77 | context_element: Sink::Handle, |
| 78 | form_element: Option<Sink::Handle>, |
| 79 | ) -> Parser<Sink> |
| 80 | where |
| 81 | Sink: TreeSink, |
| 82 | { |
| 83 | let tb: TreeBuilder<::Handle, …> = TreeBuilder::new_for_fragment(sink, context_elem:context_element, form_elem:form_element, opts.tree_builder); |
| 84 | let tok_opts: TokenizerOpts = TokenizerOpts { |
| 85 | initial_state: Some(tb.tokenizer_state_for_context_elem()), |
| 86 | ..opts.tokenizer |
| 87 | }; |
| 88 | let tok: Tokenizer::Handle, …>> = Tokenizer::new(sink:tb, tok_opts); |
| 89 | Parser { |
| 90 | tokenizer: tok, |
| 91 | input_buffer: BufferQueue::default(), |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | /// An HTML parser, |
| 96 | /// ready to receive Unicode input through the `tendril::TendrilSink` trait’s methods. |
| 97 | pub struct Parser<Sink> |
| 98 | where |
| 99 | Sink: TreeSink, |
| 100 | { |
| 101 | pub tokenizer: Tokenizer<TreeBuilder<Sink::Handle, Sink>>, |
| 102 | pub input_buffer: BufferQueue, |
| 103 | } |
| 104 | |
| 105 | impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for Parser<Sink> { |
| 106 | fn process(&mut self, t: StrTendril) { |
| 107 | self.input_buffer.push_back(buf:t); |
| 108 | // FIXME: Properly support </script> somehow. |
| 109 | while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {} |
| 110 | } |
| 111 | |
| 112 | // FIXME: Is it too noisy to report every character decoding error? |
| 113 | fn error(&mut self, desc: Cow<'static, str>) { |
| 114 | self.tokenizer.sink.sink.parse_error(msg:desc) |
| 115 | } |
| 116 | |
| 117 | type Output = Sink::Output; |
| 118 | |
| 119 | fn finish(self) -> Self::Output { |
| 120 | // FIXME: Properly support </script> somehow. |
| 121 | while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {} |
| 122 | assert!(self.input_buffer.is_empty()); |
| 123 | self.tokenizer.end(); |
| 124 | self.tokenizer.sink.sink.finish() |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | impl<Sink: TreeSink> Parser<Sink> { |
| 129 | /// Wrap this parser into a `TendrilSink` that accepts UTF-8 bytes. |
| 130 | /// |
| 131 | /// Use this when your input is bytes that are known to be in the UTF-8 encoding. |
| 132 | /// Decoding is lossy, like `String::from_utf8_lossy`. |
| 133 | #[allow (clippy::wrong_self_convention)] |
| 134 | pub fn from_utf8(self) -> Utf8LossyDecoder<Self> { |
| 135 | Utf8LossyDecoder::new(self) |
| 136 | } |
| 137 | } |
| 138 | |