1 | use std::fmt::Write; |
2 | |
3 | use crate::common::{is_name_char, is_name_start_char, is_whitespace_char}; |
4 | use crate::reader::error::SyntaxError; |
5 | use crate::reader::lexer::Token; |
6 | |
7 | use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State}; |
8 | |
9 | impl PullParser { |
10 | pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> { |
11 | if let Some(ref mut doctype) = self.data.doctype { |
12 | write!(doctype, " {t}" ).ok()?; |
13 | if doctype.len() > self.config.max_data_length { |
14 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
15 | } |
16 | } |
17 | |
18 | match substate { |
19 | DoctypeSubstate::Outside => match t { |
20 | Token::TagEnd => self.into_state_continue(State::OutsideTag), |
21 | Token::MarkupDeclarationStart => { |
22 | self.buf.clear(); |
23 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName)) |
24 | }, |
25 | Token::Character('%' ) => { |
26 | self.data.ref_data.clear(); |
27 | self.data.ref_data.push('%' ); |
28 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd)) |
29 | }, |
30 | Token::CommentStart => { |
31 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment)) |
32 | }, |
33 | Token::SingleQuote | Token::DoubleQuote => { |
34 | // just discard string literals |
35 | self.data.quote = super::QuoteToken::from_token(t); |
36 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String)) |
37 | }, |
38 | Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))), |
39 | // TODO: parse SYSTEM, and [ |
40 | _ => None, |
41 | }, |
42 | DoctypeSubstate::String => match t { |
43 | Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None, |
44 | Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None, |
45 | Token::SingleQuote | Token::DoubleQuote => { |
46 | self.data.quote = None; |
47 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) |
48 | }, |
49 | _ => None, |
50 | }, |
51 | DoctypeSubstate::Comment => match t { |
52 | Token::CommentEnd => { |
53 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) |
54 | }, |
55 | _ => None, |
56 | }, |
57 | DoctypeSubstate::InsideName => match t { |
58 | Token::Character(c @ 'A' ..='Z' ) => { |
59 | self.buf.push(c); |
60 | None |
61 | }, |
62 | Token::Character(c) if is_whitespace_char(c) => { |
63 | let buf = self.take_buf(); |
64 | match buf.as_str() { |
65 | "ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)), |
66 | "NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)), |
67 | _ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))), |
68 | } |
69 | }, |
70 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), |
71 | }, |
72 | DoctypeSubstate::BeforeEntityName => { |
73 | self.data.name.clear(); |
74 | match t { |
75 | Token::Character(c) if is_whitespace_char(c) => None, |
76 | Token::Character('%' ) => { // % is for PEDecl |
77 | self.data.name.push('%' ); |
78 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart)) |
79 | }, |
80 | Token::Character(c) if is_name_start_char(c) => { |
81 | if self.data.name.len() > self.config.max_name_length { |
82 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
83 | } |
84 | self.data.name.push(c); |
85 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName)) |
86 | }, |
87 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
88 | } |
89 | }, |
90 | DoctypeSubstate::EntityName => match t { |
91 | Token::Character(c) if is_whitespace_char(c) => { |
92 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) |
93 | }, |
94 | Token::Character(c) if is_name_char(c) => { |
95 | if self.data.name.len() > self.config.max_name_length { |
96 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
97 | } |
98 | self.data.name.push(c); |
99 | None |
100 | }, |
101 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
102 | }, |
103 | DoctypeSubstate::BeforeEntityValue => { |
104 | self.buf.clear(); |
105 | match t { |
106 | Token::Character(c) if is_whitespace_char(c) => None, |
107 | // SYSTEM/PUBLIC not supported |
108 | Token::Character('S' | 'P' ) => { |
109 | let name = self.data.take_name(); |
110 | self.entities.entry(name).or_default(); // Dummy value, but at least the name is recognized |
111 | |
112 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) |
113 | }, |
114 | Token::SingleQuote | Token::DoubleQuote => { |
115 | self.data.quote = super::QuoteToken::from_token(t); |
116 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) |
117 | }, |
118 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
119 | } |
120 | }, |
121 | DoctypeSubstate::EntityValue => match t { |
122 | Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push(' \'' ); None }, |
123 | Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"' ); None }, |
124 | Token::SingleQuote | Token::DoubleQuote => { |
125 | self.data.quote = None; |
126 | let name = self.data.take_name(); |
127 | let val = self.take_buf(); |
128 | self.entities.entry(name).or_insert(val); // First wins |
129 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME |
130 | }, |
131 | Token::ReferenceStart | Token::Character('&' ) => { |
132 | self.data.ref_data.clear(); |
133 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart)) |
134 | }, |
135 | Token::Character('%' ) => { |
136 | self.data.ref_data.clear(); |
137 | self.data.ref_data.push('%' ); // include literal % in the name to distinguish from regular entities |
138 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue)) |
139 | }, |
140 | Token::Character(c) if !self.is_valid_xml_char(c) => { |
141 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) |
142 | }, |
143 | Token::Character(c) => { |
144 | self.buf.push(c); |
145 | None |
146 | }, |
147 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
148 | }, |
149 | DoctypeSubstate::PEReferenceDefinitionStart => match t { |
150 | Token::Character(c) if is_whitespace_char(c) => None, |
151 | Token::Character(c) if is_name_start_char(c) => { |
152 | debug_assert_eq!(self.data.name, "%" ); |
153 | self.data.name.push(c); |
154 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition)) |
155 | }, |
156 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
157 | }, |
158 | DoctypeSubstate::PEReferenceDefinition => match t { |
159 | Token::Character(c) if is_name_char(c) => { |
160 | if self.data.name.len() > self.config.max_name_length { |
161 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
162 | } |
163 | self.data.name.push(c); |
164 | None |
165 | }, |
166 | Token::Character(c) if is_whitespace_char(c) => { |
167 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue)) |
168 | }, |
169 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
170 | }, |
171 | DoctypeSubstate::PEReferenceInDtd => match t { |
172 | Token::Character(c) if is_name_char(c) => { |
173 | self.data.ref_data.push(c); |
174 | None |
175 | }, |
176 | Token::ReferenceEnd | Token::Character(';' ) => { |
177 | let name = self.data.take_ref_data(); |
178 | match self.entities.get(&name) { |
179 | Some(ent) => { |
180 | if let Err(e) = self.lexer.reparse(ent) { |
181 | return Some(Err(e)); |
182 | } |
183 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) |
184 | }, |
185 | None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), |
186 | } |
187 | }, |
188 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
189 | }, |
190 | DoctypeSubstate::PEReferenceInValue => match t { |
191 | Token::Character(c) if is_name_char(c) => { |
192 | self.data.ref_data.push(c); |
193 | None |
194 | }, |
195 | Token::ReferenceEnd | Token::Character(';' ) => { |
196 | let name = self.data.take_ref_data(); |
197 | match self.entities.get(&name) { |
198 | Some(ent) => { |
199 | self.buf.push_str(ent); |
200 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) |
201 | }, |
202 | None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))), |
203 | } |
204 | }, |
205 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
206 | }, |
207 | DoctypeSubstate::NumericReferenceStart => match t { |
208 | Token::Character('#' ) => { |
209 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference)) |
210 | }, |
211 | Token::Character(c) if !self.is_valid_xml_char(c) => { |
212 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) |
213 | }, |
214 | Token::Character(c) => { |
215 | self.buf.push('&' ); |
216 | self.buf.push(c); |
217 | // named entities are not expanded inside doctype |
218 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) |
219 | }, |
220 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
221 | }, |
222 | DoctypeSubstate::NumericReference => match t { |
223 | Token::ReferenceEnd | Token::Character(';' ) => { |
224 | let r = self.data.take_ref_data(); |
225 | // https://www.w3.org/TR/xml/#sec-entexpand |
226 | match self.numeric_reference_from_str(&r) { |
227 | Ok(c) => { |
228 | self.buf.push(c); |
229 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue)) |
230 | }, |
231 | Err(e) => Some(self.error(e)), |
232 | } |
233 | }, |
234 | Token::Character(c) if !self.is_valid_xml_char(c) => { |
235 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) |
236 | }, |
237 | Token::Character(c) => { |
238 | self.data.ref_data.push(c); |
239 | None |
240 | }, |
241 | _ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))), |
242 | }, |
243 | DoctypeSubstate::SkipDeclaration => match t { |
244 | Token::TagEnd => { |
245 | self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside)) |
246 | }, |
247 | _ => None, |
248 | }, |
249 | } |
250 | } |
251 | } |
252 | |