1 | //! Contains parser configuration structure. |
2 | use std::collections::HashMap; |
3 | use std::io::Read; |
4 | |
5 | use crate::reader::EventReader; |
6 | use crate::util::Encoding; |
7 | |
8 | /// Limits to defend from billion laughs attack |
9 | const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000; |
10 | const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10; |
11 | |
12 | /// Parser configuration structure. **There are more config methods than public fileds — see methods below**. |
13 | /// |
14 | /// This structure contains various configuration options which affect |
15 | /// behavior of the parser. |
16 | #[derive (Clone, PartialEq, Eq, Debug)] |
17 | pub struct ParserConfig { |
18 | /// Whether or not should whitespace in textual events be removed. Default is false. |
19 | /// |
20 | /// When true, all standalone whitespace will be removed (this means no |
21 | /// `Whitespace` events will be emitted), and leading and trailing whitespace |
22 | /// from `Character` events will be deleted. If after trimming `Characters` |
23 | /// event will be empty, it will also be omitted from output stream. This is |
24 | /// possible, however, only if `whitespace_to_characters` or |
25 | /// `cdata_to_characters` options are set. |
26 | /// |
27 | /// This option does not affect CDATA events, unless `cdata_to_characters` |
28 | /// option is also set. In that case CDATA content will also be trimmed. |
29 | pub trim_whitespace: bool, |
30 | |
31 | /// Whether or not should whitespace be converted to characters. |
32 | /// Default is false. |
33 | /// |
34 | /// If true, instead of `Whitespace` events `Characters` events with the |
35 | /// same content will be emitted. If `trim_whitespace` is also true, these |
36 | /// events will be trimmed to nothing and, consequently, not emitted. |
37 | pub whitespace_to_characters: bool, |
38 | |
39 | /// Whether or not should CDATA be converted to characters. |
40 | /// Default is false. |
41 | /// |
42 | /// If true, instead of `CData` events `Characters` events with the same |
43 | /// content will be emitted. If `trim_whitespace` is also true, these events |
44 | /// will be trimmed. If corresponding CDATA contained nothing but whitespace, |
45 | /// this event will be omitted from the stream. |
46 | pub cdata_to_characters: bool, |
47 | |
48 | /// Whether or not should comments be omitted. Default is true. |
49 | /// |
50 | /// If true, `Comment` events will not be emitted at all. |
51 | pub ignore_comments: bool, |
52 | |
53 | /// Whether or not should sequential `Characters` events be merged. |
54 | /// Default is true. |
55 | /// |
56 | /// If true, multiple sequential `Characters` events will be merged into |
57 | /// a single event, that is, their data will be concatenated. |
58 | /// |
59 | /// Multiple sequential `Characters` events are only possible if either |
60 | /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character |
61 | /// events will always be separated by other events. |
62 | pub coalesce_characters: bool, |
63 | |
64 | /// A map of extra entities recognized by the parser. Default is an empty map. |
65 | /// |
66 | /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, |
67 | /// however, it is convenient to make the parser recognize additional entities which |
68 | /// are also not available through the DTD definitions (especially given that at the moment |
69 | /// DTD parsing is not supported). |
70 | pub extra_entities: HashMap<String, String>, |
71 | |
72 | /// Whether or not the parser should ignore the end of stream. Default is false. |
73 | /// |
74 | /// By default the parser will either error out when it encounters a premature end of |
75 | /// stream or complete normally if the end of stream was expected. If you want to continue |
76 | /// reading from a stream whose input is supplied progressively, you can set this option to true. |
77 | /// In this case the parser will allow you to invoke the `next()` method even if a supposed end |
78 | /// of stream has happened. |
79 | /// |
80 | /// Note that support for this functionality is incomplete; for example, the parser will fail if |
81 | /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. |
82 | pub ignore_end_of_stream: bool, |
83 | |
84 | /// Whether or not non-unicode entity references get replaced with the replacement character |
85 | /// |
86 | /// When true, any decimal or hexadecimal character reference that cannot be converted from a |
87 | /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) |
88 | /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). |
89 | pub replace_unknown_entity_references: bool, |
90 | |
91 | /// Whether or not whitespace at the root level of the document is ignored. Default is true. |
92 | /// |
93 | /// By default any whitespace that is not enclosed within at least one level of elements will be |
94 | /// ignored. Setting this value to false will cause root level whitespace events to be emitted. |
95 | /// |
96 | /// **There are configuration options – see methods below** |
97 | pub ignore_root_level_whitespace: bool, |
98 | } |
99 | |
100 | impl ParserConfig { |
101 | /// Returns a new config with default values. |
102 | /// |
103 | /// You can tweak default values using builder-like pattern: |
104 | /// |
105 | /// ```rust |
106 | /// use xml::reader::ParserConfig; |
107 | /// |
108 | /// let config = ParserConfig::new() |
109 | /// .trim_whitespace(true) |
110 | /// .ignore_comments(true) |
111 | /// .coalesce_characters(false); |
112 | /// ``` |
113 | #[must_use ] |
114 | #[inline ] |
115 | pub fn new() -> Self { |
116 | Self { |
117 | trim_whitespace: false, |
118 | whitespace_to_characters: false, |
119 | cdata_to_characters: false, |
120 | ignore_comments: true, |
121 | coalesce_characters: true, |
122 | extra_entities: HashMap::new(), |
123 | ignore_end_of_stream: false, |
124 | replace_unknown_entity_references: false, |
125 | ignore_root_level_whitespace: true, |
126 | } |
127 | } |
128 | |
129 | /// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. |
130 | /// |
131 | /// This is a convenience method for configuring and creating a reader at the same time: |
132 | /// |
133 | /// ```rust |
134 | /// use xml::reader::ParserConfig; |
135 | /// |
136 | /// let mut source: &[u8] = b"..." ; |
137 | /// |
138 | /// let reader = ParserConfig::new() |
139 | /// .trim_whitespace(true) |
140 | /// .ignore_comments(true) |
141 | /// .coalesce_characters(false) |
142 | /// .create_reader(&mut source); |
143 | /// ``` |
144 | /// |
145 | /// This method is exactly equivalent to calling `EventReader::new_with_config()` with |
146 | /// this configuration object. |
147 | #[inline ] |
148 | pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { |
149 | EventReader::new_with_config(source, self) |
150 | } |
151 | |
152 | /// Adds a new entity mapping and returns an updated config object. |
153 | /// |
154 | /// This is a convenience method for adding external entities mappings to the XML parser. |
155 | /// An example: |
156 | /// |
157 | /// ```rust |
158 | /// use xml::reader::ParserConfig; |
159 | /// |
160 | /// let mut source: &[u8] = b"..." ; |
161 | /// |
162 | /// let reader = ParserConfig::new() |
163 | /// .add_entity("nbsp" , " " ) |
164 | /// .add_entity("copy" , "©" ) |
165 | /// .add_entity("reg" , "®" ) |
166 | /// .create_reader(&mut source); |
167 | /// ``` |
168 | #[must_use ] |
169 | pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> Self { |
170 | self.extra_entities.insert(entity.into(), value.into()); |
171 | self |
172 | } |
173 | } |
174 | |
175 | impl Default for ParserConfig { |
176 | #[inline ] |
177 | fn default() -> Self { |
178 | Self::new() |
179 | } |
180 | } |
181 | |
182 | gen_setters! { ParserConfig, |
183 | trim_whitespace: val bool, |
184 | whitespace_to_characters: val bool, |
185 | cdata_to_characters: val bool, |
186 | ignore_comments: val bool, |
187 | coalesce_characters: val bool, |
188 | ignore_end_of_stream: val bool, |
189 | replace_unknown_entity_references: val bool, |
190 | ignore_root_level_whitespace: val bool |
191 | } |
192 | |
193 | /// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct |
194 | #[derive (Clone, PartialEq, Eq, Debug)] |
195 | #[non_exhaustive ] |
196 | pub struct ParserConfig2 { |
197 | pub(crate) c: ParserConfig, |
198 | |
199 | /// Use this encoding as the default. Necessary for UTF-16 files without BOM. |
200 | pub override_encoding: Option<Encoding>, |
201 | |
202 | /// Allow `<?xml encoding="…">` to contain unsupported encoding names, |
203 | /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing. |
204 | pub ignore_invalid_encoding_declarations: bool, |
205 | |
206 | /// Documents with multiple root elements are ill-formed |
207 | pub allow_multiple_root_elements: bool, |
208 | |
209 | /// Abort if custom entities create a string longer than this |
210 | pub max_entity_expansion_length: usize, |
211 | /// Entities can expand into other entities this many times (be careful about exponential cost!) |
212 | pub max_entity_expansion_depth: u8, |
213 | |
214 | /// Maximum length of tag name or attribute name |
215 | pub max_name_length: usize, |
216 | |
217 | /// Max number of attributes per element |
218 | pub max_attributes: usize, |
219 | |
220 | /// Max number of bytes in each attribute |
221 | pub max_attribute_length: usize, |
222 | |
223 | /// Maximum length of strings reprsenting characters, comments, and processing instructions |
224 | pub max_data_length: usize, |
225 | } |
226 | |
227 | impl Default for ParserConfig2 { |
228 | fn default() -> Self { |
229 | Self { |
230 | c: ParserConfig::default(), |
231 | override_encoding: None, |
232 | ignore_invalid_encoding_declarations: false, |
233 | allow_multiple_root_elements: true, |
234 | max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH, |
235 | max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH, |
236 | max_attributes: 1 << 16, |
237 | max_attribute_length: 1 << 30, |
238 | max_data_length: 1 << 30, |
239 | max_name_length: 1 << 18, |
240 | } |
241 | } |
242 | } |
243 | |
244 | impl ParserConfig2 { |
245 | /// Create extended configuration struct |
246 | #[inline ] |
247 | #[must_use ] |
248 | pub fn new() -> Self { |
249 | Self::default() |
250 | } |
251 | |
252 | /// Read character encoding from `Content-Type` header. |
253 | /// Set this when parsing XML documents fetched over HTTP. |
254 | /// |
255 | /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback. |
256 | #[must_use ] pub fn content_type(mut self, mime_type: &str) -> Self { |
257 | let charset = mime_type.split_once(';' ) |
258 | .and_then(|(_, args)| args.split_once("charset" )) |
259 | .and_then(|(_, args)| args.split_once('=' )); |
260 | if let Some((_, charset)) = charset { |
261 | let name = charset.trim().trim_matches('"' ); |
262 | if let Ok(enc) = name.parse() { |
263 | self.override_encoding = Some(enc); |
264 | } |
265 | } |
266 | self |
267 | } |
268 | |
269 | /// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. |
270 | /// |
271 | /// This is a convenience method for configuring and creating a reader at the same time: |
272 | /// |
273 | /// ```rust |
274 | /// use xml::reader::ParserConfig; |
275 | /// |
276 | /// let mut source: &[u8] = b"..." ; |
277 | /// |
278 | /// let reader = ParserConfig::new() |
279 | /// .trim_whitespace(true) |
280 | /// .ignore_comments(true) |
281 | /// .coalesce_characters(false) |
282 | /// .create_reader(&mut source); |
283 | /// ``` |
284 | /// |
285 | /// This method is exactly equivalent to calling `EventReader::new_with_config()` with |
286 | /// this configuration object. |
287 | #[inline ] |
288 | pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { |
289 | EventReader::new_with_config(source, self) |
290 | } |
291 | } |
292 | |
293 | impl From<ParserConfig> for ParserConfig2 { |
294 | #[inline ] |
295 | fn from(c: ParserConfig) -> Self { |
296 | Self { c, ..Default::default() } |
297 | } |
298 | } |
299 | |
300 | gen_setters! { ParserConfig2, |
301 | /// Set if you got one in the HTTP header |
302 | override_encoding: val Option<Encoding>, |
303 | /// Allows invalid documents. There should be only a single root element in XML. |
304 | allow_multiple_root_elements: val bool, |
305 | /// Abort if custom entities create a string longer than this |
306 | max_entity_expansion_length: val usize, |
307 | /// Entities can expand into other entities this many times (be careful about exponential cost!) |
308 | max_entity_expansion_depth: val u8, |
309 | /// Max number of attributes per element |
310 | max_attributes: val usize, |
311 | /// Maximum length of tag name or attribute name |
312 | max_name_length: val usize, |
313 | /// Max number of bytes in each attribute |
314 | max_attribute_length: val usize, |
315 | /// Maximum length of strings reprsenting characters, comments, and processing instructions |
316 | max_data_length: val usize, |
317 | /// Allow `<?xml encoding="bogus"?>` |
318 | ignore_invalid_encoding_declarations: val bool |
319 | } |
320 | |
321 | gen_setters! { ParserConfig, |
322 | /// Set if you got one in the HTTP header (see `content_type`) |
323 | override_encoding: c2 Option<Encoding>, |
324 | /// Allow `<?xml encoding="bogus"?>` |
325 | ignore_invalid_encoding_declarations: c2 bool, |
326 | /// Allows invalid documents. There should be only a single root element in XML. |
327 | allow_multiple_root_elements: c2 bool, |
328 | |
329 | /// Abort if custom entities create a string longer than this |
330 | max_entity_expansion_length: c2 usize, |
331 | /// Entities can expand into other entities this many times (be careful about exponential cost!) |
332 | max_entity_expansion_depth: c2 u8, |
333 | /// Max number of attributes per element |
334 | max_attributes: c2 usize, |
335 | /// Maximum length of tag name or attribute name |
336 | max_name_length: c2 usize, |
337 | /// Max number of bytes in each attribute |
338 | max_attribute_length: c2 usize, |
339 | /// Maximum length of strings reprsenting characters, comments, and processing instructions |
340 | max_data_length: c2 usize, |
341 | |
342 | /// Set encoding from the MIME type. Important for HTTP compatibility. |
343 | content_type: c2 &str |
344 | } |
345 | |
346 | gen_setters! { ParserConfig2, |
347 | trim_whitespace: delegate bool, |
348 | whitespace_to_characters: delegate bool, |
349 | cdata_to_characters: delegate bool, |
350 | ignore_comments: delegate bool, |
351 | coalesce_characters: delegate bool, |
352 | ignore_end_of_stream: delegate bool, |
353 | replace_unknown_entity_references: delegate bool, |
354 | /// Whether or not whitespace at the root level of the document is ignored. Default is true. |
355 | ignore_root_level_whitespace: delegate bool |
356 | } |
357 | |
358 | #[test ] |
359 | fn mime_parse() { |
360 | let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii" ).max_entity_expansion_length(1000); |
361 | assert_eq!(c.override_encoding, Some(Encoding::Ascii)); |
362 | |
363 | let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16 \"" ); |
364 | assert_eq!(c.override_encoding, Some(Encoding::Utf16)); |
365 | } |
366 | |