config.rs source code [crates/xml-rs-0.8.26/src/reader/config.rs]

1	//! Contains parser configuration structure.
2	use std::collections::HashMap;
3	use std::io::Read;
4
5	use crate::reader::EventReader;
6	use crate::util::Encoding;
7
8	/// Limits to defend from billion laughs attack
9	const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = `1_000_000`;
10	const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = `10`;
11
12	/// Parser configuration structure. There are more config methods than public fileds — see methods below.
13	///
14	/// This structure contains various configuration options which affect
15	/// behavior of the parser.
16	#[derive(Clone, PartialEq, Eq, Debug)]
17	pub struct ParserConfig {
18	/// Whether or not should whitespace in textual events be removed. Default is false.
19	///
20	/// When true, all standalone whitespace will be removed (this means no
21	/// `Whitespace` events will be emitted), and leading and trailing whitespace
22	/// from `Character` events will be deleted. If after trimming `Characters`
23	/// event will be empty, it will also be omitted from output stream. This is
24	/// possible, however, only if `whitespace_to_characters` or
25	/// `cdata_to_characters` options are set.
26	///
27	/// This option does not affect CDATA events, unless `cdata_to_characters`
28	/// option is also set. In that case CDATA content will also be trimmed.
29	pub trim_whitespace: bool,
30
31	/// Whether or not should whitespace be converted to characters.
32	/// Default is false.
33	///
34	/// If true, instead of `Whitespace` events `Characters` events with the
35	/// same content will be emitted. If `trim_whitespace` is also true, these
36	/// events will be trimmed to nothing and, consequently, not emitted.
37	pub whitespace_to_characters: bool,
38
39	/// Whether or not should CDATA be converted to characters.
40	/// Default is false.
41	///
42	/// If true, instead of `CData` events `Characters` events with the same
43	/// content will be emitted. If `trim_whitespace` is also true, these events
44	/// will be trimmed. If corresponding CDATA contained nothing but whitespace,
45	/// this event will be omitted from the stream.
46	pub cdata_to_characters: bool,
47
48	/// Whether or not should comments be omitted. Default is true.
49	///
50	/// If true, `Comment` events will not be emitted at all.
51	pub ignore_comments: bool,
52
53	/// Whether or not should sequential `Characters` events be merged.
54	/// Default is true.
55	///
56	/// If true, multiple sequential `Characters` events will be merged into
57	/// a single event, that is, their data will be concatenated.
58	///
59	/// Multiple sequential `Characters` events are only possible if either
60	/// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
61	/// events will always be separated by other events.
62	pub coalesce_characters: bool,
63
64	/// A map of extra entities recognized by the parser. Default is an empty map.
65	///
66	/// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
67	/// however, it is convenient to make the parser recognize additional entities which
68	/// are also not available through the DTD definitions (especially given that at the moment
69	/// DTD parsing is not supported).
70	pub extra_entities: HashMap<String, String>,
71
72	/// Whether or not the parser should ignore the end of stream. Default is false.
73	///
74	/// By default the parser will either error out when it encounters a premature end of
75	/// stream or complete normally if the end of stream was expected. If you want to continue
76	/// reading from a stream whose input is supplied progressively, you can set this option to true.
77	/// In this case the parser will allow you to invoke the `next()` method even if a supposed end
78	/// of stream has happened.
79	///
80	/// Note that support for this functionality is incomplete; for example, the parser will fail if
81	/// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
82	pub ignore_end_of_stream: bool,
83
84	/// Whether or not non-unicode entity references get replaced with the replacement character
85	///
86	/// When true, any decimal or hexadecimal character reference that cannot be converted from a
87	/// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
88	/// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
89	pub replace_unknown_entity_references: bool,
90
91	/// Whether or not whitespace at the root level of the document is ignored. Default is true.
92	///
93	/// By default any whitespace that is not enclosed within at least one level of elements will be
94	/// ignored. Setting this value to false will cause root level whitespace events to be emitted.
95	///
96	/// There are configuration options – see methods below
97	pub ignore_root_level_whitespace: bool,
98	}
99
100	impl ParserConfig {
101	/// Returns a new config with default values.
102	///
103	/// You can tweak default values using builder-like pattern:
104	///
105	/// ```rust
106	/// use xml::reader::ParserConfig;
107	///
108	/// let config = ParserConfig::new()
109	/// .trim_whitespace(`true`)
110	/// .ignore_comments(`true`)
111	/// .coalesce_characters(`false`);
112	/// ```
113	#[must_use]
114	#[inline]
115	pub fn new() -> Self {
116	Self {
117	trim_whitespace: `false`,
118	whitespace_to_characters: `false`,
119	cdata_to_characters: `false`,
120	ignore_comments: `true`,
121	coalesce_characters: `true`,
122	extra_entities: HashMap::new(),
123	ignore_end_of_stream: `false`,
124	replace_unknown_entity_references: `false`,
125	ignore_root_level_whitespace: `true`,
126	}
127	}
128
129	/// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
130	///
131	/// This is a convenience method for configuring and creating a reader at the same time:
132	///
133	/// ```rust
134	/// use xml::reader::ParserConfig;
135	///
136	/// let mut source: &[u8] = b"...";
137	///
138	/// let reader = ParserConfig::new()
139	/// .trim_whitespace(`true`)
140	/// .ignore_comments(`true`)
141	/// .coalesce_characters(`false`)
142	/// .create_reader(&mut source);
143	/// ```
144	///
145	/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
146	/// this configuration object.
147	#[inline]
148	pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
149	EventReader::new_with_config(source, self)
150	}
151
152	/// Adds a new entity mapping and returns an updated config object.
153	///
154	/// This is a convenience method for adding external entities mappings to the XML parser.
155	/// An example:
156	///
157	/// ```rust
158	/// use xml::reader::ParserConfig;
159	///
160	/// let mut source: &[u8] = b"...";
161	///
162	/// let reader = ParserConfig::new()
163	/// .add_entity("nbsp", " ")
164	/// .add_entity("copy", "©")
165	/// .add_entity("reg", "®")
166	/// .create_reader(&mut source);
167	/// ```
168	#[must_use]
169	pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> Self {
170	self.extra_entities.insert(entity.into(), value.into());
171	self
172	}
173	}
174
175	impl Default for ParserConfig {
176	#[inline]
177	fn default() -> Self {
178	Self::new()
179	}
180	}
181
182	gen_setters! { ParserConfig,
183	trim_whitespace: val bool,
184	whitespace_to_characters: val bool,
185	cdata_to_characters: val bool,
186	ignore_comments: val bool,
187	coalesce_characters: val bool,
188	ignore_end_of_stream: val bool,
189	replace_unknown_entity_references: val bool,
190	ignore_root_level_whitespace: val bool
191	}
192
193	/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
194	#[derive(Clone, PartialEq, Eq, Debug)]
195	#[non_exhaustive]
196	pub struct ParserConfig2 {
197	pub(crate) c: ParserConfig,
198
199	/// Use this encoding as the default. Necessary for UTF-16 files without BOM.
200	pub override_encoding: Option<Encoding>,
201
202	/// Allow `<?xml encoding="…">` to contain unsupported encoding names,
203	/// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
204	pub ignore_invalid_encoding_declarations: bool,
205
206	/// Documents with multiple root elements are ill-formed
207	pub allow_multiple_root_elements: bool,
208
209	/// Abort if custom entities create a string longer than this
210	pub max_entity_expansion_length: usize,
211	/// Entities can expand into other entities this many times (be careful about exponential cost!)
212	pub max_entity_expansion_depth: u8,
213
214	/// Maximum length of tag name or attribute name
215	pub max_name_length: usize,
216
217	/// Max number of attributes per element
218	pub max_attributes: usize,
219
220	/// Max number of bytes in each attribute
221	pub max_attribute_length: usize,
222
223	/// Maximum length of strings reprsenting characters, comments, and processing instructions
224	pub max_data_length: usize,
225	}
226
227	impl Default for ParserConfig2 {
228	fn default() -> Self {
229	Self {
230	c: ParserConfig::default(),
231	override_encoding: None,
232	ignore_invalid_encoding_declarations: `false`,
233	allow_multiple_root_elements: `true`,
234	max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
235	max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
236	max_attributes: `1` << `16`,
237	max_attribute_length: `1` << `30`,
238	max_data_length: `1` << `30`,
239	max_name_length: `1` << `18`,
240	}
241	}
242	}
243
244	impl ParserConfig2 {
245	/// Create extended configuration struct
246	#[inline]
247	#[must_use]
248	pub fn new() -> Self {
249	Self::default()
250	}
251
252	/// Read character encoding from `Content-Type` header.
253	/// Set this when parsing XML documents fetched over HTTP.
254	///
255	/// `text/` MIME types do not imply latin1. UTF-8 is always the default fallback.*
256	#[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
257	let charset = mime_type.split_once(';')
258	.and_then(\|(_, args)\| args.split_once("charset"))
259	.and_then(\|(_, args)\| args.split_once('='));
260	if let Some((_, charset)) = charset {
261	let name = charset.trim().trim_matches('"');
262	if let Ok(enc) = name.parse() {
263	self.override_encoding = Some(enc);
264	}
265	}
266	self
267	}
268
269	/// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
270	///
271	/// This is a convenience method for configuring and creating a reader at the same time:
272	///
273	/// ```rust
274	/// use xml::reader::ParserConfig;
275	///
276	/// let mut source: &[u8] = b"...";
277	///
278	/// let reader = ParserConfig::new()
279	/// .trim_whitespace(`true`)
280	/// .ignore_comments(`true`)
281	/// .coalesce_characters(`false`)
282	/// .create_reader(&mut source);
283	/// ```
284	///
285	/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
286	/// this configuration object.
287	#[inline]
288	pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
289	EventReader::new_with_config(source, self)
290	}
291	}
292
293	impl From<ParserConfig> for ParserConfig2 {
294	#[inline]
295	fn from(c: ParserConfig) -> Self {
296	Self { c, ..Default::default() }
297	}
298	}
299
300	gen_setters! { ParserConfig2,
301	/// Set if you got one in the HTTP header
302	override_encoding: val Option<Encoding>,
303	/// Allows invalid documents. There should be only a single root element in XML.
304	allow_multiple_root_elements: val bool,
305	/// Abort if custom entities create a string longer than this
306	max_entity_expansion_length: val usize,
307	/// Entities can expand into other entities this many times (be careful about exponential cost!)
308	max_entity_expansion_depth: val u8,
309	/// Max number of attributes per element
310	max_attributes: val usize,
311	/// Maximum length of tag name or attribute name
312	max_name_length: val usize,
313	/// Max number of bytes in each attribute
314	max_attribute_length: val usize,
315	/// Maximum length of strings reprsenting characters, comments, and processing instructions
316	max_data_length: val usize,
317	/// Allow `<?xml encoding="bogus"?>`
318	ignore_invalid_encoding_declarations: val bool
319	}
320
321	gen_setters! { ParserConfig,
322	/// Set if you got one in the HTTP header (see `content_type`)
323	override_encoding: c2 Option<Encoding>,
324	/// Allow `<?xml encoding="bogus"?>`
325	ignore_invalid_encoding_declarations: c2 bool,
326	/// Allows invalid documents. There should be only a single root element in XML.
327	allow_multiple_root_elements: c2 bool,
328
329	/// Abort if custom entities create a string longer than this
330	max_entity_expansion_length: c2 usize,
331	/// Entities can expand into other entities this many times (be careful about exponential cost!)
332	max_entity_expansion_depth: c2 u8,
333	/// Max number of attributes per element
334	max_attributes: c2 usize,
335	/// Maximum length of tag name or attribute name
336	max_name_length: c2 usize,
337	/// Max number of bytes in each attribute
338	max_attribute_length: c2 usize,
339	/// Maximum length of strings reprsenting characters, comments, and processing instructions
340	max_data_length: c2 usize,
341
342	/// Set encoding from the MIME type. Important for HTTP compatibility.
343	content_type: c2 &str
344	}
345
346	gen_setters! { ParserConfig2,
347	trim_whitespace: delegate bool,
348	whitespace_to_characters: delegate bool,
349	cdata_to_characters: delegate bool,
350	ignore_comments: delegate bool,
351	coalesce_characters: delegate bool,
352	ignore_end_of_stream: delegate bool,
353	replace_unknown_entity_references: delegate bool,
354	/// Whether or not whitespace at the root level of the document is ignored. Default is true.
355	ignore_root_level_whitespace: delegate bool
356	}
357
358	#[test]
359	fn mime_parse() {
360	let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(`1000`);
361	assert_eq!(c.override_encoding, Some(Encoding::Ascii));
362
363	let c = ParserConfig2::new().max_entity_expansion_depth(`3`).content_type("text/xml;charset = `\"`UTF-16`\"`");
364	assert_eq!(c.override_encoding, Some(Encoding::Utf16));
365	}
366