| 1 | //! Contains parser configuration structure. |
| 2 | use std::collections::HashMap; |
| 3 | use std::io::Read; |
| 4 | |
| 5 | use crate::reader::EventReader; |
| 6 | use crate::util::Encoding; |
| 7 | |
| 8 | /// Limits to defend from billion laughs attack |
| 9 | const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000; |
| 10 | const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10; |
| 11 | |
| 12 | /// Parser configuration structure. **There are more config methods than public fileds — see methods below**. |
| 13 | /// |
| 14 | /// This structure contains various configuration options which affect |
| 15 | /// behavior of the parser. |
| 16 | #[derive (Clone, PartialEq, Eq, Debug)] |
| 17 | pub struct ParserConfig { |
| 18 | /// Whether or not should whitespace in textual events be removed. Default is false. |
| 19 | /// |
| 20 | /// When true, all standalone whitespace will be removed (this means no |
| 21 | /// `Whitespace` events will be emitted), and leading and trailing whitespace |
| 22 | /// from `Character` events will be deleted. If after trimming `Characters` |
| 23 | /// event will be empty, it will also be omitted from output stream. This is |
| 24 | /// possible, however, only if `whitespace_to_characters` or |
| 25 | /// `cdata_to_characters` options are set. |
| 26 | /// |
| 27 | /// This option does not affect CDATA events, unless `cdata_to_characters` |
| 28 | /// option is also set. In that case CDATA content will also be trimmed. |
| 29 | pub trim_whitespace: bool, |
| 30 | |
| 31 | /// Whether or not should whitespace be converted to characters. |
| 32 | /// Default is false. |
| 33 | /// |
| 34 | /// If true, instead of `Whitespace` events `Characters` events with the |
| 35 | /// same content will be emitted. If `trim_whitespace` is also true, these |
| 36 | /// events will be trimmed to nothing and, consequently, not emitted. |
| 37 | pub whitespace_to_characters: bool, |
| 38 | |
| 39 | /// Whether or not should CDATA be converted to characters. |
| 40 | /// Default is false. |
| 41 | /// |
| 42 | /// If true, instead of `CData` events `Characters` events with the same |
| 43 | /// content will be emitted. If `trim_whitespace` is also true, these events |
| 44 | /// will be trimmed. If corresponding CDATA contained nothing but whitespace, |
| 45 | /// this event will be omitted from the stream. |
| 46 | pub cdata_to_characters: bool, |
| 47 | |
| 48 | /// Whether or not should comments be omitted. Default is true. |
| 49 | /// |
| 50 | /// If true, `Comment` events will not be emitted at all. |
| 51 | pub ignore_comments: bool, |
| 52 | |
| 53 | /// Whether or not should sequential `Characters` events be merged. |
| 54 | /// Default is true. |
| 55 | /// |
| 56 | /// If true, multiple sequential `Characters` events will be merged into |
| 57 | /// a single event, that is, their data will be concatenated. |
| 58 | /// |
| 59 | /// Multiple sequential `Characters` events are only possible if either |
| 60 | /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character |
| 61 | /// events will always be separated by other events. |
| 62 | pub coalesce_characters: bool, |
| 63 | |
| 64 | /// A map of extra entities recognized by the parser. Default is an empty map. |
| 65 | /// |
| 66 | /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes, |
| 67 | /// however, it is convenient to make the parser recognize additional entities which |
| 68 | /// are also not available through the DTD definitions (especially given that at the moment |
| 69 | /// DTD parsing is not supported). |
| 70 | pub extra_entities: HashMap<String, String>, |
| 71 | |
| 72 | /// Whether or not the parser should ignore the end of stream. Default is false. |
| 73 | /// |
| 74 | /// By default the parser will either error out when it encounters a premature end of |
| 75 | /// stream or complete normally if the end of stream was expected. If you want to continue |
| 76 | /// reading from a stream whose input is supplied progressively, you can set this option to true. |
| 77 | /// In this case the parser will allow you to invoke the `next()` method even if a supposed end |
| 78 | /// of stream has happened. |
| 79 | /// |
| 80 | /// Note that support for this functionality is incomplete; for example, the parser will fail if |
| 81 | /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk. |
| 82 | pub ignore_end_of_stream: bool, |
| 83 | |
| 84 | /// Whether or not non-unicode entity references get replaced with the replacement character |
| 85 | /// |
| 86 | /// When true, any decimal or hexadecimal character reference that cannot be converted from a |
| 87 | /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html) |
| 88 | /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD). |
| 89 | pub replace_unknown_entity_references: bool, |
| 90 | |
| 91 | /// Whether or not whitespace at the root level of the document is ignored. Default is true. |
| 92 | /// |
| 93 | /// By default any whitespace that is not enclosed within at least one level of elements will be |
| 94 | /// ignored. Setting this value to false will cause root level whitespace events to be emitted. |
| 95 | /// |
| 96 | /// **There are configuration options – see methods below** |
| 97 | pub ignore_root_level_whitespace: bool, |
| 98 | } |
| 99 | |
| 100 | impl ParserConfig { |
| 101 | /// Returns a new config with default values. |
| 102 | /// |
| 103 | /// You can tweak default values using builder-like pattern: |
| 104 | /// |
| 105 | /// ```rust |
| 106 | /// use xml::reader::ParserConfig; |
| 107 | /// |
| 108 | /// let config = ParserConfig::new() |
| 109 | /// .trim_whitespace(true) |
| 110 | /// .ignore_comments(true) |
| 111 | /// .coalesce_characters(false); |
| 112 | /// ``` |
| 113 | #[must_use ] |
| 114 | #[inline ] |
| 115 | pub fn new() -> Self { |
| 116 | Self { |
| 117 | trim_whitespace: false, |
| 118 | whitespace_to_characters: false, |
| 119 | cdata_to_characters: false, |
| 120 | ignore_comments: true, |
| 121 | coalesce_characters: true, |
| 122 | extra_entities: HashMap::new(), |
| 123 | ignore_end_of_stream: false, |
| 124 | replace_unknown_entity_references: false, |
| 125 | ignore_root_level_whitespace: true, |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | /// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. |
| 130 | /// |
| 131 | /// This is a convenience method for configuring and creating a reader at the same time: |
| 132 | /// |
| 133 | /// ```rust |
| 134 | /// use xml::reader::ParserConfig; |
| 135 | /// |
| 136 | /// let mut source: &[u8] = b"..." ; |
| 137 | /// |
| 138 | /// let reader = ParserConfig::new() |
| 139 | /// .trim_whitespace(true) |
| 140 | /// .ignore_comments(true) |
| 141 | /// .coalesce_characters(false) |
| 142 | /// .create_reader(&mut source); |
| 143 | /// ``` |
| 144 | /// |
| 145 | /// This method is exactly equivalent to calling `EventReader::new_with_config()` with |
| 146 | /// this configuration object. |
| 147 | #[inline ] |
| 148 | pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { |
| 149 | EventReader::new_with_config(source, self) |
| 150 | } |
| 151 | |
| 152 | /// Adds a new entity mapping and returns an updated config object. |
| 153 | /// |
| 154 | /// This is a convenience method for adding external entities mappings to the XML parser. |
| 155 | /// An example: |
| 156 | /// |
| 157 | /// ```rust |
| 158 | /// use xml::reader::ParserConfig; |
| 159 | /// |
| 160 | /// let mut source: &[u8] = b"..." ; |
| 161 | /// |
| 162 | /// let reader = ParserConfig::new() |
| 163 | /// .add_entity("nbsp" , " " ) |
| 164 | /// .add_entity("copy" , "©" ) |
| 165 | /// .add_entity("reg" , "®" ) |
| 166 | /// .create_reader(&mut source); |
| 167 | /// ``` |
| 168 | #[must_use ] |
| 169 | pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> Self { |
| 170 | self.extra_entities.insert(entity.into(), value.into()); |
| 171 | self |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | impl Default for ParserConfig { |
| 176 | #[inline ] |
| 177 | fn default() -> Self { |
| 178 | Self::new() |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | gen_setters! { ParserConfig, |
| 183 | trim_whitespace: val bool, |
| 184 | whitespace_to_characters: val bool, |
| 185 | cdata_to_characters: val bool, |
| 186 | ignore_comments: val bool, |
| 187 | coalesce_characters: val bool, |
| 188 | ignore_end_of_stream: val bool, |
| 189 | replace_unknown_entity_references: val bool, |
| 190 | ignore_root_level_whitespace: val bool |
| 191 | } |
| 192 | |
| 193 | /// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct |
| 194 | #[derive (Clone, PartialEq, Eq, Debug)] |
| 195 | #[non_exhaustive ] |
| 196 | pub struct ParserConfig2 { |
| 197 | pub(crate) c: ParserConfig, |
| 198 | |
| 199 | /// Use this encoding as the default. Necessary for UTF-16 files without BOM. |
| 200 | pub override_encoding: Option<Encoding>, |
| 201 | |
| 202 | /// Allow `<?xml encoding="…">` to contain unsupported encoding names, |
| 203 | /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing. |
| 204 | pub ignore_invalid_encoding_declarations: bool, |
| 205 | |
| 206 | /// Documents with multiple root elements are ill-formed |
| 207 | pub allow_multiple_root_elements: bool, |
| 208 | |
| 209 | /// Abort if custom entities create a string longer than this |
| 210 | pub max_entity_expansion_length: usize, |
| 211 | /// Entities can expand into other entities this many times (be careful about exponential cost!) |
| 212 | pub max_entity_expansion_depth: u8, |
| 213 | |
| 214 | /// Maximum length of tag name or attribute name |
| 215 | pub max_name_length: usize, |
| 216 | |
| 217 | /// Max number of attributes per element |
| 218 | pub max_attributes: usize, |
| 219 | |
| 220 | /// Max number of bytes in each attribute |
| 221 | pub max_attribute_length: usize, |
| 222 | |
| 223 | /// Maximum length of strings reprsenting characters, comments, and processing instructions |
| 224 | pub max_data_length: usize, |
| 225 | } |
| 226 | |
| 227 | impl Default for ParserConfig2 { |
| 228 | fn default() -> Self { |
| 229 | Self { |
| 230 | c: ParserConfig::default(), |
| 231 | override_encoding: None, |
| 232 | ignore_invalid_encoding_declarations: false, |
| 233 | allow_multiple_root_elements: true, |
| 234 | max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH, |
| 235 | max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH, |
| 236 | max_attributes: 1 << 16, |
| 237 | max_attribute_length: 1 << 30, |
| 238 | max_data_length: 1 << 30, |
| 239 | max_name_length: 1 << 18, |
| 240 | } |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | impl ParserConfig2 { |
| 245 | /// Create extended configuration struct |
| 246 | #[inline ] |
| 247 | #[must_use ] |
| 248 | pub fn new() -> Self { |
| 249 | Self::default() |
| 250 | } |
| 251 | |
| 252 | /// Read character encoding from `Content-Type` header. |
| 253 | /// Set this when parsing XML documents fetched over HTTP. |
| 254 | /// |
| 255 | /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback. |
| 256 | #[must_use ] pub fn content_type(mut self, mime_type: &str) -> Self { |
| 257 | let charset = mime_type.split_once(';' ) |
| 258 | .and_then(|(_, args)| args.split_once("charset" )) |
| 259 | .and_then(|(_, args)| args.split_once('=' )); |
| 260 | if let Some((_, charset)) = charset { |
| 261 | let name = charset.trim().trim_matches('"' ); |
| 262 | if let Ok(enc) = name.parse() { |
| 263 | self.override_encoding = Some(enc); |
| 264 | } |
| 265 | } |
| 266 | self |
| 267 | } |
| 268 | |
| 269 | /// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow. |
| 270 | /// |
| 271 | /// This is a convenience method for configuring and creating a reader at the same time: |
| 272 | /// |
| 273 | /// ```rust |
| 274 | /// use xml::reader::ParserConfig; |
| 275 | /// |
| 276 | /// let mut source: &[u8] = b"..." ; |
| 277 | /// |
| 278 | /// let reader = ParserConfig::new() |
| 279 | /// .trim_whitespace(true) |
| 280 | /// .ignore_comments(true) |
| 281 | /// .coalesce_characters(false) |
| 282 | /// .create_reader(&mut source); |
| 283 | /// ``` |
| 284 | /// |
| 285 | /// This method is exactly equivalent to calling `EventReader::new_with_config()` with |
| 286 | /// this configuration object. |
| 287 | #[inline ] |
| 288 | pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> { |
| 289 | EventReader::new_with_config(source, self) |
| 290 | } |
| 291 | } |
| 292 | |
| 293 | impl From<ParserConfig> for ParserConfig2 { |
| 294 | #[inline ] |
| 295 | fn from(c: ParserConfig) -> Self { |
| 296 | Self { c, ..Default::default() } |
| 297 | } |
| 298 | } |
| 299 | |
| 300 | gen_setters! { ParserConfig2, |
| 301 | /// Set if you got one in the HTTP header |
| 302 | override_encoding: val Option<Encoding>, |
| 303 | /// Allows invalid documents. There should be only a single root element in XML. |
| 304 | allow_multiple_root_elements: val bool, |
| 305 | /// Abort if custom entities create a string longer than this |
| 306 | max_entity_expansion_length: val usize, |
| 307 | /// Entities can expand into other entities this many times (be careful about exponential cost!) |
| 308 | max_entity_expansion_depth: val u8, |
| 309 | /// Max number of attributes per element |
| 310 | max_attributes: val usize, |
| 311 | /// Maximum length of tag name or attribute name |
| 312 | max_name_length: val usize, |
| 313 | /// Max number of bytes in each attribute |
| 314 | max_attribute_length: val usize, |
| 315 | /// Maximum length of strings reprsenting characters, comments, and processing instructions |
| 316 | max_data_length: val usize, |
| 317 | /// Allow `<?xml encoding="bogus"?>` |
| 318 | ignore_invalid_encoding_declarations: val bool |
| 319 | } |
| 320 | |
| 321 | gen_setters! { ParserConfig, |
| 322 | /// Set if you got one in the HTTP header (see `content_type`) |
| 323 | override_encoding: c2 Option<Encoding>, |
| 324 | /// Allow `<?xml encoding="bogus"?>` |
| 325 | ignore_invalid_encoding_declarations: c2 bool, |
| 326 | /// Allows invalid documents. There should be only a single root element in XML. |
| 327 | allow_multiple_root_elements: c2 bool, |
| 328 | |
| 329 | /// Abort if custom entities create a string longer than this |
| 330 | max_entity_expansion_length: c2 usize, |
| 331 | /// Entities can expand into other entities this many times (be careful about exponential cost!) |
| 332 | max_entity_expansion_depth: c2 u8, |
| 333 | /// Max number of attributes per element |
| 334 | max_attributes: c2 usize, |
| 335 | /// Maximum length of tag name or attribute name |
| 336 | max_name_length: c2 usize, |
| 337 | /// Max number of bytes in each attribute |
| 338 | max_attribute_length: c2 usize, |
| 339 | /// Maximum length of strings reprsenting characters, comments, and processing instructions |
| 340 | max_data_length: c2 usize, |
| 341 | |
| 342 | /// Set encoding from the MIME type. Important for HTTP compatibility. |
| 343 | content_type: c2 &str |
| 344 | } |
| 345 | |
| 346 | gen_setters! { ParserConfig2, |
| 347 | trim_whitespace: delegate bool, |
| 348 | whitespace_to_characters: delegate bool, |
| 349 | cdata_to_characters: delegate bool, |
| 350 | ignore_comments: delegate bool, |
| 351 | coalesce_characters: delegate bool, |
| 352 | ignore_end_of_stream: delegate bool, |
| 353 | replace_unknown_entity_references: delegate bool, |
| 354 | /// Whether or not whitespace at the root level of the document is ignored. Default is true. |
| 355 | ignore_root_level_whitespace: delegate bool |
| 356 | } |
| 357 | |
| 358 | #[test ] |
| 359 | fn mime_parse() { |
| 360 | let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii" ).max_entity_expansion_length(1000); |
| 361 | assert_eq!(c.override_encoding, Some(Encoding::Ascii)); |
| 362 | |
| 363 | let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16 \"" ); |
| 364 | assert_eq!(c.override_encoding, Some(Encoding::Utf16)); |
| 365 | } |
| 366 | |