| 1 | use crate::common::is_whitespace_char; |
| 2 | use crate::reader::error::SyntaxError; |
| 3 | use crate::reader::events::XmlEvent; |
| 4 | use crate::reader::lexer::Token; |
| 5 | |
| 6 | use super::{ |
| 7 | ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate, |
| 8 | ProcessingInstructionSubstate, PullParser, Result, State, |
| 9 | }; |
| 10 | |
| 11 | impl PullParser { |
| 12 | pub fn outside_tag(&mut self, t: Token) -> Option<Result> { |
| 13 | match t { |
| 14 | Token::Character(c) => { |
| 15 | if is_whitespace_char(c) { |
| 16 | // skip whitespace outside of the root element |
| 17 | if (self.config.c.trim_whitespace && self.buf.is_empty()) || |
| 18 | (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { |
| 19 | return None; |
| 20 | } |
| 21 | } else { |
| 22 | self.inside_whitespace = false; |
| 23 | if self.depth() == 0 { |
| 24 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); |
| 25 | } |
| 26 | } |
| 27 | |
| 28 | if !self.is_valid_xml_char_not_restricted(c) { |
| 29 | return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); |
| 30 | } |
| 31 | |
| 32 | if self.buf.is_empty() { |
| 33 | self.push_pos(); |
| 34 | } else if self.buf.len() > self.config.max_data_length { |
| 35 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
| 36 | } |
| 37 | self.buf.push(c); |
| 38 | None |
| 39 | }, |
| 40 | |
| 41 | Token::CommentEnd | Token::TagEnd | Token::EqualsSign | |
| 42 | Token::DoubleQuote | Token::SingleQuote | |
| 43 | Token::ProcessingInstructionEnd | Token::EmptyTagEnd => { |
| 44 | if self.depth() == 0 { |
| 45 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); |
| 46 | } |
| 47 | self.inside_whitespace = false; |
| 48 | |
| 49 | if let Some(s) = t.as_static_str() { |
| 50 | if self.buf.is_empty() { |
| 51 | self.push_pos(); |
| 52 | } else if self.buf.len() > self.config.max_data_length { |
| 53 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
| 54 | } |
| 55 | |
| 56 | self.buf.push_str(s); |
| 57 | } |
| 58 | None |
| 59 | }, |
| 60 | |
| 61 | Token::ReferenceStart if self.depth() > 0 => { |
| 62 | self.state_after_reference = State::OutsideTag; |
| 63 | self.into_state_continue(State::InsideReference) |
| 64 | }, |
| 65 | |
| 66 | Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity |
| 67 | self.inside_whitespace = false; |
| 68 | if self.buf.len() > self.config.max_data_length { |
| 69 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
| 70 | } |
| 71 | Token::ReferenceEnd.push_to_string(&mut self.buf); |
| 72 | None |
| 73 | }, |
| 74 | |
| 75 | Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => { |
| 76 | let next_event = self.set_encountered(Encountered::Comment); |
| 77 | // We need to switch the lexer into a comment mode inside comments |
| 78 | self.into_state(State::InsideComment, next_event) |
| 79 | } |
| 80 | |
| 81 | Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => { |
| 82 | if self.buf.is_empty() { |
| 83 | self.push_pos(); // CDataEnd will pop pos if the buffer remains empty |
| 84 | } |
| 85 | // if coalescing chars, continue without event |
| 86 | self.into_state_continue(State::InsideCData) |
| 87 | }, |
| 88 | |
| 89 | _ => { |
| 90 | // Encountered some markup event, flush the buffer as characters |
| 91 | // or a whitespace |
| 92 | let mut next_event = if self.buf_has_data() { |
| 93 | let buf = self.take_buf(); |
| 94 | if self.inside_whitespace && self.config.c.trim_whitespace { |
| 95 | // there will be no event emitted for this, but start of buffering has pushed a pos |
| 96 | self.next_pos(); |
| 97 | None |
| 98 | } else if self.inside_whitespace && !self.config.c.whitespace_to_characters { |
| 99 | debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws= {buf:?}" ); |
| 100 | Some(Ok(XmlEvent::Whitespace(buf))) |
| 101 | } else if self.config.c.trim_whitespace { |
| 102 | Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into()))) |
| 103 | } else { |
| 104 | Some(Ok(XmlEvent::Characters(buf))) |
| 105 | } |
| 106 | } else { None }; |
| 107 | self.inside_whitespace = true; // Reset inside_whitespace flag |
| 108 | |
| 109 | // pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it |
| 110 | // and ignored comments don't pop |
| 111 | if t != Token::CommentStart || !self.config.c.ignore_comments { |
| 112 | self.push_pos(); |
| 113 | } |
| 114 | match t { |
| 115 | Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => { |
| 116 | if let Some(e) = self.set_encountered(Encountered::Element) { |
| 117 | next_event = Some(e); |
| 118 | } |
| 119 | self.nst.push_empty(); |
| 120 | self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) |
| 121 | }, |
| 122 | |
| 123 | Token::ClosingTagStart if self.depth() > 0 => |
| 124 | self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event), |
| 125 | |
| 126 | Token::CommentStart => { |
| 127 | if let Some(e) = self.set_encountered(Encountered::Comment) { |
| 128 | next_event = Some(e); |
| 129 | } |
| 130 | // We need to switch the lexer into a comment mode inside comments |
| 131 | self.into_state(State::InsideComment, next_event) |
| 132 | }, |
| 133 | |
| 134 | Token::DoctypeStart if self.encountered < Encountered::Doctype => { |
| 135 | if let Some(e) = self.set_encountered(Encountered::Doctype) { |
| 136 | next_event = Some(e); |
| 137 | } |
| 138 | self.data.doctype = Some(Token::DoctypeStart.to_string()); |
| 139 | |
| 140 | // We don't have a doctype event so skip this position |
| 141 | // FIXME: update when we have a doctype event |
| 142 | self.next_pos(); |
| 143 | self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) |
| 144 | }, |
| 145 | |
| 146 | Token::ProcessingInstructionStart => |
| 147 | self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), |
| 148 | |
| 149 | Token::CDataStart if self.depth() > 0 => { |
| 150 | self.into_state(State::InsideCData, next_event) |
| 151 | }, |
| 152 | |
| 153 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), |
| 154 | } |
| 155 | }, |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | pub fn document_start(&mut self, t: Token) -> Option<Result> { |
| 160 | debug_assert!(self.encountered < Encountered::Declaration); |
| 161 | |
| 162 | match t { |
| 163 | Token::Character(c) => { |
| 164 | let next_event = self.set_encountered(Encountered::AnyChars); |
| 165 | |
| 166 | if !is_whitespace_char(c) { |
| 167 | return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); |
| 168 | } |
| 169 | self.inside_whitespace = true; |
| 170 | |
| 171 | // skip whitespace outside of the root element |
| 172 | if (self.config.c.trim_whitespace && self.buf.is_empty()) || |
| 173 | (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { |
| 174 | return self.into_state(State::OutsideTag, next_event); |
| 175 | } |
| 176 | |
| 177 | self.push_pos(); |
| 178 | self.buf.push(c); |
| 179 | self.into_state(State::OutsideTag, next_event) |
| 180 | }, |
| 181 | |
| 182 | Token::CommentStart => { |
| 183 | let next_event = self.set_encountered(Encountered::Comment); |
| 184 | self.into_state(State::InsideComment, next_event) |
| 185 | }, |
| 186 | |
| 187 | Token::OpeningTagStart => { |
| 188 | let next_event = self.set_encountered(Encountered::Element); |
| 189 | self.nst.push_empty(); |
| 190 | self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) |
| 191 | }, |
| 192 | |
| 193 | Token::DoctypeStart => { |
| 194 | let next_event = self.set_encountered(Encountered::Doctype); |
| 195 | self.data.doctype = Some(Token::DoctypeStart.to_string()); |
| 196 | |
| 197 | // We don't have a doctype event so skip this position |
| 198 | // FIXME: update when we have a doctype event |
| 199 | self.next_pos(); |
| 200 | self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) |
| 201 | }, |
| 202 | |
| 203 | Token::ProcessingInstructionStart => { |
| 204 | self.push_pos(); |
| 205 | self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName)) |
| 206 | }, |
| 207 | |
| 208 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), |
| 209 | } |
| 210 | } |
| 211 | } |
| 212 | |