1// Copyright 2013-2016 The rust-url developers.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use std::error::Error;
10use std::fmt::{self, Formatter, Write};
11use std::str;
12
13use crate::host::{Host, HostInternal};
14use crate::Url;
15use form_urlencoded::EncodingOverride;
16use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17
18/// https://url.spec.whatwg.org/#fragment-percent-encode-set
19const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(byte:b'`');
20
21/// https://url.spec.whatwg.org/#path-percent-encode-set
22const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(byte:b'}');
23
24/// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25pub(crate) const USERINFO: &AsciiSet = &PATH
26 .add(b'/')
27 .add(b':')
28 .add(b';')
29 .add(b'=')
30 .add(b'@')
31 .add(b'[')
32 .add(b'\\')
33 .add(b']')
34 .add(b'^')
35 .add(byte:b'|');
36
37pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(byte:b'%');
38
39// The backslash (\) character is treated as a path separator in special URLs
40// so it needs to be additionally escaped in that case.
41pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(byte:b'\\');
42
43// https://url.spec.whatwg.org/#query-state
44const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(byte:b'>');
45const SPECIAL_QUERY: &AsciiSet = &QUERY.add(byte:b'\'');
46
47pub type ParseResult<T> = Result<T, ParseError>;
48
49macro_rules! simple_enum_error {
50 ($($name: ident => $description: expr,)+) => {
51 /// Errors that can occur during parsing.
52 ///
53 /// This may be extended in the future so exhaustive matching is
54 /// discouraged with an unused variant.
55 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
56 #[non_exhaustive]
57 pub enum ParseError {
58 $(
59 $name,
60 )+
61 }
62
63 impl fmt::Display for ParseError {
64 fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
65 match *self {
66 $(
67 ParseError::$name => fmt.write_str($description),
68 )+
69 }
70 }
71 }
72 }
73}
74
75impl Error for ParseError {}
76
77simple_enum_error! {
78 EmptyHost => "empty host",
79 IdnaError => "invalid international domain name",
80 InvalidPort => "invalid port number",
81 InvalidIpv4Address => "invalid IPv4 address",
82 InvalidIpv6Address => "invalid IPv6 address",
83 InvalidDomainCharacter => "invalid domain character",
84 RelativeUrlWithoutBase => "relative URL without a base",
85 RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
86 SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
87 Overflow => "URLs more than 4 GB are not supported",
88}
89
90impl From<::idna::Errors> for ParseError {
91 fn from(_: ::idna::Errors) -> ParseError {
92 ParseError::IdnaError
93 }
94}
95
96macro_rules! syntax_violation_enum {
97 ($($name: ident => $description: expr,)+) => {
98 /// Non-fatal syntax violations that can occur during parsing.
99 ///
100 /// This may be extended in the future so exhaustive matching is
101 /// discouraged with an unused variant.
102 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
103 #[non_exhaustive]
104 pub enum SyntaxViolation {
105 $(
106 $name,
107 )+
108 }
109
110 impl SyntaxViolation {
111 pub fn description(&self) -> &'static str {
112 match *self {
113 $(
114 SyntaxViolation::$name => $description,
115 )+
116 }
117 }
118 }
119 }
120}
121
122syntax_violation_enum! {
123 Backslash => "backslash",
124 C0SpaceIgnored =>
125 "leading or trailing control or space character are ignored in URLs",
126 EmbeddedCredentials =>
127 "embedding authentication information (username or password) \
128 in an URL is not recommended",
129 ExpectedDoubleSlash => "expected //",
130 ExpectedFileDoubleSlash => "expected // after file:",
131 FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
132 NonUrlCodePoint => "non-URL code point",
133 NullInFragment => "NULL characters are ignored in URL fragment identifiers",
134 PercentDecode => "expected 2 hex digits after %",
135 TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
136 UnencodedAtSign => "unencoded @ sign in username or password",
137}
138
139impl fmt::Display for SyntaxViolation {
140 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
141 fmt::Display::fmt(self.description(), f)
142 }
143}
144
145#[derive(Copy, Clone, PartialEq, Eq)]
146pub enum SchemeType {
147 File,
148 SpecialNotFile,
149 NotSpecial,
150}
151
152impl SchemeType {
153 pub fn is_special(&self) -> bool {
154 !matches!(*self, SchemeType::NotSpecial)
155 }
156
157 pub fn is_file(&self) -> bool {
158 matches!(*self, SchemeType::File)
159 }
160}
161
162impl<T: AsRef<str>> From<T> for SchemeType {
163 fn from(s: T) -> Self {
164 match s.as_ref() {
165 "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
166 "file" => SchemeType::File,
167 _ => SchemeType::NotSpecial,
168 }
169 }
170}
171
172pub fn default_port(scheme: &str) -> Option<u16> {
173 match scheme {
174 "http" | "ws" => Some(80),
175 "https" | "wss" => Some(443),
176 "ftp" => Some(21),
177 _ => None,
178 }
179}
180
181#[derive(Clone, Debug)]
182pub struct Input<'i> {
183 chars: str::Chars<'i>,
184}
185
186impl<'i> Input<'i> {
187 pub fn new(input: &'i str) -> Self {
188 Input::with_log(input, None)
189 }
190
191 pub fn no_trim(input: &'i str) -> Self {
192 Input {
193 chars: input.chars(),
194 }
195 }
196
197 pub fn trim_tab_and_newlines(
198 original_input: &'i str,
199 vfn: Option<&dyn Fn(SyntaxViolation)>,
200 ) -> Self {
201 let input = original_input.trim_matches(ascii_tab_or_new_line);
202 if let Some(vfn) = vfn {
203 if input.len() < original_input.len() {
204 vfn(SyntaxViolation::C0SpaceIgnored)
205 }
206 if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
207 vfn(SyntaxViolation::TabOrNewlineIgnored)
208 }
209 }
210 Input {
211 chars: input.chars(),
212 }
213 }
214
215 pub fn with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self {
216 let input = original_input.trim_matches(c0_control_or_space);
217 if let Some(vfn) = vfn {
218 if input.len() < original_input.len() {
219 vfn(SyntaxViolation::C0SpaceIgnored)
220 }
221 if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
222 vfn(SyntaxViolation::TabOrNewlineIgnored)
223 }
224 }
225 Input {
226 chars: input.chars(),
227 }
228 }
229
230 #[inline]
231 pub fn is_empty(&self) -> bool {
232 self.clone().next().is_none()
233 }
234
235 #[inline]
236 fn starts_with<P: Pattern>(&self, p: P) -> bool {
237 p.split_prefix(&mut self.clone())
238 }
239
240 #[inline]
241 pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
242 let mut remaining = self.clone();
243 if p.split_prefix(&mut remaining) {
244 Some(remaining)
245 } else {
246 None
247 }
248 }
249
250 #[inline]
251 fn split_first(&self) -> (Option<char>, Self) {
252 let mut remaining = self.clone();
253 (remaining.next(), remaining)
254 }
255
256 #[inline]
257 fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
258 let mut count = 0;
259 let mut remaining = self.clone();
260 loop {
261 let mut input = remaining.clone();
262 if matches!(input.next(), Some(c) if f(c)) {
263 remaining = input;
264 count += 1;
265 } else {
266 return (count, remaining);
267 }
268 }
269 }
270
271 #[inline]
272 fn next_utf8(&mut self) -> Option<(char, &'i str)> {
273 loop {
274 let utf8 = self.chars.as_str();
275 match self.chars.next() {
276 Some(c) => {
277 if !matches!(c, '\t' | '\n' | '\r') {
278 return Some((c, &utf8[..c.len_utf8()]));
279 }
280 }
281 None => return None,
282 }
283 }
284 }
285}
286
287pub trait Pattern {
288 fn split_prefix(self, input: &mut Input) -> bool;
289}
290
291impl Pattern for char {
292 fn split_prefix(self, input: &mut Input) -> bool {
293 input.next() == Some(self)
294 }
295}
296
297impl<'a> Pattern for &'a str {
298 fn split_prefix(self, input: &mut Input) -> bool {
299 for c: char in self.chars() {
300 if input.next() != Some(c) {
301 return false;
302 }
303 }
304 true
305 }
306}
307
308impl<F: FnMut(char) -> bool> Pattern for F {
309 fn split_prefix(self, input: &mut Input) -> bool {
310 input.next().map_or(default:false, self)
311 }
312}
313
314impl<'i> Iterator for Input<'i> {
315 type Item = char;
316 fn next(&mut self) -> Option<char> {
317 self.chars
318 .by_ref()
319 .find(|&c: char| !matches!(c, '\t' | '\n' | '\r'))
320 }
321}
322
323pub struct Parser<'a> {
324 pub serialization: String,
325 pub base_url: Option<&'a Url>,
326 pub query_encoding_override: EncodingOverride<'a>,
327 pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
328 pub context: Context,
329}
330
331#[derive(PartialEq, Eq, Copy, Clone)]
332pub enum Context {
333 UrlParser,
334 Setter,
335 PathSegmentSetter,
336}
337
338impl<'a> Parser<'a> {
339 fn log_violation(&self, v: SyntaxViolation) {
340 if let Some(f) = self.violation_fn {
341 f(v)
342 }
343 }
344
345 fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
346 if let Some(f) = self.violation_fn {
347 if test() {
348 f(v)
349 }
350 }
351 }
352
353 pub fn for_setter(serialization: String) -> Parser<'a> {
354 Parser {
355 serialization,
356 base_url: None,
357 query_encoding_override: None,
358 violation_fn: None,
359 context: Context::Setter,
360 }
361 }
362
363 /// https://url.spec.whatwg.org/#concept-basic-url-parser
364 pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
365 let input = Input::with_log(input, self.violation_fn);
366 if let Ok(remaining) = self.parse_scheme(input.clone()) {
367 return self.parse_with_scheme(remaining);
368 }
369
370 // No-scheme state
371 if let Some(base_url) = self.base_url {
372 if input.starts_with('#') {
373 self.fragment_only(base_url, input)
374 } else if base_url.cannot_be_a_base() {
375 Err(ParseError::RelativeUrlWithCannotBeABaseBase)
376 } else {
377 let scheme_type = SchemeType::from(base_url.scheme());
378 if scheme_type.is_file() {
379 self.parse_file(input, scheme_type, Some(base_url))
380 } else {
381 self.parse_relative(input, scheme_type, base_url)
382 }
383 }
384 } else {
385 Err(ParseError::RelativeUrlWithoutBase)
386 }
387 }
388
389 pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
390 if input.is_empty() || !input.starts_with(ascii_alpha) {
391 return Err(());
392 }
393 debug_assert!(self.serialization.is_empty());
394 while let Some(c) = input.next() {
395 match c {
396 'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
397 self.serialization.push(c.to_ascii_lowercase())
398 }
399 ':' => return Ok(input),
400 _ => {
401 self.serialization.clear();
402 return Err(());
403 }
404 }
405 }
406 // EOF before ':'
407 if self.context == Context::Setter {
408 Ok(input)
409 } else {
410 self.serialization.clear();
411 Err(())
412 }
413 }
414
415 fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
416 use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
417 let scheme_end = to_u32(self.serialization.len())?;
418 let scheme_type = SchemeType::from(&self.serialization);
419 self.serialization.push(':');
420 match scheme_type {
421 SchemeType::File => {
422 self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
423 let base_file_url = self.base_url.and_then(|base| {
424 if base.scheme() == "file" {
425 Some(base)
426 } else {
427 None
428 }
429 });
430 self.serialization.clear();
431 self.parse_file(input, scheme_type, base_file_url)
432 }
433 SchemeType::SpecialNotFile => {
434 // special relative or authority state
435 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
436 if let Some(base_url) = self.base_url {
437 if slashes_count < 2
438 && base_url.scheme() == &self.serialization[..scheme_end as usize]
439 {
440 // "Cannot-be-a-base" URLs only happen with "not special" schemes.
441 debug_assert!(!base_url.cannot_be_a_base());
442 self.serialization.clear();
443 return self.parse_relative(input, scheme_type, base_url);
444 }
445 }
446 // special authority slashes state
447 self.log_violation_if(ExpectedDoubleSlash, || {
448 input
449 .clone()
450 .take_while(|&c| matches!(c, '/' | '\\'))
451 .collect::<String>()
452 != "//"
453 });
454 self.after_double_slash(remaining, scheme_type, scheme_end)
455 }
456 SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
457 }
458 }
459
460 /// Scheme other than file, http, https, ws, ws, ftp.
461 fn parse_non_special(
462 mut self,
463 input: Input<'_>,
464 scheme_type: SchemeType,
465 scheme_end: u32,
466 ) -> ParseResult<Url> {
467 // path or authority state (
468 if let Some(input) = input.split_prefix("//") {
469 return self.after_double_slash(input, scheme_type, scheme_end);
470 }
471 // Anarchist URL (no authority)
472 let path_start = to_u32(self.serialization.len())?;
473 let username_end = path_start;
474 let host_start = path_start;
475 let host_end = path_start;
476 let host = HostInternal::None;
477 let port = None;
478 let remaining = if let Some(input) = input.split_prefix('/') {
479 self.serialization.push('/');
480 self.parse_path(scheme_type, &mut false, path_start as usize, input)
481 } else {
482 self.parse_cannot_be_a_base_path(input)
483 };
484 self.with_query_and_fragment(
485 scheme_type,
486 scheme_end,
487 username_end,
488 host_start,
489 host_end,
490 host,
491 port,
492 path_start,
493 remaining,
494 )
495 }
496
497 fn parse_file(
498 mut self,
499 input: Input<'_>,
500 scheme_type: SchemeType,
501 base_file_url: Option<&Url>,
502 ) -> ParseResult<Url> {
503 use crate::SyntaxViolation::Backslash;
504 // file state
505 debug_assert!(self.serialization.is_empty());
506 let (first_char, input_after_first_char) = input.split_first();
507 if matches!(first_char, Some('/') | Some('\\')) {
508 self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
509 // file slash state
510 let (next_char, input_after_next_char) = input_after_first_char.split_first();
511 if matches!(next_char, Some('/') | Some('\\')) {
512 self.log_violation_if(Backslash, || next_char == Some('\\'));
513 // file host state
514 self.serialization.push_str("file://");
515 let scheme_end = "file".len() as u32;
516 let host_start = "file://".len() as u32;
517 let (path_start, mut host, remaining) =
518 self.parse_file_host(input_after_next_char)?;
519 let mut host_end = to_u32(self.serialization.len())?;
520 let mut has_host = !matches!(host, HostInternal::None);
521 let remaining = if path_start {
522 self.parse_path_start(SchemeType::File, &mut has_host, remaining)
523 } else {
524 let path_start = self.serialization.len();
525 self.serialization.push('/');
526 self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
527 };
528
529 // For file URLs that have a host and whose path starts
530 // with the windows drive letter we just remove the host.
531 if !has_host {
532 self.serialization
533 .drain(host_start as usize..host_end as usize);
534 host_end = host_start;
535 host = HostInternal::None;
536 }
537 let (query_start, fragment_start) =
538 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
539 return Ok(Url {
540 serialization: self.serialization,
541 scheme_end,
542 username_end: host_start,
543 host_start,
544 host_end,
545 host,
546 port: None,
547 path_start: host_end,
548 query_start,
549 fragment_start,
550 });
551 } else {
552 self.serialization.push_str("file://");
553 let scheme_end = "file".len() as u32;
554 let host_start = "file://".len();
555 let mut host_end = host_start;
556 let mut host = HostInternal::None;
557 if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
558 if let Some(base_url) = base_file_url {
559 let first_segment = base_url.path_segments().unwrap().next().unwrap();
560 if is_normalized_windows_drive_letter(first_segment) {
561 self.serialization.push('/');
562 self.serialization.push_str(first_segment);
563 } else if let Some(host_str) = base_url.host_str() {
564 self.serialization.push_str(host_str);
565 host_end = self.serialization.len();
566 host = base_url.host;
567 }
568 }
569 }
570 // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
571 let parse_path_input = if let Some(c) = first_char {
572 if c == '/' || c == '\\' || c == '?' || c == '#' {
573 input
574 } else {
575 input_after_first_char
576 }
577 } else {
578 input_after_first_char
579 };
580
581 let remaining =
582 self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
583
584 let host_start = host_start as u32;
585
586 let (query_start, fragment_start) =
587 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
588
589 let host_end = host_end as u32;
590 return Ok(Url {
591 serialization: self.serialization,
592 scheme_end,
593 username_end: host_start,
594 host_start,
595 host_end,
596 host,
597 port: None,
598 path_start: host_end,
599 query_start,
600 fragment_start,
601 });
602 }
603 }
604 if let Some(base_url) = base_file_url {
605 match first_char {
606 None => {
607 // Copy everything except the fragment
608 let before_fragment = match base_url.fragment_start {
609 Some(i) => &base_url.serialization[..i as usize],
610 None => &*base_url.serialization,
611 };
612 self.serialization.push_str(before_fragment);
613 Ok(Url {
614 serialization: self.serialization,
615 fragment_start: None,
616 ..*base_url
617 })
618 }
619 Some('?') => {
620 // Copy everything up to the query string
621 let before_query = match (base_url.query_start, base_url.fragment_start) {
622 (None, None) => &*base_url.serialization,
623 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
624 };
625 self.serialization.push_str(before_query);
626 let (query_start, fragment_start) =
627 self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
628 Ok(Url {
629 serialization: self.serialization,
630 query_start,
631 fragment_start,
632 ..*base_url
633 })
634 }
635 Some('#') => self.fragment_only(base_url, input),
636 _ => {
637 if !starts_with_windows_drive_letter_segment(&input) {
638 let before_query = match (base_url.query_start, base_url.fragment_start) {
639 (None, None) => &*base_url.serialization,
640 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
641 };
642 self.serialization.push_str(before_query);
643 self.shorten_path(SchemeType::File, base_url.path_start as usize);
644 let remaining = self.parse_path(
645 SchemeType::File,
646 &mut true,
647 base_url.path_start as usize,
648 input,
649 );
650 self.with_query_and_fragment(
651 SchemeType::File,
652 base_url.scheme_end,
653 base_url.username_end,
654 base_url.host_start,
655 base_url.host_end,
656 base_url.host,
657 base_url.port,
658 base_url.path_start,
659 remaining,
660 )
661 } else {
662 self.serialization.push_str("file:///");
663 let scheme_end = "file".len() as u32;
664 let path_start = "file://".len();
665 let remaining =
666 self.parse_path(SchemeType::File, &mut false, path_start, input);
667 let (query_start, fragment_start) =
668 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
669 let path_start = path_start as u32;
670 Ok(Url {
671 serialization: self.serialization,
672 scheme_end,
673 username_end: path_start,
674 host_start: path_start,
675 host_end: path_start,
676 host: HostInternal::None,
677 port: None,
678 path_start,
679 query_start,
680 fragment_start,
681 })
682 }
683 }
684 }
685 } else {
686 self.serialization.push_str("file:///");
687 let scheme_end = "file".len() as u32;
688 let path_start = "file://".len();
689 let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
690 let (query_start, fragment_start) =
691 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
692 let path_start = path_start as u32;
693 Ok(Url {
694 serialization: self.serialization,
695 scheme_end,
696 username_end: path_start,
697 host_start: path_start,
698 host_end: path_start,
699 host: HostInternal::None,
700 port: None,
701 path_start,
702 query_start,
703 fragment_start,
704 })
705 }
706 }
707
708 fn parse_relative(
709 mut self,
710 input: Input<'_>,
711 scheme_type: SchemeType,
712 base_url: &Url,
713 ) -> ParseResult<Url> {
714 // relative state
715 debug_assert!(self.serialization.is_empty());
716 let (first_char, input_after_first_char) = input.split_first();
717 match first_char {
718 None => {
719 // Copy everything except the fragment
720 let before_fragment = match base_url.fragment_start {
721 Some(i) => &base_url.serialization[..i as usize],
722 None => &*base_url.serialization,
723 };
724 self.serialization.push_str(before_fragment);
725 Ok(Url {
726 serialization: self.serialization,
727 fragment_start: None,
728 ..*base_url
729 })
730 }
731 Some('?') => {
732 // Copy everything up to the query string
733 let before_query = match (base_url.query_start, base_url.fragment_start) {
734 (None, None) => &*base_url.serialization,
735 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
736 };
737 self.serialization.push_str(before_query);
738 let (query_start, fragment_start) =
739 self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
740 Ok(Url {
741 serialization: self.serialization,
742 query_start,
743 fragment_start,
744 ..*base_url
745 })
746 }
747 Some('#') => self.fragment_only(base_url, input),
748 Some('/') | Some('\\') => {
749 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
750 if slashes_count >= 2 {
751 self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
752 input
753 .clone()
754 .take_while(|&c| matches!(c, '/' | '\\'))
755 .collect::<String>()
756 != "//"
757 });
758 let scheme_end = base_url.scheme_end;
759 debug_assert!(base_url.byte_at(scheme_end) == b':');
760 self.serialization
761 .push_str(base_url.slice(..scheme_end + 1));
762 if let Some(after_prefix) = input.split_prefix("//") {
763 return self.after_double_slash(after_prefix, scheme_type, scheme_end);
764 }
765 return self.after_double_slash(remaining, scheme_type, scheme_end);
766 }
767 let path_start = base_url.path_start;
768 self.serialization.push_str(base_url.slice(..path_start));
769 self.serialization.push('/');
770 let remaining = self.parse_path(
771 scheme_type,
772 &mut true,
773 path_start as usize,
774 input_after_first_char,
775 );
776 self.with_query_and_fragment(
777 scheme_type,
778 base_url.scheme_end,
779 base_url.username_end,
780 base_url.host_start,
781 base_url.host_end,
782 base_url.host,
783 base_url.port,
784 base_url.path_start,
785 remaining,
786 )
787 }
788 _ => {
789 let before_query = match (base_url.query_start, base_url.fragment_start) {
790 (None, None) => &*base_url.serialization,
791 (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
792 };
793 self.serialization.push_str(before_query);
794 // FIXME spec says just "remove last entry", not the "pop" algorithm
795 self.pop_path(scheme_type, base_url.path_start as usize);
796 // A special url always has a path.
797 // A path always starts with '/'
798 if self.serialization.len() == base_url.path_start as usize
799 && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
800 {
801 self.serialization.push('/');
802 }
803 let remaining = match input.split_first() {
804 (Some('/'), remaining) => self.parse_path(
805 scheme_type,
806 &mut true,
807 base_url.path_start as usize,
808 remaining,
809 ),
810 _ => {
811 self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
812 }
813 };
814 self.with_query_and_fragment(
815 scheme_type,
816 base_url.scheme_end,
817 base_url.username_end,
818 base_url.host_start,
819 base_url.host_end,
820 base_url.host,
821 base_url.port,
822 base_url.path_start,
823 remaining,
824 )
825 }
826 }
827 }
828
829 fn after_double_slash(
830 mut self,
831 input: Input<'_>,
832 scheme_type: SchemeType,
833 scheme_end: u32,
834 ) -> ParseResult<Url> {
835 self.serialization.push('/');
836 self.serialization.push('/');
837 // authority state
838 let before_authority = self.serialization.len();
839 let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
840 let has_authority = before_authority != self.serialization.len();
841 // host state
842 let host_start = to_u32(self.serialization.len())?;
843 let (host_end, host, port, remaining) =
844 self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
845 if host == HostInternal::None && has_authority {
846 return Err(ParseError::EmptyHost);
847 }
848 // path state
849 let path_start = to_u32(self.serialization.len())?;
850 let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
851 self.with_query_and_fragment(
852 scheme_type,
853 scheme_end,
854 username_end,
855 host_start,
856 host_end,
857 host,
858 port,
859 path_start,
860 remaining,
861 )
862 }
863
864 /// Return (username_end, remaining)
865 fn parse_userinfo<'i>(
866 &mut self,
867 mut input: Input<'i>,
868 scheme_type: SchemeType,
869 ) -> ParseResult<(u32, Input<'i>)> {
870 let mut last_at = None;
871 let mut remaining = input.clone();
872 let mut char_count = 0;
873 while let Some(c) = remaining.next() {
874 match c {
875 '@' => {
876 if last_at.is_some() {
877 self.log_violation(SyntaxViolation::UnencodedAtSign)
878 } else {
879 self.log_violation(SyntaxViolation::EmbeddedCredentials)
880 }
881 last_at = Some((char_count, remaining.clone()))
882 }
883 '/' | '?' | '#' => break,
884 '\\' if scheme_type.is_special() => break,
885 _ => (),
886 }
887 char_count += 1;
888 }
889 let (mut userinfo_char_count, remaining) = match last_at {
890 None => return Ok((to_u32(self.serialization.len())?, input)),
891 Some((0, remaining)) => {
892 // Otherwise, if one of the following is true
893 // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
894 // url is special and c is U+005C (\)
895 // If @ flag is set and buffer is the empty string, validation error, return failure.
896 if let (Some(c), _) = remaining.split_first() {
897 if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
898 return Err(ParseError::EmptyHost);
899 }
900 }
901 return Ok((to_u32(self.serialization.len())?, remaining));
902 }
903 Some(x) => x,
904 };
905
906 let mut username_end = None;
907 let mut has_password = false;
908 let mut has_username = false;
909 while userinfo_char_count > 0 {
910 let (c, utf8_c) = input.next_utf8().unwrap();
911 userinfo_char_count -= 1;
912 if c == ':' && username_end.is_none() {
913 // Start parsing password
914 username_end = Some(to_u32(self.serialization.len())?);
915 // We don't add a colon if the password is empty
916 if userinfo_char_count > 0 {
917 self.serialization.push(':');
918 has_password = true;
919 }
920 } else {
921 if !has_password {
922 has_username = true;
923 }
924 self.check_url_code_point(c, &input);
925 self.serialization
926 .extend(utf8_percent_encode(utf8_c, USERINFO));
927 }
928 }
929 let username_end = match username_end {
930 Some(i) => i,
931 None => to_u32(self.serialization.len())?,
932 };
933 if has_username || has_password {
934 self.serialization.push('@');
935 }
936 Ok((username_end, remaining))
937 }
938
939 fn parse_host_and_port<'i>(
940 &mut self,
941 input: Input<'i>,
942 scheme_end: u32,
943 scheme_type: SchemeType,
944 ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
945 let (host, remaining) = Parser::parse_host(input, scheme_type)?;
946 write!(&mut self.serialization, "{}", host).unwrap();
947 let host_end = to_u32(self.serialization.len())?;
948 if let Host::Domain(h) = &host {
949 if h.is_empty() {
950 // Port with an empty host
951 if remaining.starts_with(":") {
952 return Err(ParseError::EmptyHost);
953 }
954 if scheme_type.is_special() {
955 return Err(ParseError::EmptyHost);
956 }
957 }
958 };
959
960 let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
961 let scheme = || default_port(&self.serialization[..scheme_end as usize]);
962 Parser::parse_port(remaining, scheme, self.context)?
963 } else {
964 (None, remaining)
965 };
966 if let Some(port) = port {
967 write!(&mut self.serialization, ":{}", port).unwrap()
968 }
969 Ok((host_end, host.into(), port, remaining))
970 }
971
972 pub fn parse_host(
973 mut input: Input<'_>,
974 scheme_type: SchemeType,
975 ) -> ParseResult<(Host<String>, Input<'_>)> {
976 if scheme_type.is_file() {
977 return Parser::get_file_host(input);
978 }
979 // Undo the Input abstraction here to avoid allocating in the common case
980 // where the host part of the input does not contain any tab or newline
981 let input_str = input.chars.as_str();
982 let mut inside_square_brackets = false;
983 let mut has_ignored_chars = false;
984 let mut non_ignored_chars = 0;
985 let mut bytes = 0;
986 for c in input_str.chars() {
987 match c {
988 ':' if !inside_square_brackets => break,
989 '\\' if scheme_type.is_special() => break,
990 '/' | '?' | '#' => break,
991 '\t' | '\n' | '\r' => {
992 has_ignored_chars = true;
993 }
994 '[' => {
995 inside_square_brackets = true;
996 non_ignored_chars += 1
997 }
998 ']' => {
999 inside_square_brackets = false;
1000 non_ignored_chars += 1
1001 }
1002 _ => non_ignored_chars += 1,
1003 }
1004 bytes += c.len_utf8();
1005 }
1006 let replaced: String;
1007 let host_str;
1008 {
1009 let host_input = input.by_ref().take(non_ignored_chars);
1010 if has_ignored_chars {
1011 replaced = host_input.collect();
1012 host_str = &*replaced
1013 } else {
1014 for _ in host_input {}
1015 host_str = &input_str[..bytes]
1016 }
1017 }
1018 if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1019 return Err(ParseError::EmptyHost);
1020 }
1021 if !scheme_type.is_special() {
1022 let host = Host::parse_opaque(host_str)?;
1023 return Ok((host, input));
1024 }
1025 let host = Host::parse(host_str)?;
1026 Ok((host, input))
1027 }
1028
1029 fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1030 let (_, host_str, remaining) = Parser::file_host(input)?;
1031 let host = match Host::parse(&host_str)? {
1032 Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1033 host => host,
1034 };
1035 Ok((host, remaining))
1036 }
1037
1038 fn parse_file_host<'i>(
1039 &mut self,
1040 input: Input<'i>,
1041 ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1042 let has_host;
1043 let (_, host_str, remaining) = Parser::file_host(input)?;
1044 let host = if host_str.is_empty() {
1045 has_host = false;
1046 HostInternal::None
1047 } else {
1048 match Host::parse(&host_str)? {
1049 Host::Domain(ref d) if d == "localhost" => {
1050 has_host = false;
1051 HostInternal::None
1052 }
1053 host => {
1054 write!(&mut self.serialization, "{}", host).unwrap();
1055 has_host = true;
1056 host.into()
1057 }
1058 }
1059 };
1060 Ok((has_host, host, remaining))
1061 }
1062
1063 pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1064 // Undo the Input abstraction here to avoid allocating in the common case
1065 // where the host part of the input does not contain any tab or newline
1066 let input_str = input.chars.as_str();
1067 let mut has_ignored_chars = false;
1068 let mut non_ignored_chars = 0;
1069 let mut bytes = 0;
1070 for c in input_str.chars() {
1071 match c {
1072 '/' | '\\' | '?' | '#' => break,
1073 '\t' | '\n' | '\r' => has_ignored_chars = true,
1074 _ => non_ignored_chars += 1,
1075 }
1076 bytes += c.len_utf8();
1077 }
1078 let replaced: String;
1079 let host_str;
1080 let mut remaining = input.clone();
1081 {
1082 let host_input = remaining.by_ref().take(non_ignored_chars);
1083 if has_ignored_chars {
1084 replaced = host_input.collect();
1085 host_str = &*replaced
1086 } else {
1087 for _ in host_input {}
1088 host_str = &input_str[..bytes]
1089 }
1090 }
1091 if is_windows_drive_letter(host_str) {
1092 return Ok((false, "".to_string(), input));
1093 }
1094 Ok((true, host_str.to_string(), remaining))
1095 }
1096
1097 pub fn parse_port<P>(
1098 mut input: Input<'_>,
1099 default_port: P,
1100 context: Context,
1101 ) -> ParseResult<(Option<u16>, Input<'_>)>
1102 where
1103 P: Fn() -> Option<u16>,
1104 {
1105 let mut port: u32 = 0;
1106 let mut has_any_digit = false;
1107 while let (Some(c), remaining) = input.split_first() {
1108 if let Some(digit) = c.to_digit(10) {
1109 port = port * 10 + digit;
1110 if port > ::std::u16::MAX as u32 {
1111 return Err(ParseError::InvalidPort);
1112 }
1113 has_any_digit = true;
1114 } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1115 return Err(ParseError::InvalidPort);
1116 } else {
1117 break;
1118 }
1119 input = remaining;
1120 }
1121 let mut opt_port = Some(port as u16);
1122 if !has_any_digit || opt_port == default_port() {
1123 opt_port = None;
1124 }
1125 Ok((opt_port, input))
1126 }
1127
1128 pub fn parse_path_start<'i>(
1129 &mut self,
1130 scheme_type: SchemeType,
1131 has_host: &mut bool,
1132 input: Input<'i>,
1133 ) -> Input<'i> {
1134 let path_start = self.serialization.len();
1135 let (maybe_c, remaining) = input.split_first();
1136 // If url is special, then:
1137 if scheme_type.is_special() {
1138 if maybe_c == Some('\\') {
1139 // If c is U+005C (\), validation error.
1140 self.log_violation(SyntaxViolation::Backslash);
1141 }
1142 // A special URL always has a non-empty path.
1143 if !self.serialization.ends_with('/') {
1144 self.serialization.push('/');
1145 // We have already made sure the forward slash is present.
1146 if maybe_c == Some('/') || maybe_c == Some('\\') {
1147 return self.parse_path(scheme_type, has_host, path_start, remaining);
1148 }
1149 }
1150 return self.parse_path(scheme_type, has_host, path_start, input);
1151 } else if maybe_c == Some('?') || maybe_c == Some('#') {
1152 // Otherwise, if state override is not given and c is U+003F (?),
1153 // set url’s query to the empty string and state to query state.
1154 // Otherwise, if state override is not given and c is U+0023 (#),
1155 // set url’s fragment to the empty string and state to fragment state.
1156 // The query and path states will be handled by the caller.
1157 return input;
1158 }
1159
1160 if maybe_c.is_some() && maybe_c != Some('/') {
1161 self.serialization.push('/');
1162 }
1163 // Otherwise, if c is not the EOF code point:
1164 self.parse_path(scheme_type, has_host, path_start, input)
1165 }
1166
1167 pub fn parse_path<'i>(
1168 &mut self,
1169 scheme_type: SchemeType,
1170 has_host: &mut bool,
1171 path_start: usize,
1172 mut input: Input<'i>,
1173 ) -> Input<'i> {
1174 // Relative path state
1175 loop {
1176 let mut segment_start = self.serialization.len();
1177 let mut ends_with_slash = false;
1178 loop {
1179 let input_before_c = input.clone();
1180 let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1181 x
1182 } else {
1183 break;
1184 };
1185 match c {
1186 '/' if self.context != Context::PathSegmentSetter => {
1187 self.serialization.push(c);
1188 ends_with_slash = true;
1189 break;
1190 }
1191 '\\' if self.context != Context::PathSegmentSetter
1192 && scheme_type.is_special() =>
1193 {
1194 self.log_violation(SyntaxViolation::Backslash);
1195 self.serialization.push('/');
1196 ends_with_slash = true;
1197 break;
1198 }
1199 '?' | '#' if self.context == Context::UrlParser => {
1200 input = input_before_c;
1201 break;
1202 }
1203 _ => {
1204 self.check_url_code_point(c, &input);
1205 if scheme_type.is_file()
1206 && is_normalized_windows_drive_letter(
1207 &self.serialization[path_start + 1..],
1208 )
1209 {
1210 self.serialization.push('/');
1211 segment_start += 1;
1212 }
1213 if self.context == Context::PathSegmentSetter {
1214 if scheme_type.is_special() {
1215 self.serialization
1216 .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1217 } else {
1218 self.serialization
1219 .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1220 }
1221 } else {
1222 self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1223 }
1224 }
1225 }
1226 }
1227 let segment_before_slash = if ends_with_slash {
1228 &self.serialization[segment_start..self.serialization.len() - 1]
1229 } else {
1230 &self.serialization[segment_start..self.serialization.len()]
1231 };
1232 match segment_before_slash {
1233 // If buffer is a double-dot path segment, shorten url’s path,
1234 ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1235 | ".%2E" => {
1236 debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1237 self.serialization.truncate(segment_start);
1238 if self.serialization.ends_with('/')
1239 && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1240 {
1241 self.serialization.pop();
1242 }
1243 self.shorten_path(scheme_type, path_start);
1244
1245 // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1246 if ends_with_slash && !self.serialization.ends_with('/') {
1247 self.serialization.push('/');
1248 }
1249 }
1250 // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1251 // nor url is special and c is U+005C (\), append the empty string to url’s path.
1252 "." | "%2e" | "%2E" => {
1253 self.serialization.truncate(segment_start);
1254 if !self.serialization.ends_with('/') {
1255 self.serialization.push('/');
1256 }
1257 }
1258 _ => {
1259 // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1260 if scheme_type.is_file()
1261 && segment_start == path_start + 1
1262 && is_windows_drive_letter(segment_before_slash)
1263 {
1264 // Replace the second code point in buffer with U+003A (:).
1265 if let Some(c) = segment_before_slash.chars().next() {
1266 self.serialization.truncate(segment_start);
1267 self.serialization.push(c);
1268 self.serialization.push(':');
1269 if ends_with_slash {
1270 self.serialization.push('/');
1271 }
1272 }
1273 // If url’s host is neither the empty string nor null,
1274 // validation error, set url’s host to the empty string.
1275 if *has_host {
1276 self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1277 *has_host = false; // FIXME account for this in callers
1278 }
1279 }
1280 }
1281 }
1282 if !ends_with_slash {
1283 break;
1284 }
1285 }
1286 if scheme_type.is_file() {
1287 // while url’s path’s size is greater than 1
1288 // and url’s path[0] is the empty string,
1289 // validation error, remove the first item from url’s path.
1290 //FIXME: log violation
1291 let path = self.serialization.split_off(path_start);
1292 self.serialization.push('/');
1293 self.serialization.push_str(path.trim_start_matches('/'));
1294 }
1295
1296 input
1297 }
1298
1299 fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1300 let url_before_segment = &serialization[..serialization.len() - 1];
1301 if let Some(segment_before_start) = url_before_segment.rfind('/') {
1302 // Do not remove the root slash
1303 segment_before_start >= path_start
1304 // Or a windows drive letter slash
1305 && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1306 } else {
1307 false
1308 }
1309 }
1310
1311 /// https://url.spec.whatwg.org/#shorten-a-urls-path
1312 fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1313 // If path is empty, then return.
1314 if self.serialization.len() == path_start {
1315 return;
1316 }
1317 // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1318 if scheme_type.is_file()
1319 && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1320 {
1321 return;
1322 }
1323 // Remove path’s last item.
1324 self.pop_path(scheme_type, path_start);
1325 }
1326
1327 /// https://url.spec.whatwg.org/#pop-a-urls-path
1328 fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1329 if self.serialization.len() > path_start {
1330 let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1331 // + 1 since rfind returns the position before the slash.
1332 let segment_start = path_start + slash_position + 1;
1333 // Don’t pop a Windows drive letter
1334 if !(scheme_type.is_file()
1335 && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1336 {
1337 self.serialization.truncate(segment_start);
1338 }
1339 }
1340 }
1341
1342 pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1343 loop {
1344 let input_before_c = input.clone();
1345 match input.next_utf8() {
1346 Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1347 return input_before_c
1348 }
1349 Some((c, utf8_c)) => {
1350 self.check_url_code_point(c, &input);
1351 self.serialization
1352 .extend(utf8_percent_encode(utf8_c, CONTROLS));
1353 }
1354 None => return input,
1355 }
1356 }
1357 }
1358
1359 #[allow(clippy::too_many_arguments)]
1360 fn with_query_and_fragment(
1361 mut self,
1362 scheme_type: SchemeType,
1363 scheme_end: u32,
1364 username_end: u32,
1365 host_start: u32,
1366 host_end: u32,
1367 host: HostInternal,
1368 port: Option<u16>,
1369 mut path_start: u32,
1370 remaining: Input<'_>,
1371 ) -> ParseResult<Url> {
1372 // Special case for anarchist URL's with a leading empty path segment
1373 // This prevents web+demo:/.//not-a-host/ or web+demo:/path/..//not-a-host/,
1374 // when parsed and then serialized, from ending up as web+demo://not-a-host/
1375 // (they end up as web+demo:/.//not-a-host/).
1376 //
1377 // If url’s host is null, url does not have an opaque path,
1378 // url’s path’s size is greater than 1, and url’s path[0] is the empty string,
1379 // then append U+002F (/) followed by U+002E (.) to output.
1380 let scheme_end_as_usize = scheme_end as usize;
1381 let path_start_as_usize = path_start as usize;
1382 if path_start_as_usize == scheme_end_as_usize + 1 {
1383 // Anarchist URL
1384 if self.serialization[path_start_as_usize..].starts_with("//") {
1385 // Case 1: The base URL did not have an empty path segment, but the resulting one does
1386 // Insert the "/." prefix
1387 self.serialization.insert_str(path_start_as_usize, "/.");
1388 path_start += 2;
1389 }
1390 assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1391 } else if path_start_as_usize == scheme_end_as_usize + 3
1392 && &self.serialization[scheme_end_as_usize..path_start_as_usize] == ":/."
1393 {
1394 // Anarchist URL with leading empty path segment
1395 // The base URL has a "/." between the host and the path
1396 assert_eq!(self.serialization.as_bytes()[path_start_as_usize], b'/');
1397 if self
1398 .serialization
1399 .as_bytes()
1400 .get(path_start_as_usize + 1)
1401 .copied()
1402 != Some(b'/')
1403 {
1404 // Case 2: The base URL had an empty path segment, but the resulting one does not
1405 // Remove the "/." prefix
1406 self.serialization
1407 .replace_range(scheme_end_as_usize..path_start_as_usize, ":");
1408 path_start -= 2;
1409 }
1410 assert!(!self.serialization[scheme_end_as_usize..].starts_with("://"));
1411 }
1412
1413 let (query_start, fragment_start) =
1414 self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1415 Ok(Url {
1416 serialization: self.serialization,
1417 scheme_end,
1418 username_end,
1419 host_start,
1420 host_end,
1421 host,
1422 port,
1423 path_start,
1424 query_start,
1425 fragment_start,
1426 })
1427 }
1428
1429 /// Return (query_start, fragment_start)
1430 fn parse_query_and_fragment(
1431 &mut self,
1432 scheme_type: SchemeType,
1433 scheme_end: u32,
1434 mut input: Input<'_>,
1435 ) -> ParseResult<(Option<u32>, Option<u32>)> {
1436 let mut query_start = None;
1437 match input.next() {
1438 Some('#') => {}
1439 Some('?') => {
1440 query_start = Some(to_u32(self.serialization.len())?);
1441 self.serialization.push('?');
1442 let remaining = self.parse_query(scheme_type, scheme_end, input);
1443 if let Some(remaining) = remaining {
1444 input = remaining
1445 } else {
1446 return Ok((query_start, None));
1447 }
1448 }
1449 None => return Ok((None, None)),
1450 _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1451 }
1452
1453 let fragment_start = to_u32(self.serialization.len())?;
1454 self.serialization.push('#');
1455 self.parse_fragment(input);
1456 Ok((query_start, Some(fragment_start)))
1457 }
1458
1459 pub fn parse_query<'i>(
1460 &mut self,
1461 scheme_type: SchemeType,
1462 scheme_end: u32,
1463 mut input: Input<'i>,
1464 ) -> Option<Input<'i>> {
1465 let len = input.chars.as_str().len();
1466 let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1467 let mut remaining = None;
1468 while let Some(c) = input.next() {
1469 if c == '#' && self.context == Context::UrlParser {
1470 remaining = Some(input);
1471 break;
1472 } else {
1473 self.check_url_code_point(c, &input);
1474 query.push(c);
1475 }
1476 }
1477
1478 let encoding = match &self.serialization[..scheme_end as usize] {
1479 "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1480 _ => None,
1481 };
1482 let query_bytes = if let Some(o) = encoding {
1483 o(&query)
1484 } else {
1485 query.as_bytes().into()
1486 };
1487 let set = if scheme_type.is_special() {
1488 SPECIAL_QUERY
1489 } else {
1490 QUERY
1491 };
1492 self.serialization.extend(percent_encode(&query_bytes, set));
1493 remaining
1494 }
1495
1496 fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1497 let before_fragment = match base_url.fragment_start {
1498 Some(i) => base_url.slice(..i),
1499 None => &*base_url.serialization,
1500 };
1501 debug_assert!(self.serialization.is_empty());
1502 self.serialization
1503 .reserve(before_fragment.len() + input.chars.as_str().len());
1504 self.serialization.push_str(before_fragment);
1505 self.serialization.push('#');
1506 let next = input.next();
1507 debug_assert!(next == Some('#'));
1508 self.parse_fragment(input);
1509 Ok(Url {
1510 serialization: self.serialization,
1511 fragment_start: Some(to_u32(before_fragment.len())?),
1512 ..*base_url
1513 })
1514 }
1515
1516 pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1517 while let Some((c, utf8_c)) = input.next_utf8() {
1518 if c == '\0' {
1519 self.log_violation(SyntaxViolation::NullInFragment)
1520 } else {
1521 self.check_url_code_point(c, &input);
1522 }
1523 self.serialization
1524 .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1525 }
1526 }
1527
1528 fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1529 if let Some(vfn) = self.violation_fn {
1530 if c == '%' {
1531 let mut input = input.clone();
1532 if !matches!((input.next(), input.next()), (Some(a), Some(b))
1533 if a.is_ascii_hexdigit() && b.is_ascii_hexdigit())
1534 {
1535 vfn(SyntaxViolation::PercentDecode)
1536 }
1537 } else if !is_url_code_point(c) {
1538 vfn(SyntaxViolation::NonUrlCodePoint)
1539 }
1540 }
1541 }
1542}
1543
1544// Non URL code points:
1545// U+0000 to U+0020 (space)
1546// " # % < > [ \ ] ^ ` { | }
1547// U+007F to U+009F
1548// surrogates
1549// U+FDD0 to U+FDEF
1550// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1551#[inline]
1552fn is_url_code_point(c: char) -> bool {
1553 matches!(c,
1554 'a'..='z' |
1555 'A'..='Z' |
1556 '0'..='9' |
1557 '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1558 '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1559 '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1560 '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1561 '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1562 '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1563 '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1564 '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1565 '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1566 '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1567 '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1568}
1569
1570/// https://url.spec.whatwg.org/#c0-controls-and-space
1571#[inline]
1572fn c0_control_or_space(ch: char) -> bool {
1573 ch <= ' ' // U+0000 to U+0020
1574}
1575
1576/// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1577#[inline]
1578fn ascii_tab_or_new_line(ch: char) -> bool {
1579 matches!(ch, '\t' | '\r' | '\n')
1580}
1581
1582/// https://url.spec.whatwg.org/#ascii-alpha
1583#[inline]
1584pub fn ascii_alpha(ch: char) -> bool {
1585 ch.is_ascii_alphabetic()
1586}
1587
1588#[inline]
1589pub fn to_u32(i: usize) -> ParseResult<u32> {
1590 if i <= ::std::u32::MAX as usize {
1591 Ok(i as u32)
1592 } else {
1593 Err(ParseError::Overflow)
1594 }
1595}
1596
1597fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1598 is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1599}
1600
1601/// Whether the scheme is file:, the path has a single segment, and that segment
1602/// is a Windows drive letter
1603#[inline]
1604pub fn is_windows_drive_letter(segment: &str) -> bool {
1605 segment.len() == 2 && starts_with_windows_drive_letter(segment)
1606}
1607
1608/// Whether path starts with a root slash
1609/// and a windows drive letter eg: "/c:" or "/a:/"
1610fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1611 if let Some(c: &u8) = s.as_bytes().first() {
1612 matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1613 } else {
1614 false
1615 }
1616}
1617
1618fn starts_with_windows_drive_letter(s: &str) -> bool {
1619 s.len() >= 2
1620 && ascii_alpha(ch:s.as_bytes()[0] as char)
1621 && matches!(s.as_bytes()[1], b':' | b'|')
1622 && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1623}
1624
1625/// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
1626fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1627 let mut input: Input<'_> = input.clone();
1628 match (input.next(), input.next(), input.next()) {
1629 // its first two code points are a Windows drive letter
1630 // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1631 (Some(a: char), Some(b: char), Some(c: char))
1632 if ascii_alpha(ch:a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1633 {
1634 true
1635 }
1636 // its first two code points are a Windows drive letter
1637 // its length is 2
1638 (Some(a: char), Some(b: char), None) if ascii_alpha(ch:a) && matches!(b, ':' | '|') => true,
1639 _ => false,
1640 }
1641}
1642