| 1 | use crate::{ | 
| 2 | error::{ParseError, Reason}, | 
|---|
| 3 | ExceptionId, LicenseId, | 
|---|
| 4 | }; | 
|---|
| 5 |  | 
|---|
| 6 | /// Parsing configuration for SPDX expression | 
|---|
| 7 | #[ derive(Default, Copy, Clone)] | 
|---|
| 8 | pub struct ParseMode { | 
|---|
| 9 | /// The `AND`, `OR`, and `WITH` operators are required to be uppercase in | 
|---|
| 10 | /// the SPDX spec, but enabling this option allows them to be lowercased | 
|---|
| 11 | pub allow_lower_case_operators: bool, | 
|---|
| 12 | /// Allows the use of `/` as a synonym for the `OR` operator. | 
|---|
| 13 | /// | 
|---|
| 14 | /// This also allows for not having whitespace between the `/` and the terms | 
|---|
| 15 | /// on either side | 
|---|
| 16 | pub allow_slash_as_or_operator: bool, | 
|---|
| 17 | /// Allows some invalid/imprecise identifiers as synonyms for an actual | 
|---|
| 18 | /// license identifier. | 
|---|
| 19 | /// | 
|---|
| 20 | /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list | 
|---|
| 21 | /// of the current synonyms. Note that this list is not comprehensive but | 
|---|
| 22 | /// can be expanded upon when invalid identifiers are found in the wild. | 
|---|
| 23 | pub allow_imprecise_license_names: bool, | 
|---|
| 24 | /// The various GPL licenses diverge from every other license in the SPDX | 
|---|
| 25 | /// license list by having an `-or-later` variant that is used as a suffix | 
|---|
| 26 | /// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical | 
|---|
| 27 | /// `GPL-3.0+`. | 
|---|
| 28 | /// | 
|---|
| 29 | /// This option just allows GPL licenses to be treated similarly to all of | 
|---|
| 30 | /// the other SPDX licenses. | 
|---|
| 31 | pub allow_postfix_plus_on_gpl: bool, | 
|---|
| 32 | } | 
|---|
| 33 |  | 
|---|
| 34 | impl ParseMode { | 
|---|
| 35 | /// Strict, specification compliant SPDX parsing. | 
|---|
| 36 | /// | 
|---|
| 37 | /// 1. Only license identifiers in the SPDX license list, or | 
|---|
| 38 | ///     Document/LicenseRef, are allowed. The license identifiers are also | 
|---|
| 39 | ///     case-sensitive. | 
|---|
| 40 | /// 1. `WITH`, `AND`, and `OR` are the only valid operators | 
|---|
| 41 | pub const STRICT: Self = Self { | 
|---|
| 42 | allow_lower_case_operators: false, | 
|---|
| 43 | allow_slash_as_or_operator: false, | 
|---|
| 44 | allow_imprecise_license_names: false, | 
|---|
| 45 | allow_postfix_plus_on_gpl: false, | 
|---|
| 46 | }; | 
|---|
| 47 |  | 
|---|
| 48 | /// Allow non-conforming syntax for crates-io compatibility | 
|---|
| 49 | /// | 
|---|
| 50 | /// 1. Additional, invalid, identifiers are accepted and mapped to a correct | 
|---|
| 51 | ///     SPDX license identifier. | 
|---|
| 52 | ///     See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the | 
|---|
| 53 | ///     list of additionally accepted identifiers and the license they | 
|---|
| 54 | ///     correspond to. | 
|---|
| 55 | /// 1. `/` can by used as a synonym for `OR`, and doesn't need to be | 
|---|
| 56 | ///     separated by whitespace from the terms it combines | 
|---|
| 57 | pub const LAX: Self = Self { | 
|---|
| 58 | allow_lower_case_operators: true, | 
|---|
| 59 | allow_slash_as_or_operator: true, | 
|---|
| 60 | allow_imprecise_license_names: true, | 
|---|
| 61 | allow_postfix_plus_on_gpl: true, | 
|---|
| 62 | }; | 
|---|
| 63 | } | 
|---|
| 64 |  | 
|---|
| 65 | /// A single token in an SPDX license expression | 
|---|
| 66 | #[ derive(Clone, Debug, PartialEq, Eq)] | 
|---|
| 67 | pub enum Token<'a> { | 
|---|
| 68 | /// A recognized SPDX license id | 
|---|
| 69 | Spdx(LicenseId), | 
|---|
| 70 | /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-` | 
|---|
| 71 | LicenseRef { | 
|---|
| 72 | doc_ref: Option<&'a str>, | 
|---|
| 73 | lic_ref: &'a str, | 
|---|
| 74 | }, | 
|---|
| 75 | /// A recognized SPDX exception id | 
|---|
| 76 | Exception(ExceptionId), | 
|---|
| 77 | /// A postfix `+` indicating "or later" for a particular SPDX license id | 
|---|
| 78 | Plus, | 
|---|
| 79 | /// A `(` for starting a group | 
|---|
| 80 | OpenParen, | 
|---|
| 81 | /// A `)` for ending a group | 
|---|
| 82 | CloseParen, | 
|---|
| 83 | /// A `WITH` operator | 
|---|
| 84 | With, | 
|---|
| 85 | /// An `AND` operator | 
|---|
| 86 | And, | 
|---|
| 87 | /// An `OR` operator | 
|---|
| 88 | Or, | 
|---|
| 89 | } | 
|---|
| 90 |  | 
|---|
| 91 | impl std::fmt::Display for Token<'_> { | 
|---|
| 92 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | 
|---|
| 93 | std::fmt::Debug::fmt(self, f) | 
|---|
| 94 | } | 
|---|
| 95 | } | 
|---|
| 96 |  | 
|---|
| 97 | impl Token<'_> { | 
|---|
| 98 | fn len(&self) -> usize { | 
|---|
| 99 | match self { | 
|---|
| 100 | Token::Spdx(id: &LicenseId) => id.name.len(), | 
|---|
| 101 | Token::Exception(e: &ExceptionId) => e.name.len(), | 
|---|
| 102 | Token::With => 4, | 
|---|
| 103 | Token::And => 3, | 
|---|
| 104 | Token::Or => 2, | 
|---|
| 105 | Token::Plus | Token::OpenParen | Token::CloseParen => 1, | 
|---|
| 106 | Token::LicenseRef { doc_ref: &Option<&str>, lic_ref: &&str } => { | 
|---|
| 107 | doc_ref.map_or(default:0, |d: &str| { | 
|---|
| 108 | // +1 is for the `:` | 
|---|
| 109 | "DocumentRef-".len() + d.len() + 1 | 
|---|
| 110 | }) + "LicenseRef-".len() | 
|---|
| 111 | + lic_ref.len() | 
|---|
| 112 | } | 
|---|
| 113 | } | 
|---|
| 114 | } | 
|---|
| 115 | } | 
|---|
| 116 |  | 
|---|
| 117 | /// Allows iteration through an SPDX license expression, yielding | 
|---|
| 118 | /// a token or a `ParseError`. | 
|---|
| 119 | /// | 
|---|
| 120 | /// Prefer to use `Expression::parse` or `Licensee::parse` rather | 
|---|
| 121 | /// than directly using the lexer | 
|---|
| 122 | pub struct Lexer<'a> { | 
|---|
| 123 | inner: &'a str, | 
|---|
| 124 | original: &'a str, | 
|---|
| 125 | offset: usize, | 
|---|
| 126 | mode: ParseMode, | 
|---|
| 127 | } | 
|---|
| 128 |  | 
|---|
| 129 | impl<'a> Lexer<'a> { | 
|---|
| 130 | /// Creates a Lexer over a license expression | 
|---|
| 131 | #[ must_use] | 
|---|
| 132 | pub fn new(text: &'a str) -> Self { | 
|---|
| 133 | Self { | 
|---|
| 134 | inner: text, | 
|---|
| 135 | original: text, | 
|---|
| 136 | offset: 0, | 
|---|
| 137 | mode: ParseMode::STRICT, | 
|---|
| 138 | } | 
|---|
| 139 | } | 
|---|
| 140 |  | 
|---|
| 141 | /// Creates a Lexer over a license expression | 
|---|
| 142 | /// | 
|---|
| 143 | /// With `ParseMode::Lax` it allows non-conforming syntax | 
|---|
| 144 | /// used in crates-io crates. | 
|---|
| 145 | #[ must_use] | 
|---|
| 146 | pub fn new_mode(text: &'a str, mode: ParseMode) -> Self { | 
|---|
| 147 | Self { | 
|---|
| 148 | inner: text, | 
|---|
| 149 | original: text, | 
|---|
| 150 | offset: 0, | 
|---|
| 151 | mode, | 
|---|
| 152 | } | 
|---|
| 153 | } | 
|---|
| 154 |  | 
|---|
| 155 | #[ inline] | 
|---|
| 156 | fn is_ref_char(c: &char) -> bool { | 
|---|
| 157 | c.is_ascii_alphanumeric() || *c == '-'|| *c == '.' | 
|---|
| 158 | } | 
|---|
| 159 |  | 
|---|
| 160 | /// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+` | 
|---|
| 161 | fn find_text_token(text: &'a str) -> Option<&'a str> { | 
|---|
| 162 | let is_token_char = |c: &char| Self::is_ref_char(c) || *c == ':'; | 
|---|
| 163 | match text.chars().take_while(is_token_char).count() { | 
|---|
| 164 | index if index > 0 => Some(&text[..index]), | 
|---|
| 165 | _ => None, | 
|---|
| 166 | } | 
|---|
| 167 | } | 
|---|
| 168 |  | 
|---|
| 169 | /// Extract the text after `prefix` if made up of valid ref characters | 
|---|
| 170 | fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> { | 
|---|
| 171 | text.strip_prefix(prefix).map(|value| { | 
|---|
| 172 | let end = value.chars().take_while(Self::is_ref_char).count(); | 
|---|
| 173 | &text[prefix.len()..prefix.len() + end] | 
|---|
| 174 | }) | 
|---|
| 175 | } | 
|---|
| 176 |  | 
|---|
| 177 | /// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)` | 
|---|
| 178 | #[ inline] | 
|---|
| 179 | fn find_license_ref(text: &'a str) -> Option<&'a str> { | 
|---|
| 180 | Self::find_ref( "LicenseRef-", text) | 
|---|
| 181 | } | 
|---|
| 182 |  | 
|---|
| 183 | /// Return a document ref and license ref if found, | 
|---|
| 184 | /// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)` | 
|---|
| 185 | fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> { | 
|---|
| 186 | let split = text.split_once( ':'); | 
|---|
| 187 | let doc_ref = split.and_then(|(doc, _)| Self::find_ref( "DocumentRef-", doc)); | 
|---|
| 188 | let lic_ref = split.and_then(|(_, lic)| Self::find_license_ref(lic)); | 
|---|
| 189 | Option::zip(doc_ref, lic_ref) | 
|---|
| 190 | } | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | /// A wrapper around a particular token that includes the span of the characters | 
|---|
| 194 | /// in the original string, for diagnostic purposes | 
|---|
| 195 | #[ derive(Debug)] | 
|---|
| 196 | pub struct LexerToken<'a> { | 
|---|
| 197 | /// The token that was lexed | 
|---|
| 198 | pub token: Token<'a>, | 
|---|
| 199 | /// The range of the token characters in the original license expression | 
|---|
| 200 | pub span: std::ops::Range<usize>, | 
|---|
| 201 | } | 
|---|
| 202 |  | 
|---|
| 203 | impl<'a> Iterator for Lexer<'a> { | 
|---|
| 204 | type Item = Result<LexerToken<'a>, ParseError>; | 
|---|
| 205 |  | 
|---|
| 206 | fn next(&mut self) -> Option<Self::Item> { | 
|---|
| 207 | #[ allow(clippy::unnecessary_wraps)] | 
|---|
| 208 | fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> { | 
|---|
| 209 | let len = token.len(); | 
|---|
| 210 | Some(Ok((token, len))) | 
|---|
| 211 | } | 
|---|
| 212 |  | 
|---|
| 213 | // Jump over any whitespace, updating `self.inner` and `self.offset` appropriately | 
|---|
| 214 | let non_whitespace_index = match self.inner.find(|c: char| !c.is_whitespace()) { | 
|---|
| 215 | Some(idx) => idx, | 
|---|
| 216 | None => self.inner.len(), | 
|---|
| 217 | }; | 
|---|
| 218 | self.inner = &self.inner[non_whitespace_index..]; | 
|---|
| 219 | self.offset += non_whitespace_index; | 
|---|
| 220 |  | 
|---|
| 221 | match self.inner.chars().next() { | 
|---|
| 222 | None => None, | 
|---|
| 223 | // From SPDX 2.1 spec | 
|---|
| 224 | // There MUST NOT be whitespace between a license-id and any following "+". | 
|---|
| 225 | Some( '+') => { | 
|---|
| 226 | if non_whitespace_index == 0 { | 
|---|
| 227 | ok_token(Token::Plus) | 
|---|
| 228 | } else { | 
|---|
| 229 | Some(Err(ParseError { | 
|---|
| 230 | original: self.original.to_owned(), | 
|---|
| 231 | span: self.offset - non_whitespace_index..self.offset, | 
|---|
| 232 | reason: Reason::SeparatedPlus, | 
|---|
| 233 | })) | 
|---|
| 234 | } | 
|---|
| 235 | } | 
|---|
| 236 | Some( '(') => ok_token(Token::OpenParen), | 
|---|
| 237 | Some( ')') => ok_token(Token::CloseParen), | 
|---|
| 238 | Some( '/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))), | 
|---|
| 239 | Some(_) => match Lexer::find_text_token(self.inner) { | 
|---|
| 240 | None => Some(Err(ParseError { | 
|---|
| 241 | original: self.original.to_owned(), | 
|---|
| 242 | span: self.offset..self.offset + self.inner.len(), | 
|---|
| 243 | reason: Reason::InvalidCharacters, | 
|---|
| 244 | })), | 
|---|
| 245 | Some(m) => { | 
|---|
| 246 | if m == "WITH"{ | 
|---|
| 247 | ok_token(Token::With) | 
|---|
| 248 | } else if m == "AND"{ | 
|---|
| 249 | ok_token(Token::And) | 
|---|
| 250 | } else if m == "OR"{ | 
|---|
| 251 | ok_token(Token::Or) | 
|---|
| 252 | } else if self.mode.allow_lower_case_operators && m == "and"{ | 
|---|
| 253 | ok_token(Token::And) | 
|---|
| 254 | } else if self.mode.allow_lower_case_operators && m == "or"{ | 
|---|
| 255 | ok_token(Token::Or) | 
|---|
| 256 | } else if self.mode.allow_lower_case_operators && m == "with"{ | 
|---|
| 257 | ok_token(Token::With) | 
|---|
| 258 | } else if let Some(lic_id) = crate::license_id(m) { | 
|---|
| 259 | ok_token(Token::Spdx(lic_id)) | 
|---|
| 260 | } else if let Some(exc_id) = crate::exception_id(m) { | 
|---|
| 261 | ok_token(Token::Exception(exc_id)) | 
|---|
| 262 | } else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m) | 
|---|
| 263 | { | 
|---|
| 264 | ok_token(Token::LicenseRef { | 
|---|
| 265 | doc_ref: Some(doc_ref), | 
|---|
| 266 | lic_ref, | 
|---|
| 267 | }) | 
|---|
| 268 | } else if let Some(lic_ref) = Lexer::find_license_ref(m) { | 
|---|
| 269 | ok_token(Token::LicenseRef { | 
|---|
| 270 | doc_ref: None, | 
|---|
| 271 | lic_ref, | 
|---|
| 272 | }) | 
|---|
| 273 | } else if let Some((lic_id, token_len)) = | 
|---|
| 274 | if self.mode.allow_imprecise_license_names { | 
|---|
| 275 | crate::imprecise_license_id(self.inner) | 
|---|
| 276 | } else { | 
|---|
| 277 | None | 
|---|
| 278 | } | 
|---|
| 279 | { | 
|---|
| 280 | Some(Ok((Token::Spdx(lic_id), token_len))) | 
|---|
| 281 | } else { | 
|---|
| 282 | Some(Err(ParseError { | 
|---|
| 283 | original: self.original.to_owned(), | 
|---|
| 284 | span: self.offset..self.offset + m.len(), | 
|---|
| 285 | reason: Reason::UnknownTerm, | 
|---|
| 286 | })) | 
|---|
| 287 | } | 
|---|
| 288 | } | 
|---|
| 289 | }, | 
|---|
| 290 | } | 
|---|
| 291 | .map(|res| { | 
|---|
| 292 | res.map(|(tok, len)| { | 
|---|
| 293 | let start = self.offset; | 
|---|
| 294 | self.inner = &self.inner[len..]; | 
|---|
| 295 | self.offset += len; | 
|---|
| 296 |  | 
|---|
| 297 | LexerToken { | 
|---|
| 298 | token: tok, | 
|---|
| 299 | span: start..self.offset, | 
|---|
| 300 | } | 
|---|
| 301 | }) | 
|---|
| 302 | }) | 
|---|
| 303 | } | 
|---|
| 304 | } | 
|---|
| 305 |  | 
|---|