1use crate::{
2 error::{ParseError, Reason},
3 ExceptionId, LicenseId,
4};
5
6/// Parsing configuration for SPDX expression
7#[derive(Default, Copy, Clone)]
8pub struct ParseMode {
9 /// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
10 /// the SPDX spec, but enabling this option allows them to be lowercased
11 pub allow_lower_case_operators: bool,
12 /// Allows the use of `/` as a synonym for the `OR` operator.
13 ///
14 /// This also allows for not having whitespace between the `/` and the terms
15 /// on either side
16 pub allow_slash_as_or_operator: bool,
17 /// Allows some invalid/imprecise identifiers as synonyms for an actual
18 /// license identifier.
19 ///
20 /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
21 /// of the current synonyms. Note that this list is not comprehensive but
22 /// can be expanded upon when invalid identifiers are found in the wild.
23 pub allow_imprecise_license_names: bool,
24 /// The various GPL licenses diverge from every other license in the SPDX
25 /// license list by having an `-or-later` variant that is used as a suffix
26 /// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
27 /// `GPL-3.0+`.
28 ///
29 /// This option just allows GPL licenses to be treated similarly to all of
30 /// the other SPDX licenses.
31 pub allow_postfix_plus_on_gpl: bool,
32}
33
34impl ParseMode {
35 /// Strict, specification compliant SPDX parsing.
36 ///
37 /// 1. Only license identifiers in the SPDX license list, or
38 /// Document/LicenseRef, are allowed. The license identifiers are also
39 /// case-sensitive.
40 /// 1. `WITH`, `AND`, and `OR` are the only valid operators
41 pub const STRICT: Self = Self {
42 allow_lower_case_operators: false,
43 allow_slash_as_or_operator: false,
44 allow_imprecise_license_names: false,
45 allow_postfix_plus_on_gpl: false,
46 };
47
48 /// Allow non-conforming syntax for crates-io compatibility
49 ///
50 /// 1. Additional, invalid, identifiers are accepted and mapped to a correct
51 /// SPDX license identifier.
52 /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the
53 /// list of additionally accepted identifiers and the license they
54 /// correspond to.
55 /// 1. `/` can by used as a synonym for `OR`, and doesn't need to be
56 /// separated by whitespace from the terms it combines
57 pub const LAX: Self = Self {
58 allow_lower_case_operators: true,
59 allow_slash_as_or_operator: true,
60 allow_imprecise_license_names: true,
61 allow_postfix_plus_on_gpl: true,
62 };
63}
64
65/// A single token in an SPDX license expression
66#[derive(Clone, Debug, PartialEq, Eq)]
67pub enum Token<'a> {
68 /// A recognized SPDX license id
69 Spdx(LicenseId),
70 /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
71 LicenseRef {
72 doc_ref: Option<&'a str>,
73 lic_ref: &'a str,
74 },
75 /// A recognized SPDX exception id
76 Exception(ExceptionId),
77 /// A postfix `+` indicating "or later" for a particular SPDX license id
78 Plus,
79 /// A `(` for starting a group
80 OpenParen,
81 /// A `)` for ending a group
82 CloseParen,
83 /// A `WITH` operator
84 With,
85 /// An `AND` operator
86 And,
87 /// An `OR` operator
88 Or,
89}
90
91impl std::fmt::Display for Token<'_> {
92 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93 std::fmt::Debug::fmt(self, f)
94 }
95}
96
97impl Token<'_> {
98 fn len(&self) -> usize {
99 match self {
100 Token::Spdx(id: &LicenseId) => id.name.len(),
101 Token::Exception(e: &ExceptionId) => e.name.len(),
102 Token::With => 4,
103 Token::And => 3,
104 Token::Or => 2,
105 Token::Plus | Token::OpenParen | Token::CloseParen => 1,
106 Token::LicenseRef { doc_ref: &Option<&str>, lic_ref: &&str } => {
107 doc_ref.map_or(default:0, |d: &str| {
108 // +1 is for the `:`
109 "DocumentRef-".len() + d.len() + 1
110 }) + "LicenseRef-".len()
111 + lic_ref.len()
112 }
113 }
114 }
115}
116
117/// Allows iteration through an SPDX license expression, yielding
118/// a token or a `ParseError`.
119///
120/// Prefer to use `Expression::parse` or `Licensee::parse` rather
121/// than directly using the lexer
122pub struct Lexer<'a> {
123 inner: &'a str,
124 original: &'a str,
125 offset: usize,
126 mode: ParseMode,
127}
128
129impl<'a> Lexer<'a> {
130 /// Creates a Lexer over a license expression
131 #[must_use]
132 pub fn new(text: &'a str) -> Self {
133 Self {
134 inner: text,
135 original: text,
136 offset: 0,
137 mode: ParseMode::STRICT,
138 }
139 }
140
141 /// Creates a Lexer over a license expression
142 ///
143 /// With `ParseMode::Lax` it allows non-conforming syntax
144 /// used in crates-io crates.
145 #[must_use]
146 pub fn new_mode(text: &'a str, mode: ParseMode) -> Self {
147 Self {
148 inner: text,
149 original: text,
150 offset: 0,
151 mode,
152 }
153 }
154
155 #[inline]
156 fn is_ref_char(c: &char) -> bool {
157 c.is_ascii_alphanumeric() || *c == '-' || *c == '.'
158 }
159
160 /// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+`
161 fn find_text_token(text: &'a str) -> Option<&'a str> {
162 let is_token_char = |c: &char| Self::is_ref_char(c) || *c == ':';
163 match text.chars().take_while(is_token_char).count() {
164 index if index > 0 => Some(&text[..index]),
165 _ => None,
166 }
167 }
168
169 /// Extract the text after `prefix` if made up of valid ref characters
170 fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> {
171 text.strip_prefix(prefix).map(|value| {
172 let end = value.chars().take_while(Self::is_ref_char).count();
173 &text[prefix.len()..prefix.len() + end]
174 })
175 }
176
177 /// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)`
178 #[inline]
179 fn find_license_ref(text: &'a str) -> Option<&'a str> {
180 Self::find_ref("LicenseRef-", text)
181 }
182
183 /// Return a document ref and license ref if found,
184 /// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)`
185 fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> {
186 let split = text.split_once(':');
187 let doc_ref = split.and_then(|(doc, _)| Self::find_ref("DocumentRef-", doc));
188 let lic_ref = split.and_then(|(_, lic)| Self::find_license_ref(lic));
189 Option::zip(doc_ref, lic_ref)
190 }
191}
192
193/// A wrapper around a particular token that includes the span of the characters
194/// in the original string, for diagnostic purposes
195#[derive(Debug)]
196pub struct LexerToken<'a> {
197 /// The token that was lexed
198 pub token: Token<'a>,
199 /// The range of the token characters in the original license expression
200 pub span: std::ops::Range<usize>,
201}
202
203impl<'a> Iterator for Lexer<'a> {
204 type Item = Result<LexerToken<'a>, ParseError>;
205
206 fn next(&mut self) -> Option<Self::Item> {
207 #[allow(clippy::unnecessary_wraps)]
208 fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> {
209 let len = token.len();
210 Some(Ok((token, len)))
211 }
212
213 // Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
214 let non_whitespace_index = match self.inner.find(|c: char| !c.is_whitespace()) {
215 Some(idx) => idx,
216 None => self.inner.len(),
217 };
218 self.inner = &self.inner[non_whitespace_index..];
219 self.offset += non_whitespace_index;
220
221 match self.inner.chars().next() {
222 None => None,
223 // From SPDX 2.1 spec
224 // There MUST NOT be whitespace between a license-id and any following "+".
225 Some('+') => {
226 if non_whitespace_index == 0 {
227 ok_token(Token::Plus)
228 } else {
229 Some(Err(ParseError {
230 original: self.original.to_owned(),
231 span: self.offset - non_whitespace_index..self.offset,
232 reason: Reason::SeparatedPlus,
233 }))
234 }
235 }
236 Some('(') => ok_token(Token::OpenParen),
237 Some(')') => ok_token(Token::CloseParen),
238 Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))),
239 Some(_) => match Lexer::find_text_token(self.inner) {
240 None => Some(Err(ParseError {
241 original: self.original.to_owned(),
242 span: self.offset..self.offset + self.inner.len(),
243 reason: Reason::InvalidCharacters,
244 })),
245 Some(m) => {
246 if m == "WITH" {
247 ok_token(Token::With)
248 } else if m == "AND" {
249 ok_token(Token::And)
250 } else if m == "OR" {
251 ok_token(Token::Or)
252 } else if self.mode.allow_lower_case_operators && m == "and" {
253 ok_token(Token::And)
254 } else if self.mode.allow_lower_case_operators && m == "or" {
255 ok_token(Token::Or)
256 } else if self.mode.allow_lower_case_operators && m == "with" {
257 ok_token(Token::With)
258 } else if let Some(lic_id) = crate::license_id(m) {
259 ok_token(Token::Spdx(lic_id))
260 } else if let Some(exc_id) = crate::exception_id(m) {
261 ok_token(Token::Exception(exc_id))
262 } else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m)
263 {
264 ok_token(Token::LicenseRef {
265 doc_ref: Some(doc_ref),
266 lic_ref,
267 })
268 } else if let Some(lic_ref) = Lexer::find_license_ref(m) {
269 ok_token(Token::LicenseRef {
270 doc_ref: None,
271 lic_ref,
272 })
273 } else if let Some((lic_id, token_len)) =
274 if self.mode.allow_imprecise_license_names {
275 crate::imprecise_license_id(self.inner)
276 } else {
277 None
278 }
279 {
280 Some(Ok((Token::Spdx(lic_id), token_len)))
281 } else {
282 Some(Err(ParseError {
283 original: self.original.to_owned(),
284 span: self.offset..self.offset + m.len(),
285 reason: Reason::UnknownTerm,
286 }))
287 }
288 }
289 },
290 }
291 .map(|res| {
292 res.map(|(tok, len)| {
293 let start = self.offset;
294 self.inner = &self.inner[len..];
295 self.offset += len;
296
297 LexerToken {
298 token: tok,
299 span: start..self.offset,
300 }
301 })
302 })
303 }
304}
305