lexer.rs source code [crates/spdx/src/lexer.rs]

1	use crate::{
2	error::{ParseError, Reason},
3	ExceptionId, LicenseId,
4	};
5
6	/// Parsing configuration for SPDX expression
7	#[derive(Default, Copy, Clone)]
8	pub struct ParseMode {
9	/// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
10	/// the SPDX spec, but enabling this option allows them to be lowercased
11	pub allow_lower_case_operators: bool,
12	/// Allows the use of `/` as a synonym for the `OR` operator.
13	///
14	/// This also allows for not having whitespace between the `/` and the terms
15	/// on either side
16	pub allow_slash_as_or_operator: bool,
17	/// Allows some invalid/imprecise identifiers as synonyms for an actual
18	/// license identifier.
19	///
20	/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
21	/// of the current synonyms. Note that this list is not comprehensive but
22	/// can be expanded upon when invalid identifiers are found in the wild.
23	pub allow_imprecise_license_names: bool,
24	/// The various GPL licenses diverge from every other license in the SPDX
25	/// license list by having an `-or-later` variant that is used as a suffix
26	/// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
27	/// `GPL-3.0+`.
28	///
29	/// This option just allows GPL licenses to be treated similarly to all of
30	/// the other SPDX licenses.
31	pub allow_postfix_plus_on_gpl: bool,
32	}
33
34	impl ParseMode {
35	/// Strict, specification compliant SPDX parsing.
36	///
37	/// 1. Only license identifiers in the SPDX license list, or
38	/// Document/LicenseRef, are allowed. The license identifiers are also
39	/// case-sensitive.
40	/// 1. `WITH`, `AND`, and `OR` are the only valid operators
41	pub const STRICT: Self = Self {
42	allow_lower_case_operators: `false`,
43	allow_slash_as_or_operator: `false`,
44	allow_imprecise_license_names: `false`,
45	allow_postfix_plus_on_gpl: `false`,
46	};
47
48	/// Allow non-conforming syntax for crates-io compatibility
49	///
50	/// 1. Additional, invalid, identifiers are accepted and mapped to a correct
51	/// SPDX license identifier.
52	/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the
53	/// list of additionally accepted identifiers and the license they
54	/// correspond to.
55	/// 1. `/` can by used as a synonym for `OR`, and doesn't need to be
56	/// separated by whitespace from the terms it combines
57	pub const LAX: Self = Self {
58	allow_lower_case_operators: `true`,
59	allow_slash_as_or_operator: `true`,
60	allow_imprecise_license_names: `true`,
61	allow_postfix_plus_on_gpl: `true`,
62	};
63	}
64
65	/// A single token in an SPDX license expression
66	#[derive(Clone, Debug, PartialEq, Eq)]
67	pub enum Token<'a> {
68	/// A recognized SPDX license id
69	Spdx(LicenseId),
70	/// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
71	LicenseRef {
72	doc_ref: Option<&'a str>,
73	lic_ref: &'a str,
74	},
75	/// A recognized SPDX exception id
76	Exception(ExceptionId),
77	/// A postfix `+` indicating "or later" for a particular SPDX license id
78	Plus,
79	/// A `(` for starting a group
80	OpenParen,
81	/// A `)` for ending a group
82	CloseParen,
83	/// A `WITH` operator
84	With,
85	/// An `AND` operator
86	And,
87	/// An `OR` operator
88	Or,
89	}
90
91	impl std::fmt::Display for Token<'_> {
92	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93	std::fmt::Debug::fmt(self, f)
94	}
95	}
96
97	impl Token<'_> {
98	fn len(&self) -> usize {
99	match self {
100	Token::Spdx(id: &LicenseId) => id.name.len(),
101	Token::Exception(e: &ExceptionId) => e.name.len(),
102	Token::With => `4`,
103	Token::And => `3`,
104	Token::Or => `2`,
105	Token::Plus \| Token::OpenParen \| Token::CloseParen => `1`,
106	Token::LicenseRef { doc_ref: &Option<&str>, lic_ref: &&str } => {
107	doc_ref.map_or(default:`0`, \|d: &str\| {
108	// +1 is for the `:`
109	"DocumentRef-".len() + d.len() + `1`
110	}) + "LicenseRef-".len()
111	+ lic_ref.len()
112	}
113	}
114	}
115	}
116
117	/// Allows iteration through an SPDX license expression, yielding
118	/// a token or a `ParseError`.
119	///
120	/// Prefer to use `Expression::parse` or `Licensee::parse` rather
121	/// than directly using the lexer
122	pub struct Lexer<'a> {
123	inner: &'a str,
124	original: &'a str,
125	offset: usize,
126	mode: ParseMode,
127	}
128
129	impl<'a> Lexer<'a> {
130	/// Creates a Lexer over a license expression
131	#[must_use]
132	pub fn new(text: &'a str) -> Self {
133	Self {
134	inner: text,
135	original: text,
136	offset: `0`,
137	mode: ParseMode::STRICT,
138	}
139	}
140
141	/// Creates a Lexer over a license expression
142	///
143	/// With `ParseMode::Lax` it allows non-conforming syntax
144	/// used in crates-io crates.
145	#[must_use]
146	pub fn new_mode(text: &'a str, mode: ParseMode) -> Self {
147	Self {
148	inner: text,
149	original: text,
150	offset: `0`,
151	mode,
152	}
153	}
154
155	#[inline]
156	fn is_ref_char(c: &char) -> bool {
157	c.is_ascii_alphanumeric() \|\| c == '-' \|\| c == '.'
158	}
159
160	/// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+`
161	fn find_text_token(text: &'a str) -> Option<&'a str> {
162	let is_token_char = \|c: &char\| Self::is_ref_char(c) \|\| *c == ':';
163	match text.chars().take_while(is_token_char).count() {
164	index if index > `0` => Some(&text[..index]),
165	_ => None,
166	}
167	}
168
169	/// Extract the text after `prefix` if made up of valid ref characters
170	fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> {
171	text.strip_prefix(prefix).map(\|value\| {
172	let end = value.chars().take_while(Self::is_ref_char).count();
173	&text[prefix.len()..prefix.len() + end]
174	})
175	}
176
177	/// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)`
178	#[inline]
179	fn find_license_ref(text: &'a str) -> Option<&'a str> {
180	Self::find_ref("LicenseRef-", text)
181	}
182
183	/// Return a document ref and license ref if found,
184	/// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)`
185	fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> {
186	let split = text.split_once(':');
187	let doc_ref = split.and_then(\|(doc, _)\| Self::find_ref("DocumentRef-", doc));
188	let lic_ref = split.and_then(\|(_, lic)\| Self::find_license_ref(lic));
189	Option::zip(doc_ref, lic_ref)
190	}
191	}
192
193	/// A wrapper around a particular token that includes the span of the characters
194	/// in the original string, for diagnostic purposes
195	#[derive(Debug)]
196	pub struct LexerToken<'a> {
197	/// The token that was lexed
198	pub token: Token<'a>,
199	/// The range of the token characters in the original license expression
200	pub span: std::ops::Range<usize>,
201	}
202
203	impl<'a> Iterator for Lexer<'a> {
204	type Item = Result<LexerToken<'a>, ParseError>;
205
206	fn next(&mut self) -> Option<Self::Item> {
207	#[allow(clippy::unnecessary_wraps)]
208	fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> {
209	let len = token.len();
210	Some(Ok((token, len)))
211	}
212
213	// Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
214	let non_whitespace_index = match self.inner.find(\|c: char\| !c.is_whitespace()) {
215	Some(idx) => idx,
216	None => self.inner.len(),
217	};
218	self.inner = &self.inner[non_whitespace_index..];
219	self.offset += non_whitespace_index;
220
221	match self.inner.chars().next() {
222	None => None,
223	// From SPDX 2.1 spec
224	// There MUST NOT be whitespace between a license-id and any following "+".
225	Some('+') => {
226	if non_whitespace_index == `0` {
227	ok_token(Token::Plus)
228	} else {
229	Some(Err(ParseError {
230	original: self.original.to_owned(),
231	span: self.offset - non_whitespace_index..self.offset,
232	reason: Reason::SeparatedPlus,
233	}))
234	}
235	}
236	Some('(') => ok_token(Token::OpenParen),
237	Some(')') => ok_token(Token::CloseParen),
238	Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, `1`))),
239	Some(_) => match Lexer::find_text_token(self.inner) {
240	None => Some(Err(ParseError {
241	original: self.original.to_owned(),
242	span: self.offset..self.offset + self.inner.len(),
243	reason: Reason::InvalidCharacters,
244	})),
245	Some(m) => {
246	if m == "WITH" {
247	ok_token(Token::With)
248	} else if m == "AND" {
249	ok_token(Token::And)
250	} else if m == "OR" {
251	ok_token(Token::Or)
252	} else if self.mode.allow_lower_case_operators && m == "and" {
253	ok_token(Token::And)
254	} else if self.mode.allow_lower_case_operators && m == "or" {
255	ok_token(Token::Or)
256	} else if self.mode.allow_lower_case_operators && m == "with" {
257	ok_token(Token::With)
258	} else if let Some(lic_id) = crate::license_id(m) {
259	ok_token(Token::Spdx(lic_id))
260	} else if let Some(exc_id) = crate::exception_id(m) {
261	ok_token(Token::Exception(exc_id))
262	} else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m)
263	{
264	ok_token(Token::LicenseRef {
265	doc_ref: Some(doc_ref),
266	lic_ref,
267	})
268	} else if let Some(lic_ref) = Lexer::find_license_ref(m) {
269	ok_token(Token::LicenseRef {
270	doc_ref: None,
271	lic_ref,
272	})
273	} else if let Some((lic_id, token_len)) =
274	if self.mode.allow_imprecise_license_names {
275	crate::imprecise_license_id(self.inner)
276	} else {
277	None
278	}
279	{
280	Some(Ok((Token::Spdx(lic_id), token_len)))
281	} else {
282	Some(Err(ParseError {
283	original: self.original.to_owned(),
284	span: self.offset..self.offset + m.len(),
285	reason: Reason::UnknownTerm,
286	}))
287	}
288	}
289	},
290	}
291	.map(\|res\| {
292	res.map(\|(tok, len)\| {
293	let start = self.offset;
294	self.inner = &self.inner[len..];
295	self.offset += len;
296
297	LexerToken {
298	token: tok,
299	span: start..self.offset,
300	}
301	})
302	})
303	}
304	}
305