| 1 | // Copyright © SixtyFPS GmbH <info@slint.dev> |
| 2 | // SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-Slint-Royalty-free-2.0 OR LicenseRef-Slint-Software-3.0 |
| 3 | |
| 4 | //! This module contains the code for the lexer. |
| 5 | //! |
| 6 | //! It is kind of shared with parser.rs, which implements the lex_next_token based on the macro_rules |
| 7 | //! that declares token |
| 8 | |
| 9 | use crate::parser::SyntaxKind; |
| 10 | |
| 11 | #[derive (Default)] |
| 12 | pub struct LexState { |
| 13 | /// The top of the stack is the level of embedded braces `{`. |
| 14 | /// So we must still lex so many '}' before re-entering into a string mode and pop the stack. |
| 15 | template_string_stack: Vec<u32>, |
| 16 | } |
| 17 | |
| 18 | /// This trait is used by the `crate::parser::lex_next_token` function and is implemented |
| 19 | /// for rule passed to the macro which can be either a string literal, or a function |
| 20 | pub trait LexingRule { |
| 21 | /// Return the size of the match for this rule, or 0 if there is no match |
| 22 | fn lex(&self, text: &str, state: &mut LexState) -> usize; |
| 23 | } |
| 24 | |
| 25 | impl LexingRule for &str { |
| 26 | #[inline ] |
| 27 | fn lex(&self, text: &str, _: &mut LexState) -> usize { |
| 28 | if text.starts_with(*self) { |
| 29 | self.len() |
| 30 | } else { |
| 31 | 0 |
| 32 | } |
| 33 | } |
| 34 | } |
| 35 | |
| 36 | impl<F: Fn(&str, &mut LexState) -> usize> LexingRule for F { |
| 37 | #[inline ] |
| 38 | fn lex(&self, text: &str, state: &mut LexState) -> usize { |
| 39 | (self)(text, state) |
| 40 | } |
| 41 | } |
| 42 | |
| 43 | pub fn lex_whitespace(text: &str, _: &mut LexState) -> usize { |
| 44 | let mut len: usize = 0; |
| 45 | let chars: Chars<'_> = text.chars(); |
| 46 | for c: char in chars { |
| 47 | if !c.is_whitespace() && ![' \u{0002}' , ' \u{0003}' ].contains(&c) { |
| 48 | break; |
| 49 | } |
| 50 | len += c.len_utf8(); |
| 51 | } |
| 52 | len |
| 53 | } |
| 54 | |
| 55 | pub fn lex_comment(text: &str, _: &mut LexState) -> usize { |
| 56 | // FIXME: could report proper error if not properly terminated |
| 57 | if text.starts_with("//" ) { |
| 58 | return text.find(&[' \n' , ' \r' ] as &[_]).unwrap_or(text.len()); |
| 59 | } |
| 60 | if text.starts_with("/*" ) { |
| 61 | let mut nested = 0; |
| 62 | let mut offset = 2; |
| 63 | let bytes = text.as_bytes(); |
| 64 | while offset < bytes.len() { |
| 65 | if let Some(star) = bytes[offset..].iter().position(|c| *c == b'*' ) { |
| 66 | let star = star + offset; |
| 67 | if star > offset && bytes[star - 1] == b'/' { |
| 68 | nested += 1; |
| 69 | offset = star + 1; |
| 70 | } else if star < bytes.len() - 1 && bytes[star + 1] == b'/' { |
| 71 | if nested == 0 { |
| 72 | return star + 2; |
| 73 | } |
| 74 | nested -= 1; |
| 75 | offset = star + 2; |
| 76 | } else { |
| 77 | offset = star + 1; |
| 78 | } |
| 79 | } else { |
| 80 | // Unterminated |
| 81 | return 0; |
| 82 | } |
| 83 | } |
| 84 | // Unterminated |
| 85 | return 0; |
| 86 | } |
| 87 | |
| 88 | 0 |
| 89 | } |
| 90 | |
| 91 | pub fn lex_string(text: &str, state: &mut LexState) -> usize { |
| 92 | if let Some(brace_level) = state.template_string_stack.last_mut() { |
| 93 | if text.starts_with('{' ) { |
| 94 | *brace_level += 1; |
| 95 | return 0; |
| 96 | } else if text.starts_with('}' ) { |
| 97 | if *brace_level > 0 { |
| 98 | *brace_level -= 1; |
| 99 | return 0; |
| 100 | } else { |
| 101 | state.template_string_stack.pop(); |
| 102 | } |
| 103 | } else if !text.starts_with('"' ) { |
| 104 | return 0; |
| 105 | } |
| 106 | } else if !text.starts_with('"' ) { |
| 107 | return 0; |
| 108 | } |
| 109 | let text_len = text.as_bytes().len(); |
| 110 | let mut end = 1; // skip the '"' |
| 111 | loop { |
| 112 | let stop = match text[end..].find(&['"' , ' \\' ][..]) { |
| 113 | Some(stop) => end + stop, |
| 114 | // FIXME: report an error for unterminated string |
| 115 | None => return 0, |
| 116 | }; |
| 117 | match text.as_bytes()[stop] { |
| 118 | b'"' => { |
| 119 | return stop + 1; |
| 120 | } |
| 121 | b' \\' => { |
| 122 | if text_len <= stop + 1 { |
| 123 | // FIXME: report an error for unterminated string |
| 124 | return 0; |
| 125 | } |
| 126 | if text.as_bytes()[stop + 1] == b'{' { |
| 127 | state.template_string_stack.push(0); |
| 128 | return stop + 2; |
| 129 | } |
| 130 | end = stop + 1 + text[stop + 1..].chars().next().map_or(0, |c| c.len_utf8()) |
| 131 | } |
| 132 | _ => unreachable!(), |
| 133 | } |
| 134 | } |
| 135 | } |
| 136 | |
| 137 | pub fn lex_number(text: &str, _: &mut LexState) -> usize { |
| 138 | let mut len = 0; |
| 139 | let mut chars = text.chars(); |
| 140 | let mut had_period = false; |
| 141 | while let Some(c) = chars.next() { |
| 142 | if !c.is_ascii_digit() { |
| 143 | if !had_period && c == '.' && len > 0 { |
| 144 | had_period = true; |
| 145 | } else { |
| 146 | if len > 0 { |
| 147 | if c == '%' { |
| 148 | return len + 1; |
| 149 | } |
| 150 | if c.is_ascii_alphabetic() { |
| 151 | len += c.len_utf8(); |
| 152 | // The unit |
| 153 | for c in chars { |
| 154 | if !c.is_ascii_alphabetic() { |
| 155 | return len; |
| 156 | } |
| 157 | len += c.len_utf8(); |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | break; |
| 162 | } |
| 163 | } |
| 164 | len += c.len_utf8(); |
| 165 | } |
| 166 | len |
| 167 | } |
| 168 | |
| 169 | pub fn lex_color(text: &str, _: &mut LexState) -> usize { |
| 170 | if !text.starts_with('#' ) { |
| 171 | return 0; |
| 172 | } |
| 173 | let mut len: usize = 1; |
| 174 | let chars: Chars<'_> = text[1..].chars(); |
| 175 | for c: char in chars { |
| 176 | if !c.is_ascii_alphanumeric() { |
| 177 | break; |
| 178 | } |
| 179 | len += c.len_utf8(); |
| 180 | } |
| 181 | len |
| 182 | } |
| 183 | |
| 184 | pub fn lex_identifier(text: &str, _: &mut LexState) -> usize { |
| 185 | let mut len: usize = 0; |
| 186 | let chars: Chars<'_> = text.chars(); |
| 187 | for c: char in chars { |
| 188 | if !c.is_alphanumeric() && c != '_' && (c != '-' || len == 0) { |
| 189 | break; |
| 190 | } |
| 191 | len += c.len_utf8(); |
| 192 | } |
| 193 | len |
| 194 | } |
| 195 | |
| 196 | #[allow (clippy::needless_update)] // Token may have extra fields depending on selected features |
| 197 | pub fn lex(mut source: &str) -> Vec<crate::parser::Token> { |
| 198 | let mut result = vec![]; |
| 199 | let mut offset = 0; |
| 200 | let mut state = LexState::default(); |
| 201 | if source.starts_with(" \u{FEFF}" ) { |
| 202 | // Skip BOM |
| 203 | result.push(crate::parser::Token { |
| 204 | kind: SyntaxKind::Whitespace, |
| 205 | text: source[..3].into(), |
| 206 | offset: 0, |
| 207 | ..Default::default() |
| 208 | }); |
| 209 | source = &source[3..]; |
| 210 | offset += 3; |
| 211 | } |
| 212 | while !source.is_empty() { |
| 213 | if let Some((len, kind)) = crate::parser::lex_next_token(source, &mut state) { |
| 214 | result.push(crate::parser::Token { |
| 215 | kind, |
| 216 | text: source[..len].into(), |
| 217 | offset, |
| 218 | ..Default::default() |
| 219 | }); |
| 220 | offset += len; |
| 221 | source = &source[len..]; |
| 222 | } else { |
| 223 | // FIXME: recover |
| 224 | result.push(crate::parser::Token { |
| 225 | kind: SyntaxKind::Error, |
| 226 | text: source.into(), |
| 227 | offset, |
| 228 | ..Default::default() |
| 229 | }); |
| 230 | //offset += source.len(); |
| 231 | break; |
| 232 | } |
| 233 | } |
| 234 | result |
| 235 | } |
| 236 | |
| 237 | #[test ] |
| 238 | fn basic_lexer_test() { |
| 239 | fn compare(source: &str, expected: &[(SyntaxKind, &str)]) { |
| 240 | let actual = lex(source); |
| 241 | let actual = |
| 242 | actual.iter().map(|token| (token.kind, token.text.as_str())).collect::<Vec<_>>(); |
| 243 | assert_eq!(actual.as_slice(), expected); |
| 244 | } |
| 245 | |
| 246 | compare( |
| 247 | r#"45 /*hi/*_*/ho*/ "string""# , |
| 248 | &[ |
| 249 | (SyntaxKind::NumberLiteral, "45" ), |
| 250 | (SyntaxKind::Whitespace, " " ), |
| 251 | (SyntaxKind::Comment, "/*hi/*_*/ho*/" ), |
| 252 | (SyntaxKind::Whitespace, " " ), |
| 253 | (SyntaxKind::StringLiteral, r#""string""# ), |
| 254 | ], |
| 255 | ); |
| 256 | |
| 257 | compare( |
| 258 | r#"12px+5.2+=0.7%"# , |
| 259 | &[ |
| 260 | (SyntaxKind::NumberLiteral, "12px" ), |
| 261 | (SyntaxKind::Plus, "+" ), |
| 262 | (SyntaxKind::NumberLiteral, "5.2" ), |
| 263 | (SyntaxKind::PlusEqual, "+=" ), |
| 264 | (SyntaxKind::NumberLiteral, "0.7%" ), |
| 265 | ], |
| 266 | ); |
| 267 | compare( |
| 268 | r#"aa_a.b1,c"# , |
| 269 | &[ |
| 270 | (SyntaxKind::Identifier, "aa_a" ), |
| 271 | (SyntaxKind::Dot, "." ), |
| 272 | (SyntaxKind::Identifier, "b1" ), |
| 273 | (SyntaxKind::Comma, "," ), |
| 274 | (SyntaxKind::Identifier, "c" ), |
| 275 | ], |
| 276 | ); |
| 277 | compare( |
| 278 | r#"/*/**/*//**/*"# , |
| 279 | &[ |
| 280 | (SyntaxKind::Comment, "/*/**/*/" ), |
| 281 | (SyntaxKind::Comment, "/**/" ), |
| 282 | (SyntaxKind::Star, "*" ), |
| 283 | ], |
| 284 | ); |
| 285 | compare( |
| 286 | "a//x \nb//y \r\nc//z" , |
| 287 | &[ |
| 288 | (SyntaxKind::Identifier, "a" ), |
| 289 | (SyntaxKind::Comment, "//x" ), |
| 290 | (SyntaxKind::Whitespace, " \n" ), |
| 291 | (SyntaxKind::Identifier, "b" ), |
| 292 | (SyntaxKind::Comment, "//y" ), |
| 293 | (SyntaxKind::Whitespace, " \r\n" ), |
| 294 | (SyntaxKind::Identifier, "c" ), |
| 295 | (SyntaxKind::Comment, "//z" ), |
| 296 | ], |
| 297 | ); |
| 298 | compare(r#""x""# , &[(SyntaxKind::StringLiteral, r#""x""# )]); |
| 299 | compare( |
| 300 | r#"a"\"\\"x"# , |
| 301 | &[ |
| 302 | (SyntaxKind::Identifier, "a" ), |
| 303 | (SyntaxKind::StringLiteral, r#""\"\\""# ), |
| 304 | (SyntaxKind::Identifier, "x" ), |
| 305 | ], |
| 306 | ); |
| 307 | compare( |
| 308 | r#""a\{b{c}d"e\{f}g"h}i"j"# , |
| 309 | &[ |
| 310 | (SyntaxKind::StringLiteral, r#""a\{"# ), |
| 311 | (SyntaxKind::Identifier, "b" ), |
| 312 | (SyntaxKind::LBrace, "{" ), |
| 313 | (SyntaxKind::Identifier, "c" ), |
| 314 | (SyntaxKind::RBrace, "}" ), |
| 315 | (SyntaxKind::Identifier, "d" ), |
| 316 | (SyntaxKind::StringLiteral, r#""e\{"# ), |
| 317 | (SyntaxKind::Identifier, "f" ), |
| 318 | (SyntaxKind::StringLiteral, r#"}g""# ), |
| 319 | (SyntaxKind::Identifier, "h" ), |
| 320 | (SyntaxKind::StringLiteral, r#"}i""# ), |
| 321 | (SyntaxKind::Identifier, "j" ), |
| 322 | ], |
| 323 | ); |
| 324 | |
| 325 | // Fuzzer tests: |
| 326 | compare(r#"/**"# , &[(SyntaxKind::Div, "/" ), (SyntaxKind::Star, "*" ), (SyntaxKind::Star, "*" )]); |
| 327 | compare(r#""\"# , &[(SyntaxKind::Error, " \"\\" )]); |
| 328 | compare(r#""\ޱ"# , &[(SyntaxKind::Error, " \"\\ޱ" )]); |
| 329 | } |
| 330 | |
| 331 | /// Given the source of a rust file, find the occurrence of each `slint!(...)`macro. |
| 332 | /// Return an iterator with the range of the location of the macro in the original source |
| 333 | pub fn locate_slint_macro(rust_source: &str) -> impl Iterator<Item = core::ops::Range<usize>> + '_ { |
| 334 | let mut begin = 0; |
| 335 | std::iter::from_fn(move || { |
| 336 | let (open, close) = loop { |
| 337 | if let Some(m) = rust_source[begin..].find("slint" ) { |
| 338 | // heuristics to find if we are not in a comment or a string literal. Not perfect, but should work in most cases |
| 339 | if let Some(x) = rust_source[begin..(begin + m)].rfind([' \\' , ' \n' , '/' , ' \"' ]) { |
| 340 | if rust_source.as_bytes()[begin + x] != b' \n' { |
| 341 | begin += m + 5; |
| 342 | begin += rust_source[begin..].find([' \n' ]).unwrap_or(0); |
| 343 | continue; |
| 344 | } |
| 345 | } |
| 346 | begin += m + 5; |
| 347 | while rust_source[begin..].starts_with(' ' ) { |
| 348 | begin += 1; |
| 349 | } |
| 350 | if !rust_source[begin..].starts_with('!' ) { |
| 351 | continue; |
| 352 | } |
| 353 | begin += 1; |
| 354 | while rust_source[begin..].starts_with(' ' ) { |
| 355 | begin += 1; |
| 356 | } |
| 357 | let Some(open) = rust_source.as_bytes().get(begin) else { continue }; |
| 358 | match open { |
| 359 | b'{' => break (SyntaxKind::LBrace, SyntaxKind::RBrace), |
| 360 | b'[' => break (SyntaxKind::LBracket, SyntaxKind::RBracket), |
| 361 | b'(' => break (SyntaxKind::LParent, SyntaxKind::RParent), |
| 362 | _ => continue, |
| 363 | } |
| 364 | } else { |
| 365 | // No macro found, just return |
| 366 | return None; |
| 367 | } |
| 368 | }; |
| 369 | |
| 370 | begin += 1; |
| 371 | |
| 372 | // Now find the matching closing delimiter |
| 373 | // Technically, we should be lexing rust, not slint |
| 374 | let mut state = LexState::default(); |
| 375 | let start = begin; |
| 376 | let mut end = begin; |
| 377 | let mut level = 0; |
| 378 | while !rust_source[end..].is_empty() { |
| 379 | let len = match crate::parser::lex_next_token(&rust_source[end..], &mut state) { |
| 380 | Some((len, x)) if x == open => { |
| 381 | level += 1; |
| 382 | len |
| 383 | } |
| 384 | Some((_, x)) if x == close && level == 0 => { |
| 385 | break; |
| 386 | } |
| 387 | Some((len, x)) if x == close => { |
| 388 | level -= 1; |
| 389 | len |
| 390 | } |
| 391 | Some((len, _)) => len, |
| 392 | None => { |
| 393 | // Lex error |
| 394 | break; |
| 395 | } |
| 396 | }; |
| 397 | if len == 0 { |
| 398 | break; // Shouldn't happen |
| 399 | } |
| 400 | end += len; |
| 401 | } |
| 402 | begin = end; |
| 403 | Some(start..end) |
| 404 | }) |
| 405 | } |
| 406 | |
| 407 | #[test ] |
| 408 | fn test_locate_rust_macro() { |
| 409 | #[track_caller ] |
| 410 | fn do_test(source: &str, captures: &[&str]) { |
| 411 | let result = locate_slint_macro(source).map(|r| &source[r]).collect::<Vec<_>>(); |
| 412 | assert_eq!(&result, captures); |
| 413 | } |
| 414 | |
| 415 | do_test(" \nslint{!{}}" , &[]); |
| 416 | do_test( |
| 417 | "//slint!(123) \nslint!(456) \nslint ![789] \n/*slint!{abc}*/ \nslint! {def}" , |
| 418 | &["456" , "789" , "def" ], |
| 419 | ); |
| 420 | do_test("slint!(slint!(abc))slint!()" , &["slint!(abc)" , "" ]); |
| 421 | } |
| 422 | |
| 423 | /// Given a Rust source file contents, return a string containing the contents of the first `slint!` macro |
| 424 | /// |
| 425 | /// All the other bytes which are not newlines are replaced by space. This allow offsets in the resulting |
| 426 | /// string to preserve line and column number. |
| 427 | /// |
| 428 | /// The last byte before the Slint area will be \u{2} (ASCII Start-of-Text), the first byte after |
| 429 | /// the slint code will be \u{3} (ASCII End-of-Text), so that programs can find the area of slint code |
| 430 | /// within the program. |
| 431 | /// |
| 432 | /// Note that the slint compiler considers Start-of-Text and End-of-Text as whitespace and will treat them |
| 433 | /// accordingly. |
| 434 | pub fn extract_rust_macro(rust_source: String) -> Option<String> { |
| 435 | let core::ops::Range { start: usize, end: usize } = locate_slint_macro(&rust_source).next()?; |
| 436 | let mut bytes: Vec = rust_source.into_bytes(); |
| 437 | for c: &mut u8 in &mut bytes[..start] { |
| 438 | if *c != b' \n' { |
| 439 | *c = b' ' |
| 440 | } |
| 441 | } |
| 442 | |
| 443 | if start > 0 { |
| 444 | bytes[start - 1] = 2; |
| 445 | } |
| 446 | if end < bytes.len() { |
| 447 | bytes[end] = 3; |
| 448 | |
| 449 | for c: &mut u8 in &mut bytes[end + 1..] { |
| 450 | if *c != b' \n' { |
| 451 | *c = b' ' |
| 452 | } |
| 453 | } |
| 454 | } |
| 455 | Some(String::from_utf8(bytes).expect(msg:"We just added spaces" )) |
| 456 | } |
| 457 | |
| 458 | #[test ] |
| 459 | fn test_extract_rust_macro() { |
| 460 | assert_eq!(extract_rust_macro(" \nslint{!{}}" .into()), None); |
| 461 | assert_eq!( |
| 462 | extract_rust_macro( |
| 463 | "abc \n€ \nslint ! {x \" \\\" }🦀 \" { () {} \n {} }xx =}- ;} \n xxx \n yyy {} \n" .into(), |
| 464 | ), |
| 465 | Some( |
| 466 | " \n \n \u{2}x \" \\\" }🦀 \" { () {} \n {} }xx = \u{3} \n \n \n" .into(), |
| 467 | ) |
| 468 | ); |
| 469 | |
| 470 | assert_eq!( |
| 471 | extract_rust_macro("xx \nabcd::slint!{abc{}efg" .into()), |
| 472 | Some(" \n \u{2}abc{}efg" .into()) |
| 473 | ); |
| 474 | assert_eq!( |
| 475 | extract_rust_macro("slint! \nnot. \nslint!{ \nunterminated \nxxx" .into()), |
| 476 | Some(" \n \n \u{2}\nunterminated \nxxx" .into()) |
| 477 | ); |
| 478 | assert_eq!(extract_rust_macro("foo \n/* slint! { hello } \n" .into()), None); |
| 479 | assert_eq!(extract_rust_macro("foo \n/* slint::slint! { hello } \n" .into()), None); |
| 480 | assert_eq!( |
| 481 | extract_rust_macro("foo \n// slint! { hello } \nslint!{world} \na" .into()), |
| 482 | Some(" \n \n \u{2}world \u{3}\n " .into()) |
| 483 | ); |
| 484 | assert_eq!(extract_rust_macro("foo \n\" slint! { hello } \"\n" .into()), None); |
| 485 | assert_eq!( |
| 486 | extract_rust_macro( |
| 487 | "abc \n€ \nslint ! (x /* \\\" )🦀*/ { () {} \n {} }xx =)- ;} \n xxx \n yyy {} \n" .into(), |
| 488 | ), |
| 489 | Some( |
| 490 | " \n \n \u{2}x /* \\\" )🦀*/ { () {} \n {} }xx = \u{3} \n \n \n" .into(), |
| 491 | ) |
| 492 | ); |
| 493 | assert_eq!( |
| 494 | extract_rust_macro("abc slint![x slint!() [{[]}] s] abc" .into()), |
| 495 | Some(" \u{0002}x slint!() [{[]}] s \u{0003} " .into()), |
| 496 | ); |
| 497 | } |
| 498 | |