1 | //===--- CommentLexer.cpp -------------------------------------------------===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang/AST/CommentLexer.h" |
10 | #include "clang/AST/CommentCommandTraits.h" |
11 | #include "clang/Basic/CharInfo.h" |
12 | #include "clang/Basic/DiagnosticComment.h" |
13 | #include "llvm/ADT/StringExtras.h" |
14 | #include "llvm/ADT/StringSwitch.h" |
15 | #include "llvm/Support/ConvertUTF.h" |
16 | #include "llvm/Support/ErrorHandling.h" |
17 | |
18 | namespace clang { |
19 | namespace comments { |
20 | |
21 | void Token::dump(const Lexer &L, const SourceManager &SM) const { |
22 | llvm::errs() << "comments::Token Kind="<< Kind << " "; |
23 | Loc.print(OS&: llvm::errs(), SM); |
24 | llvm::errs() << " "<< Length << " \""<< L.getSpelling(Tok: *this, SourceMgr: SM) << "\"\n"; |
25 | } |
26 | |
27 | static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { |
28 | return isLetter(c: C); |
29 | } |
30 | |
31 | static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { |
32 | return isDigit(c: C); |
33 | } |
34 | |
35 | static inline bool isHTMLHexCharacterReferenceCharacter(char C) { |
36 | return isHexDigit(c: C); |
37 | } |
38 | |
39 | static inline StringRef convertCodePointToUTF8( |
40 | llvm::BumpPtrAllocator &Allocator, |
41 | unsigned CodePoint) { |
42 | char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); |
43 | char *ResolvedPtr = Resolved; |
44 | if (llvm::ConvertCodePointToUTF8(Source: CodePoint, ResultPtr&: ResolvedPtr)) |
45 | return StringRef(Resolved, ResolvedPtr - Resolved); |
46 | else |
47 | return StringRef(); |
48 | } |
49 | |
50 | namespace { |
51 | |
52 | #include "clang/AST/CommentHTMLTags.inc" |
53 | #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" |
54 | |
55 | } // end anonymous namespace |
56 | |
57 | StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { |
58 | // Fast path, first check a few most widely used named character references. |
59 | return llvm::StringSwitch<StringRef>(Name) |
60 | .Case("amp", "&") |
61 | .Case("lt", "<") |
62 | .Case("gt", ">") |
63 | .Case("quot", "\"") |
64 | .Case("apos", "\'") |
65 | // Slow path. |
66 | .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); |
67 | } |
68 | |
69 | StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { |
70 | unsigned CodePoint = 0; |
71 | for (unsigned i = 0, e = Name.size(); i != e; ++i) { |
72 | assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); |
73 | CodePoint *= 10; |
74 | CodePoint += Name[i] - '0'; |
75 | } |
76 | return convertCodePointToUTF8(Allocator, CodePoint); |
77 | } |
78 | |
79 | StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { |
80 | unsigned CodePoint = 0; |
81 | for (unsigned i = 0, e = Name.size(); i != e; ++i) { |
82 | CodePoint *= 16; |
83 | const char C = Name[i]; |
84 | assert(isHTMLHexCharacterReferenceCharacter(C)); |
85 | CodePoint += llvm::hexDigitValue(C); |
86 | } |
87 | return convertCodePointToUTF8(Allocator, CodePoint); |
88 | } |
89 | |
90 | void Lexer::skipLineStartingDecorations() { |
91 | // This function should be called only for C comments |
92 | assert(CommentState == LCS_InsideCComment); |
93 | |
94 | if (BufferPtr == CommentEnd) |
95 | return; |
96 | |
97 | const char *NewBufferPtr = BufferPtr; |
98 | while (isHorizontalWhitespace(c: *NewBufferPtr)) |
99 | if (++NewBufferPtr == CommentEnd) |
100 | return; |
101 | if (*NewBufferPtr == '*') |
102 | BufferPtr = NewBufferPtr + 1; |
103 | } |
104 | |
105 | namespace { |
106 | /// Returns pointer to the first newline character in the string. |
107 | const char *findNewline(const char *BufferPtr, const char *BufferEnd) { |
108 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
109 | if (isVerticalWhitespace(c: *BufferPtr)) |
110 | return BufferPtr; |
111 | } |
112 | return BufferEnd; |
113 | } |
114 | |
115 | const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { |
116 | if (BufferPtr == BufferEnd) |
117 | return BufferPtr; |
118 | |
119 | if (*BufferPtr == '\n') |
120 | BufferPtr++; |
121 | else { |
122 | assert(*BufferPtr == '\r'); |
123 | BufferPtr++; |
124 | if (BufferPtr != BufferEnd && *BufferPtr == '\n') |
125 | BufferPtr++; |
126 | } |
127 | return BufferPtr; |
128 | } |
129 | |
130 | const char *skipNamedCharacterReference(const char *BufferPtr, |
131 | const char *BufferEnd) { |
132 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
133 | if (!isHTMLNamedCharacterReferenceCharacter(C: *BufferPtr)) |
134 | return BufferPtr; |
135 | } |
136 | return BufferEnd; |
137 | } |
138 | |
139 | const char *skipDecimalCharacterReference(const char *BufferPtr, |
140 | const char *BufferEnd) { |
141 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
142 | if (!isHTMLDecimalCharacterReferenceCharacter(C: *BufferPtr)) |
143 | return BufferPtr; |
144 | } |
145 | return BufferEnd; |
146 | } |
147 | |
148 | const char *skipHexCharacterReference(const char *BufferPtr, |
149 | const char *BufferEnd) { |
150 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
151 | if (!isHTMLHexCharacterReferenceCharacter(C: *BufferPtr)) |
152 | return BufferPtr; |
153 | } |
154 | return BufferEnd; |
155 | } |
156 | |
157 | bool isHTMLIdentifierStartingCharacter(char C) { |
158 | return isLetter(c: C); |
159 | } |
160 | |
161 | bool isHTMLIdentifierCharacter(char C) { |
162 | return isAlphanumeric(c: C); |
163 | } |
164 | |
165 | const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { |
166 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
167 | if (!isHTMLIdentifierCharacter(C: *BufferPtr)) |
168 | return BufferPtr; |
169 | } |
170 | return BufferEnd; |
171 | } |
172 | |
173 | /// Skip HTML string quoted in single or double quotes. Escaping quotes inside |
174 | /// string allowed. |
175 | /// |
176 | /// Returns pointer to closing quote. |
177 | const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) |
178 | { |
179 | const char Quote = *BufferPtr; |
180 | assert(Quote == '\"' || Quote == '\''); |
181 | |
182 | BufferPtr++; |
183 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
184 | const char C = *BufferPtr; |
185 | if (C == Quote && BufferPtr[-1] != '\\') |
186 | return BufferPtr; |
187 | } |
188 | return BufferEnd; |
189 | } |
190 | |
191 | const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { |
192 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
193 | if (!isWhitespace(c: *BufferPtr)) |
194 | return BufferPtr; |
195 | } |
196 | return BufferEnd; |
197 | } |
198 | |
199 | const char *skipHorizontalWhitespace(const char *BufferPtr, |
200 | const char *BufferEnd) { |
201 | for (; BufferPtr != BufferEnd; ++BufferPtr) { |
202 | if (!isHorizontalWhitespace(c: *BufferPtr)) |
203 | return BufferPtr; |
204 | } |
205 | return BufferEnd; |
206 | } |
207 | |
208 | bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { |
209 | return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; |
210 | } |
211 | |
212 | bool isCommandNameStartCharacter(char C) { |
213 | return isLetter(c: C); |
214 | } |
215 | |
216 | bool isCommandNameCharacter(char C) { |
217 | return isAlphanumeric(c: C); |
218 | } |
219 | |
220 | const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { |
221 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
222 | if (!isCommandNameCharacter(C: *BufferPtr)) |
223 | return BufferPtr; |
224 | } |
225 | return BufferEnd; |
226 | } |
227 | |
228 | /// Return the one past end pointer for BCPL comments. |
229 | /// Handles newlines escaped with backslash or trigraph for backslahs. |
230 | const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
231 | const char *CurPtr = BufferPtr; |
232 | while (CurPtr != BufferEnd) { |
233 | while (!isVerticalWhitespace(c: *CurPtr)) { |
234 | CurPtr++; |
235 | if (CurPtr == BufferEnd) |
236 | return BufferEnd; |
237 | } |
238 | // We found a newline, check if it is escaped. |
239 | const char *EscapePtr = CurPtr - 1; |
240 | while(isHorizontalWhitespace(c: *EscapePtr)) |
241 | EscapePtr--; |
242 | |
243 | if (*EscapePtr == '\\' || |
244 | (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && |
245 | EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { |
246 | // We found an escaped newline. |
247 | CurPtr = skipNewline(BufferPtr: CurPtr, BufferEnd); |
248 | } else |
249 | return CurPtr; // Not an escaped newline. |
250 | } |
251 | return BufferEnd; |
252 | } |
253 | |
254 | /// Return the one past end pointer for C comments. |
255 | /// Very dumb, does not handle escaped newlines or trigraphs. |
256 | const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
257 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
258 | if (*BufferPtr == '*') { |
259 | assert(BufferPtr + 1 != BufferEnd); |
260 | if (*(BufferPtr + 1) == '/') |
261 | return BufferPtr; |
262 | } |
263 | } |
264 | llvm_unreachable("buffer end hit before '*/' was seen"); |
265 | } |
266 | |
267 | } // end anonymous namespace |
268 | |
269 | void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, |
270 | tok::TokenKind Kind) { |
271 | const unsigned TokLen = TokEnd - BufferPtr; |
272 | Result.setLocation(getSourceLocation(Loc: BufferPtr)); |
273 | Result.setKind(Kind); |
274 | Result.setLength(TokLen); |
275 | #ifndef NDEBUG |
276 | Result.TextPtr = "<UNSET>"; |
277 | Result.IntVal = 7; |
278 | #endif |
279 | BufferPtr = TokEnd; |
280 | } |
281 | |
282 | const char *Lexer::skipTextToken() { |
283 | const char *TokenPtr = BufferPtr; |
284 | assert(TokenPtr < CommentEnd); |
285 | StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<": "\n\r"; |
286 | |
287 | again: |
288 | size_t End = |
289 | StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: TokStartSymbols); |
290 | if (End == StringRef::npos) |
291 | return CommentEnd; |
292 | |
293 | // Doxygen doesn't recognize any commands in a one-line double quotation. |
294 | // If we don't find an ending quotation mark, we pretend it never began. |
295 | if (*(TokenPtr + End) == '\"') { |
296 | TokenPtr += End + 1; |
297 | End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: "\n\r\""); |
298 | if (End != StringRef::npos && *(TokenPtr + End) == '\"') |
299 | TokenPtr += End + 1; |
300 | goto again; |
301 | } |
302 | return TokenPtr + End; |
303 | } |
304 | |
305 | void Lexer::lexCommentText(Token &T) { |
306 | assert(CommentState == LCS_InsideBCPLComment || |
307 | CommentState == LCS_InsideCComment); |
308 | |
309 | // Handles lexing non-command text, i.e. text and newline. |
310 | auto HandleNonCommandToken = [&]() -> void { |
311 | assert(State == LS_Normal); |
312 | |
313 | const char *TokenPtr = BufferPtr; |
314 | assert(TokenPtr < CommentEnd); |
315 | switch (*TokenPtr) { |
316 | case '\n': |
317 | case '\r': |
318 | TokenPtr = skipNewline(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
319 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::newline); |
320 | |
321 | if (CommentState == LCS_InsideCComment) |
322 | skipLineStartingDecorations(); |
323 | return; |
324 | |
325 | default: |
326 | return formTextToken(Result&: T, TokEnd: skipTextToken()); |
327 | } |
328 | }; |
329 | |
330 | if (!ParseCommands) |
331 | return HandleNonCommandToken(); |
332 | |
333 | switch (State) { |
334 | case LS_Normal: |
335 | break; |
336 | case LS_VerbatimBlockFirstLine: |
337 | lexVerbatimBlockFirstLine(T); |
338 | return; |
339 | case LS_VerbatimBlockBody: |
340 | lexVerbatimBlockBody(T); |
341 | return; |
342 | case LS_VerbatimLineText: |
343 | lexVerbatimLineText(T); |
344 | return; |
345 | case LS_HTMLStartTag: |
346 | lexHTMLStartTag(T); |
347 | return; |
348 | case LS_HTMLEndTag: |
349 | lexHTMLEndTag(T); |
350 | return; |
351 | } |
352 | |
353 | assert(State == LS_Normal); |
354 | const char *TokenPtr = BufferPtr; |
355 | assert(TokenPtr < CommentEnd); |
356 | switch(*TokenPtr) { |
357 | case '\\': |
358 | case '@': { |
359 | // Commands that start with a backslash and commands that start with |
360 | // 'at' have equivalent semantics. But we keep information about the |
361 | // exact syntax in AST for comments. |
362 | tok::TokenKind CommandKind = |
363 | (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; |
364 | TokenPtr++; |
365 | if (TokenPtr == CommentEnd) { |
366 | formTextToken(Result&: T, TokEnd: TokenPtr); |
367 | return; |
368 | } |
369 | char C = *TokenPtr; |
370 | switch (C) { |
371 | default: |
372 | break; |
373 | |
374 | case '\\': case '@': case '&': case '$': |
375 | case '#': case '<': case '>': case '%': |
376 | case '\"': case '.': case ':': |
377 | // This is one of \\ \@ \& \$ etc escape sequences. |
378 | TokenPtr++; |
379 | if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { |
380 | // This is the \:: escape sequence. |
381 | TokenPtr++; |
382 | } |
383 | StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); |
384 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text); |
385 | T.setText(UnescapedText); |
386 | return; |
387 | } |
388 | |
389 | // Don't make zero-length commands. |
390 | if (!isCommandNameStartCharacter(C: *TokenPtr)) { |
391 | formTextToken(Result&: T, TokEnd: TokenPtr); |
392 | return; |
393 | } |
394 | |
395 | TokenPtr = skipCommandName(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
396 | unsigned Length = TokenPtr - (BufferPtr + 1); |
397 | |
398 | // Hardcoded support for lexing LaTeX formula commands |
399 | // \f$ \f( \f) \f[ \f] \f{ \f} as a single command. |
400 | if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { |
401 | C = *TokenPtr; |
402 | if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' || |
403 | C == '{' || C == '}') { |
404 | TokenPtr++; |
405 | Length++; |
406 | } |
407 | } |
408 | |
409 | StringRef CommandName(BufferPtr + 1, Length); |
410 | |
411 | const CommandInfo *Info = Traits.getCommandInfoOrNULL(Name: CommandName); |
412 | if (!Info) { |
413 | if ((Info = Traits.getTypoCorrectCommandInfo(Typo: CommandName))) { |
414 | StringRef CorrectedName = Info->Name; |
415 | SourceLocation Loc = getSourceLocation(Loc: BufferPtr); |
416 | SourceLocation EndLoc = getSourceLocation(Loc: TokenPtr); |
417 | SourceRange FullRange = SourceRange(Loc, EndLoc); |
418 | SourceRange CommandRange(Loc.getLocWithOffset(Offset: 1), EndLoc); |
419 | Diag(Loc, diag::warn_correct_comment_command_name) |
420 | << FullRange << CommandName << CorrectedName |
421 | << FixItHint::CreateReplacement(CommandRange, CorrectedName); |
422 | } else { |
423 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::unknown_command); |
424 | T.setUnknownCommandName(CommandName); |
425 | Diag(T.getLocation(), diag::warn_unknown_comment_command_name) |
426 | << SourceRange(T.getLocation(), T.getEndLocation()); |
427 | return; |
428 | } |
429 | } |
430 | if (Info->IsVerbatimBlockCommand) { |
431 | setupAndLexVerbatimBlock(T, TextBegin: TokenPtr, Marker: *BufferPtr, Info); |
432 | return; |
433 | } |
434 | if (Info->IsVerbatimLineCommand) { |
435 | setupAndLexVerbatimLine(T, TextBegin: TokenPtr, Info); |
436 | return; |
437 | } |
438 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: CommandKind); |
439 | T.setCommandID(Info->getID()); |
440 | return; |
441 | } |
442 | |
443 | case '&': |
444 | lexHTMLCharacterReference(T); |
445 | return; |
446 | |
447 | case '<': { |
448 | TokenPtr++; |
449 | if (TokenPtr == CommentEnd) { |
450 | formTextToken(Result&: T, TokEnd: TokenPtr); |
451 | return; |
452 | } |
453 | const char C = *TokenPtr; |
454 | if (isHTMLIdentifierStartingCharacter(C)) |
455 | setupAndLexHTMLStartTag(T); |
456 | else if (C == '/') |
457 | setupAndLexHTMLEndTag(T); |
458 | else |
459 | formTextToken(Result&: T, TokEnd: TokenPtr); |
460 | return; |
461 | } |
462 | |
463 | default: |
464 | return HandleNonCommandToken(); |
465 | } |
466 | } |
467 | |
468 | void Lexer::setupAndLexVerbatimBlock(Token &T, |
469 | const char *TextBegin, |
470 | char Marker, const CommandInfo *Info) { |
471 | assert(Info->IsVerbatimBlockCommand); |
472 | |
473 | VerbatimBlockEndCommandName.clear(); |
474 | VerbatimBlockEndCommandName.append(RHS: Marker == '\\' ? "\\": "@"); |
475 | VerbatimBlockEndCommandName.append(RHS: Info->EndCommandName); |
476 | |
477 | formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_block_begin); |
478 | T.setVerbatimBlockID(Info->getID()); |
479 | |
480 | // If there is a newline following the verbatim opening command, skip the |
481 | // newline so that we don't create an tok::verbatim_block_line with empty |
482 | // text content. |
483 | if (BufferPtr != CommentEnd && |
484 | isVerticalWhitespace(c: *BufferPtr)) { |
485 | BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd); |
486 | State = LS_VerbatimBlockBody; |
487 | return; |
488 | } |
489 | |
490 | State = LS_VerbatimBlockFirstLine; |
491 | } |
492 | |
493 | void Lexer::lexVerbatimBlockFirstLine(Token &T) { |
494 | again: |
495 | assert(BufferPtr < CommentEnd); |
496 | |
497 | // FIXME: It would be better to scan the text once, finding either the block |
498 | // end command or newline. |
499 | // |
500 | // Extract current line. |
501 | const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd); |
502 | StringRef Line(BufferPtr, Newline - BufferPtr); |
503 | |
504 | // Look for end command in current line. |
505 | size_t Pos = Line.find(Str: VerbatimBlockEndCommandName); |
506 | const char *TextEnd; |
507 | const char *NextLine; |
508 | if (Pos == StringRef::npos) { |
509 | // Current line is completely verbatim. |
510 | TextEnd = Newline; |
511 | NextLine = skipNewline(BufferPtr: Newline, BufferEnd: CommentEnd); |
512 | } else if (Pos == 0) { |
513 | // Current line contains just an end command. |
514 | const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); |
515 | StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); |
516 | formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::verbatim_block_end); |
517 | T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); |
518 | State = LS_Normal; |
519 | return; |
520 | } else { |
521 | // There is some text, followed by end command. Extract text first. |
522 | TextEnd = BufferPtr + Pos; |
523 | NextLine = TextEnd; |
524 | // If there is only whitespace before end command, skip whitespace. |
525 | if (isWhitespace(BufferPtr, BufferEnd: TextEnd)) { |
526 | BufferPtr = TextEnd; |
527 | goto again; |
528 | } |
529 | } |
530 | |
531 | StringRef Text(BufferPtr, TextEnd - BufferPtr); |
532 | formTokenWithChars(Result&: T, TokEnd: NextLine, Kind: tok::verbatim_block_line); |
533 | T.setVerbatimBlockText(Text); |
534 | |
535 | State = LS_VerbatimBlockBody; |
536 | } |
537 | |
538 | void Lexer::lexVerbatimBlockBody(Token &T) { |
539 | assert(State == LS_VerbatimBlockBody); |
540 | |
541 | if (CommentState == LCS_InsideCComment) |
542 | skipLineStartingDecorations(); |
543 | |
544 | if (BufferPtr == CommentEnd) { |
545 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::verbatim_block_line); |
546 | T.setVerbatimBlockText(""); |
547 | return; |
548 | } |
549 | |
550 | lexVerbatimBlockFirstLine(T); |
551 | } |
552 | |
553 | void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
554 | const CommandInfo *Info) { |
555 | assert(Info->IsVerbatimLineCommand); |
556 | formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_line_name); |
557 | T.setVerbatimLineID(Info->getID()); |
558 | |
559 | State = LS_VerbatimLineText; |
560 | } |
561 | |
562 | void Lexer::lexVerbatimLineText(Token &T) { |
563 | assert(State == LS_VerbatimLineText); |
564 | |
565 | // Extract current line. |
566 | const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd); |
567 | StringRef Text(BufferPtr, Newline - BufferPtr); |
568 | formTokenWithChars(Result&: T, TokEnd: Newline, Kind: tok::verbatim_line_text); |
569 | T.setVerbatimLineText(Text); |
570 | |
571 | State = LS_Normal; |
572 | } |
573 | |
574 | void Lexer::lexHTMLCharacterReference(Token &T) { |
575 | const char *TokenPtr = BufferPtr; |
576 | assert(*TokenPtr == '&'); |
577 | TokenPtr++; |
578 | if (TokenPtr == CommentEnd) { |
579 | formTextToken(Result&: T, TokEnd: TokenPtr); |
580 | return; |
581 | } |
582 | const char *NamePtr; |
583 | bool isNamed = false; |
584 | bool isDecimal = false; |
585 | char C = *TokenPtr; |
586 | if (isHTMLNamedCharacterReferenceCharacter(C)) { |
587 | NamePtr = TokenPtr; |
588 | TokenPtr = skipNamedCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
589 | isNamed = true; |
590 | } else if (C == '#') { |
591 | TokenPtr++; |
592 | if (TokenPtr == CommentEnd) { |
593 | formTextToken(Result&: T, TokEnd: TokenPtr); |
594 | return; |
595 | } |
596 | C = *TokenPtr; |
597 | if (isHTMLDecimalCharacterReferenceCharacter(C)) { |
598 | NamePtr = TokenPtr; |
599 | TokenPtr = skipDecimalCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
600 | isDecimal = true; |
601 | } else if (C == 'x' || C == 'X') { |
602 | TokenPtr++; |
603 | NamePtr = TokenPtr; |
604 | TokenPtr = skipHexCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
605 | } else { |
606 | formTextToken(Result&: T, TokEnd: TokenPtr); |
607 | return; |
608 | } |
609 | } else { |
610 | formTextToken(Result&: T, TokEnd: TokenPtr); |
611 | return; |
612 | } |
613 | if (NamePtr == TokenPtr || TokenPtr == CommentEnd || |
614 | *TokenPtr != ';') { |
615 | formTextToken(Result&: T, TokEnd: TokenPtr); |
616 | return; |
617 | } |
618 | StringRef Name(NamePtr, TokenPtr - NamePtr); |
619 | TokenPtr++; // Skip semicolon. |
620 | StringRef Resolved; |
621 | if (isNamed) |
622 | Resolved = resolveHTMLNamedCharacterReference(Name); |
623 | else if (isDecimal) |
624 | Resolved = resolveHTMLDecimalCharacterReference(Name); |
625 | else |
626 | Resolved = resolveHTMLHexCharacterReference(Name); |
627 | |
628 | if (Resolved.empty()) { |
629 | formTextToken(Result&: T, TokEnd: TokenPtr); |
630 | return; |
631 | } |
632 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text); |
633 | T.setText(Resolved); |
634 | } |
635 | |
636 | void Lexer::setupAndLexHTMLStartTag(Token &T) { |
637 | assert(BufferPtr[0] == '<' && |
638 | isHTMLIdentifierStartingCharacter(BufferPtr[1])); |
639 | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: BufferPtr + 2, BufferEnd: CommentEnd); |
640 | StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); |
641 | if (!isHTMLTagName(Name)) { |
642 | formTextToken(Result&: T, TokEnd: TagNameEnd); |
643 | return; |
644 | } |
645 | |
646 | formTokenWithChars(Result&: T, TokEnd: TagNameEnd, Kind: tok::html_start_tag); |
647 | T.setHTMLTagStartName(Name); |
648 | |
649 | BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd); |
650 | if (BufferPtr == CommentEnd) { // in BCPL comments |
651 | State = LS_HTMLStartTag; |
652 | return; |
653 | } |
654 | |
655 | const char C = *BufferPtr; |
656 | if (BufferPtr != CommentEnd && |
657 | (C == '>' || C == '/' || isVerticalWhitespace(c: C) || |
658 | isHTMLIdentifierStartingCharacter(C))) |
659 | State = LS_HTMLStartTag; |
660 | } |
661 | |
662 | void Lexer::lexHTMLStartTag(Token &T) { |
663 | assert(State == LS_HTMLStartTag); |
664 | |
665 | // Skip leading whitespace and comment decorations |
666 | while (isVerticalWhitespace(c: *BufferPtr)) { |
667 | BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd); |
668 | |
669 | if (CommentState == LCS_InsideCComment) |
670 | skipLineStartingDecorations(); |
671 | |
672 | BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd); |
673 | if (BufferPtr == CommentEnd) { |
674 | // HTML starting tags must be defined in a single comment block. |
675 | // It's likely a user-error where they forgot to terminate the comment. |
676 | State = LS_Normal; |
677 | // Since at least one newline was skipped and one token needs to be lexed, |
678 | // return a newline. |
679 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline); |
680 | return; |
681 | } |
682 | } |
683 | |
684 | const char *TokenPtr = BufferPtr; |
685 | char C = *TokenPtr; |
686 | if (isHTMLIdentifierCharacter(C)) { |
687 | TokenPtr = skipHTMLIdentifier(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
688 | StringRef Ident(BufferPtr, TokenPtr - BufferPtr); |
689 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_ident); |
690 | T.setHTMLIdent(Ident); |
691 | } else { |
692 | switch (C) { |
693 | case '=': |
694 | TokenPtr++; |
695 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_equals); |
696 | break; |
697 | case '\"': |
698 | case '\'': { |
699 | const char *OpenQuote = TokenPtr; |
700 | TokenPtr = skipHTMLQuotedString(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
701 | const char *ClosingQuote = TokenPtr; |
702 | if (TokenPtr != CommentEnd) // Skip closing quote. |
703 | TokenPtr++; |
704 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_quoted_string); |
705 | T.setHTMLQuotedString(StringRef(OpenQuote + 1, |
706 | ClosingQuote - (OpenQuote + 1))); |
707 | break; |
708 | } |
709 | case '>': |
710 | TokenPtr++; |
711 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_greater); |
712 | State = LS_Normal; |
713 | return; |
714 | case '/': |
715 | TokenPtr++; |
716 | if (TokenPtr != CommentEnd && *TokenPtr == '>') { |
717 | TokenPtr++; |
718 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_slash_greater); |
719 | } else |
720 | formTextToken(Result&: T, TokEnd: TokenPtr); |
721 | |
722 | State = LS_Normal; |
723 | return; |
724 | } |
725 | } |
726 | |
727 | // Now look ahead and return to normal state if we don't see any HTML tokens |
728 | // ahead. |
729 | BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd); |
730 | if (BufferPtr == CommentEnd) { |
731 | return; |
732 | } |
733 | |
734 | C = *BufferPtr; |
735 | if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(c: C) && |
736 | C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') { |
737 | State = LS_Normal; |
738 | return; |
739 | } |
740 | } |
741 | |
742 | void Lexer::setupAndLexHTMLEndTag(Token &T) { |
743 | assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); |
744 | |
745 | const char *TagNameBegin = skipWhitespace(BufferPtr: BufferPtr + 2, BufferEnd: CommentEnd); |
746 | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: TagNameBegin, BufferEnd: CommentEnd); |
747 | StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); |
748 | if (!isHTMLTagName(Name)) { |
749 | formTextToken(Result&: T, TokEnd: TagNameEnd); |
750 | return; |
751 | } |
752 | |
753 | const char *End = skipWhitespace(BufferPtr: TagNameEnd, BufferEnd: CommentEnd); |
754 | |
755 | formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::html_end_tag); |
756 | T.setHTMLTagEndName(Name); |
757 | |
758 | if (BufferPtr != CommentEnd && *BufferPtr == '>') |
759 | State = LS_HTMLEndTag; |
760 | } |
761 | |
762 | void Lexer::lexHTMLEndTag(Token &T) { |
763 | assert(BufferPtr != CommentEnd && *BufferPtr == '>'); |
764 | |
765 | formTokenWithChars(Result&: T, TokEnd: BufferPtr + 1, Kind: tok::html_greater); |
766 | State = LS_Normal; |
767 | } |
768 | |
769 | Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
770 | const CommandTraits &Traits, SourceLocation FileLoc, |
771 | const char *BufferStart, const char *BufferEnd, bool ParseCommands) |
772 | : Allocator(Allocator), Diags(Diags), Traits(Traits), |
773 | BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), |
774 | FileLoc(FileLoc), ParseCommands(ParseCommands), |
775 | CommentState(LCS_BeforeComment), State(LS_Normal) {} |
776 | |
777 | void Lexer::lex(Token &T) { |
778 | again: |
779 | switch (CommentState) { |
780 | case LCS_BeforeComment: |
781 | if (BufferPtr == BufferEnd) { |
782 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::eof); |
783 | return; |
784 | } |
785 | |
786 | assert(*BufferPtr == '/'); |
787 | BufferPtr++; // Skip first slash. |
788 | switch(*BufferPtr) { |
789 | case '/': { // BCPL comment. |
790 | BufferPtr++; // Skip second slash. |
791 | |
792 | if (BufferPtr != BufferEnd) { |
793 | // Skip Doxygen magic marker, if it is present. |
794 | // It might be missing because of a typo //< or /*<, or because we |
795 | // merged this non-Doxygen comment into a bunch of Doxygen comments |
796 | // around it: /** ... */ /* ... */ /** ... */ |
797 | const char C = *BufferPtr; |
798 | if (C == '/' || C == '!') |
799 | BufferPtr++; |
800 | } |
801 | |
802 | // Skip less-than symbol that marks trailing comments. |
803 | // Skip it even if the comment is not a Doxygen one, because //< and /*< |
804 | // are frequent typos. |
805 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
806 | BufferPtr++; |
807 | |
808 | CommentState = LCS_InsideBCPLComment; |
809 | switch (State) { |
810 | case LS_VerbatimBlockFirstLine: |
811 | case LS_VerbatimBlockBody: |
812 | break; |
813 | case LS_HTMLStartTag: |
814 | BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd); |
815 | break; |
816 | default: |
817 | State = LS_Normal; |
818 | break; |
819 | } |
820 | CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); |
821 | goto again; |
822 | } |
823 | case '*': { // C comment. |
824 | BufferPtr++; // Skip star. |
825 | |
826 | // Skip Doxygen magic marker. |
827 | const char C = *BufferPtr; |
828 | if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') |
829 | BufferPtr++; |
830 | |
831 | // Skip less-than symbol that marks trailing comments. |
832 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
833 | BufferPtr++; |
834 | |
835 | CommentState = LCS_InsideCComment; |
836 | State = LS_Normal; |
837 | CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); |
838 | goto again; |
839 | } |
840 | default: |
841 | llvm_unreachable("second character of comment should be '/' or '*'"); |
842 | } |
843 | |
844 | case LCS_BetweenComments: { |
845 | // Consecutive comments are extracted only if there is only whitespace |
846 | // between them. So we can search for the start of the next comment. |
847 | const char *EndWhitespace = BufferPtr; |
848 | while(EndWhitespace != BufferEnd && *EndWhitespace != '/') |
849 | EndWhitespace++; |
850 | |
851 | // When lexing the start of an HTML tag (i.e. going through the attributes) |
852 | // there won't be any newlines generated. |
853 | if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) { |
854 | CommentState = LCS_BeforeComment; |
855 | BufferPtr = EndWhitespace; |
856 | goto again; |
857 | } |
858 | |
859 | // Turn any whitespace between comments (and there is only whitespace |
860 | // between them -- guaranteed by comment extraction) into a newline. We |
861 | // have two newlines between C comments in total (first one was synthesized |
862 | // after a comment). |
863 | formTokenWithChars(Result&: T, TokEnd: EndWhitespace, Kind: tok::newline); |
864 | |
865 | CommentState = LCS_BeforeComment; |
866 | break; |
867 | } |
868 | |
869 | case LCS_InsideBCPLComment: |
870 | case LCS_InsideCComment: |
871 | if (BufferPtr != CommentEnd) { |
872 | lexCommentText(T); |
873 | break; |
874 | } else { |
875 | // Skip C comment closing sequence. |
876 | if (CommentState == LCS_InsideCComment) { |
877 | assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); |
878 | BufferPtr += 2; |
879 | assert(BufferPtr <= BufferEnd); |
880 | |
881 | // When lexing the start of an HTML tag (i.e. going through the |
882 | // attributes) there won't be any newlines generated - whitespace still |
883 | // needs to be skipped. |
884 | if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) { |
885 | CommentState = LCS_BetweenComments; |
886 | goto again; |
887 | } |
888 | |
889 | // Synthenize newline just after the C comment, regardless if there is |
890 | // actually a newline. |
891 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline); |
892 | |
893 | CommentState = LCS_BetweenComments; |
894 | break; |
895 | } else { |
896 | // Don't synthesized a newline after BCPL comment. |
897 | CommentState = LCS_BetweenComments; |
898 | goto again; |
899 | } |
900 | } |
901 | } |
902 | } |
903 | |
904 | StringRef Lexer::getSpelling(const Token &Tok, |
905 | const SourceManager &SourceMgr) const { |
906 | SourceLocation Loc = Tok.getLocation(); |
907 | std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); |
908 | |
909 | bool InvalidTemp = false; |
910 | StringRef File = SourceMgr.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp); |
911 | if (InvalidTemp) |
912 | return StringRef(); |
913 | |
914 | const char *Begin = File.data() + LocInfo.second; |
915 | return StringRef(Begin, Tok.getLength()); |
916 | } |
917 | |
918 | } // end namespace comments |
919 | } // end namespace clang |
920 |
Definitions
- dump
- isHTMLNamedCharacterReferenceCharacter
- isHTMLDecimalCharacterReferenceCharacter
- isHTMLHexCharacterReferenceCharacter
- convertCodePointToUTF8
- resolveHTMLNamedCharacterReference
- resolveHTMLDecimalCharacterReference
- resolveHTMLHexCharacterReference
- skipLineStartingDecorations
- findNewline
- skipNewline
- skipNamedCharacterReference
- skipDecimalCharacterReference
- skipHexCharacterReference
- isHTMLIdentifierStartingCharacter
- isHTMLIdentifierCharacter
- skipHTMLIdentifier
- skipHTMLQuotedString
- skipWhitespace
- skipHorizontalWhitespace
- isWhitespace
- isCommandNameStartCharacter
- isCommandNameCharacter
- skipCommandName
- findBCPLCommentEnd
- findCCommentEnd
- formTokenWithChars
- skipTextToken
- lexCommentText
- setupAndLexVerbatimBlock
- lexVerbatimBlockFirstLine
- lexVerbatimBlockBody
- setupAndLexVerbatimLine
- lexVerbatimLineText
- lexHTMLCharacterReference
- setupAndLexHTMLStartTag
- lexHTMLStartTag
- setupAndLexHTMLEndTag
- lexHTMLEndTag
- Lexer
- lex
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more