1//===--- CommentLexer.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang/AST/CommentLexer.h"
10#include "clang/AST/CommentCommandTraits.h"
11#include "clang/Basic/CharInfo.h"
12#include "clang/Basic/DiagnosticComment.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/ADT/StringSwitch.h"
15#include "llvm/Support/ConvertUTF.h"
16#include "llvm/Support/ErrorHandling.h"
17
18namespace clang {
19namespace comments {
20
21void Token::dump(const Lexer &L, const SourceManager &SM) const {
22 llvm::errs() << "comments::Token Kind=" << Kind << " ";
23 Loc.print(OS&: llvm::errs(), SM);
24 llvm::errs() << " " << Length << " \"" << L.getSpelling(Tok: *this, SourceMgr: SM) << "\"\n";
25}
26
27static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28 return isLetter(c: C);
29}
30
31static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32 return isDigit(c: C);
33}
34
35static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36 return isHexDigit(c: C);
37}
38
39static inline StringRef convertCodePointToUTF8(
40 llvm::BumpPtrAllocator &Allocator,
41 unsigned CodePoint) {
42 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43 char *ResolvedPtr = Resolved;
44 if (llvm::ConvertCodePointToUTF8(Source: CodePoint, ResultPtr&: ResolvedPtr))
45 return StringRef(Resolved, ResolvedPtr - Resolved);
46 else
47 return StringRef();
48}
49
50namespace {
51
52#include "clang/AST/CommentHTMLTags.inc"
53#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55} // end anonymous namespace
56
57StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58 // Fast path, first check a few most widely used named character references.
59 return llvm::StringSwitch<StringRef>(Name)
60 .Case("amp", "&")
61 .Case("lt", "<")
62 .Case("gt", ">")
63 .Case("quot", "\"")
64 .Case("apos", "\'")
65 // Slow path.
66 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67}
68
69StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70 unsigned CodePoint = 0;
71 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73 CodePoint *= 10;
74 CodePoint += Name[i] - '0';
75 }
76 return convertCodePointToUTF8(Allocator, CodePoint);
77}
78
79StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80 unsigned CodePoint = 0;
81 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82 CodePoint *= 16;
83 const char C = Name[i];
84 assert(isHTMLHexCharacterReferenceCharacter(C));
85 CodePoint += llvm::hexDigitValue(C);
86 }
87 return convertCodePointToUTF8(Allocator, CodePoint);
88}
89
90void Lexer::skipLineStartingDecorations() {
91 // This function should be called only for C comments
92 assert(CommentState == LCS_InsideCComment);
93
94 if (BufferPtr == CommentEnd)
95 return;
96
97 const char *NewBufferPtr = BufferPtr;
98 while (isHorizontalWhitespace(c: *NewBufferPtr))
99 if (++NewBufferPtr == CommentEnd)
100 return;
101 if (*NewBufferPtr == '*')
102 BufferPtr = NewBufferPtr + 1;
103}
104
105namespace {
106/// Returns pointer to the first newline character in the string.
107const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109 if (isVerticalWhitespace(c: *BufferPtr))
110 return BufferPtr;
111 }
112 return BufferEnd;
113}
114
115const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116 if (BufferPtr == BufferEnd)
117 return BufferPtr;
118
119 if (*BufferPtr == '\n')
120 BufferPtr++;
121 else {
122 assert(*BufferPtr == '\r');
123 BufferPtr++;
124 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125 BufferPtr++;
126 }
127 return BufferPtr;
128}
129
130const char *skipNamedCharacterReference(const char *BufferPtr,
131 const char *BufferEnd) {
132 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133 if (!isHTMLNamedCharacterReferenceCharacter(C: *BufferPtr))
134 return BufferPtr;
135 }
136 return BufferEnd;
137}
138
139const char *skipDecimalCharacterReference(const char *BufferPtr,
140 const char *BufferEnd) {
141 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142 if (!isHTMLDecimalCharacterReferenceCharacter(C: *BufferPtr))
143 return BufferPtr;
144 }
145 return BufferEnd;
146}
147
148const char *skipHexCharacterReference(const char *BufferPtr,
149 const char *BufferEnd) {
150 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151 if (!isHTMLHexCharacterReferenceCharacter(C: *BufferPtr))
152 return BufferPtr;
153 }
154 return BufferEnd;
155}
156
157bool isHTMLIdentifierStartingCharacter(char C) {
158 return isLetter(c: C);
159}
160
161bool isHTMLIdentifierCharacter(char C) {
162 return isAlphanumeric(c: C);
163}
164
165const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167 if (!isHTMLIdentifierCharacter(C: *BufferPtr))
168 return BufferPtr;
169 }
170 return BufferEnd;
171}
172
173/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174/// string allowed.
175///
176/// Returns pointer to closing quote.
177const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178{
179 const char Quote = *BufferPtr;
180 assert(Quote == '\"' || Quote == '\'');
181
182 BufferPtr++;
183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184 const char C = *BufferPtr;
185 if (C == Quote && BufferPtr[-1] != '\\')
186 return BufferPtr;
187 }
188 return BufferEnd;
189}
190
191const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193 if (!isWhitespace(c: *BufferPtr))
194 return BufferPtr;
195 }
196 return BufferEnd;
197}
198
199const char *skipHorizontalWhitespace(const char *BufferPtr,
200 const char *BufferEnd) {
201 for (; BufferPtr != BufferEnd; ++BufferPtr) {
202 if (!isHorizontalWhitespace(c: *BufferPtr))
203 return BufferPtr;
204 }
205 return BufferEnd;
206}
207
208bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
209 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
210}
211
212bool isCommandNameStartCharacter(char C) {
213 return isLetter(c: C);
214}
215
216bool isCommandNameCharacter(char C) {
217 return isAlphanumeric(c: C);
218}
219
220const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
221 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
222 if (!isCommandNameCharacter(C: *BufferPtr))
223 return BufferPtr;
224 }
225 return BufferEnd;
226}
227
228/// Return the one past end pointer for BCPL comments.
229/// Handles newlines escaped with backslash or trigraph for backslahs.
230const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
231 const char *CurPtr = BufferPtr;
232 while (CurPtr != BufferEnd) {
233 while (!isVerticalWhitespace(c: *CurPtr)) {
234 CurPtr++;
235 if (CurPtr == BufferEnd)
236 return BufferEnd;
237 }
238 // We found a newline, check if it is escaped.
239 const char *EscapePtr = CurPtr - 1;
240 while(isHorizontalWhitespace(c: *EscapePtr))
241 EscapePtr--;
242
243 if (*EscapePtr == '\\' ||
244 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
245 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
246 // We found an escaped newline.
247 CurPtr = skipNewline(BufferPtr: CurPtr, BufferEnd);
248 } else
249 return CurPtr; // Not an escaped newline.
250 }
251 return BufferEnd;
252}
253
254/// Return the one past end pointer for C comments.
255/// Very dumb, does not handle escaped newlines or trigraphs.
256const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
257 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
258 if (*BufferPtr == '*') {
259 assert(BufferPtr + 1 != BufferEnd);
260 if (*(BufferPtr + 1) == '/')
261 return BufferPtr;
262 }
263 }
264 llvm_unreachable("buffer end hit before '*/' was seen");
265}
266
267} // end anonymous namespace
268
269void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
270 tok::TokenKind Kind) {
271 const unsigned TokLen = TokEnd - BufferPtr;
272 Result.setLocation(getSourceLocation(Loc: BufferPtr));
273 Result.setKind(Kind);
274 Result.setLength(TokLen);
275#ifndef NDEBUG
276 Result.TextPtr = "<UNSET>";
277 Result.IntVal = 7;
278#endif
279 BufferPtr = TokEnd;
280}
281
282const char *Lexer::skipTextToken() {
283 const char *TokenPtr = BufferPtr;
284 assert(TokenPtr < CommentEnd);
285 StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
286
287again:
288 size_t End =
289 StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: TokStartSymbols);
290 if (End == StringRef::npos)
291 return CommentEnd;
292
293 // Doxygen doesn't recognize any commands in a one-line double quotation.
294 // If we don't find an ending quotation mark, we pretend it never began.
295 if (*(TokenPtr + End) == '\"') {
296 TokenPtr += End + 1;
297 End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: "\n\r\"");
298 if (End != StringRef::npos && *(TokenPtr + End) == '\"')
299 TokenPtr += End + 1;
300 goto again;
301 }
302 return TokenPtr + End;
303}
304
305void Lexer::lexCommentText(Token &T) {
306 assert(CommentState == LCS_InsideBCPLComment ||
307 CommentState == LCS_InsideCComment);
308
309 // Handles lexing non-command text, i.e. text and newline.
310 auto HandleNonCommandToken = [&]() -> void {
311 assert(State == LS_Normal);
312
313 const char *TokenPtr = BufferPtr;
314 assert(TokenPtr < CommentEnd);
315 switch (*TokenPtr) {
316 case '\n':
317 case '\r':
318 TokenPtr = skipNewline(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
319 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::newline);
320
321 if (CommentState == LCS_InsideCComment)
322 skipLineStartingDecorations();
323 return;
324
325 default:
326 return formTextToken(Result&: T, TokEnd: skipTextToken());
327 }
328 };
329
330 if (!ParseCommands)
331 return HandleNonCommandToken();
332
333 switch (State) {
334 case LS_Normal:
335 break;
336 case LS_VerbatimBlockFirstLine:
337 lexVerbatimBlockFirstLine(T);
338 return;
339 case LS_VerbatimBlockBody:
340 lexVerbatimBlockBody(T);
341 return;
342 case LS_VerbatimLineText:
343 lexVerbatimLineText(T);
344 return;
345 case LS_HTMLStartTag:
346 lexHTMLStartTag(T);
347 return;
348 case LS_HTMLEndTag:
349 lexHTMLEndTag(T);
350 return;
351 }
352
353 assert(State == LS_Normal);
354 const char *TokenPtr = BufferPtr;
355 assert(TokenPtr < CommentEnd);
356 switch(*TokenPtr) {
357 case '\\':
358 case '@': {
359 // Commands that start with a backslash and commands that start with
360 // 'at' have equivalent semantics. But we keep information about the
361 // exact syntax in AST for comments.
362 tok::TokenKind CommandKind =
363 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
364 TokenPtr++;
365 if (TokenPtr == CommentEnd) {
366 formTextToken(Result&: T, TokEnd: TokenPtr);
367 return;
368 }
369 char C = *TokenPtr;
370 switch (C) {
371 default:
372 break;
373
374 case '\\': case '@': case '&': case '$':
375 case '#': case '<': case '>': case '%':
376 case '\"': case '.': case ':':
377 // This is one of \\ \@ \& \$ etc escape sequences.
378 TokenPtr++;
379 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
380 // This is the \:: escape sequence.
381 TokenPtr++;
382 }
383 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
384 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
385 T.setText(UnescapedText);
386 return;
387 }
388
389 // Don't make zero-length commands.
390 if (!isCommandNameStartCharacter(C: *TokenPtr)) {
391 formTextToken(Result&: T, TokEnd: TokenPtr);
392 return;
393 }
394
395 TokenPtr = skipCommandName(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
396 unsigned Length = TokenPtr - (BufferPtr + 1);
397
398 // Hardcoded support for lexing LaTeX formula commands
399 // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
400 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
401 C = *TokenPtr;
402 if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
403 C == '{' || C == '}') {
404 TokenPtr++;
405 Length++;
406 }
407 }
408
409 StringRef CommandName(BufferPtr + 1, Length);
410
411 const CommandInfo *Info = Traits.getCommandInfoOrNULL(Name: CommandName);
412 if (!Info) {
413 if ((Info = Traits.getTypoCorrectCommandInfo(Typo: CommandName))) {
414 StringRef CorrectedName = Info->Name;
415 SourceLocation Loc = getSourceLocation(Loc: BufferPtr);
416 SourceLocation EndLoc = getSourceLocation(Loc: TokenPtr);
417 SourceRange FullRange = SourceRange(Loc, EndLoc);
418 SourceRange CommandRange(Loc.getLocWithOffset(Offset: 1), EndLoc);
419 Diag(Loc, diag::warn_correct_comment_command_name)
420 << FullRange << CommandName << CorrectedName
421 << FixItHint::CreateReplacement(CommandRange, CorrectedName);
422 } else {
423 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::unknown_command);
424 T.setUnknownCommandName(CommandName);
425 Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
426 << SourceRange(T.getLocation(), T.getEndLocation());
427 return;
428 }
429 }
430 if (Info->IsVerbatimBlockCommand) {
431 setupAndLexVerbatimBlock(T, TextBegin: TokenPtr, Marker: *BufferPtr, Info);
432 return;
433 }
434 if (Info->IsVerbatimLineCommand) {
435 setupAndLexVerbatimLine(T, TextBegin: TokenPtr, Info);
436 return;
437 }
438 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: CommandKind);
439 T.setCommandID(Info->getID());
440 return;
441 }
442
443 case '&':
444 lexHTMLCharacterReference(T);
445 return;
446
447 case '<': {
448 TokenPtr++;
449 if (TokenPtr == CommentEnd) {
450 formTextToken(Result&: T, TokEnd: TokenPtr);
451 return;
452 }
453 const char C = *TokenPtr;
454 if (isHTMLIdentifierStartingCharacter(C))
455 setupAndLexHTMLStartTag(T);
456 else if (C == '/')
457 setupAndLexHTMLEndTag(T);
458 else
459 formTextToken(Result&: T, TokEnd: TokenPtr);
460 return;
461 }
462
463 default:
464 return HandleNonCommandToken();
465 }
466}
467
468void Lexer::setupAndLexVerbatimBlock(Token &T,
469 const char *TextBegin,
470 char Marker, const CommandInfo *Info) {
471 assert(Info->IsVerbatimBlockCommand);
472
473 VerbatimBlockEndCommandName.clear();
474 VerbatimBlockEndCommandName.append(RHS: Marker == '\\' ? "\\" : "@");
475 VerbatimBlockEndCommandName.append(RHS: Info->EndCommandName);
476
477 formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_block_begin);
478 T.setVerbatimBlockID(Info->getID());
479
480 // If there is a newline following the verbatim opening command, skip the
481 // newline so that we don't create an tok::verbatim_block_line with empty
482 // text content.
483 if (BufferPtr != CommentEnd &&
484 isVerticalWhitespace(c: *BufferPtr)) {
485 BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
486 State = LS_VerbatimBlockBody;
487 return;
488 }
489
490 State = LS_VerbatimBlockFirstLine;
491}
492
493void Lexer::lexVerbatimBlockFirstLine(Token &T) {
494again:
495 assert(BufferPtr < CommentEnd);
496
497 // FIXME: It would be better to scan the text once, finding either the block
498 // end command or newline.
499 //
500 // Extract current line.
501 const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
502 StringRef Line(BufferPtr, Newline - BufferPtr);
503
504 // Look for end command in current line.
505 size_t Pos = Line.find(Str: VerbatimBlockEndCommandName);
506 const char *TextEnd;
507 const char *NextLine;
508 if (Pos == StringRef::npos) {
509 // Current line is completely verbatim.
510 TextEnd = Newline;
511 NextLine = skipNewline(BufferPtr: Newline, BufferEnd: CommentEnd);
512 } else if (Pos == 0) {
513 // Current line contains just an end command.
514 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
515 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
516 formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::verbatim_block_end);
517 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
518 State = LS_Normal;
519 return;
520 } else {
521 // There is some text, followed by end command. Extract text first.
522 TextEnd = BufferPtr + Pos;
523 NextLine = TextEnd;
524 // If there is only whitespace before end command, skip whitespace.
525 if (isWhitespace(BufferPtr, BufferEnd: TextEnd)) {
526 BufferPtr = TextEnd;
527 goto again;
528 }
529 }
530
531 StringRef Text(BufferPtr, TextEnd - BufferPtr);
532 formTokenWithChars(Result&: T, TokEnd: NextLine, Kind: tok::verbatim_block_line);
533 T.setVerbatimBlockText(Text);
534
535 State = LS_VerbatimBlockBody;
536}
537
538void Lexer::lexVerbatimBlockBody(Token &T) {
539 assert(State == LS_VerbatimBlockBody);
540
541 if (CommentState == LCS_InsideCComment)
542 skipLineStartingDecorations();
543
544 if (BufferPtr == CommentEnd) {
545 formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::verbatim_block_line);
546 T.setVerbatimBlockText("");
547 return;
548 }
549
550 lexVerbatimBlockFirstLine(T);
551}
552
553void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
554 const CommandInfo *Info) {
555 assert(Info->IsVerbatimLineCommand);
556 formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_line_name);
557 T.setVerbatimLineID(Info->getID());
558
559 State = LS_VerbatimLineText;
560}
561
562void Lexer::lexVerbatimLineText(Token &T) {
563 assert(State == LS_VerbatimLineText);
564
565 // Extract current line.
566 const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
567 StringRef Text(BufferPtr, Newline - BufferPtr);
568 formTokenWithChars(Result&: T, TokEnd: Newline, Kind: tok::verbatim_line_text);
569 T.setVerbatimLineText(Text);
570
571 State = LS_Normal;
572}
573
574void Lexer::lexHTMLCharacterReference(Token &T) {
575 const char *TokenPtr = BufferPtr;
576 assert(*TokenPtr == '&');
577 TokenPtr++;
578 if (TokenPtr == CommentEnd) {
579 formTextToken(Result&: T, TokEnd: TokenPtr);
580 return;
581 }
582 const char *NamePtr;
583 bool isNamed = false;
584 bool isDecimal = false;
585 char C = *TokenPtr;
586 if (isHTMLNamedCharacterReferenceCharacter(C)) {
587 NamePtr = TokenPtr;
588 TokenPtr = skipNamedCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
589 isNamed = true;
590 } else if (C == '#') {
591 TokenPtr++;
592 if (TokenPtr == CommentEnd) {
593 formTextToken(Result&: T, TokEnd: TokenPtr);
594 return;
595 }
596 C = *TokenPtr;
597 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
598 NamePtr = TokenPtr;
599 TokenPtr = skipDecimalCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
600 isDecimal = true;
601 } else if (C == 'x' || C == 'X') {
602 TokenPtr++;
603 NamePtr = TokenPtr;
604 TokenPtr = skipHexCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
605 } else {
606 formTextToken(Result&: T, TokEnd: TokenPtr);
607 return;
608 }
609 } else {
610 formTextToken(Result&: T, TokEnd: TokenPtr);
611 return;
612 }
613 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
614 *TokenPtr != ';') {
615 formTextToken(Result&: T, TokEnd: TokenPtr);
616 return;
617 }
618 StringRef Name(NamePtr, TokenPtr - NamePtr);
619 TokenPtr++; // Skip semicolon.
620 StringRef Resolved;
621 if (isNamed)
622 Resolved = resolveHTMLNamedCharacterReference(Name);
623 else if (isDecimal)
624 Resolved = resolveHTMLDecimalCharacterReference(Name);
625 else
626 Resolved = resolveHTMLHexCharacterReference(Name);
627
628 if (Resolved.empty()) {
629 formTextToken(Result&: T, TokEnd: TokenPtr);
630 return;
631 }
632 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
633 T.setText(Resolved);
634}
635
636void Lexer::setupAndLexHTMLStartTag(Token &T) {
637 assert(BufferPtr[0] == '<' &&
638 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
639 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: BufferPtr + 2, BufferEnd: CommentEnd);
640 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
641 if (!isHTMLTagName(Name)) {
642 formTextToken(Result&: T, TokEnd: TagNameEnd);
643 return;
644 }
645
646 formTokenWithChars(Result&: T, TokEnd: TagNameEnd, Kind: tok::html_start_tag);
647 T.setHTMLTagStartName(Name);
648
649 BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
650 if (BufferPtr == CommentEnd) { // in BCPL comments
651 State = LS_HTMLStartTag;
652 return;
653 }
654
655 const char C = *BufferPtr;
656 if (BufferPtr != CommentEnd &&
657 (C == '>' || C == '/' || isVerticalWhitespace(c: C) ||
658 isHTMLIdentifierStartingCharacter(C)))
659 State = LS_HTMLStartTag;
660}
661
662void Lexer::lexHTMLStartTag(Token &T) {
663 assert(State == LS_HTMLStartTag);
664
665 // Skip leading whitespace and comment decorations
666 while (isVerticalWhitespace(c: *BufferPtr)) {
667 BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
668
669 if (CommentState == LCS_InsideCComment)
670 skipLineStartingDecorations();
671
672 BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
673 if (BufferPtr == CommentEnd) {
674 // HTML starting tags must be defined in a single comment block.
675 // It's likely a user-error where they forgot to terminate the comment.
676 State = LS_Normal;
677 // Since at least one newline was skipped and one token needs to be lexed,
678 // return a newline.
679 formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
680 return;
681 }
682 }
683
684 const char *TokenPtr = BufferPtr;
685 char C = *TokenPtr;
686 if (isHTMLIdentifierCharacter(C)) {
687 TokenPtr = skipHTMLIdentifier(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
688 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
689 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_ident);
690 T.setHTMLIdent(Ident);
691 } else {
692 switch (C) {
693 case '=':
694 TokenPtr++;
695 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_equals);
696 break;
697 case '\"':
698 case '\'': {
699 const char *OpenQuote = TokenPtr;
700 TokenPtr = skipHTMLQuotedString(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
701 const char *ClosingQuote = TokenPtr;
702 if (TokenPtr != CommentEnd) // Skip closing quote.
703 TokenPtr++;
704 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_quoted_string);
705 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
706 ClosingQuote - (OpenQuote + 1)));
707 break;
708 }
709 case '>':
710 TokenPtr++;
711 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_greater);
712 State = LS_Normal;
713 return;
714 case '/':
715 TokenPtr++;
716 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
717 TokenPtr++;
718 formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_slash_greater);
719 } else
720 formTextToken(Result&: T, TokEnd: TokenPtr);
721
722 State = LS_Normal;
723 return;
724 }
725 }
726
727 // Now look ahead and return to normal state if we don't see any HTML tokens
728 // ahead.
729 BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd: CommentEnd);
730 if (BufferPtr == CommentEnd) {
731 return;
732 }
733
734 C = *BufferPtr;
735 if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(c: C) &&
736 C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
737 State = LS_Normal;
738 return;
739 }
740}
741
742void Lexer::setupAndLexHTMLEndTag(Token &T) {
743 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
744
745 const char *TagNameBegin = skipWhitespace(BufferPtr: BufferPtr + 2, BufferEnd: CommentEnd);
746 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: TagNameBegin, BufferEnd: CommentEnd);
747 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
748 if (!isHTMLTagName(Name)) {
749 formTextToken(Result&: T, TokEnd: TagNameEnd);
750 return;
751 }
752
753 const char *End = skipWhitespace(BufferPtr: TagNameEnd, BufferEnd: CommentEnd);
754
755 formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::html_end_tag);
756 T.setHTMLTagEndName(Name);
757
758 if (BufferPtr != CommentEnd && *BufferPtr == '>')
759 State = LS_HTMLEndTag;
760}
761
762void Lexer::lexHTMLEndTag(Token &T) {
763 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
764
765 formTokenWithChars(Result&: T, TokEnd: BufferPtr + 1, Kind: tok::html_greater);
766 State = LS_Normal;
767}
768
769Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
770 const CommandTraits &Traits, SourceLocation FileLoc,
771 const char *BufferStart, const char *BufferEnd, bool ParseCommands)
772 : Allocator(Allocator), Diags(Diags), Traits(Traits),
773 BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
774 FileLoc(FileLoc), ParseCommands(ParseCommands),
775 CommentState(LCS_BeforeComment), State(LS_Normal) {}
776
777void Lexer::lex(Token &T) {
778again:
779 switch (CommentState) {
780 case LCS_BeforeComment:
781 if (BufferPtr == BufferEnd) {
782 formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::eof);
783 return;
784 }
785
786 assert(*BufferPtr == '/');
787 BufferPtr++; // Skip first slash.
788 switch(*BufferPtr) {
789 case '/': { // BCPL comment.
790 BufferPtr++; // Skip second slash.
791
792 if (BufferPtr != BufferEnd) {
793 // Skip Doxygen magic marker, if it is present.
794 // It might be missing because of a typo //< or /*<, or because we
795 // merged this non-Doxygen comment into a bunch of Doxygen comments
796 // around it: /** ... */ /* ... */ /** ... */
797 const char C = *BufferPtr;
798 if (C == '/' || C == '!')
799 BufferPtr++;
800 }
801
802 // Skip less-than symbol that marks trailing comments.
803 // Skip it even if the comment is not a Doxygen one, because //< and /*<
804 // are frequent typos.
805 if (BufferPtr != BufferEnd && *BufferPtr == '<')
806 BufferPtr++;
807
808 CommentState = LCS_InsideBCPLComment;
809 switch (State) {
810 case LS_VerbatimBlockFirstLine:
811 case LS_VerbatimBlockBody:
812 break;
813 case LS_HTMLStartTag:
814 BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
815 break;
816 default:
817 State = LS_Normal;
818 break;
819 }
820 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
821 goto again;
822 }
823 case '*': { // C comment.
824 BufferPtr++; // Skip star.
825
826 // Skip Doxygen magic marker.
827 const char C = *BufferPtr;
828 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
829 BufferPtr++;
830
831 // Skip less-than symbol that marks trailing comments.
832 if (BufferPtr != BufferEnd && *BufferPtr == '<')
833 BufferPtr++;
834
835 CommentState = LCS_InsideCComment;
836 State = LS_Normal;
837 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
838 goto again;
839 }
840 default:
841 llvm_unreachable("second character of comment should be '/' or '*'");
842 }
843
844 case LCS_BetweenComments: {
845 // Consecutive comments are extracted only if there is only whitespace
846 // between them. So we can search for the start of the next comment.
847 const char *EndWhitespace = BufferPtr;
848 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
849 EndWhitespace++;
850
851 // When lexing the start of an HTML tag (i.e. going through the attributes)
852 // there won't be any newlines generated.
853 if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
854 CommentState = LCS_BeforeComment;
855 BufferPtr = EndWhitespace;
856 goto again;
857 }
858
859 // Turn any whitespace between comments (and there is only whitespace
860 // between them -- guaranteed by comment extraction) into a newline. We
861 // have two newlines between C comments in total (first one was synthesized
862 // after a comment).
863 formTokenWithChars(Result&: T, TokEnd: EndWhitespace, Kind: tok::newline);
864
865 CommentState = LCS_BeforeComment;
866 break;
867 }
868
869 case LCS_InsideBCPLComment:
870 case LCS_InsideCComment:
871 if (BufferPtr != CommentEnd) {
872 lexCommentText(T);
873 break;
874 } else {
875 // Skip C comment closing sequence.
876 if (CommentState == LCS_InsideCComment) {
877 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
878 BufferPtr += 2;
879 assert(BufferPtr <= BufferEnd);
880
881 // When lexing the start of an HTML tag (i.e. going through the
882 // attributes) there won't be any newlines generated - whitespace still
883 // needs to be skipped.
884 if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
885 CommentState = LCS_BetweenComments;
886 goto again;
887 }
888
889 // Synthenize newline just after the C comment, regardless if there is
890 // actually a newline.
891 formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
892
893 CommentState = LCS_BetweenComments;
894 break;
895 } else {
896 // Don't synthesized a newline after BCPL comment.
897 CommentState = LCS_BetweenComments;
898 goto again;
899 }
900 }
901 }
902}
903
904StringRef Lexer::getSpelling(const Token &Tok,
905 const SourceManager &SourceMgr) const {
906 SourceLocation Loc = Tok.getLocation();
907 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
908
909 bool InvalidTemp = false;
910 StringRef File = SourceMgr.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp);
911 if (InvalidTemp)
912 return StringRef();
913
914 const char *Begin = File.data() + LocInfo.second;
915 return StringRef(Begin, Tok.getLength());
916}
917
918} // end namespace comments
919} // end namespace clang
920

Provided by KDAB

Privacy Policy
Update your C++ knowledge – Modern C++11/14/17 Training
Find out more

source code of clang/lib/AST/CommentLexer.cpp