1 | //===--- CommentLexer.cpp -------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang/AST/CommentLexer.h" |
10 | #include "clang/AST/CommentCommandTraits.h" |
11 | #include "clang/AST/CommentDiagnostic.h" |
12 | #include "clang/Basic/CharInfo.h" |
13 | #include "llvm/ADT/StringExtras.h" |
14 | #include "llvm/ADT/StringSwitch.h" |
15 | #include "llvm/Support/ConvertUTF.h" |
16 | #include "llvm/Support/ErrorHandling.h" |
17 | |
18 | namespace clang { |
19 | namespace comments { |
20 | |
21 | void Token::(const Lexer &L, const SourceManager &SM) const { |
22 | llvm::errs() << "comments::Token Kind=" << Kind << " " ; |
23 | Loc.print(OS&: llvm::errs(), SM); |
24 | llvm::errs() << " " << Length << " \"" << L.getSpelling(Tok: *this, SourceMgr: SM) << "\"\n" ; |
25 | } |
26 | |
27 | static inline bool (char C) { |
28 | return isLetter(c: C); |
29 | } |
30 | |
31 | static inline bool (char C) { |
32 | return isDigit(c: C); |
33 | } |
34 | |
35 | static inline bool (char C) { |
36 | return isHexDigit(c: C); |
37 | } |
38 | |
39 | static inline StringRef ( |
40 | llvm::BumpPtrAllocator &Allocator, |
41 | unsigned CodePoint) { |
42 | char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); |
43 | char *ResolvedPtr = Resolved; |
44 | if (llvm::ConvertCodePointToUTF8(Source: CodePoint, ResultPtr&: ResolvedPtr)) |
45 | return StringRef(Resolved, ResolvedPtr - Resolved); |
46 | else |
47 | return StringRef(); |
48 | } |
49 | |
50 | namespace { |
51 | |
52 | #include "clang/AST/CommentHTMLTags.inc" |
53 | #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" |
54 | |
55 | } // end anonymous namespace |
56 | |
57 | StringRef Lexer::(StringRef Name) const { |
58 | // Fast path, first check a few most widely used named character references. |
59 | return llvm::StringSwitch<StringRef>(Name) |
60 | .Case("amp" , "&" ) |
61 | .Case("lt" , "<" ) |
62 | .Case("gt" , ">" ) |
63 | .Case("quot" , "\"" ) |
64 | .Case("apos" , "\'" ) |
65 | // Slow path. |
66 | .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); |
67 | } |
68 | |
69 | StringRef Lexer::(StringRef Name) const { |
70 | unsigned CodePoint = 0; |
71 | for (unsigned i = 0, e = Name.size(); i != e; ++i) { |
72 | assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); |
73 | CodePoint *= 10; |
74 | CodePoint += Name[i] - '0'; |
75 | } |
76 | return convertCodePointToUTF8(Allocator, CodePoint); |
77 | } |
78 | |
79 | StringRef Lexer::(StringRef Name) const { |
80 | unsigned CodePoint = 0; |
81 | for (unsigned i = 0, e = Name.size(); i != e; ++i) { |
82 | CodePoint *= 16; |
83 | const char C = Name[i]; |
84 | assert(isHTMLHexCharacterReferenceCharacter(C)); |
85 | CodePoint += llvm::hexDigitValue(C); |
86 | } |
87 | return convertCodePointToUTF8(Allocator, CodePoint); |
88 | } |
89 | |
90 | void Lexer::() { |
91 | // This function should be called only for C comments |
92 | assert(CommentState == LCS_InsideCComment); |
93 | |
94 | if (BufferPtr == CommentEnd) |
95 | return; |
96 | |
97 | const char *NewBufferPtr = BufferPtr; |
98 | while (isHorizontalWhitespace(c: *NewBufferPtr)) |
99 | if (++NewBufferPtr == CommentEnd) |
100 | return; |
101 | if (*NewBufferPtr == '*') |
102 | BufferPtr = NewBufferPtr + 1; |
103 | } |
104 | |
105 | namespace { |
106 | /// Returns pointer to the first newline character in the string. |
107 | const char *(const char *BufferPtr, const char *BufferEnd) { |
108 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
109 | if (isVerticalWhitespace(c: *BufferPtr)) |
110 | return BufferPtr; |
111 | } |
112 | return BufferEnd; |
113 | } |
114 | |
115 | const char *(const char *BufferPtr, const char *BufferEnd) { |
116 | if (BufferPtr == BufferEnd) |
117 | return BufferPtr; |
118 | |
119 | if (*BufferPtr == '\n') |
120 | BufferPtr++; |
121 | else { |
122 | assert(*BufferPtr == '\r'); |
123 | BufferPtr++; |
124 | if (BufferPtr != BufferEnd && *BufferPtr == '\n') |
125 | BufferPtr++; |
126 | } |
127 | return BufferPtr; |
128 | } |
129 | |
130 | const char *(const char *BufferPtr, |
131 | const char *BufferEnd) { |
132 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
133 | if (!isHTMLNamedCharacterReferenceCharacter(C: *BufferPtr)) |
134 | return BufferPtr; |
135 | } |
136 | return BufferEnd; |
137 | } |
138 | |
139 | const char *(const char *BufferPtr, |
140 | const char *BufferEnd) { |
141 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
142 | if (!isHTMLDecimalCharacterReferenceCharacter(C: *BufferPtr)) |
143 | return BufferPtr; |
144 | } |
145 | return BufferEnd; |
146 | } |
147 | |
148 | const char *(const char *BufferPtr, |
149 | const char *BufferEnd) { |
150 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
151 | if (!isHTMLHexCharacterReferenceCharacter(C: *BufferPtr)) |
152 | return BufferPtr; |
153 | } |
154 | return BufferEnd; |
155 | } |
156 | |
157 | bool (char C) { |
158 | return isLetter(c: C); |
159 | } |
160 | |
161 | bool (char C) { |
162 | return isAlphanumeric(c: C); |
163 | } |
164 | |
165 | const char *(const char *BufferPtr, const char *BufferEnd) { |
166 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
167 | if (!isHTMLIdentifierCharacter(C: *BufferPtr)) |
168 | return BufferPtr; |
169 | } |
170 | return BufferEnd; |
171 | } |
172 | |
173 | /// Skip HTML string quoted in single or double quotes. Escaping quotes inside |
174 | /// string allowed. |
175 | /// |
176 | /// Returns pointer to closing quote. |
177 | const char *(const char *BufferPtr, const char *BufferEnd) |
178 | { |
179 | const char Quote = *BufferPtr; |
180 | assert(Quote == '\"' || Quote == '\''); |
181 | |
182 | BufferPtr++; |
183 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
184 | const char C = *BufferPtr; |
185 | if (C == Quote && BufferPtr[-1] != '\\') |
186 | return BufferPtr; |
187 | } |
188 | return BufferEnd; |
189 | } |
190 | |
191 | const char *(const char *BufferPtr, const char *BufferEnd) { |
192 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
193 | if (!isWhitespace(c: *BufferPtr)) |
194 | return BufferPtr; |
195 | } |
196 | return BufferEnd; |
197 | } |
198 | |
199 | bool (const char *BufferPtr, const char *BufferEnd) { |
200 | return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; |
201 | } |
202 | |
203 | bool isCommandNameStartCharacter(char C) { |
204 | return isLetter(c: C); |
205 | } |
206 | |
207 | bool isCommandNameCharacter(char C) { |
208 | return isAlphanumeric(c: C); |
209 | } |
210 | |
211 | const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { |
212 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
213 | if (!isCommandNameCharacter(C: *BufferPtr)) |
214 | return BufferPtr; |
215 | } |
216 | return BufferEnd; |
217 | } |
218 | |
219 | /// Return the one past end pointer for BCPL comments. |
220 | /// Handles newlines escaped with backslash or trigraph for backslahs. |
221 | const char *(const char *BufferPtr, const char *BufferEnd) { |
222 | const char *CurPtr = BufferPtr; |
223 | while (CurPtr != BufferEnd) { |
224 | while (!isVerticalWhitespace(c: *CurPtr)) { |
225 | CurPtr++; |
226 | if (CurPtr == BufferEnd) |
227 | return BufferEnd; |
228 | } |
229 | // We found a newline, check if it is escaped. |
230 | const char *EscapePtr = CurPtr - 1; |
231 | while(isHorizontalWhitespace(c: *EscapePtr)) |
232 | EscapePtr--; |
233 | |
234 | if (*EscapePtr == '\\' || |
235 | (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && |
236 | EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { |
237 | // We found an escaped newline. |
238 | CurPtr = skipNewline(BufferPtr: CurPtr, BufferEnd); |
239 | } else |
240 | return CurPtr; // Not an escaped newline. |
241 | } |
242 | return BufferEnd; |
243 | } |
244 | |
245 | /// Return the one past end pointer for C comments. |
246 | /// Very dumb, does not handle escaped newlines or trigraphs. |
247 | const char *(const char *BufferPtr, const char *BufferEnd) { |
248 | for ( ; BufferPtr != BufferEnd; ++BufferPtr) { |
249 | if (*BufferPtr == '*') { |
250 | assert(BufferPtr + 1 != BufferEnd); |
251 | if (*(BufferPtr + 1) == '/') |
252 | return BufferPtr; |
253 | } |
254 | } |
255 | llvm_unreachable("buffer end hit before '*/' was seen" ); |
256 | } |
257 | |
258 | } // end anonymous namespace |
259 | |
260 | void Lexer::(Token &Result, const char *TokEnd, |
261 | tok::TokenKind Kind) { |
262 | const unsigned TokLen = TokEnd - BufferPtr; |
263 | Result.setLocation(getSourceLocation(Loc: BufferPtr)); |
264 | Result.setKind(Kind); |
265 | Result.setLength(TokLen); |
266 | #ifndef NDEBUG |
267 | Result.TextPtr = "<UNSET>" ; |
268 | Result.IntVal = 7; |
269 | #endif |
270 | BufferPtr = TokEnd; |
271 | } |
272 | |
273 | const char *Lexer::() { |
274 | const char *TokenPtr = BufferPtr; |
275 | assert(TokenPtr < CommentEnd); |
276 | StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r" ; |
277 | |
278 | again: |
279 | size_t End = |
280 | StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: TokStartSymbols); |
281 | if (End == StringRef::npos) |
282 | return CommentEnd; |
283 | |
284 | // Doxygen doesn't recognize any commands in a one-line double quotation. |
285 | // If we don't find an ending quotation mark, we pretend it never began. |
286 | if (*(TokenPtr + End) == '\"') { |
287 | TokenPtr += End + 1; |
288 | End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: "\n\r\"" ); |
289 | if (End != StringRef::npos && *(TokenPtr + End) == '\"') |
290 | TokenPtr += End + 1; |
291 | goto again; |
292 | } |
293 | return TokenPtr + End; |
294 | } |
295 | |
296 | void Lexer::(Token &T) { |
297 | assert(CommentState == LCS_InsideBCPLComment || |
298 | CommentState == LCS_InsideCComment); |
299 | |
300 | // Handles lexing non-command text, i.e. text and newline. |
301 | auto HandleNonCommandToken = [&]() -> void { |
302 | assert(State == LS_Normal); |
303 | |
304 | const char *TokenPtr = BufferPtr; |
305 | assert(TokenPtr < CommentEnd); |
306 | switch (*TokenPtr) { |
307 | case '\n': |
308 | case '\r': |
309 | TokenPtr = skipNewline(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
310 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::newline); |
311 | |
312 | if (CommentState == LCS_InsideCComment) |
313 | skipLineStartingDecorations(); |
314 | return; |
315 | |
316 | default: |
317 | return formTextToken(Result&: T, TokEnd: skipTextToken()); |
318 | } |
319 | }; |
320 | |
321 | if (!ParseCommands) |
322 | return HandleNonCommandToken(); |
323 | |
324 | switch (State) { |
325 | case LS_Normal: |
326 | break; |
327 | case LS_VerbatimBlockFirstLine: |
328 | lexVerbatimBlockFirstLine(T); |
329 | return; |
330 | case LS_VerbatimBlockBody: |
331 | lexVerbatimBlockBody(T); |
332 | return; |
333 | case LS_VerbatimLineText: |
334 | lexVerbatimLineText(T); |
335 | return; |
336 | case LS_HTMLStartTag: |
337 | lexHTMLStartTag(T); |
338 | return; |
339 | case LS_HTMLEndTag: |
340 | lexHTMLEndTag(T); |
341 | return; |
342 | } |
343 | |
344 | assert(State == LS_Normal); |
345 | const char *TokenPtr = BufferPtr; |
346 | assert(TokenPtr < CommentEnd); |
347 | switch(*TokenPtr) { |
348 | case '\\': |
349 | case '@': { |
350 | // Commands that start with a backslash and commands that start with |
351 | // 'at' have equivalent semantics. But we keep information about the |
352 | // exact syntax in AST for comments. |
353 | tok::TokenKind CommandKind = |
354 | (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; |
355 | TokenPtr++; |
356 | if (TokenPtr == CommentEnd) { |
357 | formTextToken(Result&: T, TokEnd: TokenPtr); |
358 | return; |
359 | } |
360 | char C = *TokenPtr; |
361 | switch (C) { |
362 | default: |
363 | break; |
364 | |
365 | case '\\': case '@': case '&': case '$': |
366 | case '#': case '<': case '>': case '%': |
367 | case '\"': case '.': case ':': |
368 | // This is one of \\ \@ \& \$ etc escape sequences. |
369 | TokenPtr++; |
370 | if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { |
371 | // This is the \:: escape sequence. |
372 | TokenPtr++; |
373 | } |
374 | StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); |
375 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text); |
376 | T.setText(UnescapedText); |
377 | return; |
378 | } |
379 | |
380 | // Don't make zero-length commands. |
381 | if (!isCommandNameStartCharacter(C: *TokenPtr)) { |
382 | formTextToken(Result&: T, TokEnd: TokenPtr); |
383 | return; |
384 | } |
385 | |
386 | TokenPtr = skipCommandName(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
387 | unsigned Length = TokenPtr - (BufferPtr + 1); |
388 | |
389 | // Hardcoded support for lexing LaTeX formula commands |
390 | // \f$ \f( \f) \f[ \f] \f{ \f} as a single command. |
391 | if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { |
392 | C = *TokenPtr; |
393 | if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' || |
394 | C == '{' || C == '}') { |
395 | TokenPtr++; |
396 | Length++; |
397 | } |
398 | } |
399 | |
400 | StringRef CommandName(BufferPtr + 1, Length); |
401 | |
402 | const CommandInfo *Info = Traits.getCommandInfoOrNULL(Name: CommandName); |
403 | if (!Info) { |
404 | if ((Info = Traits.getTypoCorrectCommandInfo(Typo: CommandName))) { |
405 | StringRef CorrectedName = Info->Name; |
406 | SourceLocation Loc = getSourceLocation(Loc: BufferPtr); |
407 | SourceLocation EndLoc = getSourceLocation(Loc: TokenPtr); |
408 | SourceRange FullRange = SourceRange(Loc, EndLoc); |
409 | SourceRange CommandRange(Loc.getLocWithOffset(Offset: 1), EndLoc); |
410 | Diag(Loc, diag::warn_correct_comment_command_name) |
411 | << FullRange << CommandName << CorrectedName |
412 | << FixItHint::CreateReplacement(CommandRange, CorrectedName); |
413 | } else { |
414 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::unknown_command); |
415 | T.setUnknownCommandName(CommandName); |
416 | Diag(T.getLocation(), diag::warn_unknown_comment_command_name) |
417 | << SourceRange(T.getLocation(), T.getEndLocation()); |
418 | return; |
419 | } |
420 | } |
421 | if (Info->IsVerbatimBlockCommand) { |
422 | setupAndLexVerbatimBlock(T, TextBegin: TokenPtr, Marker: *BufferPtr, Info); |
423 | return; |
424 | } |
425 | if (Info->IsVerbatimLineCommand) { |
426 | setupAndLexVerbatimLine(T, TextBegin: TokenPtr, Info); |
427 | return; |
428 | } |
429 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: CommandKind); |
430 | T.setCommandID(Info->getID()); |
431 | return; |
432 | } |
433 | |
434 | case '&': |
435 | lexHTMLCharacterReference(T); |
436 | return; |
437 | |
438 | case '<': { |
439 | TokenPtr++; |
440 | if (TokenPtr == CommentEnd) { |
441 | formTextToken(Result&: T, TokEnd: TokenPtr); |
442 | return; |
443 | } |
444 | const char C = *TokenPtr; |
445 | if (isHTMLIdentifierStartingCharacter(C)) |
446 | setupAndLexHTMLStartTag(T); |
447 | else if (C == '/') |
448 | setupAndLexHTMLEndTag(T); |
449 | else |
450 | formTextToken(Result&: T, TokEnd: TokenPtr); |
451 | return; |
452 | } |
453 | |
454 | default: |
455 | return HandleNonCommandToken(); |
456 | } |
457 | } |
458 | |
459 | void Lexer::setupAndLexVerbatimBlock(Token &T, |
460 | const char *TextBegin, |
461 | char Marker, const CommandInfo *Info) { |
462 | assert(Info->IsVerbatimBlockCommand); |
463 | |
464 | VerbatimBlockEndCommandName.clear(); |
465 | VerbatimBlockEndCommandName.append(RHS: Marker == '\\' ? "\\" : "@" ); |
466 | VerbatimBlockEndCommandName.append(RHS: Info->EndCommandName); |
467 | |
468 | formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_block_begin); |
469 | T.setVerbatimBlockID(Info->getID()); |
470 | |
471 | // If there is a newline following the verbatim opening command, skip the |
472 | // newline so that we don't create an tok::verbatim_block_line with empty |
473 | // text content. |
474 | if (BufferPtr != CommentEnd && |
475 | isVerticalWhitespace(c: *BufferPtr)) { |
476 | BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd); |
477 | State = LS_VerbatimBlockBody; |
478 | return; |
479 | } |
480 | |
481 | State = LS_VerbatimBlockFirstLine; |
482 | } |
483 | |
484 | void Lexer::(Token &T) { |
485 | again: |
486 | assert(BufferPtr < CommentEnd); |
487 | |
488 | // FIXME: It would be better to scan the text once, finding either the block |
489 | // end command or newline. |
490 | // |
491 | // Extract current line. |
492 | const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd); |
493 | StringRef Line(BufferPtr, Newline - BufferPtr); |
494 | |
495 | // Look for end command in current line. |
496 | size_t Pos = Line.find(Str: VerbatimBlockEndCommandName); |
497 | const char *TextEnd; |
498 | const char *NextLine; |
499 | if (Pos == StringRef::npos) { |
500 | // Current line is completely verbatim. |
501 | TextEnd = Newline; |
502 | NextLine = skipNewline(BufferPtr: Newline, BufferEnd: CommentEnd); |
503 | } else if (Pos == 0) { |
504 | // Current line contains just an end command. |
505 | const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); |
506 | StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); |
507 | formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::verbatim_block_end); |
508 | T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); |
509 | State = LS_Normal; |
510 | return; |
511 | } else { |
512 | // There is some text, followed by end command. Extract text first. |
513 | TextEnd = BufferPtr + Pos; |
514 | NextLine = TextEnd; |
515 | // If there is only whitespace before end command, skip whitespace. |
516 | if (isWhitespace(BufferPtr, BufferEnd: TextEnd)) { |
517 | BufferPtr = TextEnd; |
518 | goto again; |
519 | } |
520 | } |
521 | |
522 | StringRef Text(BufferPtr, TextEnd - BufferPtr); |
523 | formTokenWithChars(Result&: T, TokEnd: NextLine, Kind: tok::verbatim_block_line); |
524 | T.setVerbatimBlockText(Text); |
525 | |
526 | State = LS_VerbatimBlockBody; |
527 | } |
528 | |
529 | void Lexer::lexVerbatimBlockBody(Token &T) { |
530 | assert(State == LS_VerbatimBlockBody); |
531 | |
532 | if (CommentState == LCS_InsideCComment) |
533 | skipLineStartingDecorations(); |
534 | |
535 | if (BufferPtr == CommentEnd) { |
536 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::verbatim_block_line); |
537 | T.setVerbatimBlockText("" ); |
538 | return; |
539 | } |
540 | |
541 | lexVerbatimBlockFirstLine(T); |
542 | } |
543 | |
544 | void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
545 | const CommandInfo *Info) { |
546 | assert(Info->IsVerbatimLineCommand); |
547 | formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_line_name); |
548 | T.setVerbatimLineID(Info->getID()); |
549 | |
550 | State = LS_VerbatimLineText; |
551 | } |
552 | |
553 | void Lexer::(Token &T) { |
554 | assert(State == LS_VerbatimLineText); |
555 | |
556 | // Extract current line. |
557 | const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd); |
558 | StringRef Text(BufferPtr, Newline - BufferPtr); |
559 | formTokenWithChars(Result&: T, TokEnd: Newline, Kind: tok::verbatim_line_text); |
560 | T.setVerbatimLineText(Text); |
561 | |
562 | State = LS_Normal; |
563 | } |
564 | |
565 | void Lexer::(Token &T) { |
566 | const char *TokenPtr = BufferPtr; |
567 | assert(*TokenPtr == '&'); |
568 | TokenPtr++; |
569 | if (TokenPtr == CommentEnd) { |
570 | formTextToken(Result&: T, TokEnd: TokenPtr); |
571 | return; |
572 | } |
573 | const char *NamePtr; |
574 | bool isNamed = false; |
575 | bool isDecimal = false; |
576 | char C = *TokenPtr; |
577 | if (isHTMLNamedCharacterReferenceCharacter(C)) { |
578 | NamePtr = TokenPtr; |
579 | TokenPtr = skipNamedCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
580 | isNamed = true; |
581 | } else if (C == '#') { |
582 | TokenPtr++; |
583 | if (TokenPtr == CommentEnd) { |
584 | formTextToken(Result&: T, TokEnd: TokenPtr); |
585 | return; |
586 | } |
587 | C = *TokenPtr; |
588 | if (isHTMLDecimalCharacterReferenceCharacter(C)) { |
589 | NamePtr = TokenPtr; |
590 | TokenPtr = skipDecimalCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
591 | isDecimal = true; |
592 | } else if (C == 'x' || C == 'X') { |
593 | TokenPtr++; |
594 | NamePtr = TokenPtr; |
595 | TokenPtr = skipHexCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
596 | } else { |
597 | formTextToken(Result&: T, TokEnd: TokenPtr); |
598 | return; |
599 | } |
600 | } else { |
601 | formTextToken(Result&: T, TokEnd: TokenPtr); |
602 | return; |
603 | } |
604 | if (NamePtr == TokenPtr || TokenPtr == CommentEnd || |
605 | *TokenPtr != ';') { |
606 | formTextToken(Result&: T, TokEnd: TokenPtr); |
607 | return; |
608 | } |
609 | StringRef Name(NamePtr, TokenPtr - NamePtr); |
610 | TokenPtr++; // Skip semicolon. |
611 | StringRef Resolved; |
612 | if (isNamed) |
613 | Resolved = resolveHTMLNamedCharacterReference(Name); |
614 | else if (isDecimal) |
615 | Resolved = resolveHTMLDecimalCharacterReference(Name); |
616 | else |
617 | Resolved = resolveHTMLHexCharacterReference(Name); |
618 | |
619 | if (Resolved.empty()) { |
620 | formTextToken(Result&: T, TokEnd: TokenPtr); |
621 | return; |
622 | } |
623 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text); |
624 | T.setText(Resolved); |
625 | } |
626 | |
627 | void Lexer::setupAndLexHTMLStartTag(Token &T) { |
628 | assert(BufferPtr[0] == '<' && |
629 | isHTMLIdentifierStartingCharacter(BufferPtr[1])); |
630 | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: BufferPtr + 2, BufferEnd: CommentEnd); |
631 | StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); |
632 | if (!isHTMLTagName(Name)) { |
633 | formTextToken(Result&: T, TokEnd: TagNameEnd); |
634 | return; |
635 | } |
636 | |
637 | formTokenWithChars(Result&: T, TokEnd: TagNameEnd, Kind: tok::html_start_tag); |
638 | T.setHTMLTagStartName(Name); |
639 | |
640 | BufferPtr = skipWhitespace(BufferPtr, BufferEnd: CommentEnd); |
641 | |
642 | const char C = *BufferPtr; |
643 | if (BufferPtr != CommentEnd && |
644 | (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) |
645 | State = LS_HTMLStartTag; |
646 | } |
647 | |
648 | void Lexer::(Token &T) { |
649 | assert(State == LS_HTMLStartTag); |
650 | |
651 | const char *TokenPtr = BufferPtr; |
652 | char C = *TokenPtr; |
653 | if (isHTMLIdentifierCharacter(C)) { |
654 | TokenPtr = skipHTMLIdentifier(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
655 | StringRef Ident(BufferPtr, TokenPtr - BufferPtr); |
656 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_ident); |
657 | T.setHTMLIdent(Ident); |
658 | } else { |
659 | switch (C) { |
660 | case '=': |
661 | TokenPtr++; |
662 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_equals); |
663 | break; |
664 | case '\"': |
665 | case '\'': { |
666 | const char *OpenQuote = TokenPtr; |
667 | TokenPtr = skipHTMLQuotedString(BufferPtr: TokenPtr, BufferEnd: CommentEnd); |
668 | const char *ClosingQuote = TokenPtr; |
669 | if (TokenPtr != CommentEnd) // Skip closing quote. |
670 | TokenPtr++; |
671 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_quoted_string); |
672 | T.setHTMLQuotedString(StringRef(OpenQuote + 1, |
673 | ClosingQuote - (OpenQuote + 1))); |
674 | break; |
675 | } |
676 | case '>': |
677 | TokenPtr++; |
678 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_greater); |
679 | State = LS_Normal; |
680 | return; |
681 | case '/': |
682 | TokenPtr++; |
683 | if (TokenPtr != CommentEnd && *TokenPtr == '>') { |
684 | TokenPtr++; |
685 | formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_slash_greater); |
686 | } else |
687 | formTextToken(Result&: T, TokEnd: TokenPtr); |
688 | |
689 | State = LS_Normal; |
690 | return; |
691 | } |
692 | } |
693 | |
694 | // Now look ahead and return to normal state if we don't see any HTML tokens |
695 | // ahead. |
696 | BufferPtr = skipWhitespace(BufferPtr, BufferEnd: CommentEnd); |
697 | if (BufferPtr == CommentEnd) { |
698 | State = LS_Normal; |
699 | return; |
700 | } |
701 | |
702 | C = *BufferPtr; |
703 | if (!isHTMLIdentifierStartingCharacter(C) && |
704 | C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') { |
705 | State = LS_Normal; |
706 | return; |
707 | } |
708 | } |
709 | |
710 | void Lexer::setupAndLexHTMLEndTag(Token &T) { |
711 | assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); |
712 | |
713 | const char *TagNameBegin = skipWhitespace(BufferPtr: BufferPtr + 2, BufferEnd: CommentEnd); |
714 | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: TagNameBegin, BufferEnd: CommentEnd); |
715 | StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); |
716 | if (!isHTMLTagName(Name)) { |
717 | formTextToken(Result&: T, TokEnd: TagNameEnd); |
718 | return; |
719 | } |
720 | |
721 | const char *End = skipWhitespace(BufferPtr: TagNameEnd, BufferEnd: CommentEnd); |
722 | |
723 | formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::html_end_tag); |
724 | T.setHTMLTagEndName(Name); |
725 | |
726 | if (BufferPtr != CommentEnd && *BufferPtr == '>') |
727 | State = LS_HTMLEndTag; |
728 | } |
729 | |
730 | void Lexer::(Token &T) { |
731 | assert(BufferPtr != CommentEnd && *BufferPtr == '>'); |
732 | |
733 | formTokenWithChars(Result&: T, TokEnd: BufferPtr + 1, Kind: tok::html_greater); |
734 | State = LS_Normal; |
735 | } |
736 | |
737 | Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
738 | const CommandTraits &Traits, SourceLocation FileLoc, |
739 | const char *BufferStart, const char *BufferEnd, bool ParseCommands) |
740 | : Allocator(Allocator), Diags(Diags), Traits(Traits), |
741 | BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), |
742 | FileLoc(FileLoc), ParseCommands(ParseCommands), |
743 | CommentState(LCS_BeforeComment), State(LS_Normal) {} |
744 | |
745 | void Lexer::(Token &T) { |
746 | again: |
747 | switch (CommentState) { |
748 | case LCS_BeforeComment: |
749 | if (BufferPtr == BufferEnd) { |
750 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::eof); |
751 | return; |
752 | } |
753 | |
754 | assert(*BufferPtr == '/'); |
755 | BufferPtr++; // Skip first slash. |
756 | switch(*BufferPtr) { |
757 | case '/': { // BCPL comment. |
758 | BufferPtr++; // Skip second slash. |
759 | |
760 | if (BufferPtr != BufferEnd) { |
761 | // Skip Doxygen magic marker, if it is present. |
762 | // It might be missing because of a typo //< or /*<, or because we |
763 | // merged this non-Doxygen comment into a bunch of Doxygen comments |
764 | // around it: /** ... */ /* ... */ /** ... */ |
765 | const char C = *BufferPtr; |
766 | if (C == '/' || C == '!') |
767 | BufferPtr++; |
768 | } |
769 | |
770 | // Skip less-than symbol that marks trailing comments. |
771 | // Skip it even if the comment is not a Doxygen one, because //< and /*< |
772 | // are frequent typos. |
773 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
774 | BufferPtr++; |
775 | |
776 | CommentState = LCS_InsideBCPLComment; |
777 | if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) |
778 | State = LS_Normal; |
779 | CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); |
780 | goto again; |
781 | } |
782 | case '*': { // C comment. |
783 | BufferPtr++; // Skip star. |
784 | |
785 | // Skip Doxygen magic marker. |
786 | const char C = *BufferPtr; |
787 | if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') |
788 | BufferPtr++; |
789 | |
790 | // Skip less-than symbol that marks trailing comments. |
791 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
792 | BufferPtr++; |
793 | |
794 | CommentState = LCS_InsideCComment; |
795 | State = LS_Normal; |
796 | CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); |
797 | goto again; |
798 | } |
799 | default: |
800 | llvm_unreachable("second character of comment should be '/' or '*'" ); |
801 | } |
802 | |
803 | case LCS_BetweenComments: { |
804 | // Consecutive comments are extracted only if there is only whitespace |
805 | // between them. So we can search for the start of the next comment. |
806 | const char *EndWhitespace = BufferPtr; |
807 | while(EndWhitespace != BufferEnd && *EndWhitespace != '/') |
808 | EndWhitespace++; |
809 | |
810 | // Turn any whitespace between comments (and there is only whitespace |
811 | // between them -- guaranteed by comment extraction) into a newline. We |
812 | // have two newlines between C comments in total (first one was synthesized |
813 | // after a comment). |
814 | formTokenWithChars(Result&: T, TokEnd: EndWhitespace, Kind: tok::newline); |
815 | |
816 | CommentState = LCS_BeforeComment; |
817 | break; |
818 | } |
819 | |
820 | case LCS_InsideBCPLComment: |
821 | case LCS_InsideCComment: |
822 | if (BufferPtr != CommentEnd) { |
823 | lexCommentText(T); |
824 | break; |
825 | } else { |
826 | // Skip C comment closing sequence. |
827 | if (CommentState == LCS_InsideCComment) { |
828 | assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); |
829 | BufferPtr += 2; |
830 | assert(BufferPtr <= BufferEnd); |
831 | |
832 | // Synthenize newline just after the C comment, regardless if there is |
833 | // actually a newline. |
834 | formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline); |
835 | |
836 | CommentState = LCS_BetweenComments; |
837 | break; |
838 | } else { |
839 | // Don't synthesized a newline after BCPL comment. |
840 | CommentState = LCS_BetweenComments; |
841 | goto again; |
842 | } |
843 | } |
844 | } |
845 | } |
846 | |
847 | StringRef Lexer::(const Token &Tok, |
848 | const SourceManager &SourceMgr) const { |
849 | SourceLocation Loc = Tok.getLocation(); |
850 | std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); |
851 | |
852 | bool InvalidTemp = false; |
853 | StringRef File = SourceMgr.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp); |
854 | if (InvalidTemp) |
855 | return StringRef(); |
856 | |
857 | const char *Begin = File.data() + LocInfo.second; |
858 | return StringRef(Begin, Tok.getLength()); |
859 | } |
860 | |
861 | } // end namespace comments |
862 | } // end namespace clang |
863 | |