CommentLexer.cpp source code [clang/lib/AST/CommentLexer.cpp]

1	//===--- CommentLexer.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "clang/AST/CommentLexer.h"
10	#include "clang/AST/CommentCommandTraits.h"
11	#include "clang/AST/CommentDiagnostic.h"
12	#include "clang/Basic/CharInfo.h"
13	#include "llvm/ADT/StringExtras.h"
14	#include "llvm/ADT/StringSwitch.h"
15	#include "llvm/Support/ConvertUTF.h"
16	#include "llvm/Support/ErrorHandling.h"
17
18	namespace clang {
19	namespace comments {
20
21	void Token::dump(const Lexer &L, const SourceManager &SM) const {
22	llvm::errs() << "comments::Token Kind=" << Kind << " ";
23	Loc.print(OS&: llvm::errs(), SM);
24	llvm::errs() << " " << Length << " \"" << L.getSpelling(Tok: *this, SourceMgr: SM) << "\"\n";
25	}
26
27	static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28	return isLetter(c: C);
29	}
30
31	static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32	return isDigit(c: C);
33	}
34
35	static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36	return isHexDigit(c: C);
37	}
38
39	static inline StringRef convertCodePointToUTF8(
40	llvm::BumpPtrAllocator &Allocator,
41	unsigned CodePoint) {
42	char Resolved = Allocator.Allocate<char*>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43	char *ResolvedPtr = Resolved;
44	if (llvm::ConvertCodePointToUTF8(Source: CodePoint, ResultPtr&: ResolvedPtr))
45	return StringRef(Resolved, ResolvedPtr - Resolved);
46	else
47	return StringRef();
48	}
49
50	namespace {
51
52	#include "clang/AST/CommentHTMLTags.inc"
53	#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55	} // end anonymous namespace
56
57	StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58	// Fast path, first check a few most widely used named character references.
59	return llvm::StringSwitch<StringRef>(Name)
60	.Case("amp", "&")
61	.Case("lt", "<")
62	.Case("gt", ">")
63	.Case("quot", "\"")
64	.Case("apos", "\'")
65	// Slow path.
66	.Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67	}
68
69	StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70	unsigned CodePoint = `0`;
71	for (unsigned i = `0`, e = Name.size(); i != e; ++i) {
72	assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73	CodePoint *= `10`;
74	CodePoint += Name [i] - `'0'`;
75	}
76	return convertCodePointToUTF8(Allocator, CodePoint);
77	}
78
79	StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80	unsigned CodePoint = `0`;
81	for (unsigned i = `0`, e = Name.size(); i != e; ++i) {
82	CodePoint *= `16`;
83	const char C = Name [i];
84	assert(isHTMLHexCharacterReferenceCharacter(C));
85	CodePoint += llvm::hexDigitValue(C);
86	}
87	return convertCodePointToUTF8(Allocator, CodePoint);
88	}
89
90	void Lexer::skipLineStartingDecorations() {
91	// This function should be called only for C comments
92	assert(CommentState == LCS_InsideCComment);
93
94	if (BufferPtr == CommentEnd)
95	return;
96
97	const char *NewBufferPtr = BufferPtr;
98	while (isHorizontalWhitespace(c: *NewBufferPtr))
99	if (++NewBufferPtr == CommentEnd)
100	return;
101	if (NewBufferPtr == `''`)
102	BufferPtr = NewBufferPtr + `1`;
103	}
104
105	namespace {
106	/// Returns pointer to the first newline character in the string.
107	const char findNewline(const* char BufferPtr, const* char *BufferEnd) {
108	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109	if (isVerticalWhitespace(c: *BufferPtr))
110	return BufferPtr;
111	}
112	return BufferEnd;
113	}
114
115	const char skipNewline(const* char BufferPtr, const* char *BufferEnd) {
116	if (BufferPtr == BufferEnd)
117	return BufferPtr;
118
119	if (*BufferPtr == `'\n'`)
120	BufferPtr++;
121	else {
122	assert(*BufferPtr == `'\r'`);
123	BufferPtr++;
124	if (BufferPtr != BufferEnd && *BufferPtr == `'\n'`)
125	BufferPtr++;
126	}
127	return BufferPtr;
128	}
129
130	const char skipNamedCharacterReference(const* char *BufferPtr,
131	const char *BufferEnd) {
132	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133	if (!isHTMLNamedCharacterReferenceCharacter(C: *BufferPtr))
134	return BufferPtr;
135	}
136	return BufferEnd;
137	}
138
139	const char skipDecimalCharacterReference(const* char *BufferPtr,
140	const char *BufferEnd) {
141	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142	if (!isHTMLDecimalCharacterReferenceCharacter(C: *BufferPtr))
143	return BufferPtr;
144	}
145	return BufferEnd;
146	}
147
148	const char skipHexCharacterReference(const* char *BufferPtr,
149	const char *BufferEnd) {
150	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151	if (!isHTMLHexCharacterReferenceCharacter(C: *BufferPtr))
152	return BufferPtr;
153	}
154	return BufferEnd;
155	}
156
157	bool isHTMLIdentifierStartingCharacter(char C) {
158	return isLetter(c: C);
159	}
160
161	bool isHTMLIdentifierCharacter(char C) {
162	return isAlphanumeric(c: C);
163	}
164
165	const char skipHTMLIdentifier(const* char BufferPtr, const* char *BufferEnd) {
166	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167	if (!isHTMLIdentifierCharacter(C: *BufferPtr))
168	return BufferPtr;
169	}
170	return BufferEnd;
171	}
172
173	/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174	/// string allowed.
175	///
176	/// Returns pointer to closing quote.
177	const char skipHTMLQuotedString(const* char BufferPtr, const* char *BufferEnd)
178	{
179	const char Quote = *BufferPtr;
180	assert(Quote == `'\"'` \|\| Quote == `'\''`);
181
182	BufferPtr++;
183	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184	const char C = *BufferPtr;
185	if (C == Quote && BufferPtr[-`1`] != `'\\'`)
186	return BufferPtr;
187	}
188	return BufferEnd;
189	}
190
191	const char skipWhitespace(const* char BufferPtr, const* char *BufferEnd) {
192	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193	if (!isWhitespace(c: *BufferPtr))
194	return BufferPtr;
195	}
196	return BufferEnd;
197	}
198
199	bool isWhitespace(const char BufferPtr, const* char *BufferEnd) {
200	return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201	}
202
203	bool isCommandNameStartCharacter(char C) {
204	return isLetter(c: C);
205	}
206
207	bool isCommandNameCharacter(char C) {
208	return isAlphanumeric(c: C);
209	}
210
211	const char skipCommandName(const* char BufferPtr, const* char *BufferEnd) {
212	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213	if (!isCommandNameCharacter(C: *BufferPtr))
214	return BufferPtr;
215	}
216	return BufferEnd;
217	}
218
219	/// Return the one past end pointer for BCPL comments.
220	/// Handles newlines escaped with backslash or trigraph for backslahs.
221	const char findBCPLCommentEnd(const* char BufferPtr, const* char *BufferEnd) {
222	const char *CurPtr = BufferPtr;
223	while (CurPtr != BufferEnd) {
224	while (!isVerticalWhitespace(c: *CurPtr)) {
225	CurPtr++;
226	if (CurPtr == BufferEnd)
227	return BufferEnd;
228	}
229	// We found a newline, check if it is escaped.
230	const char *EscapePtr = CurPtr - `1`;
231	while(isHorizontalWhitespace(c: *EscapePtr))
232	EscapePtr--;
233
234	if (*EscapePtr == `'\\'` \|\|
235	(EscapePtr - `2` >= BufferPtr && EscapePtr[`0`] == `'/'` &&
236	EscapePtr[-`1`] == `'?'` && EscapePtr[-`2`] == `'?'`)) {
237	// We found an escaped newline.
238	CurPtr = skipNewline(BufferPtr: CurPtr, BufferEnd);
239	} else
240	return CurPtr; // Not an escaped newline.
241	}
242	return BufferEnd;
243	}
244
245	/// Return the one past end pointer for C comments.
246	/// Very dumb, does not handle escaped newlines or trigraphs.
247	const char findCCommentEnd(const* char BufferPtr, const* char *BufferEnd) {
248	for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249	if (BufferPtr == `''`) {
250	assert(BufferPtr + `1` != BufferEnd);
251	if (*(BufferPtr + `1`) == `'/'`)
252	return BufferPtr;
253	}
254	}
255	llvm_unreachable("buffer end hit before '*/' was seen");
256	}
257
258	} // end anonymous namespace
259
260	void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261	tok::TokenKind Kind) {
262	const unsigned TokLen = TokEnd - BufferPtr;
263	Result.setLocation(getSourceLocation(Loc: BufferPtr));
264	Result.setKind(Kind);
265	Result.setLength(TokLen);
266	#ifndef NDEBUG
267	Result.TextPtr = "<UNSET>";
268	Result.IntVal = `7`;
269	#endif
270	BufferPtr = TokEnd;
271	}
272
273	const char *Lexer::skipTextToken() {
274	const char *TokenPtr = BufferPtr;
275	assert(TokenPtr < CommentEnd);
276	StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277
278	again:
279	size_t End =
280	StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: TokStartSymbols);
281	if (End == StringRef::npos)
282	return CommentEnd;
283
284	// Doxygen doesn't recognize any commands in a one-line double quotation.
285	// If we don't find an ending quotation mark, we pretend it never began.
286	if (*(TokenPtr + End) == `'\"'`) {
287	TokenPtr += End + `1`;
288	End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(Chars: "\n\r\"");
289	if (End != StringRef::npos && *(TokenPtr + End) == `'\"'`)
290	TokenPtr += End + `1`;
291	goto again;
292	}
293	return TokenPtr + End;
294	}
295
296	void Lexer::lexCommentText(Token &T) {
297	assert(CommentState == LCS_InsideBCPLComment \|\|
298	CommentState == LCS_InsideCComment);
299
300	// Handles lexing non-command text, i.e. text and newline.
301	auto HandleNonCommandToken = [&]() -> void {
302	assert(State == LS_Normal);
303
304	const char *TokenPtr = BufferPtr;
305	assert(TokenPtr < CommentEnd);
306	switch (*TokenPtr) {
307	case `'\n'`:
308	case `'\r'`:
309	TokenPtr = skipNewline(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
310	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::newline);
311
312	if (CommentState == LCS_InsideCComment)
313	skipLineStartingDecorations();
314	return;
315
316	default:
317	return formTextToken(Result&: T, TokEnd: skipTextToken());
318	}
319	};
320
321	if (!ParseCommands)
322	return HandleNonCommandToken ();
323
324	switch (State) {
325	case LS_Normal:
326	break;
327	case LS_VerbatimBlockFirstLine:
328	lexVerbatimBlockFirstLine(T);
329	return;
330	case LS_VerbatimBlockBody:
331	lexVerbatimBlockBody(T);
332	return;
333	case LS_VerbatimLineText:
334	lexVerbatimLineText(T);
335	return;
336	case LS_HTMLStartTag:
337	lexHTMLStartTag(T);
338	return;
339	case LS_HTMLEndTag:
340	lexHTMLEndTag(T);
341	return;
342	}
343
344	assert(State == LS_Normal);
345	const char *TokenPtr = BufferPtr;
346	assert(TokenPtr < CommentEnd);
347	switch(*TokenPtr) {
348	case `'\\'`:
349	case `'@'`: {
350	// Commands that start with a backslash and commands that start with
351	// 'at' have equivalent semantics. But we keep information about the
352	// exact syntax in AST for comments.
353	tok::TokenKind CommandKind =
354	(*TokenPtr == `'@'`) ? tok::at_command : tok::backslash_command;
355	TokenPtr++;
356	if (TokenPtr == CommentEnd) {
357	formTextToken(Result&: T, TokEnd: TokenPtr);
358	return;
359	}
360	char C = *TokenPtr;
361	switch (C) {
362	default:
363	break;
364
365	case `'\\'`: case `'@'`: case `'&'`: case `'$'`:
366	case `'#'`: case `'<'`: case `'>'`: case `'%'`:
367	case `'\"'`: case `'.'`: case `':'`:
368	// This is one of \\ \@ \& \$ etc escape sequences.
369	TokenPtr++;
370	if (C == `':'` && TokenPtr != CommentEnd && *TokenPtr == `':'`) {
371	// This is the \:: escape sequence.
372	TokenPtr++;
373	}
374	StringRef UnescapedText(BufferPtr + `1`, TokenPtr - (BufferPtr + `1`));
375	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
376	T.setText(UnescapedText);
377	return;
378	}
379
380	// Don't make zero-length commands.
381	if (!isCommandNameStartCharacter(C: *TokenPtr)) {
382	formTextToken(Result&: T, TokEnd: TokenPtr);
383	return;
384	}
385
386	TokenPtr = skipCommandName(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
387	unsigned Length = TokenPtr - (BufferPtr + `1`);
388
389	// Hardcoded support for lexing LaTeX formula commands
390	// \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391	if (Length == `1` && TokenPtr[-`1`] == `'f'` && TokenPtr != CommentEnd) {
392	C = *TokenPtr;
393	if (C == `'$'` \|\| C == `'('` \|\| C == `')'` \|\| C == `'['` \|\| C == `']'` \|\|
394	C == `'{'` \|\| C == `'}'`) {
395	TokenPtr++;
396	Length++;
397	}
398	}
399
400	StringRef CommandName(BufferPtr + `1`, Length);
401
402	const CommandInfo *Info = Traits.getCommandInfoOrNULL(Name: CommandName);
403	if (!Info) {
404	if ((Info = Traits.getTypoCorrectCommandInfo(Typo: CommandName))) {
405	StringRef CorrectedName = Info->Name;
406	SourceLocation Loc = getSourceLocation(Loc: BufferPtr);
407	SourceLocation EndLoc = getSourceLocation(Loc: TokenPtr);
408	SourceRange FullRange = SourceRange (Loc, EndLoc);
409	SourceRange CommandRange(Loc.getLocWithOffset(Offset: `1`), EndLoc);
410	Diag(Loc, diag::warn_correct_comment_command_name)
411	<< FullRange << CommandName << CorrectedName
412	<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
413	} else {
414	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::unknown_command);
415	T.setUnknownCommandName(CommandName);
416	Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417	<< SourceRange(T.getLocation(), T.getEndLocation());
418	return;
419	}
420	}
421	if (Info->IsVerbatimBlockCommand) {
422	setupAndLexVerbatimBlock(T, TextBegin: TokenPtr, Marker: *BufferPtr, Info);
423	return;
424	}
425	if (Info->IsVerbatimLineCommand) {
426	setupAndLexVerbatimLine(T, TextBegin: TokenPtr, Info);
427	return;
428	}
429	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: CommandKind);
430	T.setCommandID(Info->getID());
431	return;
432	}
433
434	case `'&'`:
435	lexHTMLCharacterReference(T);
436	return;
437
438	case `'<'`: {
439	TokenPtr++;
440	if (TokenPtr == CommentEnd) {
441	formTextToken(Result&: T, TokEnd: TokenPtr);
442	return;
443	}
444	const char C = *TokenPtr;
445	if (isHTMLIdentifierStartingCharacter(C))
446	setupAndLexHTMLStartTag(T);
447	else if (C == `'/'`)
448	setupAndLexHTMLEndTag(T);
449	else
450	formTextToken(Result&: T, TokEnd: TokenPtr);
451	return;
452	}
453
454	default:
455	return HandleNonCommandToken ();
456	}
457	}
458
459	void Lexer::setupAndLexVerbatimBlock(Token &T,
460	const char *TextBegin,
461	char Marker, const CommandInfo *Info) {
462	assert(Info->IsVerbatimBlockCommand);
463
464	VerbatimBlockEndCommandName.clear();
465	VerbatimBlockEndCommandName.append(RHS: Marker == `'\\'` ? "\\" : "@");
466	VerbatimBlockEndCommandName.append(RHS: Info->EndCommandName);
467
468	formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_block_begin);
469	T.setVerbatimBlockID(Info->getID());
470
471	// If there is a newline following the verbatim opening command, skip the
472	// newline so that we don't create an tok::verbatim_block_line with empty
473	// text content.
474	if (BufferPtr != CommentEnd &&
475	isVerticalWhitespace(c: *BufferPtr)) {
476	BufferPtr = skipNewline(BufferPtr, BufferEnd: CommentEnd);
477	State = LS_VerbatimBlockBody;
478	return;
479	}
480
481	State = LS_VerbatimBlockFirstLine;
482	}
483
484	void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485	again:
486	assert(BufferPtr < CommentEnd);
487
488	// FIXME: It would be better to scan the text once, finding either the block
489	// end command or newline.
490	//
491	// Extract current line.
492	const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
493	StringRef Line(BufferPtr, Newline - BufferPtr);
494
495	// Look for end command in current line.
496	size_t Pos = Line.find(Str: VerbatimBlockEndCommandName);
497	const char *TextEnd;
498	const char *NextLine;
499	if (Pos == StringRef::npos) {
500	// Current line is completely verbatim.
501	TextEnd = Newline;
502	NextLine = skipNewline(BufferPtr: Newline, BufferEnd: CommentEnd);
503	} else if (Pos == `0`) {
504	// Current line contains just an end command.
505	const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506	StringRef Name(BufferPtr + `1`, End - (BufferPtr + `1`));
507	formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::verbatim_block_end);
508	T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509	State = LS_Normal;
510	return;
511	} else {
512	// There is some text, followed by end command. Extract text first.
513	TextEnd = BufferPtr + Pos;
514	NextLine = TextEnd;
515	// If there is only whitespace before end command, skip whitespace.
516	if (isWhitespace(BufferPtr, BufferEnd: TextEnd)) {
517	BufferPtr = TextEnd;
518	goto again;
519	}
520	}
521
522	StringRef Text(BufferPtr, TextEnd - BufferPtr);
523	formTokenWithChars(Result&: T, TokEnd: NextLine, Kind: tok::verbatim_block_line);
524	T.setVerbatimBlockText(Text);
525
526	State = LS_VerbatimBlockBody;
527	}
528
529	void Lexer::lexVerbatimBlockBody(Token &T) {
530	assert(State == LS_VerbatimBlockBody);
531
532	if (CommentState == LCS_InsideCComment)
533	skipLineStartingDecorations();
534
535	if (BufferPtr == CommentEnd) {
536	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::verbatim_block_line);
537	T.setVerbatimBlockText("");
538	return;
539	}
540
541	lexVerbatimBlockFirstLine(T);
542	}
543
544	void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545	const CommandInfo *Info) {
546	assert(Info->IsVerbatimLineCommand);
547	formTokenWithChars(Result&: T, TokEnd: TextBegin, Kind: tok::verbatim_line_name);
548	T.setVerbatimLineID(Info->getID());
549
550	State = LS_VerbatimLineText;
551	}
552
553	void Lexer::lexVerbatimLineText(Token &T) {
554	assert(State == LS_VerbatimLineText);
555
556	// Extract current line.
557	const char *Newline = findNewline(BufferPtr, BufferEnd: CommentEnd);
558	StringRef Text(BufferPtr, Newline - BufferPtr);
559	formTokenWithChars(Result&: T, TokEnd: Newline, Kind: tok::verbatim_line_text);
560	T.setVerbatimLineText(Text);
561
562	State = LS_Normal;
563	}
564
565	void Lexer::lexHTMLCharacterReference(Token &T) {
566	const char *TokenPtr = BufferPtr;
567	assert(*TokenPtr == `'&'`);
568	TokenPtr++;
569	if (TokenPtr == CommentEnd) {
570	formTextToken(Result&: T, TokEnd: TokenPtr);
571	return;
572	}
573	const char *NamePtr;
574	bool isNamed = false;
575	bool isDecimal = false;
576	char C = *TokenPtr;
577	if (isHTMLNamedCharacterReferenceCharacter(C)) {
578	NamePtr = TokenPtr;
579	TokenPtr = skipNamedCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
580	isNamed = true;
581	} else if (C == `'#'`) {
582	TokenPtr++;
583	if (TokenPtr == CommentEnd) {
584	formTextToken(Result&: T, TokEnd: TokenPtr);
585	return;
586	}
587	C = *TokenPtr;
588	if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589	NamePtr = TokenPtr;
590	TokenPtr = skipDecimalCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
591	isDecimal = true;
592	} else if (C == `'x'` \|\| C == `'X'`) {
593	TokenPtr++;
594	NamePtr = TokenPtr;
595	TokenPtr = skipHexCharacterReference(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
596	} else {
597	formTextToken(Result&: T, TokEnd: TokenPtr);
598	return;
599	}
600	} else {
601	formTextToken(Result&: T, TokEnd: TokenPtr);
602	return;
603	}
604	if (NamePtr == TokenPtr \|\| TokenPtr == CommentEnd \|\|
605	*TokenPtr != `';'`) {
606	formTextToken(Result&: T, TokEnd: TokenPtr);
607	return;
608	}
609	StringRef Name(NamePtr, TokenPtr - NamePtr);
610	TokenPtr++; // Skip semicolon.
611	StringRef Resolved;
612	if (isNamed)
613	Resolved = resolveHTMLNamedCharacterReference(Name);
614	else if (isDecimal)
615	Resolved = resolveHTMLDecimalCharacterReference(Name);
616	else
617	Resolved = resolveHTMLHexCharacterReference(Name);
618
619	if (Resolved.empty()) {
620	formTextToken(Result&: T, TokEnd: TokenPtr);
621	return;
622	}
623	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::text);
624	T.setText(Resolved);
625	}
626
627	void Lexer::setupAndLexHTMLStartTag(Token &T) {
628	assert(BufferPtr[`0`] == `'<'` &&
629	isHTMLIdentifierStartingCharacter(BufferPtr[`1`]));
630	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: BufferPtr + `2`, BufferEnd: CommentEnd);
631	StringRef Name(BufferPtr + `1`, TagNameEnd - (BufferPtr + `1`));
632	if (!isHTMLTagName(Name)) {
633	formTextToken(Result&: T, TokEnd: TagNameEnd);
634	return;
635	}
636
637	formTokenWithChars(Result&: T, TokEnd: TagNameEnd, Kind: tok::html_start_tag);
638	T.setHTMLTagStartName(Name);
639
640	BufferPtr = skipWhitespace(BufferPtr, BufferEnd: CommentEnd);
641
642	const char C = *BufferPtr;
643	if (BufferPtr != CommentEnd &&
644	(C == `'>'` \|\| C == `'/'` \|\| isHTMLIdentifierStartingCharacter(C)))
645	State = LS_HTMLStartTag;
646	}
647
648	void Lexer::lexHTMLStartTag(Token &T) {
649	assert(State == LS_HTMLStartTag);
650
651	const char *TokenPtr = BufferPtr;
652	char C = *TokenPtr;
653	if (isHTMLIdentifierCharacter(C)) {
654	TokenPtr = skipHTMLIdentifier(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
655	StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_ident);
657	T.setHTMLIdent(Ident);
658	} else {
659	switch (C) {
660	case `'='`:
661	TokenPtr++;
662	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_equals);
663	break;
664	case `'\"'`:
665	case `'\''`: {
666	const char *OpenQuote = TokenPtr;
667	TokenPtr = skipHTMLQuotedString(BufferPtr: TokenPtr, BufferEnd: CommentEnd);
668	const char *ClosingQuote = TokenPtr;
669	if (TokenPtr != CommentEnd) // Skip closing quote.
670	TokenPtr++;
671	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_quoted_string);
672	T.setHTMLQuotedString(StringRef(OpenQuote + `1`,
673	ClosingQuote - (OpenQuote + `1`)));
674	break;
675	}
676	case `'>'`:
677	TokenPtr++;
678	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_greater);
679	State = LS_Normal;
680	return;
681	case `'/'`:
682	TokenPtr++;
683	if (TokenPtr != CommentEnd && *TokenPtr == `'>'`) {
684	TokenPtr++;
685	formTokenWithChars(Result&: T, TokEnd: TokenPtr, Kind: tok::html_slash_greater);
686	} else
687	formTextToken(Result&: T, TokEnd: TokenPtr);
688
689	State = LS_Normal;
690	return;
691	}
692	}
693
694	// Now look ahead and return to normal state if we don't see any HTML tokens
695	// ahead.
696	BufferPtr = skipWhitespace(BufferPtr, BufferEnd: CommentEnd);
697	if (BufferPtr == CommentEnd) {
698	State = LS_Normal;
699	return;
700	}
701
702	C = *BufferPtr;
703	if (!isHTMLIdentifierStartingCharacter(C) &&
704	C != `'='` && C != `'\"'` && C != `'\''` && C != `'>'` && C != `'/'`) {
705	State = LS_Normal;
706	return;
707	}
708	}
709
710	void Lexer::setupAndLexHTMLEndTag(Token &T) {
711	assert(BufferPtr[`0`] == `'<'` && BufferPtr[`1`] == `'/'`);
712
713	const char *TagNameBegin = skipWhitespace(BufferPtr: BufferPtr + `2`, BufferEnd: CommentEnd);
714	const char *TagNameEnd = skipHTMLIdentifier(BufferPtr: TagNameBegin, BufferEnd: CommentEnd);
715	StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716	if (!isHTMLTagName(Name)) {
717	formTextToken(Result&: T, TokEnd: TagNameEnd);
718	return;
719	}
720
721	const char *End = skipWhitespace(BufferPtr: TagNameEnd, BufferEnd: CommentEnd);
722
723	formTokenWithChars(Result&: T, TokEnd: End, Kind: tok::html_end_tag);
724	T.setHTMLTagEndName(Name);
725
726	if (BufferPtr != CommentEnd && *BufferPtr == `'>'`)
727	State = LS_HTMLEndTag;
728	}
729
730	void Lexer::lexHTMLEndTag(Token &T) {
731	assert(BufferPtr != CommentEnd && *BufferPtr == `'>'`);
732
733	formTokenWithChars(Result&: T, TokEnd: BufferPtr + `1`, Kind: tok::html_greater);
734	State = LS_Normal;
735	}
736
737	Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738	const CommandTraits &Traits, SourceLocation FileLoc,
739	const char BufferStart, const* char BufferEnd, bool* ParseCommands)
740	: Allocator(Allocator), Diags(Diags), Traits(Traits),
741	BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742	FileLoc (FileLoc), ParseCommands(ParseCommands),
743	CommentState(LCS_BeforeComment), State(LS_Normal) {}
744
745	void Lexer::lex(Token &T) {
746	again:
747	switch (CommentState) {
748	case LCS_BeforeComment:
749	if (BufferPtr == BufferEnd) {
750	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::eof);
751	return;
752	}
753
754	assert(*BufferPtr == `'/'`);
755	BufferPtr++; // Skip first slash.
756	switch(*BufferPtr) {
757	case `'/'`: { // BCPL comment.
758	BufferPtr++; // Skip second slash.
759
760	if (BufferPtr != BufferEnd) {
761	// Skip Doxygen magic marker, if it is present.
762	// It might be missing because of a typo //< or /<, or because we*
763	// merged this non-Doxygen comment into a bunch of Doxygen comments
764	// around it: /* ... / / ... / /* ... /
765	const char C = *BufferPtr;
766	if (C == `'/'` \|\| C == `'!'`)
767	BufferPtr++;
768	}
769
770	// Skip less-than symbol that marks trailing comments.
771	// Skip it even if the comment is not a Doxygen one, because //< and /<*
772	// are frequent typos.
773	if (BufferPtr != BufferEnd && *BufferPtr == `'<'`)
774	BufferPtr++;
775
776	CommentState = LCS_InsideBCPLComment;
777	if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778	State = LS_Normal;
779	CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780	goto again;
781	}
782	case `''`: { // C comment.*
783	BufferPtr++; // Skip star.
784
785	// Skip Doxygen magic marker.
786	const char C = *BufferPtr;
787	if ((C == `''` && (BufferPtr + `1`) != `'/'`) \|\| C == `'!'`)
788	BufferPtr++;
789
790	// Skip less-than symbol that marks trailing comments.
791	if (BufferPtr != BufferEnd && *BufferPtr == `'<'`)
792	BufferPtr++;
793
794	CommentState = LCS_InsideCComment;
795	State = LS_Normal;
796	CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797	goto again;
798	}
799	default:
800	llvm_unreachable("second character of comment should be '/' or '*'");
801	}
802
803	case LCS_BetweenComments: {
804	// Consecutive comments are extracted only if there is only whitespace
805	// between them. So we can search for the start of the next comment.
806	const char *EndWhitespace = BufferPtr;
807	while(EndWhitespace != BufferEnd && *EndWhitespace != `'/'`)
808	EndWhitespace++;
809
810	// Turn any whitespace between comments (and there is only whitespace
811	// between them -- guaranteed by comment extraction) into a newline. We
812	// have two newlines between C comments in total (first one was synthesized
813	// after a comment).
814	formTokenWithChars(Result&: T, TokEnd: EndWhitespace, Kind: tok::newline);
815
816	CommentState = LCS_BeforeComment;
817	break;
818	}
819
820	case LCS_InsideBCPLComment:
821	case LCS_InsideCComment:
822	if (BufferPtr != CommentEnd) {
823	lexCommentText(T);
824	break;
825	} else {
826	// Skip C comment closing sequence.
827	if (CommentState == LCS_InsideCComment) {
828	assert(BufferPtr[`0`] == `'*'` && BufferPtr[`1`] == `'/'`);
829	BufferPtr += `2`;
830	assert(BufferPtr <= BufferEnd);
831
832	// Synthenize newline just after the C comment, regardless if there is
833	// actually a newline.
834	formTokenWithChars(Result&: T, TokEnd: BufferPtr, Kind: tok::newline);
835
836	CommentState = LCS_BetweenComments;
837	break;
838	} else {
839	// Don't synthesized a newline after BCPL comment.
840	CommentState = LCS_BetweenComments;
841	goto again;
842	}
843	}
844	}
845	}
846
847	StringRef Lexer::getSpelling(const Token &Tok,
848	const SourceManager &SourceMgr) const {
849	SourceLocation Loc = Tok.getLocation();
850	std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851
852	bool InvalidTemp = false;
853	StringRef File = SourceMgr.getBufferData(FID: LocInfo.first, Invalid: &InvalidTemp);
854	if (InvalidTemp)
855	return StringRef();
856
857	const char *Begin = File.data() + LocInfo.second;
858	return StringRef(Begin, Tok.getLength());
859	}
860
861	} // end namespace comments
862	} // end namespace clang
863

source code of clang/lib/AST/CommentLexer.cpp