1 | //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Various code that examines C++ source code without using heavy AST machinery |
10 | // (and often not even the lexer). To be used sparingly! |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H |
14 | #define |
15 | |
16 | #include "Protocol.h" |
17 | #include "support/Context.h" |
18 | #include "support/ThreadsafeFS.h" |
19 | #include "clang/Basic/CharInfo.h" |
20 | #include "clang/Basic/Diagnostic.h" |
21 | #include "clang/Basic/LangOptions.h" |
22 | #include "clang/Basic/SourceLocation.h" |
23 | #include "clang/Basic/SourceManager.h" |
24 | #include "clang/Format/Format.h" |
25 | #include "clang/Lex/HeaderSearch.h" |
26 | #include "clang/Tooling/Core/Replacement.h" |
27 | #include "clang/Tooling/Syntax/Tokens.h" |
28 | #include "llvm/ADT/StringRef.h" |
29 | #include "llvm/ADT/StringSet.h" |
30 | #include "llvm/Support/Error.h" |
31 | #include <optional> |
32 | #include <string> |
33 | |
34 | namespace clang { |
35 | class SourceManager; |
36 | |
37 | namespace clangd { |
38 | |
39 | // We tend to generate digests for source codes in a lot of different places. |
40 | // This represents the type for those digests to prevent us hard coding details |
41 | // of hashing function at every place that needs to store this information. |
42 | using FileDigest = std::array<uint8_t, 8>; |
43 | FileDigest digest(StringRef Content); |
44 | std::optional<FileDigest> digestFile(const SourceManager &SM, FileID FID); |
45 | |
46 | // This context variable controls the behavior of functions in this file |
47 | // that convert between LSP offsets and native clang byte offsets. |
48 | // If not set, defaults to UTF-16 for backwards-compatibility. |
49 | extern Key<OffsetEncoding> kCurrentOffsetEncoding; |
50 | |
51 | // Counts the number of UTF-16 code units needed to represent a string (LSP |
52 | // specifies string lengths in UTF-16 code units). |
53 | // Use of UTF-16 may be overridden by kCurrentOffsetEncoding. |
54 | size_t lspLength(StringRef Code); |
55 | |
56 | /// Turn a [line, column] pair into an offset in Code. |
57 | /// |
58 | /// If P.character exceeds the line length, returns the offset at end-of-line. |
59 | /// (If !AllowColumnsBeyondLineLength, then returns an error instead). |
60 | /// If the line number is out of range, returns an error. |
61 | /// |
62 | /// The returned value is in the range [0, Code.size()]. |
63 | llvm::Expected<size_t> |
64 | positionToOffset(llvm::StringRef Code, Position P, |
65 | bool AllowColumnsBeyondLineLength = true); |
66 | |
67 | /// Turn an offset in Code into a [line, column] pair. |
68 | /// The offset must be in range [0, Code.size()]. |
69 | Position offsetToPosition(llvm::StringRef Code, size_t Offset); |
70 | |
71 | /// Turn a SourceLocation into a [line, column] pair. |
72 | /// FIXME: This should return an error if the location is invalid. |
73 | Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc); |
74 | |
75 | /// Return the file location, corresponding to \p P. Note that one should take |
76 | /// care to avoid comparing the result with expansion locations. |
77 | llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, |
78 | Position P); |
79 | |
80 | /// Returns true iff \p Loc is inside the main file. This function handles |
81 | /// file & macro locations. For macro locations, returns iff the macro is being |
82 | /// expanded inside the main file. |
83 | /// |
84 | /// The function is usually used to check whether a declaration is inside the |
85 | /// the main file. |
86 | bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM); |
87 | |
88 | /// Returns the #include location through which IncludedFIle was loaded. |
89 | /// Where SM.getIncludeLoc() returns the location of the *filename*, which may |
90 | /// be in a macro, includeHashLoc() returns the location of the #. |
91 | SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM); |
92 | |
93 | /// Returns true if the token at Loc is spelled in the source code. |
94 | /// This is not the case for: |
95 | /// * symbols formed via macro concatenation, the spelling location will |
96 | /// be "<scratch space>" |
97 | /// * symbols controlled and defined by a compile command-line option |
98 | /// `-DName=foo`, the spelling location will be "<command line>". |
99 | bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM); |
100 | |
101 | /// Turns a token range into a half-open range and checks its correctness. |
102 | /// The resulting range will have only valid source location on both sides, both |
103 | /// of which are file locations. |
104 | /// |
105 | /// File locations always point to a particular offset in a file, i.e. they |
106 | /// never refer to a location inside a macro expansion. Turning locations from |
107 | /// macro expansions into file locations is ambiguous - one can use |
108 | /// SourceManager::{getExpansion|getFile|getSpelling}Loc. This function |
109 | /// calls SourceManager::getFileLoc on both ends of \p R to do the conversion. |
110 | /// |
111 | /// User input (e.g. cursor position) is expressed as a file location, so this |
112 | /// function can be viewed as a way to normalize the ranges used in the clang |
113 | /// AST so that they are comparable with ranges coming from the user input. |
114 | std::optional<SourceRange> toHalfOpenFileRange(const SourceManager &Mgr, |
115 | const LangOptions &LangOpts, |
116 | SourceRange R); |
117 | |
118 | /// Returns true iff all of the following conditions hold: |
119 | /// - start and end locations are valid, |
120 | /// - start and end locations are file locations from the same file |
121 | /// (i.e. expansion locations are not taken into account). |
122 | /// - start offset <= end offset. |
123 | /// FIXME: introduce a type for source range with this invariant. |
124 | bool isValidFileRange(const SourceManager &Mgr, SourceRange R); |
125 | |
126 | /// Returns the source code covered by the source range. |
127 | /// EXPECTS: isValidFileRange(R) == true. |
128 | llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R); |
129 | |
130 | // Converts a half-open clang source range to an LSP range. |
131 | // Note that clang also uses closed source ranges, which this can't handle! |
132 | Range halfOpenToRange(const SourceManager &SM, CharSourceRange R); |
133 | |
134 | // Expand range `A` to also contain `B`. |
135 | void unionRanges(Range &A, Range B); |
136 | |
137 | // Converts an offset to a clang line/column (1-based, columns are bytes). |
138 | // The offset must be in range [0, Code.size()]. |
139 | // Prefer to use SourceManager if one is available. |
140 | std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code, |
141 | size_t Offset); |
142 | |
143 | /// From "a::b::c", return {"a::b::", "c"}. Scope is empty if there's no |
144 | /// qualifier. |
145 | std::pair<llvm::StringRef, llvm::StringRef> |
146 | splitQualifiedName(llvm::StringRef QName); |
147 | |
148 | TextEdit replacementToEdit(StringRef Code, const tooling::Replacement &R); |
149 | |
150 | std::vector<TextEdit> replacementsToEdits(StringRef Code, |
151 | const tooling::Replacements &Repls); |
152 | |
153 | TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, |
154 | const LangOptions &L); |
155 | |
156 | /// Get the canonical path of \p F. This means: |
157 | /// |
158 | /// - Absolute path |
159 | /// - Symlinks resolved |
160 | /// - No "." or ".." component |
161 | /// - No duplicate or trailing directory separator |
162 | /// |
163 | /// This function should be used when paths needs to be used outside the |
164 | /// component that generate it, so that paths are normalized as much as |
165 | /// possible. |
166 | std::optional<std::string> getCanonicalPath(const FileEntryRef F, |
167 | FileManager &FileMgr); |
168 | |
169 | /// Choose the clang-format style we should apply to a certain file. |
170 | /// This will usually use FS to look for .clang-format directories. |
171 | /// FIXME: should we be caching the .clang-format file search? |
172 | /// This uses format::DefaultFormatStyle and format::DefaultFallbackStyle, |
173 | /// though the latter may have been overridden in main()! |
174 | /// \p FormatFile indicates whether the returned FormatStyle is used |
175 | /// to format the entire main file (or a range selected by the user |
176 | /// which can be arbitrarily long). |
177 | format::FormatStyle getFormatStyleForFile(llvm::StringRef File, |
178 | llvm::StringRef Content, |
179 | const ThreadsafeFS &TFS, |
180 | bool FormatFile); |
181 | |
182 | /// Cleanup and format the given replacements. |
183 | llvm::Expected<tooling::Replacements> |
184 | cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, |
185 | const format::FormatStyle &Style); |
186 | |
187 | /// A set of edits generated for a single file. Can verify whether it is safe to |
188 | /// apply these edits to a code block. |
189 | struct Edit { |
190 | tooling::Replacements Replacements; |
191 | std::string InitialCode; |
192 | |
193 | Edit() = default; |
194 | |
195 | Edit(llvm::StringRef Code, tooling::Replacements Reps) |
196 | : Replacements(std::move(Reps)), InitialCode(Code) {} |
197 | |
198 | /// Returns the file contents after changes are applied. |
199 | llvm::Expected<std::string> apply() const; |
200 | |
201 | /// Represents Replacements as TextEdits that are available for use in LSP. |
202 | std::vector<TextEdit> asTextEdits() const; |
203 | |
204 | /// Checks whether the Replacements are applicable to given Code. |
205 | bool canApplyTo(llvm::StringRef Code) const; |
206 | }; |
207 | /// A mapping from absolute file path (the one used for accessing the underlying |
208 | /// VFS) to edits. |
209 | using FileEdits = llvm::StringMap<Edit>; |
210 | |
211 | /// Formats the edits and code around it according to Style. Changes |
212 | /// Replacements to formatted ones if succeeds. |
213 | llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style); |
214 | |
215 | /// Apply an incremental update to a text document. |
216 | llvm::Error applyChange(std::string &Contents, |
217 | const TextDocumentContentChangeEvent &Change); |
218 | |
219 | /// Collects identifiers with counts in the source code. |
220 | llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content, |
221 | const format::FormatStyle &Style); |
222 | |
223 | /// Collects all ranges of the given identifier in the source code. |
224 | std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier, |
225 | llvm::StringRef Content, |
226 | const LangOptions &LangOpts); |
227 | |
228 | /// Collects words from the source code. |
229 | /// Unlike collectIdentifiers: |
230 | /// - also finds text in comments: |
231 | /// - splits text into words |
232 | /// - drops stopwords like "get" and "for" |
233 | llvm::StringSet<> collectWords(llvm::StringRef Content); |
234 | |
235 | // Something that looks like a word in the source code. |
236 | // Could be a "real" token that's "live" in the AST, a spelled token consumed by |
237 | // the preprocessor, or part of a spelled token (e.g. word in a comment). |
238 | struct SpelledWord { |
239 | // (Spelling) location of the start of the word. |
240 | SourceLocation Location; |
241 | // The range of the word itself, excluding any quotes. |
242 | // This is a subrange of the file buffer. |
243 | llvm::StringRef Text; |
244 | // Whether this word is likely to refer to an identifier. True if: |
245 | // - the word is a spelled identifier token |
246 | // - Text is identifier-like (e.g. "foo_bar") |
247 | // - Text is surrounded by backticks (e.g. Foo in "// returns `Foo`") |
248 | bool LikelyIdentifier = false; |
249 | // Set if the word is contained in a token spelled in the file. |
250 | // (This should always be true, but comments aren't retained by TokenBuffer). |
251 | const syntax::Token *PartOfSpelledToken = nullptr; |
252 | // Set if the word is exactly a token spelled in the file. |
253 | const syntax::Token *SpelledToken = nullptr; |
254 | // Set if the word is a token spelled in the file, and that token survives |
255 | // preprocessing to emit an expanded token spelled the same way. |
256 | const syntax::Token *ExpandedToken = nullptr; |
257 | |
258 | // Find the unique word that contains SpelledLoc or starts/ends there. |
259 | static std::optional<SpelledWord> touching(SourceLocation SpelledLoc, |
260 | const syntax::TokenBuffer &TB, |
261 | const LangOptions &LangOpts); |
262 | }; |
263 | |
264 | /// Return true if the \p TokenName is in the list of reversed keywords of the |
265 | /// language. |
266 | bool isKeyword(llvm::StringRef TokenName, const LangOptions &LangOpts); |
267 | |
268 | /// Heuristically determine namespaces visible at a point, without parsing Code. |
269 | /// This considers using-directives and enclosing namespace-declarations that |
270 | /// are visible (and not obfuscated) in the file itself (not headers). |
271 | /// Code should be truncated at the point of interest. |
272 | /// |
273 | /// The returned vector is always non-empty. |
274 | /// - The first element is the namespace that encloses the point: a declaration |
275 | /// near the point would be within this namespace. |
276 | /// - The elements are the namespaces in scope at the point: an unqualified |
277 | /// lookup would search within these namespaces. |
278 | /// |
279 | /// Using directives are resolved against all enclosing scopes, but no other |
280 | /// namespace directives. |
281 | /// |
282 | /// example: |
283 | /// using namespace a; |
284 | /// namespace foo { |
285 | /// using namespace b; |
286 | /// |
287 | /// visibleNamespaces are {"foo::", "", "a::", "b::", "foo::b::"}, not "a::b::". |
288 | std::vector<std::string> visibleNamespaces(llvm::StringRef Code, |
289 | const LangOptions &LangOpts); |
290 | |
291 | /// Represents locations that can accept a definition. |
292 | struct EligibleRegion { |
293 | /// Namespace that owns all of the EligiblePoints, e.g. |
294 | /// namespace a{ namespace b {^ void foo();^} } |
295 | /// It will be “a::b” for both carrot locations. |
296 | std::string EnclosingNamespace; |
297 | /// Offsets into the code marking eligible points to insert a function |
298 | /// definition. |
299 | std::vector<Position> EligiblePoints; |
300 | }; |
301 | |
302 | /// Returns most eligible region to insert a definition for \p |
303 | /// FullyQualifiedName in the \p Code. |
304 | /// Pseudo parses \pCode under the hood to determine namespace decls and |
305 | /// possible insertion points. Choses the region that matches the longest prefix |
306 | /// of \p FullyQualifiedName. Returns EOF if there are no shared namespaces. |
307 | /// \p FullyQualifiedName should not contain anonymous namespaces. |
308 | EligibleRegion getEligiblePoints(llvm::StringRef Code, |
309 | llvm::StringRef FullyQualifiedName, |
310 | const LangOptions &LangOpts); |
311 | |
312 | struct DefinedMacro { |
313 | llvm::StringRef Name; |
314 | const MacroInfo *Info; |
315 | /// Location of the identifier that names the macro. |
316 | /// Unlike Info->Location, this translates preamble-patch locations to |
317 | /// main-file locations. |
318 | SourceLocation NameLoc; |
319 | }; |
320 | /// Gets the macro referenced by \p SpelledTok. It must be a spelled token |
321 | /// aligned to the beginning of an identifier. |
322 | std::optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok, |
323 | Preprocessor &PP); |
324 | |
325 | /// Infers whether this is a header from the FileName and LangOpts (if |
326 | /// presents). |
327 | bool (llvm::StringRef FileName, |
328 | std::optional<LangOptions> LangOpts = std::nullopt); |
329 | |
330 | /// Returns true if the given location is in a generated protobuf file. |
331 | bool isProtoFile(SourceLocation Loc, const SourceManager &SourceMgr); |
332 | |
333 | /// Returns true if Name is reserved, like _Foo or __Vector_base. |
334 | inline bool isReservedName(llvm::StringRef Name) { |
335 | // This doesn't catch all cases, but the most common. |
336 | return Name.size() >= 2 && Name[0] == '_' && |
337 | (isUppercase(c: Name[1]) || Name[1] == '_'); |
338 | } |
339 | |
340 | /// Translates locations inside preamble patch to their main-file equivalent |
341 | /// using presumed locations. Returns \p Loc if it isn't inside preamble patch. |
342 | SourceLocation translatePreamblePatchLocation(SourceLocation Loc, |
343 | const SourceManager &SM); |
344 | |
345 | /// Returns the range starting at offset and spanning the whole line. Escaped |
346 | /// newlines are not handled. |
347 | clangd::Range rangeTillEOL(llvm::StringRef Code, unsigned HashOffset); |
348 | } // namespace clangd |
349 | } // namespace clang |
350 | #endif |
351 | |