1 | //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file contains the main building blocks of macro support in |
11 | /// clang-format. |
12 | /// |
13 | /// In order to not violate the requirement that clang-format can format files |
14 | /// in isolation, clang-format's macro support uses expansions users provide |
15 | /// as part of clang-format's style configuration. |
16 | /// |
17 | /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support |
18 | /// one level of expansion (\see MacroExpander for a full description of what |
19 | /// is supported). |
20 | /// |
21 | /// As part of parsing, clang-format uses the MacroExpander to expand the |
22 | /// spelled token streams into expanded token streams when it encounters a |
23 | /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines |
24 | /// from the expanded token stream. |
25 | /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor |
26 | /// matches the spelled token stream into unwrapped lines that best resemble the |
27 | /// structure of the expanded unwrapped lines. These reconstructed unwrapped |
28 | /// lines are aliasing the tokens in the expanded token stream, so that token |
29 | /// annotations will be reused when formatting the spelled macro calls. |
30 | /// |
31 | /// When formatting, clang-format annotates and formats the expanded unwrapped |
32 | /// lines first, determining the token types. Next, it formats the spelled |
33 | /// unwrapped lines, keeping the token types fixed, while allowing other |
34 | /// formatting decisions to change. |
35 | /// |
36 | //===----------------------------------------------------------------------===// |
37 | |
38 | #ifndef CLANG_LIB_FORMAT_MACROS_H |
39 | #define CLANG_LIB_FORMAT_MACROS_H |
40 | |
41 | #include <list> |
42 | |
43 | #include "FormatToken.h" |
44 | #include "llvm/ADT/DenseMap.h" |
45 | |
46 | namespace clang { |
47 | namespace format { |
48 | |
49 | struct UnwrappedLine; |
50 | struct UnwrappedLineNode; |
51 | |
52 | /// Takes a set of macro definitions as strings and allows expanding calls to |
53 | /// those macros. |
54 | /// |
55 | /// For example: |
56 | /// Definition: A(x, y)=x + y |
57 | /// Call : A(int a = 1, 2) |
58 | /// Expansion : int a = 1 + 2 |
59 | /// |
60 | /// Expansion does not check arity of the definition. |
61 | /// If fewer arguments than expected are provided, the remaining parameters |
62 | /// are considered empty: |
63 | /// Call : A(a) |
64 | /// Expansion: a + |
65 | /// If more arguments than expected are provided, they will be discarded. |
66 | /// |
67 | /// The expander does not support: |
68 | /// - recursive expansion |
69 | /// - stringification |
70 | /// - concatenation |
71 | /// - variadic macros |
72 | /// |
73 | /// Furthermore, only a single expansion of each macro argument is supported, |
74 | /// so that we cannot get conflicting formatting decisions from different |
75 | /// expansions. |
76 | /// Definition: A(x)=x+x |
77 | /// Call : A(id) |
78 | /// Expansion : id+x |
79 | /// |
80 | class MacroExpander { |
81 | public: |
82 | using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>; |
83 | |
84 | /// Construct a macro expander from a set of macro definitions. |
85 | /// Macro definitions must be encoded as UTF-8. |
86 | /// |
87 | /// Each entry in \p Macros must conform to the following simple |
88 | /// macro-definition language: |
89 | /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion> |
90 | /// <params> ::= <id-list> | "" |
91 | /// <id-list> ::= <id> | <id> "," <params> |
92 | /// <expansion> ::= "=" <tail> | <eof> |
93 | /// <tail> ::= <tok> <tail> | <eof> |
94 | /// |
95 | /// Macros that cannot be parsed will be silently discarded. |
96 | /// |
97 | MacroExpander(const std::vector<std::string> &Macros, |
98 | clang::SourceManager &SourceMgr, const FormatStyle &Style, |
99 | llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, |
100 | IdentifierTable &IdentTable); |
101 | ~MacroExpander(); |
102 | |
103 | /// Returns whether any macro \p Name is defined, regardless of overloads. |
104 | bool defined(llvm::StringRef Name) const; |
105 | |
106 | /// Returns whetherh there is an object-like overload, i.e. where the macro |
107 | /// has no arguments and should not consume subsequent parentheses. |
108 | bool objectLike(llvm::StringRef Name) const; |
109 | |
110 | /// Returns whether macro \p Name provides an overload with the given arity. |
111 | bool hasArity(llvm::StringRef Name, unsigned Arity) const; |
112 | |
113 | /// Returns the expanded stream of format tokens for \p ID, where |
114 | /// each element in \p Args is a positional argument to the macro call. |
115 | /// If \p Args is not set, the object-like overload is used. |
116 | /// If \p Args is set, the overload with the arity equal to \c Args.size() is |
117 | /// used. |
118 | llvm::SmallVector<FormatToken *, 8> |
119 | expand(FormatToken *ID, std::optional<ArgsList> OptionalArgs) const; |
120 | |
121 | private: |
122 | struct Definition; |
123 | class DefinitionParser; |
124 | |
125 | void parseDefinition(const std::string &Macro); |
126 | |
127 | clang::SourceManager &SourceMgr; |
128 | const FormatStyle &Style; |
129 | llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator; |
130 | IdentifierTable &IdentTable; |
131 | SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers; |
132 | llvm::StringMap<llvm::DenseMap<int, Definition>> FunctionLike; |
133 | llvm::StringMap<Definition> ObjectLike; |
134 | }; |
135 | |
136 | /// Converts a sequence of UnwrappedLines containing expanded macros into a |
137 | /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be |
138 | /// broken into child lines, in a way that best conveys the structure of the |
139 | /// expanded code. |
140 | /// |
141 | /// In the simplest case, a spelled UnwrappedLine contains one macro, and after |
142 | /// expanding it we have one expanded UnwrappedLine. In general, macro |
143 | /// expansions can span UnwrappedLines, and multiple macros can contribute |
144 | /// tokens to the same line. We keep consuming expanded lines until: |
145 | /// * all expansions that started have finished (we're not chopping any macros |
146 | /// in half) |
147 | /// * *and* we've reached the end of a *spelled* unwrapped line. |
148 | /// |
149 | /// A single UnwrappedLine represents this chunk of code. |
150 | /// |
151 | /// After this point, the state of the spelled/expanded stream is "in sync" |
152 | /// (both at the start of an UnwrappedLine, with no macros open), so the |
153 | /// Reconstructor can be thrown away and parsing can continue. |
154 | /// |
155 | /// Given a mapping from the macro name identifier token in the macro call |
156 | /// to the tokens of the macro call, for example: |
157 | /// CLASSA -> CLASSA({public: void x();}) |
158 | /// |
159 | /// When getting the formatted lines of the expansion via the \c addLine method |
160 | /// (each '->' specifies a call to \c addLine ): |
161 | /// -> class A { |
162 | /// -> public: |
163 | /// -> void x(); |
164 | /// -> }; |
165 | /// |
166 | /// Creates the tree of unwrapped lines containing the macro call tokens so that |
167 | /// the macro call tokens fit the semantic structure of the expanded formatted |
168 | /// lines: |
169 | /// -> CLASSA({ |
170 | /// -> public: |
171 | /// -> void x(); |
172 | /// -> }) |
173 | class MacroCallReconstructor { |
174 | public: |
175 | /// Create an Reconstructor whose resulting \p UnwrappedLine will start at |
176 | /// \p Level, using the map from name identifier token to the corresponding |
177 | /// tokens of the spelled macro call. |
178 | MacroCallReconstructor( |
179 | unsigned Level, |
180 | const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> |
181 | &ActiveExpansions); |
182 | |
183 | /// For the given \p Line, match all occurences of tokens expanded from a |
184 | /// macro to unwrapped lines in the spelled macro call so that the resulting |
185 | /// tree of unwrapped lines best resembles the structure of unwrapped lines |
186 | /// passed in via \c addLine. |
187 | void addLine(const UnwrappedLine &Line); |
188 | |
189 | /// Check whether at the current state there is no open macro expansion |
190 | /// that needs to be processed to finish an macro call. |
191 | /// Only when \c finished() is true, \c takeResult() can be called to retrieve |
192 | /// the resulting \c UnwrappedLine. |
193 | /// If there are multiple subsequent macro calls within an unwrapped line in |
194 | /// the spelled token stream, the calling code may also continue to call |
195 | /// \c addLine() when \c finished() is true. |
196 | bool finished() const { return ActiveExpansions.empty(); } |
197 | |
198 | /// Retrieve the formatted \c UnwrappedLine containing the orginal |
199 | /// macro calls, formatted according to the expanded token stream received |
200 | /// via \c addLine(). |
201 | /// Generally, this line tries to have the same structure as the expanded, |
202 | /// formatted unwrapped lines handed in via \c addLine(), with the exception |
203 | /// that for multiple top-level lines, each subsequent line will be the |
204 | /// child of the last token in its predecessor. This representation is chosen |
205 | /// because it is a precondition to the formatter that we get what looks like |
206 | /// a single statement in a single \c UnwrappedLine (i.e. matching parens). |
207 | /// |
208 | /// If a token in a macro argument is a child of a token in the expansion, |
209 | /// the parent will be the corresponding token in the macro call. |
210 | /// For example: |
211 | /// #define C(a, b) class C { a b |
212 | /// C(int x;, int y;) |
213 | /// would expand to |
214 | /// class C { int x; int y; |
215 | /// where in a formatted line "int x;" and "int y;" would both be new separate |
216 | /// lines. |
217 | /// |
218 | /// In the result, "int x;" will be a child of the opening parenthesis in "C(" |
219 | /// and "int y;" will be a child of the "," token: |
220 | /// C ( |
221 | /// \- int x; |
222 | /// , |
223 | /// \- int y; |
224 | /// ) |
225 | UnwrappedLine takeResult() &&; |
226 | |
227 | private: |
228 | void add(FormatToken *Token, FormatToken *ExpandedParent, bool First, |
229 | unsigned Level); |
230 | void prepareParent(FormatToken *ExpandedParent, bool First, unsigned Level); |
231 | FormatToken *getParentInResult(FormatToken *Parent); |
232 | void reconstruct(FormatToken *Token); |
233 | void startReconstruction(FormatToken *Token); |
234 | bool reconstructActiveCallUntil(FormatToken *Token); |
235 | void endReconstruction(FormatToken *Token); |
236 | bool processNextReconstructed(); |
237 | void finalize(); |
238 | |
239 | struct ReconstructedLine; |
240 | |
241 | void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr); |
242 | UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level); |
243 | void debug(const ReconstructedLine &Line, int Level); |
244 | ReconstructedLine &parentLine(); |
245 | ReconstructedLine *currentLine(); |
246 | void debugParentMap() const; |
247 | |
248 | #ifndef NDEBUG |
249 | enum ReconstructorState { |
250 | Start, // No macro expansion was found in the input yet. |
251 | InProgress, // During a macro reconstruction. |
252 | Finalized, // Past macro reconstruction, the result is finalized. |
253 | }; |
254 | ReconstructorState State = Start; |
255 | #endif |
256 | |
257 | // Node in which we build up the resulting unwrapped line; this type is |
258 | // analogous to UnwrappedLineNode. |
259 | struct LineNode { |
260 | LineNode() = default; |
261 | LineNode(FormatToken *Tok) : Tok(Tok) {} |
262 | FormatToken *Tok = nullptr; |
263 | llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children; |
264 | }; |
265 | |
266 | // Line in which we build up the resulting unwrapped line. |
267 | // FIXME: Investigate changing UnwrappedLine to a pointer type and using it |
268 | // instead of rolling our own type. |
269 | struct ReconstructedLine { |
270 | explicit ReconstructedLine(unsigned Level) : Level(Level) {} |
271 | unsigned Level; |
272 | llvm::SmallVector<std::unique_ptr<LineNode>> Tokens; |
273 | }; |
274 | |
275 | // The line in which we collect the resulting reconstructed output. |
276 | // To reduce special cases in the algorithm, the first level of the line |
277 | // contains a single null token that has the reconstructed incoming |
278 | // lines as children. |
279 | // In the end, we stich the lines together so that each subsequent line |
280 | // is a child of the last token of the previous line. This is necessary |
281 | // in order to format the overall expression as a single logical line - |
282 | // if we created separate lines, we'd format them with their own top-level |
283 | // indent depending on the semantic structure, which is not desired. |
284 | ReconstructedLine Result; |
285 | |
286 | // Stack of currently "open" lines, where each line's predecessor's last |
287 | // token is the parent token for that line. |
288 | llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines; |
289 | |
290 | // Maps from the expanded token to the token that takes its place in the |
291 | // reconstructed token stream in terms of parent-child relationships. |
292 | // Note that it might take multiple steps to arrive at the correct |
293 | // parent in the output. |
294 | // Given: #define C(a, b) []() { a; b; } |
295 | // And a call: C(f(), g()) |
296 | // The structure in the incoming formatted unwrapped line will be: |
297 | // []() { |
298 | // |- f(); |
299 | // \- g(); |
300 | // } |
301 | // with f and g being children of the opening brace. |
302 | // In the reconstructed call: |
303 | // C(f(), g()) |
304 | // \- f() |
305 | // \- g() |
306 | // We want f to be a child of the opening parenthesis and g to be a child |
307 | // of the comma token in the macro call. |
308 | // Thus, we map |
309 | // { -> ( |
310 | // and add |
311 | // ( -> , |
312 | // once we're past the comma in the reconstruction. |
313 | llvm::DenseMap<FormatToken *, FormatToken *> |
314 | SpelledParentToReconstructedParent; |
315 | |
316 | // Keeps track of a single expansion while we're reconstructing tokens it |
317 | // generated. |
318 | struct Expansion { |
319 | // The identifier token of the macro call. |
320 | FormatToken *ID; |
321 | // Our current position in the reconstruction. |
322 | std::list<UnwrappedLineNode>::iterator SpelledI; |
323 | // The end of the reconstructed token sequence. |
324 | std::list<UnwrappedLineNode>::iterator SpelledE; |
325 | }; |
326 | |
327 | // Stack of macro calls for which we're in the middle of an expansion. |
328 | llvm::SmallVector<Expansion> ActiveExpansions; |
329 | |
330 | struct MacroCallState { |
331 | MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken, |
332 | FormatToken *MacroCallLParen); |
333 | |
334 | ReconstructedLine *Line; |
335 | |
336 | // The last token in the parent line or expansion, or nullptr if the macro |
337 | // expansion is on a top-level line. |
338 | // |
339 | // For example, in the macro call: |
340 | // auto f = []() { ID(1); }; |
341 | // The MacroCallState for ID will have '{' as ParentLastToken. |
342 | // |
343 | // In the macro call: |
344 | // ID(ID(void f())); |
345 | // The MacroCallState of the outer ID will have nullptr as ParentLastToken, |
346 | // while the MacroCallState for the inner ID will have the '(' of the outer |
347 | // ID as ParentLastToken. |
348 | // |
349 | // In the macro call: |
350 | // ID2(a, ID(b)); |
351 | // The MacroCallState of ID will have ',' as ParentLastToken. |
352 | FormatToken *ParentLastToken; |
353 | |
354 | // The l_paren of this MacroCallState's macro call. |
355 | FormatToken *MacroCallLParen; |
356 | }; |
357 | |
358 | // Keeps track of the lines into which the opening brace/parenthesis & |
359 | // argument separating commas for each level in the macro call go in order to |
360 | // put the corresponding closing brace/parenthesis into the same line in the |
361 | // output and keep track of which parents in the expanded token stream map to |
362 | // which tokens in the reconstructed stream. |
363 | // When an opening brace/parenthesis has children, we want the structure of |
364 | // the output line to be: |
365 | // |- MACRO |
366 | // |- ( |
367 | // | \- <argument> |
368 | // |- , |
369 | // | \- <argument> |
370 | // \- ) |
371 | llvm::SmallVector<MacroCallState> MacroCallStructure; |
372 | |
373 | // Maps from identifier of the macro call to an unwrapped line containing |
374 | // all tokens of the macro call. |
375 | const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> |
376 | &IdToReconstructed; |
377 | }; |
378 | |
379 | } // namespace format |
380 | } // namespace clang |
381 | |
382 | #endif |
383 | |