1 | //===- FormatGen.h - Utilities for custom assembly formats ------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file contains common classes for building custom assembly format parsers |
10 | // and generators. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #ifndef MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ |
15 | #define MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ |
16 | |
17 | #include "mlir/Support/LLVM.h" |
18 | #include "mlir/Support/LogicalResult.h" |
19 | #include "llvm/ADT/StringRef.h" |
20 | #include "llvm/ADT/StringSet.h" |
21 | #include "llvm/Support/Allocator.h" |
22 | #include "llvm/Support/CommandLine.h" |
23 | #include "llvm/Support/SMLoc.h" |
24 | #include <vector> |
25 | |
26 | namespace llvm { |
27 | class SourceMgr; |
28 | } // namespace llvm |
29 | |
30 | namespace mlir { |
31 | namespace tblgen { |
32 | |
33 | //===----------------------------------------------------------------------===// |
34 | // FormatToken |
35 | //===----------------------------------------------------------------------===// |
36 | |
37 | /// This class represents a specific token in the input format. |
38 | class FormatToken { |
39 | public: |
40 | /// Basic token kinds. |
41 | enum Kind { |
42 | // Markers. |
43 | eof, |
44 | error, |
45 | |
46 | // Tokens with no info. |
47 | l_paren, |
48 | r_paren, |
49 | caret, |
50 | colon, |
51 | comma, |
52 | equal, |
53 | less, |
54 | greater, |
55 | question, |
56 | star, |
57 | pipe, |
58 | |
59 | // Keywords. |
60 | keyword_start, |
61 | kw_attr_dict, |
62 | kw_attr_dict_w_keyword, |
63 | kw_prop_dict, |
64 | kw_custom, |
65 | kw_functional_type, |
66 | kw_oilist, |
67 | kw_operands, |
68 | kw_params, |
69 | kw_qualified, |
70 | kw_ref, |
71 | kw_regions, |
72 | kw_results, |
73 | kw_struct, |
74 | kw_successors, |
75 | kw_type, |
76 | keyword_end, |
77 | |
78 | // String valued tokens. |
79 | identifier, |
80 | literal, |
81 | variable, |
82 | string, |
83 | }; |
84 | |
85 | FormatToken(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {} |
86 | |
87 | /// Return the bytes that make up this token. |
88 | StringRef getSpelling() const { return spelling; } |
89 | |
90 | /// Return the kind of this token. |
91 | Kind getKind() const { return kind; } |
92 | |
93 | /// Return a location for this token. |
94 | SMLoc getLoc() const; |
95 | |
96 | /// Returns true if the token is of the given kind. |
97 | bool is(Kind kind) { return getKind() == kind; } |
98 | |
99 | /// Return if this token is a keyword. |
100 | bool isKeyword() const { |
101 | return getKind() > Kind::keyword_start && getKind() < Kind::keyword_end; |
102 | } |
103 | |
104 | private: |
105 | /// Discriminator that indicates the kind of token this is. |
106 | Kind kind; |
107 | |
108 | /// A reference to the entire token contents; this is always a pointer into |
109 | /// a memory buffer owned by the source manager. |
110 | StringRef spelling; |
111 | }; |
112 | |
113 | //===----------------------------------------------------------------------===// |
114 | // FormatLexer |
115 | //===----------------------------------------------------------------------===// |
116 | |
117 | /// This class implements a simple lexer for operation assembly format strings. |
118 | class FormatLexer { |
119 | public: |
120 | FormatLexer(llvm::SourceMgr &mgr, SMLoc loc); |
121 | |
122 | /// Lex the next token and return it. |
123 | FormatToken lexToken(); |
124 | |
125 | /// Emit an error to the lexer with the given location and message. |
126 | FormatToken emitError(SMLoc loc, const Twine &msg); |
127 | FormatToken emitError(const char *loc, const Twine &msg); |
128 | |
129 | FormatToken emitErrorAndNote(SMLoc loc, const Twine &msg, const Twine ¬e); |
130 | |
131 | private: |
132 | /// Return the next character in the stream. |
133 | int getNextChar(); |
134 | |
135 | /// Lex an identifier, literal, variable, or string. |
136 | FormatToken lexIdentifier(const char *tokStart); |
137 | FormatToken lexLiteral(const char *tokStart); |
138 | FormatToken lexVariable(const char *tokStart); |
139 | FormatToken lexString(const char *tokStart); |
140 | |
141 | /// Create a token with the current pointer and a start pointer. |
142 | FormatToken formToken(FormatToken::Kind kind, const char *tokStart) { |
143 | return FormatToken(kind, StringRef(tokStart, curPtr - tokStart)); |
144 | } |
145 | |
146 | /// The source manager containing the format string. |
147 | llvm::SourceMgr &mgr; |
148 | /// Location of the format string. |
149 | SMLoc loc; |
150 | /// Buffer containing the format string. |
151 | StringRef curBuffer; |
152 | /// Current pointer in the buffer. |
153 | const char *curPtr; |
154 | }; |
155 | |
156 | //===----------------------------------------------------------------------===// |
157 | // FormatElement |
158 | //===----------------------------------------------------------------------===// |
159 | |
160 | /// This class represents a single format element. |
161 | /// |
162 | /// If you squint and take a close look, you can see the outline of a `Format` |
163 | /// dialect. |
164 | class FormatElement { |
165 | public: |
166 | virtual ~FormatElement(); |
167 | |
168 | // The top-level kinds of format elements. |
169 | enum Kind { Literal, String, Variable, Whitespace, Directive, Optional }; |
170 | |
171 | /// Support LLVM-style RTTI. |
172 | static bool classof(const FormatElement *el) { return true; } |
173 | |
174 | /// Get the element kind. |
175 | Kind getKind() const { return kind; } |
176 | |
177 | protected: |
178 | /// Create a format element with the given kind. |
179 | FormatElement(Kind kind) : kind(kind) {} |
180 | |
181 | private: |
182 | /// The kind of the element. |
183 | Kind kind; |
184 | }; |
185 | |
186 | /// The base class for all format elements. This class implements common methods |
187 | /// for LLVM-style RTTI. |
188 | template <FormatElement::Kind ElementKind> |
189 | class FormatElementBase : public FormatElement { |
190 | public: |
191 | /// Support LLVM-style RTTI. |
192 | static bool classof(const FormatElement *el) { |
193 | return ElementKind == el->getKind(); |
194 | } |
195 | |
196 | protected: |
197 | /// Create a format element with the given kind. |
198 | FormatElementBase() : FormatElement(ElementKind) {} |
199 | }; |
200 | |
201 | /// This class represents a literal element. A literal is either one of the |
202 | /// supported punctuation characters (e.g. `(` or `,`) or a string literal (e.g. |
203 | /// `literal`). |
204 | class LiteralElement : public FormatElementBase<FormatElement::Literal> { |
205 | public: |
206 | /// Create a literal element with the given spelling. |
207 | explicit LiteralElement(StringRef spelling) : spelling(spelling) {} |
208 | |
209 | /// Get the spelling of the literal. |
210 | StringRef getSpelling() const { return spelling; } |
211 | |
212 | private: |
213 | /// The spelling of the variable, i.e. the string contained within the |
214 | /// backticks. |
215 | StringRef spelling; |
216 | }; |
217 | |
218 | /// This class represents a raw string that can contain arbitrary C++ code. |
219 | class StringElement : public FormatElementBase<FormatElement::String> { |
220 | public: |
221 | /// Create a string element with the given contents. |
222 | explicit StringElement(std::string value) : value(std::move(value)) {} |
223 | |
224 | /// Get the value of the string element. |
225 | StringRef getValue() const { return value; } |
226 | |
227 | private: |
228 | /// The contents of the string. |
229 | std::string value; |
230 | }; |
231 | |
232 | /// This class represents a variable element. A variable refers to some part of |
233 | /// the object being parsed, e.g. an attribute or operand on an operation or a |
234 | /// parameter on an attribute. |
235 | class VariableElement : public FormatElementBase<FormatElement::Variable> { |
236 | public: |
237 | /// These are the kinds of variables. |
238 | enum Kind { |
239 | Attribute, |
240 | Operand, |
241 | Region, |
242 | Result, |
243 | Successor, |
244 | Parameter, |
245 | Property |
246 | }; |
247 | |
248 | /// Get the kind of variable. |
249 | Kind getKind() const { return kind; } |
250 | |
251 | protected: |
252 | /// Create a variable with a kind. |
253 | VariableElement(Kind kind) : kind(kind) {} |
254 | |
255 | private: |
256 | /// The kind of variable. |
257 | Kind kind; |
258 | }; |
259 | |
260 | /// Base class for variable elements. This class implements common methods for |
261 | /// LLVM-style RTTI. |
262 | template <VariableElement::Kind VariableKind> |
263 | class VariableElementBase : public VariableElement { |
264 | public: |
265 | /// An element is of this class if it is a variable and has the same variable |
266 | /// type. |
267 | static bool classof(const FormatElement *el) { |
268 | if (auto *varEl = dyn_cast<VariableElement>(Val: el)) |
269 | return VariableKind == varEl->getKind(); |
270 | return false; |
271 | } |
272 | |
273 | protected: |
274 | /// Create a variable element with the given variable kind. |
275 | VariableElementBase() : VariableElement(VariableKind) {} |
276 | }; |
277 | |
278 | /// This class represents a whitespace element, e.g. a newline or space. It is a |
279 | /// literal that is printed but never parsed. When the value is empty, i.e. ``, |
280 | /// a space is elided where one would have been printed automatically. |
281 | class WhitespaceElement : public FormatElementBase<FormatElement::Whitespace> { |
282 | public: |
283 | /// Create a whitespace element. |
284 | explicit WhitespaceElement(StringRef value) : value(value) {} |
285 | |
286 | /// Get the whitespace value. |
287 | StringRef getValue() const { return value; } |
288 | |
289 | private: |
290 | /// The value of the whitespace element. Can be empty. |
291 | StringRef value; |
292 | }; |
293 | |
294 | class DirectiveElement : public FormatElementBase<FormatElement::Directive> { |
295 | public: |
296 | /// These are the kinds of directives. |
297 | enum Kind { |
298 | AttrDict, |
299 | PropDict, |
300 | Custom, |
301 | FunctionalType, |
302 | OIList, |
303 | Operands, |
304 | Ref, |
305 | Regions, |
306 | Results, |
307 | Successors, |
308 | Type, |
309 | Params, |
310 | Struct |
311 | }; |
312 | |
313 | /// Get the directive kind. |
314 | Kind getKind() const { return kind; } |
315 | |
316 | protected: |
317 | /// Create a directive element with a kind. |
318 | DirectiveElement(Kind kind) : kind(kind) {} |
319 | |
320 | private: |
321 | /// The directive kind. |
322 | Kind kind; |
323 | }; |
324 | |
325 | /// Base class for directive elements. This class implements common methods for |
326 | /// LLVM-style RTTI. |
327 | template <DirectiveElement::Kind DirectiveKind> |
328 | class DirectiveElementBase : public DirectiveElement { |
329 | public: |
330 | /// Create a directive element with the specified kind. |
331 | DirectiveElementBase() : DirectiveElement(DirectiveKind) {} |
332 | |
333 | /// A format element is of this class if it is a directive element and has the |
334 | /// same kind. |
335 | static bool classof(const FormatElement *el) { |
336 | if (auto *directiveEl = dyn_cast<DirectiveElement>(Val: el)) |
337 | return DirectiveKind == directiveEl->getKind(); |
338 | return false; |
339 | } |
340 | }; |
341 | |
342 | /// This class represents a custom format directive that is implemented by the |
343 | /// user in C++. The directive accepts a list of arguments that is passed to the |
344 | /// C++ function. |
345 | class CustomDirective : public DirectiveElementBase<DirectiveElement::Custom> { |
346 | public: |
347 | /// Create a custom directive with a name and list of arguments. |
348 | CustomDirective(StringRef name, std::vector<FormatElement *> &&arguments) |
349 | : name(name), arguments(std::move(arguments)) {} |
350 | |
351 | /// Get the custom directive name. |
352 | StringRef getName() const { return name; } |
353 | |
354 | /// Get the arguments to the custom directive. |
355 | ArrayRef<FormatElement *> getArguments() const { return arguments; } |
356 | |
357 | private: |
358 | /// The name of the custom directive. The name is used to call two C++ |
359 | /// methods: `parse{name}` and `print{name}` with the given arguments. |
360 | StringRef name; |
361 | /// The arguments with which to call the custom functions. These are either |
362 | /// variables (for which the functions are responsible for populating) or |
363 | /// references to variables. |
364 | std::vector<FormatElement *> arguments; |
365 | }; |
366 | |
367 | /// This class represents a reference directive. This directive can be used to |
368 | /// reference but not bind a previously bound variable or format object. Its |
369 | /// current only use is to pass variables as arguments to the custom directive. |
370 | class RefDirective : public DirectiveElementBase<DirectiveElement::Ref> { |
371 | public: |
372 | /// Create a reference directive with the single referenced child. |
373 | RefDirective(FormatElement *arg) : arg(arg) {} |
374 | |
375 | /// Get the reference argument. |
376 | FormatElement *getArg() const { return arg; } |
377 | |
378 | private: |
379 | /// The referenced argument. |
380 | FormatElement *arg; |
381 | }; |
382 | |
383 | /// This class represents a group of elements that are optionally emitted based |
384 | /// on an optional variable "anchor" and a group of elements that are emitted |
385 | /// when the anchor element is not present. |
386 | class OptionalElement : public FormatElementBase<FormatElement::Optional> { |
387 | public: |
388 | /// Create an optional group with the given child elements. |
389 | OptionalElement(std::vector<FormatElement *> &&thenElements, |
390 | std::vector<FormatElement *> &&elseElements, |
391 | unsigned thenParseStart, unsigned elseParseStart, |
392 | FormatElement *anchor, bool inverted) |
393 | : thenElements(std::move(thenElements)), |
394 | elseElements(std::move(elseElements)), thenParseStart(thenParseStart), |
395 | elseParseStart(elseParseStart), anchor(anchor), inverted(inverted) {} |
396 | |
397 | /// Return the `then` elements of the optional group. Drops the first |
398 | /// `thenParseStart` whitespace elements if `parseable` is true. |
399 | ArrayRef<FormatElement *> getThenElements(bool parseable = false) const { |
400 | return llvm::ArrayRef(thenElements) |
401 | .drop_front(N: parseable ? thenParseStart : 0); |
402 | } |
403 | |
404 | /// Return the `else` elements of the optional group. Drops the first |
405 | /// `elseParseStart` whitespace elements if `parseable` is true. |
406 | ArrayRef<FormatElement *> getElseElements(bool parseable = false) const { |
407 | return llvm::ArrayRef(elseElements) |
408 | .drop_front(N: parseable ? elseParseStart : 0); |
409 | } |
410 | |
411 | /// Return the anchor of the optional group. |
412 | FormatElement *getAnchor() const { return anchor; } |
413 | |
414 | /// Return true if the optional group is inverted. |
415 | bool isInverted() const { return inverted; } |
416 | |
417 | private: |
418 | /// The child elements emitted when the anchor is present. |
419 | std::vector<FormatElement *> thenElements; |
420 | /// The child elements emitted when the anchor is not present. |
421 | std::vector<FormatElement *> elseElements; |
422 | /// The index of the first element that is parsed in `thenElements`. That is, |
423 | /// the first non-whitespace element. |
424 | unsigned thenParseStart; |
425 | /// The index of the first element that is parsed in `elseElements`. That is, |
426 | /// the first non-whitespace element. |
427 | unsigned elseParseStart; |
428 | /// The anchor element of the optional group. |
429 | FormatElement *anchor; |
430 | /// Whether the optional group condition is inverted and the anchor element is |
431 | /// in the else group. |
432 | bool inverted; |
433 | }; |
434 | |
435 | //===----------------------------------------------------------------------===// |
436 | // FormatParserBase |
437 | //===----------------------------------------------------------------------===// |
438 | |
439 | /// Base class for a parser that implements an assembly format. This class |
440 | /// defines a common assembly format syntax and the creation of format elements. |
441 | /// Subclasses will need to implement parsing for the format elements they |
442 | /// support. |
443 | class FormatParser { |
444 | public: |
445 | /// Vtable anchor. |
446 | virtual ~FormatParser(); |
447 | |
448 | /// Parse the assembly format. |
449 | FailureOr<std::vector<FormatElement *>> parse(); |
450 | |
451 | protected: |
452 | /// The current context of the parser when parsing an element. |
453 | enum Context { |
454 | /// The element is being parsed in a "top-level" context, i.e. at the top of |
455 | /// the format or in an optional group. |
456 | TopLevelContext, |
457 | /// The element is being parsed as a custom directive child. |
458 | CustomDirectiveContext, |
459 | /// The element is being parsed as a type directive child. |
460 | TypeDirectiveContext, |
461 | /// The element is being parsed as a reference directive child. |
462 | RefDirectiveContext, |
463 | /// The element is being parsed as a struct directive child. |
464 | StructDirectiveContext |
465 | }; |
466 | |
467 | /// Create a format parser with the given source manager and a location. |
468 | explicit FormatParser(llvm::SourceMgr &mgr, llvm::SMLoc loc) |
469 | : lexer(mgr, loc), curToken(lexer.lexToken()) {} |
470 | |
471 | /// Allocate and construct a format element. |
472 | template <typename FormatElementT, typename... Args> |
473 | FormatElementT *create(Args &&...args) { |
474 | // FormatElementT *ptr = allocator.Allocate<FormatElementT>(); |
475 | // ::new (ptr) FormatElementT(std::forward<Args>(args)...); |
476 | // return ptr; |
477 | auto mem = std::make_unique<FormatElementT>(std::forward<Args>(args)...); |
478 | FormatElementT *ptr = mem.get(); |
479 | allocator.push_back(std::move(mem)); |
480 | return ptr; |
481 | } |
482 | |
483 | //===--------------------------------------------------------------------===// |
484 | // Element Parsing |
485 | |
486 | /// Parse a single element of any kind. |
487 | FailureOr<FormatElement *> parseElement(Context ctx); |
488 | /// Parse a literal. |
489 | FailureOr<FormatElement *> parseLiteral(Context ctx); |
490 | /// Parse a string. |
491 | FailureOr<FormatElement *> parseString(Context ctx); |
492 | /// Parse a variable. |
493 | FailureOr<FormatElement *> parseVariable(Context ctx); |
494 | /// Parse a directive. |
495 | FailureOr<FormatElement *> parseDirective(Context ctx); |
496 | /// Parse an optional group. |
497 | FailureOr<FormatElement *> parseOptionalGroup(Context ctx); |
498 | |
499 | /// Parse a custom directive. |
500 | FailureOr<FormatElement *> parseCustomDirective(llvm::SMLoc loc, Context ctx); |
501 | |
502 | /// Parse a format-specific variable kind. |
503 | virtual FailureOr<FormatElement *> |
504 | parseVariableImpl(llvm::SMLoc loc, StringRef name, Context ctx) = 0; |
505 | /// Parse a format-specific directive kind. |
506 | virtual FailureOr<FormatElement *> |
507 | parseDirectiveImpl(llvm::SMLoc loc, FormatToken::Kind kind, Context ctx) = 0; |
508 | |
509 | //===--------------------------------------------------------------------===// |
510 | // Format Verification |
511 | |
512 | /// Verify that the format is well-formed. |
513 | virtual LogicalResult verify(llvm::SMLoc loc, |
514 | ArrayRef<FormatElement *> elements) = 0; |
515 | /// Verify the arguments to a custom directive. |
516 | virtual LogicalResult |
517 | verifyCustomDirectiveArguments(llvm::SMLoc loc, |
518 | ArrayRef<FormatElement *> arguments) = 0; |
519 | /// Verify the elements of an optional group. |
520 | virtual LogicalResult |
521 | verifyOptionalGroupElements(llvm::SMLoc loc, |
522 | ArrayRef<FormatElement *> elements, |
523 | FormatElement *anchor) = 0; |
524 | |
525 | //===--------------------------------------------------------------------===// |
526 | // Lexer Utilities |
527 | |
528 | /// Emit an error at the given location. |
529 | LogicalResult emitError(llvm::SMLoc loc, const Twine &msg) { |
530 | lexer.emitError(loc, msg); |
531 | return failure(); |
532 | } |
533 | |
534 | /// Emit an error and a note at the given notation. |
535 | LogicalResult emitErrorAndNote(llvm::SMLoc loc, const Twine &msg, |
536 | const Twine ¬e) { |
537 | lexer.emitErrorAndNote(loc, msg, note); |
538 | return failure(); |
539 | } |
540 | |
541 | /// Parse a single token of the expected kind. |
542 | FailureOr<FormatToken> parseToken(FormatToken::Kind kind, const Twine &msg) { |
543 | if (!curToken.is(kind)) |
544 | return emitError(loc: curToken.getLoc(), msg); |
545 | FormatToken tok = curToken; |
546 | consumeToken(); |
547 | return tok; |
548 | } |
549 | |
550 | /// Advance the lexer to the next token. |
551 | void consumeToken() { |
552 | assert(!curToken.is(FormatToken::eof) && !curToken.is(FormatToken::error) && |
553 | "shouldn't advance past EOF or errors" ); |
554 | curToken = lexer.lexToken(); |
555 | } |
556 | |
557 | /// Get the current token. |
558 | FormatToken peekToken() { return curToken; } |
559 | |
560 | private: |
561 | /// The format parser retains ownership of the format elements in a bump |
562 | /// pointer allocator. |
563 | // FIXME: FormatElement with `std::vector` need to be converted to use |
564 | // trailing objects. |
565 | // llvm::BumpPtrAllocator allocator; |
566 | std::vector<std::unique_ptr<FormatElement>> allocator; |
567 | /// The format lexer to use. |
568 | FormatLexer lexer; |
569 | /// The current token in the lexer. |
570 | FormatToken curToken; |
571 | }; |
572 | |
573 | //===----------------------------------------------------------------------===// |
574 | // Utility Functions |
575 | //===----------------------------------------------------------------------===// |
576 | |
577 | /// Whether a space needs to be emitted before a literal. E.g., two keywords |
578 | /// back-to-back require a space separator, but a keyword followed by '<' does |
579 | /// not require a space. |
580 | bool shouldEmitSpaceBefore(StringRef value, bool lastWasPunctuation); |
581 | |
582 | /// Returns true if the given string can be formatted as a keyword. |
583 | bool canFormatStringAsKeyword(StringRef value, |
584 | function_ref<void(Twine)> emitError = nullptr); |
585 | |
586 | /// Returns true if the given string is valid format literal element. |
587 | /// If `emitError` is provided, it is invoked with the reason for the failure. |
588 | bool isValidLiteral(StringRef value, |
589 | function_ref<void(Twine)> emitError = nullptr); |
590 | |
591 | /// Whether a failure in parsing the assembly format should be a fatal error. |
592 | extern llvm::cl::opt<bool> formatErrorIsFatal; |
593 | |
594 | } // namespace tblgen |
595 | } // namespace mlir |
596 | |
597 | #endif // MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_ |
598 | |