1 | //===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "clang-pseudo/Bracket.h" |
10 | #include "clang-pseudo/DirectiveTree.h" |
11 | #include "clang-pseudo/Disambiguate.h" |
12 | #include "clang-pseudo/Forest.h" |
13 | #include "clang-pseudo/GLR.h" |
14 | #include "clang-pseudo/Language.h" |
15 | #include "clang-pseudo/Token.h" |
16 | #include "clang-pseudo/cli/CLI.h" |
17 | #include "clang-pseudo/grammar/Grammar.h" |
18 | #include "clang-pseudo/grammar/LRGraph.h" |
19 | #include "clang-pseudo/grammar/LRTable.h" |
20 | #include "clang/Basic/LangOptions.h" |
21 | #include "llvm/ADT/STLExtras.h" |
22 | #include "llvm/ADT/STLFunctionalExtras.h" |
23 | #include "llvm/Support/CommandLine.h" |
24 | #include "llvm/Support/FormatVariadic.h" |
25 | #include "llvm/Support/MemoryBuffer.h" |
26 | #include "llvm/Support/Signals.h" |
27 | #include <optional> |
28 | |
29 | using clang::pseudo::ForestNode; |
30 | using clang::pseudo::Token; |
31 | using clang::pseudo::TokenStream; |
32 | using llvm::cl::desc; |
33 | using llvm::cl::init; |
34 | using llvm::cl::opt; |
35 | |
36 | static opt<bool> PrintGrammar("print-grammar" , desc("Print the grammar" )); |
37 | static opt<bool> PrintGraph("print-graph" , |
38 | desc("Print the LR graph for the grammar" )); |
39 | static opt<bool> PrintTable("print-table" , |
40 | desc("Print the LR table for the grammar" )); |
41 | static opt<std::string> Source("source" , desc("Source file" )); |
42 | static opt<bool> PrintSource("print-source" , desc("Print token stream" )); |
43 | static opt<bool> PrintTokens("print-tokens" , desc("Print detailed token info" )); |
44 | static opt<bool> |
45 | PrintDirectiveTree("print-directive-tree" , |
46 | desc("Print directive structure of source code" )); |
47 | static opt<bool> |
48 | StripDirectives("strip-directives" , |
49 | desc("Strip directives and select conditional sections" )); |
50 | static opt<bool> Disambiguate("disambiguate" , |
51 | desc("Choose best tree from parse forest" )); |
52 | static opt<bool> PrintStatistics("print-statistics" , desc("Print GLR parser statistics" )); |
53 | static opt<bool> PrintForest("print-forest" , desc("Print parse forest" )); |
54 | static opt<bool> ForestAbbrev("forest-abbrev" , desc("Abbreviate parse forest" ), |
55 | init(Val: true)); |
56 | static opt<std::string> HTMLForest("html-forest" , |
57 | desc("output file for HTML forest" )); |
58 | static opt<std::string> StartSymbol("start-symbol" , |
59 | desc("Specify the start symbol to parse" ), |
60 | init(Val: "translation-unit" )); |
61 | |
62 | static std::string readOrDie(llvm::StringRef Path) { |
63 | llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text = |
64 | llvm::MemoryBuffer::getFile(Filename: Path); |
65 | if (std::error_code EC = Text.getError()) { |
66 | llvm::errs() << "Error: can't read file '" << Path |
67 | << "': " << EC.message() << "\n" ; |
68 | ::exit(status: 1); |
69 | } |
70 | return Text.get()->getBuffer().str(); |
71 | } |
72 | |
73 | namespace clang { |
74 | namespace pseudo { |
75 | // Defined in HTMLForest.cpp |
76 | void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &, |
77 | const ForestNode &Root, const Disambiguation &, |
78 | const TokenStream &); |
79 | namespace { |
80 | |
81 | struct NodeStats { |
82 | unsigned Total = 0; |
83 | std::vector<std::pair<SymbolID, unsigned>> BySymbol; |
84 | |
85 | NodeStats(const ForestNode &Root, |
86 | llvm::function_ref<bool(const ForestNode &)> Filter) { |
87 | llvm::DenseMap<SymbolID, unsigned> Map; |
88 | for (const ForestNode &N : Root.descendants()) |
89 | if (Filter(N)) { |
90 | ++Total; |
91 | ++Map[N.symbol()]; |
92 | } |
93 | BySymbol = {Map.begin(), Map.end()}; |
94 | // Sort by count descending, then symbol ascending. |
95 | llvm::sort(C&: BySymbol, Comp: [](const auto &L, const auto &R) { |
96 | return std::tie(R.second, L.first) < std::tie(L.second, R.first); |
97 | }); |
98 | } |
99 | }; |
100 | |
101 | } // namespace |
102 | } // namespace pseudo |
103 | } // namespace clang |
104 | |
105 | int main(int argc, char *argv[]) { |
106 | llvm::cl::ParseCommandLineOptions(argc, argv, Overview: "" ); |
107 | llvm::sys::PrintStackTraceOnErrorSignal(Argv0: argv[0]); |
108 | |
109 | clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); |
110 | std::string SourceText; |
111 | std::optional<clang::pseudo::TokenStream> RawStream; |
112 | std::optional<TokenStream> PreprocessedStream; |
113 | std::optional<clang::pseudo::TokenStream> ParseableStream; |
114 | if (Source.getNumOccurrences()) { |
115 | SourceText = readOrDie(Path: Source); |
116 | RawStream = clang::pseudo::lex(SourceText, LangOpts); |
117 | TokenStream *Stream = &*RawStream; |
118 | |
119 | auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream); |
120 | clang::pseudo::chooseConditionalBranches(DirectiveStructure, Code: *RawStream); |
121 | |
122 | std::optional<TokenStream> Preprocessed; |
123 | if (StripDirectives) { |
124 | Preprocessed = DirectiveStructure.stripDirectives(*Stream); |
125 | Stream = &*Preprocessed; |
126 | } |
127 | |
128 | if (PrintSource) |
129 | Stream->print(llvm::outs()); |
130 | if (PrintTokens) |
131 | llvm::outs() << *Stream; |
132 | if (PrintDirectiveTree) |
133 | llvm::outs() << DirectiveStructure; |
134 | |
135 | ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts)); |
136 | pairBrackets(*ParseableStream); |
137 | } |
138 | |
139 | const auto &Lang = clang::pseudo::getLanguageFromFlags(); |
140 | if (PrintGrammar) |
141 | llvm::outs() << Lang.G.dump(); |
142 | if (PrintGraph) |
143 | llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests( |
144 | Lang.G); |
145 | |
146 | if (PrintTable) |
147 | llvm::outs() << Lang.Table.dumpForTests(G: Lang.G); |
148 | if (PrintStatistics) |
149 | llvm::outs() << Lang.Table.dumpStatistics(); |
150 | |
151 | if (ParseableStream) { |
152 | clang::pseudo::ForestArena Arena; |
153 | clang::pseudo::GSS GSS; |
154 | std::optional<clang::pseudo::SymbolID> StartSymID = |
155 | Lang.G.findNonterminal(Name: StartSymbol); |
156 | if (!StartSymID) { |
157 | llvm::errs() << llvm::formatv( |
158 | Fmt: "The start symbol {0} doesn't exit in the grammar!\n" , Vals&: StartSymbol); |
159 | return 2; |
160 | } |
161 | auto &Root = |
162 | glrParse(Params: clang::pseudo::ParseParams{.Code: *ParseableStream, .Forest: Arena, .GSStack: GSS}, |
163 | StartSymbol: *StartSymID, Lang); |
164 | // If we're disambiguating, we'll print at the end instead. |
165 | if (PrintForest && !Disambiguate) |
166 | llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev); |
167 | clang::pseudo::Disambiguation Disambig; |
168 | if (Disambiguate) |
169 | Disambig = clang::pseudo::disambiguate(Root: &Root, Params: {}); |
170 | |
171 | if (HTMLForest.getNumOccurrences()) { |
172 | std::error_code EC; |
173 | llvm::raw_fd_ostream HTMLOut(HTMLForest, EC); |
174 | if (EC) { |
175 | llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message() |
176 | << "\n" ; |
177 | return 2; |
178 | } |
179 | clang::pseudo::writeHTMLForest(OS&: HTMLOut, Lang.G, Root, Disambig, |
180 | *ParseableStream); |
181 | } |
182 | |
183 | if (PrintStatistics) { |
184 | llvm::outs() << "Forest bytes: " << Arena.bytes() |
185 | << " nodes: " << Arena.nodeCount() << "\n" ; |
186 | llvm::outs() << "GSS bytes: " << GSS.bytes() |
187 | << " nodes: " << GSS.nodesCreated() << "\n" ; |
188 | |
189 | for (auto &P : {std::make_pair(x: "Ambiguous" , y: ForestNode::Ambiguous), |
190 | std::make_pair(x: "Opaque" , y: ForestNode::Opaque)}) { |
191 | clang::pseudo::NodeStats Stats( |
192 | Root, [&](const auto &N) { return N.kind() == P.second; }); |
193 | llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n" ; |
194 | for (const auto &S : Stats.BySymbol) |
195 | llvm::outs() << llvm::formatv(Fmt: " {0,3} {1}\n" , Vals: S.second, |
196 | Vals: Lang.G.symbolName(S.first)); |
197 | } |
198 | |
199 | // Metrics for how imprecise parsing was. |
200 | // These are rough but aim to be: |
201 | // - linear: if we eliminate half the errors the metric should halve |
202 | // - length-independent |
203 | unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique) |
204 | unsigned Misparses = 0; // Sum of alternatives-1 |
205 | llvm::DenseSet<const ForestNode *> Visited; |
206 | auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void { |
207 | if (N.kind() == ForestNode::Opaque) { |
208 | UnparsedTokens += End - N.startTokenIndex(); |
209 | } else if (N.kind() == ForestNode::Ambiguous) { |
210 | Misparses += N.alternatives().size() - 1; |
211 | for (const auto *C : N.alternatives()) |
212 | if (Visited.insert(V: C).second) |
213 | DFS(*C, End, DFS); |
214 | } else if (N.kind() == ForestNode::Sequence) { |
215 | for (unsigned I = 0, E = N.children().size(); I < E; ++I) |
216 | if (Visited.insert(V: N.children()[I]).second) |
217 | DFS(*N.children()[I], |
218 | I + 1 == N.children().size() |
219 | ? End |
220 | : N.children()[I + 1]->startTokenIndex(), |
221 | DFS); |
222 | } |
223 | }; |
224 | unsigned Len = ParseableStream->tokens().size(); |
225 | DFS(Root, Len, DFS); |
226 | llvm::outs() << "\n" ; |
227 | llvm::outs() << llvm::formatv(Fmt: "Ambiguity: {0} misparses/token\n" , |
228 | Vals: double(Misparses) / Len); |
229 | llvm::outs() << llvm::formatv(Fmt: "Unparsed: {0}%\n" , |
230 | Vals: 100.0 * UnparsedTokens / Len); |
231 | } |
232 | |
233 | if (Disambiguate && PrintForest) { |
234 | ForestNode *DisambigRoot = &Root; |
235 | removeAmbiguities(Root&: DisambigRoot, Disambig); |
236 | llvm::outs() << "Disambiguated tree:\n" ; |
237 | llvm::outs() << DisambigRoot->dumpRecursive(Lang.G, |
238 | /*Abbreviated=*/ForestAbbrev); |
239 | } |
240 | } |
241 | |
242 | return 0; |
243 | } |
244 | |