1//===-- ClangPseudo.cpp - Clang pseudoparser tool -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "clang-pseudo/Bracket.h"
10#include "clang-pseudo/DirectiveTree.h"
11#include "clang-pseudo/Disambiguate.h"
12#include "clang-pseudo/Forest.h"
13#include "clang-pseudo/GLR.h"
14#include "clang-pseudo/Language.h"
15#include "clang-pseudo/Token.h"
16#include "clang-pseudo/cli/CLI.h"
17#include "clang-pseudo/grammar/Grammar.h"
18#include "clang-pseudo/grammar/LRGraph.h"
19#include "clang-pseudo/grammar/LRTable.h"
20#include "clang/Basic/LangOptions.h"
21#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/STLFunctionalExtras.h"
23#include "llvm/Support/CommandLine.h"
24#include "llvm/Support/FormatVariadic.h"
25#include "llvm/Support/MemoryBuffer.h"
26#include "llvm/Support/Signals.h"
27#include <optional>
28
29using clang::pseudo::ForestNode;
30using clang::pseudo::Token;
31using clang::pseudo::TokenStream;
32using llvm::cl::desc;
33using llvm::cl::init;
34using llvm::cl::opt;
35
36static opt<bool> PrintGrammar("print-grammar", desc("Print the grammar"));
37static opt<bool> PrintGraph("print-graph",
38 desc("Print the LR graph for the grammar"));
39static opt<bool> PrintTable("print-table",
40 desc("Print the LR table for the grammar"));
41static opt<std::string> Source("source", desc("Source file"));
42static opt<bool> PrintSource("print-source", desc("Print token stream"));
43static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
44static opt<bool>
45 PrintDirectiveTree("print-directive-tree",
46 desc("Print directive structure of source code"));
47static opt<bool>
48 StripDirectives("strip-directives",
49 desc("Strip directives and select conditional sections"));
50static opt<bool> Disambiguate("disambiguate",
51 desc("Choose best tree from parse forest"));
52static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
53static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
54static opt<bool> ForestAbbrev("forest-abbrev", desc("Abbreviate parse forest"),
55 init(Val: true));
56static opt<std::string> HTMLForest("html-forest",
57 desc("output file for HTML forest"));
58static opt<std::string> StartSymbol("start-symbol",
59 desc("Specify the start symbol to parse"),
60 init(Val: "translation-unit"));
61
62static std::string readOrDie(llvm::StringRef Path) {
63 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
64 llvm::MemoryBuffer::getFile(Filename: Path);
65 if (std::error_code EC = Text.getError()) {
66 llvm::errs() << "Error: can't read file '" << Path
67 << "': " << EC.message() << "\n";
68 ::exit(status: 1);
69 }
70 return Text.get()->getBuffer().str();
71}
72
73namespace clang {
74namespace pseudo {
75// Defined in HTMLForest.cpp
76void writeHTMLForest(llvm::raw_ostream &OS, const Grammar &,
77 const ForestNode &Root, const Disambiguation &,
78 const TokenStream &);
79namespace {
80
81struct NodeStats {
82 unsigned Total = 0;
83 std::vector<std::pair<SymbolID, unsigned>> BySymbol;
84
85 NodeStats(const ForestNode &Root,
86 llvm::function_ref<bool(const ForestNode &)> Filter) {
87 llvm::DenseMap<SymbolID, unsigned> Map;
88 for (const ForestNode &N : Root.descendants())
89 if (Filter(N)) {
90 ++Total;
91 ++Map[N.symbol()];
92 }
93 BySymbol = {Map.begin(), Map.end()};
94 // Sort by count descending, then symbol ascending.
95 llvm::sort(C&: BySymbol, Comp: [](const auto &L, const auto &R) {
96 return std::tie(R.second, L.first) < std::tie(L.second, R.first);
97 });
98 }
99};
100
101} // namespace
102} // namespace pseudo
103} // namespace clang
104
105int main(int argc, char *argv[]) {
106 llvm::cl::ParseCommandLineOptions(argc, argv, Overview: "");
107 llvm::sys::PrintStackTraceOnErrorSignal(Argv0: argv[0]);
108
109 clang::LangOptions LangOpts = clang::pseudo::genericLangOpts();
110 std::string SourceText;
111 std::optional<clang::pseudo::TokenStream> RawStream;
112 std::optional<TokenStream> PreprocessedStream;
113 std::optional<clang::pseudo::TokenStream> ParseableStream;
114 if (Source.getNumOccurrences()) {
115 SourceText = readOrDie(Path: Source);
116 RawStream = clang::pseudo::lex(SourceText, LangOpts);
117 TokenStream *Stream = &*RawStream;
118
119 auto DirectiveStructure = clang::pseudo::DirectiveTree::parse(*RawStream);
120 clang::pseudo::chooseConditionalBranches(DirectiveStructure, Code: *RawStream);
121
122 std::optional<TokenStream> Preprocessed;
123 if (StripDirectives) {
124 Preprocessed = DirectiveStructure.stripDirectives(*Stream);
125 Stream = &*Preprocessed;
126 }
127
128 if (PrintSource)
129 Stream->print(llvm::outs());
130 if (PrintTokens)
131 llvm::outs() << *Stream;
132 if (PrintDirectiveTree)
133 llvm::outs() << DirectiveStructure;
134
135 ParseableStream = clang::pseudo::stripComments(cook(*Stream, LangOpts));
136 pairBrackets(*ParseableStream);
137 }
138
139 const auto &Lang = clang::pseudo::getLanguageFromFlags();
140 if (PrintGrammar)
141 llvm::outs() << Lang.G.dump();
142 if (PrintGraph)
143 llvm::outs() << clang::pseudo::LRGraph::buildLR0(Lang.G).dumpForTests(
144 Lang.G);
145
146 if (PrintTable)
147 llvm::outs() << Lang.Table.dumpForTests(G: Lang.G);
148 if (PrintStatistics)
149 llvm::outs() << Lang.Table.dumpStatistics();
150
151 if (ParseableStream) {
152 clang::pseudo::ForestArena Arena;
153 clang::pseudo::GSS GSS;
154 std::optional<clang::pseudo::SymbolID> StartSymID =
155 Lang.G.findNonterminal(Name: StartSymbol);
156 if (!StartSymID) {
157 llvm::errs() << llvm::formatv(
158 Fmt: "The start symbol {0} doesn't exit in the grammar!\n", Vals&: StartSymbol);
159 return 2;
160 }
161 auto &Root =
162 glrParse(Params: clang::pseudo::ParseParams{.Code: *ParseableStream, .Forest: Arena, .GSStack: GSS},
163 StartSymbol: *StartSymID, Lang);
164 // If we're disambiguating, we'll print at the end instead.
165 if (PrintForest && !Disambiguate)
166 llvm::outs() << Root.dumpRecursive(Lang.G, /*Abbreviated=*/ForestAbbrev);
167 clang::pseudo::Disambiguation Disambig;
168 if (Disambiguate)
169 Disambig = clang::pseudo::disambiguate(Root: &Root, Params: {});
170
171 if (HTMLForest.getNumOccurrences()) {
172 std::error_code EC;
173 llvm::raw_fd_ostream HTMLOut(HTMLForest, EC);
174 if (EC) {
175 llvm::errs() << "Couldn't write " << HTMLForest << ": " << EC.message()
176 << "\n";
177 return 2;
178 }
179 clang::pseudo::writeHTMLForest(OS&: HTMLOut, Lang.G, Root, Disambig,
180 *ParseableStream);
181 }
182
183 if (PrintStatistics) {
184 llvm::outs() << "Forest bytes: " << Arena.bytes()
185 << " nodes: " << Arena.nodeCount() << "\n";
186 llvm::outs() << "GSS bytes: " << GSS.bytes()
187 << " nodes: " << GSS.nodesCreated() << "\n";
188
189 for (auto &P : {std::make_pair(x: "Ambiguous", y: ForestNode::Ambiguous),
190 std::make_pair(x: "Opaque", y: ForestNode::Opaque)}) {
191 clang::pseudo::NodeStats Stats(
192 Root, [&](const auto &N) { return N.kind() == P.second; });
193 llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n";
194 for (const auto &S : Stats.BySymbol)
195 llvm::outs() << llvm::formatv(Fmt: " {0,3} {1}\n", Vals: S.second,
196 Vals: Lang.G.symbolName(S.first));
197 }
198
199 // Metrics for how imprecise parsing was.
200 // These are rough but aim to be:
201 // - linear: if we eliminate half the errors the metric should halve
202 // - length-independent
203 unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique)
204 unsigned Misparses = 0; // Sum of alternatives-1
205 llvm::DenseSet<const ForestNode *> Visited;
206 auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void {
207 if (N.kind() == ForestNode::Opaque) {
208 UnparsedTokens += End - N.startTokenIndex();
209 } else if (N.kind() == ForestNode::Ambiguous) {
210 Misparses += N.alternatives().size() - 1;
211 for (const auto *C : N.alternatives())
212 if (Visited.insert(V: C).second)
213 DFS(*C, End, DFS);
214 } else if (N.kind() == ForestNode::Sequence) {
215 for (unsigned I = 0, E = N.children().size(); I < E; ++I)
216 if (Visited.insert(V: N.children()[I]).second)
217 DFS(*N.children()[I],
218 I + 1 == N.children().size()
219 ? End
220 : N.children()[I + 1]->startTokenIndex(),
221 DFS);
222 }
223 };
224 unsigned Len = ParseableStream->tokens().size();
225 DFS(Root, Len, DFS);
226 llvm::outs() << "\n";
227 llvm::outs() << llvm::formatv(Fmt: "Ambiguity: {0} misparses/token\n",
228 Vals: double(Misparses) / Len);
229 llvm::outs() << llvm::formatv(Fmt: "Unparsed: {0}%\n",
230 Vals: 100.0 * UnparsedTokens / Len);
231 }
232
233 if (Disambiguate && PrintForest) {
234 ForestNode *DisambigRoot = &Root;
235 removeAmbiguities(Root&: DisambigRoot, Disambig);
236 llvm::outs() << "Disambiguated tree:\n";
237 llvm::outs() << DisambigRoot->dumpRecursive(Lang.G,
238 /*Abbreviated=*/ForestAbbrev);
239 }
240 }
241
242 return 0;
243}
244

source code of clang-tools-extra/pseudo/tool/ClangPseudo.cpp