1//===--- Markup.cpp -----------------------------------------*- C++-*------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8#include "support/Markup.h"
9#include "llvm/ADT/ArrayRef.h"
10#include "llvm/ADT/STLExtras.h"
11#include "llvm/ADT/SmallVector.h"
12#include "llvm/ADT/StringExtras.h"
13#include "llvm/ADT/StringRef.h"
14#include "llvm/Support/Compiler.h"
15#include "llvm/Support/raw_ostream.h"
16#include <cstddef>
17#include <iterator>
18#include <memory>
19#include <string>
20#include <vector>
21
22namespace clang {
23namespace clangd {
24namespace markup {
25namespace {
26
27// Is <contents a plausible start to an HTML tag?
28// Contents may not be the rest of the line, but it's the rest of the plain
29// text, so we expect to see at least the tag name.
30bool looksLikeTag(llvm::StringRef Contents) {
31 if (Contents.empty())
32 return false;
33 if (Contents.front() == '!' || Contents.front() == '?' ||
34 Contents.front() == '/')
35 return true;
36 // Check the start of the tag name.
37 if (!llvm::isAlpha(C: Contents.front()))
38 return false;
39 // Drop rest of the tag name, and following whitespace.
40 Contents = Contents
41 .drop_while(F: [](char C) {
42 return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
43 })
44 .drop_while(F: llvm::isSpace);
45 // The rest of the tag consists of attributes, which have restrictive names.
46 // If we hit '=', all bets are off (attribute values can contain anything).
47 for (; !Contents.empty(); Contents = Contents.drop_front()) {
48 if (llvm::isAlnum(C: Contents.front()) || llvm::isSpace(C: Contents.front()))
49 continue;
50 if (Contents.front() == '>' || Contents.starts_with(Prefix: "/>"))
51 return true; // May close the tag.
52 if (Contents.front() == '=')
53 return true; // Don't try to parse attribute values.
54 return false; // Random punctuation means this isn't a tag.
55 }
56 return true; // Potentially incomplete tag.
57}
58
59// Tests whether C should be backslash-escaped in markdown.
60// The string being escaped is Before + C + After. This is part of a paragraph.
61// StartsLine indicates whether `Before` is the start of the line.
62// After may not be everything until the end of the line.
63//
64// It's always safe to escape punctuation, but want minimal escaping.
65// The strategy is to escape the first character of anything that might start
66// a markdown grammar construct.
67bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
68 bool StartsLine) {
69 assert(Before.take_while(llvm::isSpace).empty());
70 auto RulerLength = [&]() -> /*Length*/ unsigned {
71 if (!StartsLine || !Before.empty())
72 return false;
73 llvm::StringRef A = After.rtrim();
74 return llvm::all_of(Range&: A, P: [C](char D) { return C == D; }) ? 1 + A.size() : 0;
75 };
76 auto IsBullet = [&]() {
77 return StartsLine && Before.empty() &&
78 (After.empty() || After.starts_with(Prefix: " "));
79 };
80 auto SpaceSurrounds = [&]() {
81 return (After.empty() || llvm::isSpace(C: After.front())) &&
82 (Before.empty() || llvm::isSpace(C: Before.back()));
83 };
84 auto WordSurrounds = [&]() {
85 return (!After.empty() && llvm::isAlnum(C: After.front())) &&
86 (!Before.empty() && llvm::isAlnum(C: Before.back()));
87 };
88
89 switch (C) {
90 case '\\': // Escaped character.
91 return true;
92 case '`': // Code block or inline code
93 // Any number of backticks can delimit an inline code block that can end
94 // anywhere (including on another line). We must escape them all.
95 return true;
96 case '~': // Code block
97 return StartsLine && Before.empty() && After.starts_with(Prefix: "~~");
98 case '#': { // ATX heading.
99 if (!StartsLine || !Before.empty())
100 return false;
101 llvm::StringRef Rest = After.ltrim(Char: C);
102 return Rest.empty() || Rest.starts_with(Prefix: " ");
103 }
104 case ']': // Link or link reference.
105 // We escape ] rather than [ here, because it's more constrained:
106 // ](...) is an in-line link
107 // ]: is a link reference
108 // The following are only links if the link reference exists:
109 // ] by itself is a shortcut link
110 // ][...] is an out-of-line link
111 // Because we never emit link references, we don't need to handle these.
112 return After.starts_with(Prefix: ":") || After.starts_with(Prefix: "(");
113 case '=': // Setex heading.
114 return RulerLength() > 0;
115 case '_': // Horizontal ruler or matched delimiter.
116 if (RulerLength() >= 3)
117 return true;
118 // Not a delimiter if surrounded by space, or inside a word.
119 // (The rules at word boundaries are subtle).
120 return !(SpaceSurrounds() || WordSurrounds());
121 case '-': // Setex heading, horizontal ruler, or bullet.
122 if (RulerLength() > 0)
123 return true;
124 return IsBullet();
125 case '+': // Bullet list.
126 return IsBullet();
127 case '*': // Bullet list, horizontal ruler, or delimiter.
128 return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
129 case '<': // HTML tag (or autolink, which we choose not to escape)
130 return looksLikeTag(Contents: After);
131 case '>': // Quote marker. Needs escaping at start of line.
132 return StartsLine && Before.empty();
133 case '&': { // HTML entity reference
134 auto End = After.find(C: ';');
135 if (End == llvm::StringRef::npos)
136 return false;
137 llvm::StringRef Content = After.substr(Start: 0, N: End);
138 if (Content.consume_front(Prefix: "#")) {
139 if (Content.consume_front(Prefix: "x") || Content.consume_front(Prefix: "X"))
140 return llvm::all_of(Range&: Content, P: llvm::isHexDigit);
141 return llvm::all_of(Range&: Content, P: llvm::isDigit);
142 }
143 return llvm::all_of(Range&: Content, P: llvm::isAlpha);
144 }
145 case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
146 case ')':
147 return StartsLine && !Before.empty() &&
148 llvm::all_of(Range&: Before, P: llvm::isDigit) && After.starts_with(Prefix: " ");
149 default:
150 return false;
151 }
152}
153
154/// Escape a markdown text block. Ensures the punctuation will not introduce
155/// any of the markdown constructs.
156std::string renderText(llvm::StringRef Input, bool StartsLine) {
157 std::string R;
158 for (unsigned I = 0; I < Input.size(); ++I) {
159 if (needsLeadingEscape(C: Input[I], Before: Input.substr(Start: 0, N: I), After: Input.substr(Start: I + 1),
160 StartsLine))
161 R.push_back(c: '\\');
162 R.push_back(c: Input[I]);
163 }
164 return R;
165}
166
167/// Renders \p Input as an inline block of code in markdown. The returned value
168/// is surrounded by backticks and the inner contents are properly escaped.
169std::string renderInlineBlock(llvm::StringRef Input) {
170 std::string R;
171 // Double all backticks to make sure we don't close the inline block early.
172 for (size_t From = 0; From < Input.size();) {
173 size_t Next = Input.find(Str: "`", From);
174 R += Input.substr(Start: From, N: Next - From);
175 if (Next == llvm::StringRef::npos)
176 break;
177 R += "``"; // double the found backtick.
178
179 From = Next + 1;
180 }
181 // If results starts with a backtick, add spaces on both sides. The spaces
182 // are ignored by markdown renderers.
183 if (llvm::StringRef(R).starts_with(Prefix: "`") || llvm::StringRef(R).ends_with(Suffix: "`"))
184 return "` " + std::move(R) + " `";
185 // Markdown render should ignore first and last space if both are there. We
186 // add an extra pair of spaces in that case to make sure we render what the
187 // user intended.
188 if (llvm::StringRef(R).starts_with(Prefix: " ") && llvm::StringRef(R).ends_with(Suffix: " "))
189 return "` " + std::move(R) + " `";
190 return "`" + std::move(R) + "`";
191}
192
193/// Get marker required for \p Input to represent a markdown codeblock. It
194/// consists of at least 3 backticks(`). Although markdown also allows to use
195/// tilde(~) for code blocks, they are never used.
196std::string getMarkerForCodeBlock(llvm::StringRef Input) {
197 // Count the maximum number of consecutive backticks in \p Input. We need to
198 // start and end the code block with more.
199 unsigned MaxBackticks = 0;
200 unsigned Backticks = 0;
201 for (char C : Input) {
202 if (C == '`') {
203 ++Backticks;
204 continue;
205 }
206 MaxBackticks = std::max(a: MaxBackticks, b: Backticks);
207 Backticks = 0;
208 }
209 MaxBackticks = std::max(a: Backticks, b: MaxBackticks);
210 // Use the corresponding number of backticks to start and end a code block.
211 return std::string(/*Repeat=*/std::max(a: 3u, b: MaxBackticks + 1), '`');
212}
213
214// Trims the input and concatenates whitespace blocks into a single ` `.
215std::string canonicalizeSpaces(llvm::StringRef Input) {
216 llvm::SmallVector<llvm::StringRef> Words;
217 llvm::SplitString(Source: Input, OutFragments&: Words);
218 return llvm::join(R&: Words, Separator: " ");
219}
220
221std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
222 void (Block::*RenderFunc)(llvm::raw_ostream &) const) {
223 std::string R;
224 llvm::raw_string_ostream OS(R);
225
226 // Trim rulers.
227 Children = Children.drop_while(
228 Pred: [](const std::unique_ptr<Block> &C) { return C->isRuler(); });
229 auto Last = llvm::find_if(
230 Range: llvm::reverse(C&: Children),
231 P: [](const std::unique_ptr<Block> &C) { return !C->isRuler(); });
232 Children = Children.drop_back(N: Children.end() - Last.base());
233
234 bool LastBlockWasRuler = true;
235 for (const auto &C : Children) {
236 if (C->isRuler() && LastBlockWasRuler)
237 continue;
238 LastBlockWasRuler = C->isRuler();
239 ((*C).*RenderFunc)(OS);
240 }
241
242 // Get rid of redundant empty lines introduced in plaintext while imitating
243 // padding in markdown.
244 std::string AdjustedResult;
245 llvm::StringRef TrimmedText(OS.str());
246 TrimmedText = TrimmedText.trim();
247
248 llvm::copy_if(Range&: TrimmedText, Out: std::back_inserter(x&: AdjustedResult),
249 P: [&TrimmedText](const char &C) {
250 return !llvm::StringRef(TrimmedText.data(),
251 &C - TrimmedText.data() + 1)
252 // We allow at most two newlines.
253 .ends_with(Suffix: "\n\n\n");
254 });
255
256 return AdjustedResult;
257}
258
259// Separates two blocks with extra spacing. Note that it might render strangely
260// in vscode if the trailing block is a codeblock, see
261// https://github.com/microsoft/vscode/issues/88416 for details.
262class Ruler : public Block {
263public:
264 void renderMarkdown(llvm::raw_ostream &OS) const override {
265 // Note that we need an extra new line before the ruler, otherwise we might
266 // make previous block a title instead of introducing a ruler.
267 OS << "\n---\n";
268 }
269 void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; }
270 std::unique_ptr<Block> clone() const override {
271 return std::make_unique<Ruler>(args: *this);
272 }
273 bool isRuler() const override { return true; }
274};
275
276class CodeBlock : public Block {
277public:
278 void renderMarkdown(llvm::raw_ostream &OS) const override {
279 std::string Marker = getMarkerForCodeBlock(Input: Contents);
280 // No need to pad from previous blocks, as they should end with a new line.
281 OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n';
282 }
283
284 void renderPlainText(llvm::raw_ostream &OS) const override {
285 // In plaintext we want one empty line before and after codeblocks.
286 OS << '\n' << Contents << "\n\n";
287 }
288
289 std::unique_ptr<Block> clone() const override {
290 return std::make_unique<CodeBlock>(args: *this);
291 }
292
293 CodeBlock(std::string Contents, std::string Language)
294 : Contents(std::move(Contents)), Language(std::move(Language)) {}
295
296private:
297 std::string Contents;
298 std::string Language;
299};
300
301// Inserts two spaces after each `\n` to indent each line. First line is not
302// indented.
303std::string indentLines(llvm::StringRef Input) {
304 assert(!Input.ends_with("\n") && "Input should've been trimmed.");
305 std::string IndentedR;
306 // We'll add 2 spaces after each new line.
307 IndentedR.reserve(res: Input.size() + Input.count(C: '\n') * 2);
308 for (char C : Input) {
309 IndentedR += C;
310 if (C == '\n')
311 IndentedR.append(s: " ");
312 }
313 return IndentedR;
314}
315
316class Heading : public Paragraph {
317public:
318 Heading(size_t Level) : Level(Level) {}
319 void renderMarkdown(llvm::raw_ostream &OS) const override {
320 OS << std::string(Level, '#') << ' ';
321 Paragraph::renderMarkdown(OS);
322 }
323
324private:
325 size_t Level;
326};
327
328} // namespace
329
330std::string Block::asMarkdown() const {
331 std::string R;
332 llvm::raw_string_ostream OS(R);
333 renderMarkdown(OS);
334 return llvm::StringRef(OS.str()).trim().str();
335}
336
337std::string Block::asPlainText() const {
338 std::string R;
339 llvm::raw_string_ostream OS(R);
340 renderPlainText(OS);
341 return llvm::StringRef(OS.str()).trim().str();
342}
343
344void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
345 bool NeedsSpace = false;
346 bool HasChunks = false;
347 for (auto &C : Chunks) {
348 if (C.SpaceBefore || NeedsSpace)
349 OS << " ";
350 switch (C.Kind) {
351 case Chunk::PlainText:
352 OS << renderText(Input: C.Contents, StartsLine: !HasChunks);
353 break;
354 case Chunk::InlineCode:
355 OS << renderInlineBlock(Input: C.Contents);
356 break;
357 }
358 HasChunks = true;
359 NeedsSpace = C.SpaceAfter;
360 }
361 // Paragraphs are translated into markdown lines, not markdown paragraphs.
362 // Therefore it only has a single linebreak afterwards.
363 // VSCode requires two spaces at the end of line to start a new one.
364 OS << " \n";
365}
366
367std::unique_ptr<Block> Paragraph::clone() const {
368 return std::make_unique<Paragraph>(args: *this);
369}
370
371/// Choose a marker to delimit `Text` from a prioritized list of options.
372/// This is more readable than escaping for plain-text.
373llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
374 llvm::StringRef Text) {
375 // Prefer a delimiter whose characters don't appear in the text.
376 for (llvm::StringRef S : Options)
377 if (Text.find_first_of(Chars: S) == llvm::StringRef::npos)
378 return S;
379 return Options.front();
380}
381
382void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
383 bool NeedsSpace = false;
384 for (auto &C : Chunks) {
385 if (C.SpaceBefore || NeedsSpace)
386 OS << " ";
387 llvm::StringRef Marker = "";
388 if (C.Preserve && C.Kind == Chunk::InlineCode)
389 Marker = chooseMarker(Options: {"`", "'", "\""}, Text: C.Contents);
390 OS << Marker << C.Contents << Marker;
391 NeedsSpace = C.SpaceAfter;
392 }
393 OS << '\n';
394}
395
396BulletList::BulletList() = default;
397BulletList::~BulletList() = default;
398
399void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
400 for (auto &D : Items) {
401 // Instead of doing this we might prefer passing Indent to children to get
402 // rid of the copies, if it turns out to be a bottleneck.
403 OS << "- " << indentLines(Input: D.asMarkdown()) << '\n';
404 }
405 // We need a new line after list to terminate it in markdown.
406 OS << '\n';
407}
408
409void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
410 for (auto &D : Items) {
411 // Instead of doing this we might prefer passing Indent to children to get
412 // rid of the copies, if it turns out to be a bottleneck.
413 OS << "- " << indentLines(Input: D.asPlainText()) << '\n';
414 }
415}
416
417Paragraph &Paragraph::appendSpace() {
418 if (!Chunks.empty())
419 Chunks.back().SpaceAfter = true;
420 return *this;
421}
422
423Paragraph &Paragraph::appendText(llvm::StringRef Text) {
424 std::string Norm = canonicalizeSpaces(Input: Text);
425 if (Norm.empty())
426 return *this;
427 Chunks.emplace_back();
428 Chunk &C = Chunks.back();
429 C.Contents = std::move(Norm);
430 C.Kind = Chunk::PlainText;
431 C.SpaceBefore = llvm::isSpace(C: Text.front());
432 C.SpaceAfter = llvm::isSpace(C: Text.back());
433 return *this;
434}
435
436Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
437 bool AdjacentCode =
438 !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
439 std::string Norm = canonicalizeSpaces(Input: std::move(Code));
440 if (Norm.empty())
441 return *this;
442 Chunks.emplace_back();
443 Chunk &C = Chunks.back();
444 C.Contents = std::move(Norm);
445 C.Kind = Chunk::InlineCode;
446 C.Preserve = Preserve;
447 // Disallow adjacent code spans without spaces, markdown can't render them.
448 C.SpaceBefore = AdjacentCode;
449 return *this;
450}
451
452std::unique_ptr<Block> BulletList::clone() const {
453 return std::make_unique<BulletList>(args: *this);
454}
455
456class Document &BulletList::addItem() {
457 Items.emplace_back();
458 return Items.back();
459}
460
461Document &Document::operator=(const Document &Other) {
462 Children.clear();
463 for (const auto &C : Other.Children)
464 Children.push_back(x: C->clone());
465 return *this;
466}
467
468void Document::append(Document Other) {
469 std::move(first: Other.Children.begin(), last: Other.Children.end(),
470 result: std::back_inserter(x&: Children));
471}
472
473Paragraph &Document::addParagraph() {
474 Children.push_back(x: std::make_unique<Paragraph>());
475 return *static_cast<Paragraph *>(Children.back().get());
476}
477
478void Document::addRuler() { Children.push_back(x: std::make_unique<Ruler>()); }
479
480void Document::addCodeBlock(std::string Code, std::string Language) {
481 Children.emplace_back(
482 args: std::make_unique<CodeBlock>(args: std::move(Code), args: std::move(Language)));
483}
484
485std::string Document::asMarkdown() const {
486 return renderBlocks(Children, RenderFunc: &Block::renderMarkdown);
487}
488
489std::string Document::asPlainText() const {
490 return renderBlocks(Children, RenderFunc: &Block::renderPlainText);
491}
492
493BulletList &Document::addBulletList() {
494 Children.emplace_back(args: std::make_unique<BulletList>());
495 return *static_cast<BulletList *>(Children.back().get());
496}
497
498Paragraph &Document::addHeading(size_t Level) {
499 assert(Level > 0);
500 Children.emplace_back(args: std::make_unique<Heading>(args&: Level));
501 return *static_cast<Paragraph *>(Children.back().get());
502}
503} // namespace markup
504} // namespace clangd
505} // namespace clang
506

source code of clang-tools-extra/clangd/support/Markup.cpp