1 | //===--- Markup.cpp -----------------------------------------*- C++-*------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | #include "support/Markup.h" |
9 | #include "llvm/ADT/ArrayRef.h" |
10 | #include "llvm/ADT/STLExtras.h" |
11 | #include "llvm/ADT/SmallVector.h" |
12 | #include "llvm/ADT/StringExtras.h" |
13 | #include "llvm/ADT/StringRef.h" |
14 | #include "llvm/Support/Compiler.h" |
15 | #include "llvm/Support/raw_ostream.h" |
16 | #include <cstddef> |
17 | #include <iterator> |
18 | #include <memory> |
19 | #include <string> |
20 | #include <vector> |
21 | |
22 | namespace clang { |
23 | namespace clangd { |
24 | namespace markup { |
25 | namespace { |
26 | |
27 | // Is <contents a plausible start to an HTML tag? |
28 | // Contents may not be the rest of the line, but it's the rest of the plain |
29 | // text, so we expect to see at least the tag name. |
30 | bool looksLikeTag(llvm::StringRef Contents) { |
31 | if (Contents.empty()) |
32 | return false; |
33 | if (Contents.front() == '!' || Contents.front() == '?' || |
34 | Contents.front() == '/') |
35 | return true; |
36 | // Check the start of the tag name. |
37 | if (!llvm::isAlpha(C: Contents.front())) |
38 | return false; |
39 | // Drop rest of the tag name, and following whitespace. |
40 | Contents = Contents |
41 | .drop_while(F: [](char C) { |
42 | return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':'; |
43 | }) |
44 | .drop_while(F: llvm::isSpace); |
45 | // The rest of the tag consists of attributes, which have restrictive names. |
46 | // If we hit '=', all bets are off (attribute values can contain anything). |
47 | for (; !Contents.empty(); Contents = Contents.drop_front()) { |
48 | if (llvm::isAlnum(C: Contents.front()) || llvm::isSpace(C: Contents.front())) |
49 | continue; |
50 | if (Contents.front() == '>' || Contents.starts_with(Prefix: "/>" )) |
51 | return true; // May close the tag. |
52 | if (Contents.front() == '=') |
53 | return true; // Don't try to parse attribute values. |
54 | return false; // Random punctuation means this isn't a tag. |
55 | } |
56 | return true; // Potentially incomplete tag. |
57 | } |
58 | |
59 | // Tests whether C should be backslash-escaped in markdown. |
60 | // The string being escaped is Before + C + After. This is part of a paragraph. |
61 | // StartsLine indicates whether `Before` is the start of the line. |
62 | // After may not be everything until the end of the line. |
63 | // |
64 | // It's always safe to escape punctuation, but want minimal escaping. |
65 | // The strategy is to escape the first character of anything that might start |
66 | // a markdown grammar construct. |
67 | bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, |
68 | bool StartsLine) { |
69 | assert(Before.take_while(llvm::isSpace).empty()); |
70 | auto RulerLength = [&]() -> /*Length*/ unsigned { |
71 | if (!StartsLine || !Before.empty()) |
72 | return false; |
73 | llvm::StringRef A = After.rtrim(); |
74 | return llvm::all_of(Range&: A, P: [C](char D) { return C == D; }) ? 1 + A.size() : 0; |
75 | }; |
76 | auto IsBullet = [&]() { |
77 | return StartsLine && Before.empty() && |
78 | (After.empty() || After.starts_with(Prefix: " " )); |
79 | }; |
80 | auto SpaceSurrounds = [&]() { |
81 | return (After.empty() || llvm::isSpace(C: After.front())) && |
82 | (Before.empty() || llvm::isSpace(C: Before.back())); |
83 | }; |
84 | auto WordSurrounds = [&]() { |
85 | return (!After.empty() && llvm::isAlnum(C: After.front())) && |
86 | (!Before.empty() && llvm::isAlnum(C: Before.back())); |
87 | }; |
88 | |
89 | switch (C) { |
90 | case '\\': // Escaped character. |
91 | return true; |
92 | case '`': // Code block or inline code |
93 | // Any number of backticks can delimit an inline code block that can end |
94 | // anywhere (including on another line). We must escape them all. |
95 | return true; |
96 | case '~': // Code block |
97 | return StartsLine && Before.empty() && After.starts_with(Prefix: "~~" ); |
98 | case '#': { // ATX heading. |
99 | if (!StartsLine || !Before.empty()) |
100 | return false; |
101 | llvm::StringRef Rest = After.ltrim(Char: C); |
102 | return Rest.empty() || Rest.starts_with(Prefix: " " ); |
103 | } |
104 | case ']': // Link or link reference. |
105 | // We escape ] rather than [ here, because it's more constrained: |
106 | // ](...) is an in-line link |
107 | // ]: is a link reference |
108 | // The following are only links if the link reference exists: |
109 | // ] by itself is a shortcut link |
110 | // ][...] is an out-of-line link |
111 | // Because we never emit link references, we don't need to handle these. |
112 | return After.starts_with(Prefix: ":" ) || After.starts_with(Prefix: "(" ); |
113 | case '=': // Setex heading. |
114 | return RulerLength() > 0; |
115 | case '_': // Horizontal ruler or matched delimiter. |
116 | if (RulerLength() >= 3) |
117 | return true; |
118 | // Not a delimiter if surrounded by space, or inside a word. |
119 | // (The rules at word boundaries are subtle). |
120 | return !(SpaceSurrounds() || WordSurrounds()); |
121 | case '-': // Setex heading, horizontal ruler, or bullet. |
122 | if (RulerLength() > 0) |
123 | return true; |
124 | return IsBullet(); |
125 | case '+': // Bullet list. |
126 | return IsBullet(); |
127 | case '*': // Bullet list, horizontal ruler, or delimiter. |
128 | return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds(); |
129 | case '<': // HTML tag (or autolink, which we choose not to escape) |
130 | return looksLikeTag(Contents: After); |
131 | case '>': // Quote marker. Needs escaping at start of line. |
132 | return StartsLine && Before.empty(); |
133 | case '&': { // HTML entity reference |
134 | auto End = After.find(C: ';'); |
135 | if (End == llvm::StringRef::npos) |
136 | return false; |
137 | llvm::StringRef Content = After.substr(Start: 0, N: End); |
138 | if (Content.consume_front(Prefix: "#" )) { |
139 | if (Content.consume_front(Prefix: "x" ) || Content.consume_front(Prefix: "X" )) |
140 | return llvm::all_of(Range&: Content, P: llvm::isHexDigit); |
141 | return llvm::all_of(Range&: Content, P: llvm::isDigit); |
142 | } |
143 | return llvm::all_of(Range&: Content, P: llvm::isAlpha); |
144 | } |
145 | case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line. |
146 | case ')': |
147 | return StartsLine && !Before.empty() && |
148 | llvm::all_of(Range&: Before, P: llvm::isDigit) && After.starts_with(Prefix: " " ); |
149 | default: |
150 | return false; |
151 | } |
152 | } |
153 | |
154 | /// Escape a markdown text block. Ensures the punctuation will not introduce |
155 | /// any of the markdown constructs. |
156 | std::string renderText(llvm::StringRef Input, bool StartsLine) { |
157 | std::string R; |
158 | for (unsigned I = 0; I < Input.size(); ++I) { |
159 | if (needsLeadingEscape(C: Input[I], Before: Input.substr(Start: 0, N: I), After: Input.substr(Start: I + 1), |
160 | StartsLine)) |
161 | R.push_back(c: '\\'); |
162 | R.push_back(c: Input[I]); |
163 | } |
164 | return R; |
165 | } |
166 | |
167 | /// Renders \p Input as an inline block of code in markdown. The returned value |
168 | /// is surrounded by backticks and the inner contents are properly escaped. |
169 | std::string renderInlineBlock(llvm::StringRef Input) { |
170 | std::string R; |
171 | // Double all backticks to make sure we don't close the inline block early. |
172 | for (size_t From = 0; From < Input.size();) { |
173 | size_t Next = Input.find(Str: "`" , From); |
174 | R += Input.substr(Start: From, N: Next - From); |
175 | if (Next == llvm::StringRef::npos) |
176 | break; |
177 | R += "``" ; // double the found backtick. |
178 | |
179 | From = Next + 1; |
180 | } |
181 | // If results starts with a backtick, add spaces on both sides. The spaces |
182 | // are ignored by markdown renderers. |
183 | if (llvm::StringRef(R).starts_with(Prefix: "`" ) || llvm::StringRef(R).ends_with(Suffix: "`" )) |
184 | return "` " + std::move(R) + " `" ; |
185 | // Markdown render should ignore first and last space if both are there. We |
186 | // add an extra pair of spaces in that case to make sure we render what the |
187 | // user intended. |
188 | if (llvm::StringRef(R).starts_with(Prefix: " " ) && llvm::StringRef(R).ends_with(Suffix: " " )) |
189 | return "` " + std::move(R) + " `" ; |
190 | return "`" + std::move(R) + "`" ; |
191 | } |
192 | |
193 | /// Get marker required for \p Input to represent a markdown codeblock. It |
194 | /// consists of at least 3 backticks(`). Although markdown also allows to use |
195 | /// tilde(~) for code blocks, they are never used. |
196 | std::string getMarkerForCodeBlock(llvm::StringRef Input) { |
197 | // Count the maximum number of consecutive backticks in \p Input. We need to |
198 | // start and end the code block with more. |
199 | unsigned MaxBackticks = 0; |
200 | unsigned Backticks = 0; |
201 | for (char C : Input) { |
202 | if (C == '`') { |
203 | ++Backticks; |
204 | continue; |
205 | } |
206 | MaxBackticks = std::max(a: MaxBackticks, b: Backticks); |
207 | Backticks = 0; |
208 | } |
209 | MaxBackticks = std::max(a: Backticks, b: MaxBackticks); |
210 | // Use the corresponding number of backticks to start and end a code block. |
211 | return std::string(/*Repeat=*/std::max(a: 3u, b: MaxBackticks + 1), '`'); |
212 | } |
213 | |
214 | // Trims the input and concatenates whitespace blocks into a single ` `. |
215 | std::string canonicalizeSpaces(llvm::StringRef Input) { |
216 | llvm::SmallVector<llvm::StringRef> Words; |
217 | llvm::SplitString(Source: Input, OutFragments&: Words); |
218 | return llvm::join(R&: Words, Separator: " " ); |
219 | } |
220 | |
221 | std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children, |
222 | void (Block::*RenderFunc)(llvm::raw_ostream &) const) { |
223 | std::string R; |
224 | llvm::raw_string_ostream OS(R); |
225 | |
226 | // Trim rulers. |
227 | Children = Children.drop_while( |
228 | Pred: [](const std::unique_ptr<Block> &C) { return C->isRuler(); }); |
229 | auto Last = llvm::find_if( |
230 | Range: llvm::reverse(C&: Children), |
231 | P: [](const std::unique_ptr<Block> &C) { return !C->isRuler(); }); |
232 | Children = Children.drop_back(N: Children.end() - Last.base()); |
233 | |
234 | bool LastBlockWasRuler = true; |
235 | for (const auto &C : Children) { |
236 | if (C->isRuler() && LastBlockWasRuler) |
237 | continue; |
238 | LastBlockWasRuler = C->isRuler(); |
239 | ((*C).*RenderFunc)(OS); |
240 | } |
241 | |
242 | // Get rid of redundant empty lines introduced in plaintext while imitating |
243 | // padding in markdown. |
244 | std::string AdjustedResult; |
245 | llvm::StringRef TrimmedText(OS.str()); |
246 | TrimmedText = TrimmedText.trim(); |
247 | |
248 | llvm::copy_if(Range&: TrimmedText, Out: std::back_inserter(x&: AdjustedResult), |
249 | P: [&TrimmedText](const char &C) { |
250 | return !llvm::StringRef(TrimmedText.data(), |
251 | &C - TrimmedText.data() + 1) |
252 | // We allow at most two newlines. |
253 | .ends_with(Suffix: "\n\n\n" ); |
254 | }); |
255 | |
256 | return AdjustedResult; |
257 | } |
258 | |
259 | // Separates two blocks with extra spacing. Note that it might render strangely |
260 | // in vscode if the trailing block is a codeblock, see |
261 | // https://github.com/microsoft/vscode/issues/88416 for details. |
262 | class Ruler : public Block { |
263 | public: |
264 | void renderMarkdown(llvm::raw_ostream &OS) const override { |
265 | // Note that we need an extra new line before the ruler, otherwise we might |
266 | // make previous block a title instead of introducing a ruler. |
267 | OS << "\n---\n" ; |
268 | } |
269 | void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; } |
270 | std::unique_ptr<Block> clone() const override { |
271 | return std::make_unique<Ruler>(args: *this); |
272 | } |
273 | bool isRuler() const override { return true; } |
274 | }; |
275 | |
276 | class CodeBlock : public Block { |
277 | public: |
278 | void renderMarkdown(llvm::raw_ostream &OS) const override { |
279 | std::string Marker = getMarkerForCodeBlock(Input: Contents); |
280 | // No need to pad from previous blocks, as they should end with a new line. |
281 | OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n'; |
282 | } |
283 | |
284 | void renderPlainText(llvm::raw_ostream &OS) const override { |
285 | // In plaintext we want one empty line before and after codeblocks. |
286 | OS << '\n' << Contents << "\n\n" ; |
287 | } |
288 | |
289 | std::unique_ptr<Block> clone() const override { |
290 | return std::make_unique<CodeBlock>(args: *this); |
291 | } |
292 | |
293 | CodeBlock(std::string Contents, std::string Language) |
294 | : Contents(std::move(Contents)), Language(std::move(Language)) {} |
295 | |
296 | private: |
297 | std::string Contents; |
298 | std::string Language; |
299 | }; |
300 | |
301 | // Inserts two spaces after each `\n` to indent each line. First line is not |
302 | // indented. |
303 | std::string indentLines(llvm::StringRef Input) { |
304 | assert(!Input.ends_with("\n" ) && "Input should've been trimmed." ); |
305 | std::string IndentedR; |
306 | // We'll add 2 spaces after each new line. |
307 | IndentedR.reserve(res: Input.size() + Input.count(C: '\n') * 2); |
308 | for (char C : Input) { |
309 | IndentedR += C; |
310 | if (C == '\n') |
311 | IndentedR.append(s: " " ); |
312 | } |
313 | return IndentedR; |
314 | } |
315 | |
316 | class Heading : public Paragraph { |
317 | public: |
318 | Heading(size_t Level) : Level(Level) {} |
319 | void renderMarkdown(llvm::raw_ostream &OS) const override { |
320 | OS << std::string(Level, '#') << ' '; |
321 | Paragraph::renderMarkdown(OS); |
322 | } |
323 | |
324 | private: |
325 | size_t Level; |
326 | }; |
327 | |
328 | } // namespace |
329 | |
330 | std::string Block::asMarkdown() const { |
331 | std::string R; |
332 | llvm::raw_string_ostream OS(R); |
333 | renderMarkdown(OS); |
334 | return llvm::StringRef(OS.str()).trim().str(); |
335 | } |
336 | |
337 | std::string Block::asPlainText() const { |
338 | std::string R; |
339 | llvm::raw_string_ostream OS(R); |
340 | renderPlainText(OS); |
341 | return llvm::StringRef(OS.str()).trim().str(); |
342 | } |
343 | |
344 | void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const { |
345 | bool NeedsSpace = false; |
346 | bool HasChunks = false; |
347 | for (auto &C : Chunks) { |
348 | if (C.SpaceBefore || NeedsSpace) |
349 | OS << " " ; |
350 | switch (C.Kind) { |
351 | case Chunk::PlainText: |
352 | OS << renderText(Input: C.Contents, StartsLine: !HasChunks); |
353 | break; |
354 | case Chunk::InlineCode: |
355 | OS << renderInlineBlock(Input: C.Contents); |
356 | break; |
357 | } |
358 | HasChunks = true; |
359 | NeedsSpace = C.SpaceAfter; |
360 | } |
361 | // Paragraphs are translated into markdown lines, not markdown paragraphs. |
362 | // Therefore it only has a single linebreak afterwards. |
363 | // VSCode requires two spaces at the end of line to start a new one. |
364 | OS << " \n" ; |
365 | } |
366 | |
367 | std::unique_ptr<Block> Paragraph::clone() const { |
368 | return std::make_unique<Paragraph>(args: *this); |
369 | } |
370 | |
371 | /// Choose a marker to delimit `Text` from a prioritized list of options. |
372 | /// This is more readable than escaping for plain-text. |
373 | llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options, |
374 | llvm::StringRef Text) { |
375 | // Prefer a delimiter whose characters don't appear in the text. |
376 | for (llvm::StringRef S : Options) |
377 | if (Text.find_first_of(Chars: S) == llvm::StringRef::npos) |
378 | return S; |
379 | return Options.front(); |
380 | } |
381 | |
382 | void Paragraph::renderPlainText(llvm::raw_ostream &OS) const { |
383 | bool NeedsSpace = false; |
384 | for (auto &C : Chunks) { |
385 | if (C.SpaceBefore || NeedsSpace) |
386 | OS << " " ; |
387 | llvm::StringRef Marker = "" ; |
388 | if (C.Preserve && C.Kind == Chunk::InlineCode) |
389 | Marker = chooseMarker(Options: {"`" , "'" , "\"" }, Text: C.Contents); |
390 | OS << Marker << C.Contents << Marker; |
391 | NeedsSpace = C.SpaceAfter; |
392 | } |
393 | OS << '\n'; |
394 | } |
395 | |
396 | BulletList::BulletList() = default; |
397 | BulletList::~BulletList() = default; |
398 | |
399 | void BulletList::renderMarkdown(llvm::raw_ostream &OS) const { |
400 | for (auto &D : Items) { |
401 | // Instead of doing this we might prefer passing Indent to children to get |
402 | // rid of the copies, if it turns out to be a bottleneck. |
403 | OS << "- " << indentLines(Input: D.asMarkdown()) << '\n'; |
404 | } |
405 | // We need a new line after list to terminate it in markdown. |
406 | OS << '\n'; |
407 | } |
408 | |
409 | void BulletList::renderPlainText(llvm::raw_ostream &OS) const { |
410 | for (auto &D : Items) { |
411 | // Instead of doing this we might prefer passing Indent to children to get |
412 | // rid of the copies, if it turns out to be a bottleneck. |
413 | OS << "- " << indentLines(Input: D.asPlainText()) << '\n'; |
414 | } |
415 | } |
416 | |
417 | Paragraph &Paragraph::appendSpace() { |
418 | if (!Chunks.empty()) |
419 | Chunks.back().SpaceAfter = true; |
420 | return *this; |
421 | } |
422 | |
423 | Paragraph &Paragraph::appendText(llvm::StringRef Text) { |
424 | std::string Norm = canonicalizeSpaces(Input: Text); |
425 | if (Norm.empty()) |
426 | return *this; |
427 | Chunks.emplace_back(); |
428 | Chunk &C = Chunks.back(); |
429 | C.Contents = std::move(Norm); |
430 | C.Kind = Chunk::PlainText; |
431 | C.SpaceBefore = llvm::isSpace(C: Text.front()); |
432 | C.SpaceAfter = llvm::isSpace(C: Text.back()); |
433 | return *this; |
434 | } |
435 | |
436 | Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) { |
437 | bool AdjacentCode = |
438 | !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode; |
439 | std::string Norm = canonicalizeSpaces(Input: std::move(Code)); |
440 | if (Norm.empty()) |
441 | return *this; |
442 | Chunks.emplace_back(); |
443 | Chunk &C = Chunks.back(); |
444 | C.Contents = std::move(Norm); |
445 | C.Kind = Chunk::InlineCode; |
446 | C.Preserve = Preserve; |
447 | // Disallow adjacent code spans without spaces, markdown can't render them. |
448 | C.SpaceBefore = AdjacentCode; |
449 | return *this; |
450 | } |
451 | |
452 | std::unique_ptr<Block> BulletList::clone() const { |
453 | return std::make_unique<BulletList>(args: *this); |
454 | } |
455 | |
456 | class Document &BulletList::addItem() { |
457 | Items.emplace_back(); |
458 | return Items.back(); |
459 | } |
460 | |
461 | Document &Document::operator=(const Document &Other) { |
462 | Children.clear(); |
463 | for (const auto &C : Other.Children) |
464 | Children.push_back(x: C->clone()); |
465 | return *this; |
466 | } |
467 | |
468 | void Document::append(Document Other) { |
469 | std::move(first: Other.Children.begin(), last: Other.Children.end(), |
470 | result: std::back_inserter(x&: Children)); |
471 | } |
472 | |
473 | Paragraph &Document::addParagraph() { |
474 | Children.push_back(x: std::make_unique<Paragraph>()); |
475 | return *static_cast<Paragraph *>(Children.back().get()); |
476 | } |
477 | |
478 | void Document::addRuler() { Children.push_back(x: std::make_unique<Ruler>()); } |
479 | |
480 | void Document::addCodeBlock(std::string Code, std::string Language) { |
481 | Children.emplace_back( |
482 | args: std::make_unique<CodeBlock>(args: std::move(Code), args: std::move(Language))); |
483 | } |
484 | |
485 | std::string Document::asMarkdown() const { |
486 | return renderBlocks(Children, RenderFunc: &Block::renderMarkdown); |
487 | } |
488 | |
489 | std::string Document::asPlainText() const { |
490 | return renderBlocks(Children, RenderFunc: &Block::renderPlainText); |
491 | } |
492 | |
493 | BulletList &Document::addBulletList() { |
494 | Children.emplace_back(args: std::make_unique<BulletList>()); |
495 | return *static_cast<BulletList *>(Children.back().get()); |
496 | } |
497 | |
498 | Paragraph &Document::addHeading(size_t Level) { |
499 | assert(Level > 0); |
500 | Children.emplace_back(args: std::make_unique<Heading>(args&: Level)); |
501 | return *static_cast<Paragraph *>(Children.back().get()); |
502 | } |
503 | } // namespace markup |
504 | } // namespace clangd |
505 | } // namespace clang |
506 | |