Markup.cpp source code [clang-tools-extra/clangd/support/Markup.cpp]

1	//===--- Markup.cpp ------------------------------------------ C++-------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	#include "support/Markup.h"
9	#include "llvm/ADT/ArrayRef.h"
10	#include "llvm/ADT/STLExtras.h"
11	#include "llvm/ADT/SmallVector.h"
12	#include "llvm/ADT/StringExtras.h"
13	#include "llvm/ADT/StringRef.h"
14	#include "llvm/Support/Compiler.h"
15	#include "llvm/Support/raw_ostream.h"
16	#include <cstddef>
17	#include <iterator>
18	#include <memory>
19	#include <string>
20	#include <vector>
21
22	namespace clang {
23	namespace clangd {
24	namespace markup {
25	namespace {
26
27	// Is <contents a plausible start to an HTML tag?
28	// Contents may not be the rest of the line, but it's the rest of the plain
29	// text, so we expect to see at least the tag name.
30	bool looksLikeTag(llvm::StringRef Contents) {
31	if (Contents.empty())
32	return false;
33	if (Contents.front() == `'!'` \|\| Contents.front() == `'?'` \|\|
34	Contents.front() == `'/'`)
35	return true;
36	// Check the start of the tag name.
37	if (!llvm::isAlpha(C: Contents.front()))
38	return false;
39	// Drop rest of the tag name, and following whitespace.
40	Contents = Contents
41	.drop_while(F: [](char C) {
42	return llvm::isAlnum(C) \|\| C == `'-'` \|\| C == `'_'` \|\| C == `':'`;
43	})
44	.drop_while(F: llvm::isSpace);
45	// The rest of the tag consists of attributes, which have restrictive names.
46	// If we hit '=', all bets are off (attribute values can contain anything).
47	for (; !Contents.empty(); Contents = Contents.drop_front()) {
48	if (llvm::isAlnum(C: Contents.front()) \|\| llvm::isSpace(C: Contents.front()))
49	continue;
50	if (Contents.front() == `'>'` \|\| Contents.starts_with(Prefix: "/>"))
51	return true; // May close the tag.
52	if (Contents.front() == `'='`)
53	return true; // Don't try to parse attribute values.
54	return false; // Random punctuation means this isn't a tag.
55	}
56	return true; // Potentially incomplete tag.
57	}
58
59	// Tests whether C should be backslash-escaped in markdown.
60	// The string being escaped is Before + C + After. This is part of a paragraph.
61	// StartsLine indicates whether `Before` is the start of the line.
62	// After may not be everything until the end of the line.
63	//
64	// It's always safe to escape punctuation, but want minimal escaping.
65	// The strategy is to escape the first character of anything that might start
66	// a markdown grammar construct.
67	bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
68	bool StartsLine) {
69	assert(Before.take_while(llvm::isSpace).empty());
70	auto RulerLength = [&]() -> /Length/ unsigned {
71	if (!StartsLine \|\| !Before.empty())
72	return false;
73	llvm::StringRef A = After.rtrim();
74	return llvm::all_of(Range&: A, P: [C](char D) { return C == D; }) ? `1` + A.size() : `0`;
75	};
76	auto IsBullet = [&]() {
77	return StartsLine && Before.empty() &&
78	(After.empty() \|\| After.starts_with(Prefix: " "));
79	};
80	auto SpaceSurrounds = [&]() {
81	return (After.empty() \|\| llvm::isSpace(C: After.front())) &&
82	(Before.empty() \|\| llvm::isSpace(C: Before.back()));
83	};
84	auto WordSurrounds = [&]() {
85	return (!After.empty() && llvm::isAlnum(C: After.front())) &&
86	(!Before.empty() && llvm::isAlnum(C: Before.back()));
87	};
88
89	switch (C) {
90	case `'\\'`: // Escaped character.
91	return true;
92	case '`': // Code block or inline code
93	// Any number of backticks can delimit an inline code block that can end
94	// anywhere (including on another line). We must escape them all.
95	return true;
96	case `'~'`: // Code block
97	return StartsLine && Before.empty() && After.starts_with(Prefix: "~~");
98	case `'#'`: { // ATX heading.
99	if (!StartsLine \|\| !Before.empty())
100	return false;
101	llvm::StringRef Rest = After.ltrim(Char: C);
102	return Rest.empty() \|\| Rest.starts_with(Prefix: " ");
103	}
104	case `']'`: // Link or link reference.
105	// We escape ] rather than [ here, because it's more constrained:
106	// ](...) is an in-line link
107	// ]: is a link reference
108	// The following are only links if the link reference exists:
109	// ] by itself is a shortcut link
110	// ][...] is an out-of-line link
111	// Because we never emit link references, we don't need to handle these.
112	return After.starts_with(Prefix: ":") \|\| After.starts_with(Prefix: "(");
113	case `'='`: // Setex heading.
114	return RulerLength () > `0`;
115	case `'_'`: // Horizontal ruler or matched delimiter.
116	if (RulerLength () >= `3`)
117	return true;
118	// Not a delimiter if surrounded by space, or inside a word.
119	// (The rules at word boundaries are subtle).
120	return !(SpaceSurrounds () \|\| WordSurrounds ());
121	case `'-'`: // Setex heading, horizontal ruler, or bullet.
122	if (RulerLength () > `0`)
123	return true;
124	return IsBullet ();
125	case `'+'`: // Bullet list.
126	return IsBullet ();
127	case `''`: // Bullet list, horizontal ruler, or delimiter.*
128	return IsBullet () \|\| RulerLength () >= `3` \|\| !SpaceSurrounds ();
129	case `'<'`: // HTML tag (or autolink, which we choose not to escape)
130	return looksLikeTag(Contents: After);
131	case `'>'`: // Quote marker. Needs escaping at start of line.
132	return StartsLine && Before.empty();
133	case `'&'`: { // HTML entity reference
134	auto End = After.find(C: `';'`);
135	if (End == llvm::StringRef::npos)
136	return false;
137	llvm::StringRef Content = After.substr(Start: `0`, N: End);
138	if (Content.consume_front(Prefix: "#")) {
139	if (Content.consume_front(Prefix: "x") \|\| Content.consume_front(Prefix: "X"))
140	return llvm::all_of(Range&: Content, P: llvm::isHexDigit);
141	return llvm::all_of(Range&: Content, P: llvm::isDigit);
142	}
143	return llvm::all_of(Range&: Content, P: llvm::isAlpha);
144	}
145	case `'.'`: // Numbered list indicator. Escape 12. -> 12\. at start of line.
146	case `')'`:
147	return StartsLine && !Before.empty() &&
148	llvm::all_of(Range&: Before, P: llvm::isDigit) && After.starts_with(Prefix: " ");
149	default:
150	return false;
151	}
152	}
153
154	/// Escape a markdown text block. Ensures the punctuation will not introduce
155	/// any of the markdown constructs.
156	std::string renderText(llvm::StringRef Input, bool StartsLine) {
157	std::string R;
158	for (unsigned I = `0`; I < Input.size(); ++I) {
159	if (needsLeadingEscape(C: Input [I], Before: Input.substr(Start: `0`, N: I), After: Input.substr(Start: I + `1`),
160	StartsLine))
161	R.push_back(c: `'\\'`);
162	R.push_back(c: Input [I]);
163	}
164	return R;
165	}
166
167	/// Renders \p Input as an inline block of code in markdown. The returned value
168	/// is surrounded by backticks and the inner contents are properly escaped.
169	std::string renderInlineBlock(llvm::StringRef Input) {
170	std::string R;
171	// Double all backticks to make sure we don't close the inline block early.
172	for (size_t From = `0`; From < Input.size();) {
173	size_t Next = Input.find(Str: "`", From);
174	R += Input.substr(Start: From, N: Next - From);
175	if (Next == llvm::StringRef::npos)
176	break;
177	R += "``"; // double the found backtick.
178
179	From = Next + `1`;
180	}
181	// If results starts with a backtick, add spaces on both sides. The spaces
182	// are ignored by markdown renderers.
183	if (llvm::StringRef (R).starts_with(Prefix: "`") \|\| llvm::StringRef (R).ends_with(Suffix: "`"))
184	return "` " + std::move(R) + " `";
185	// Markdown render should ignore first and last space if both are there. We
186	// add an extra pair of spaces in that case to make sure we render what the
187	// user intended.
188	if (llvm::StringRef (R).starts_with(Prefix: " ") && llvm::StringRef (R).ends_with(Suffix: " "))
189	return "` " + std::move(R) + " `";
190	return "`" + std::move(R) + "`";
191	}
192
193	/// Get marker required for \p Input to represent a markdown codeblock. It
194	/// consists of at least 3 backticks(`). Although markdown also allows to use
195	/// tilde(~) for code blocks, they are never used.
196	std::string getMarkerForCodeBlock(llvm::StringRef Input) {
197	// Count the maximum number of consecutive backticks in \p Input. We need to
198	// start and end the code block with more.
199	unsigned MaxBackticks = `0`;
200	unsigned Backticks = `0`;
201	for (char C : Input) {
202	if (C == '`') {
203	++Backticks;
204	continue;
205	}
206	MaxBackticks = std::max(a: MaxBackticks, b: Backticks);
207	Backticks = `0`;
208	}
209	MaxBackticks = std::max(a: Backticks, b: MaxBackticks);
210	// Use the corresponding number of backticks to start and end a code block.
211	return std::string (/Repeat=/std::max(a: `3u`, b: MaxBackticks + `1`), '`');
212	}
213
214	// Trims the input and concatenates whitespace blocks into a single ` `.
215	std::string canonicalizeSpaces(llvm::StringRef Input) {
216	llvm::SmallVector<llvm::StringRef> Words;
217	llvm::SplitString(Source: Input, OutFragments&: Words);
218	return llvm::join(R&: Words, Separator: " ");
219	}
220
221	std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
222	void (Block::RenderFunc)(llvm::raw_ostream &) const*) {
223	std::string R;
224	llvm::raw_string_ostream OS(R);
225
226	// Trim rulers.
227	Children = Children.drop_while(
228	Pred: [](const std::unique_ptr<Block> &C) { return C ->isRuler(); });
229	auto Last = llvm::find_if(
230	Range: llvm::reverse(C&: Children),
231	P: [](const std::unique_ptr<Block> &C) { return !C ->isRuler(); });
232	Children = Children.drop_back(N: Children.end() - Last.base());
233
234	bool LastBlockWasRuler = true;
235	for (const auto &C : Children) {
236	if (C ->isRuler() && LastBlockWasRuler)
237	continue;
238	LastBlockWasRuler = C ->isRuler();
239	((C).RenderFunc)(OS);
240	}
241
242	// Get rid of redundant empty lines introduced in plaintext while imitating
243	// padding in markdown.
244	std::string AdjustedResult;
245	llvm::StringRef TrimmedText(OS.str());
246	TrimmedText = TrimmedText.trim();
247
248	llvm::copy_if(Range&: TrimmedText, Out: std::back_inserter(x&: AdjustedResult),
249	P: [&TrimmedText](const char &C) {
250	return !llvm::StringRef (TrimmedText.data(),
251	&C - TrimmedText.data() + `1`)
252	// We allow at most two newlines.
253	.ends_with(Suffix: "\n\n\n");
254	});
255
256	return AdjustedResult;
257	}
258
259	// Separates two blocks with extra spacing. Note that it might render strangely
260	// in vscode if the trailing block is a codeblock, see
261	// https://github.com/microsoft/vscode/issues/88416 for details.
262	class Ruler : public Block {
263	public:
264	void renderMarkdown(llvm::raw_ostream &OS) const override {
265	// Note that we need an extra new line before the ruler, otherwise we might
266	// make previous block a title instead of introducing a ruler.
267	OS << "\n---\n";
268	}
269	void renderPlainText(llvm::raw_ostream &OS) const override { OS << `'\n'`; }
270	std::unique_ptr<Block> clone() const override {
271	return std::make_unique<Ruler>(args: *this);
272	}
273	bool isRuler() const override { return true; }
274	};
275
276	class CodeBlock : public Block {
277	public:
278	void renderMarkdown(llvm::raw_ostream &OS) const override {
279	std::string Marker = getMarkerForCodeBlock(Input: Contents);
280	// No need to pad from previous blocks, as they should end with a new line.
281	OS << Marker << Language << `'\n'` << Contents << `'\n'` << Marker << `'\n'`;
282	}
283
284	void renderPlainText(llvm::raw_ostream &OS) const override {
285	// In plaintext we want one empty line before and after codeblocks.
286	OS << `'\n'` << Contents << "\n\n";
287	}
288
289	std::unique_ptr<Block> clone() const override {
290	return std::make_unique<CodeBlock>(args: *this);
291	}
292
293	CodeBlock(std::string Contents, std::string Language)
294	: Contents (std::move(Contents)), Language (std::move(Language)) {}
295
296	private:
297	std::string Contents;
298	std::string Language;
299	};
300
301	// Inserts two spaces after each `\n` to indent each line. First line is not
302	// indented.
303	std::string indentLines(llvm::StringRef Input) {
304	assert(!Input.ends_with("\n") && "Input should've been trimmed.");
305	std::string IndentedR;
306	// We'll add 2 spaces after each new line.
307	IndentedR.reserve(res: Input.size() + Input.count(C: `'\n'`) * `2`);
308	for (char C : Input) {
309	IndentedR += C;
310	if (C == `'\n'`)
311	IndentedR.append(s: " ");
312	}
313	return IndentedR;
314	}
315
316	class Heading : public Paragraph {
317	public:
318	Heading(size_t Level) : Level(Level) {}
319	void renderMarkdown(llvm::raw_ostream &OS) const override {
320	OS << std::string (Level, `'#'`) << `' '`;
321	Paragraph::renderMarkdown(OS);
322	}
323
324	private:
325	size_t Level;
326	};
327
328	} // namespace
329
330	std::string Block::asMarkdown() const {
331	std::string R;
332	llvm::raw_string_ostream OS(R);
333	renderMarkdown(OS);
334	return llvm::StringRef (OS.str()).trim().str();
335	}
336
337	std::string Block::asPlainText() const {
338	std::string R;
339	llvm::raw_string_ostream OS(R);
340	renderPlainText(OS);
341	return llvm::StringRef (OS.str()).trim().str();
342	}
343
344	void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
345	bool NeedsSpace = false;
346	bool HasChunks = false;
347	for (auto &C : Chunks) {
348	if (C.SpaceBefore \|\| NeedsSpace)
349	OS << " ";
350	switch (C.Kind) {
351	case Chunk::PlainText:
352	OS << renderText(Input: C.Contents, StartsLine: !HasChunks);
353	break;
354	case Chunk::InlineCode:
355	OS << renderInlineBlock(Input: C.Contents);
356	break;
357	}
358	HasChunks = true;
359	NeedsSpace = C.SpaceAfter;
360	}
361	// Paragraphs are translated into markdown lines, not markdown paragraphs.
362	// Therefore it only has a single linebreak afterwards.
363	// VSCode requires two spaces at the end of line to start a new one.
364	OS << " \n";
365	}
366
367	std::unique_ptr<Block> Paragraph::clone() const {
368	return std::make_unique<Paragraph>(args: *this);
369	}
370
371	/// Choose a marker to delimit `Text` from a prioritized list of options.
372	/// This is more readable than escaping for plain-text.
373	llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
374	llvm::StringRef Text) {
375	// Prefer a delimiter whose characters don't appear in the text.
376	for (llvm::StringRef S : Options)
377	if (Text.find_first_of(Chars: S) == llvm::StringRef::npos)
378	return S;
379	return Options.front();
380	}
381
382	void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
383	bool NeedsSpace = false;
384	for (auto &C : Chunks) {
385	if (C.SpaceBefore \|\| NeedsSpace)
386	OS << " ";
387	llvm::StringRef Marker = "";
388	if (C.Preserve && C.Kind == Chunk::InlineCode)
389	Marker = chooseMarker(Options: {"`", "'", "\""}, Text: C.Contents);
390	OS << Marker << C.Contents << Marker;
391	NeedsSpace = C.SpaceAfter;
392	}
393	OS << `'\n'`;
394	}
395
396	BulletList::BulletList() = default;
397	BulletList::~BulletList() = default;
398
399	void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
400	for (auto &D : Items) {
401	// Instead of doing this we might prefer passing Indent to children to get
402	// rid of the copies, if it turns out to be a bottleneck.
403	OS << "- " << indentLines(Input: D.asMarkdown()) << `'\n'`;
404	}
405	// We need a new line after list to terminate it in markdown.
406	OS << `'\n'`;
407	}
408
409	void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
410	for (auto &D : Items) {
411	// Instead of doing this we might prefer passing Indent to children to get
412	// rid of the copies, if it turns out to be a bottleneck.
413	OS << "- " << indentLines(Input: D.asPlainText()) << `'\n'`;
414	}
415	}
416
417	Paragraph &Paragraph::appendSpace() {
418	if (!Chunks.empty())
419	Chunks.back().SpaceAfter = true;
420	return *this;
421	}
422
423	Paragraph &Paragraph::appendText(llvm::StringRef Text) {
424	std::string Norm = canonicalizeSpaces(Input: Text);
425	if (Norm.empty())
426	return *this;
427	Chunks.emplace_back();
428	Chunk &C = Chunks.back();
429	C.Contents = std::move(Norm);
430	C.Kind = Chunk::PlainText;
431	C.SpaceBefore = llvm::isSpace(C: Text.front());
432	C.SpaceAfter = llvm::isSpace(C: Text.back());
433	return *this;
434	}
435
436	Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
437	bool AdjacentCode =
438	!Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
439	std::string Norm = canonicalizeSpaces(Input: std::move(Code));
440	if (Norm.empty())
441	return *this;
442	Chunks.emplace_back();
443	Chunk &C = Chunks.back();
444	C.Contents = std::move(Norm);
445	C.Kind = Chunk::InlineCode;
446	C.Preserve = Preserve;
447	// Disallow adjacent code spans without spaces, markdown can't render them.
448	C.SpaceBefore = AdjacentCode;
449	return *this;
450	}
451
452	std::unique_ptr<Block> BulletList::clone() const {
453	return std::make_unique<BulletList>(args: *this);
454	}
455
456	class Document &BulletList::addItem() {
457	Items.emplace_back();
458	return Items.back();
459	}
460
461	Document &Document::operator=(const Document &Other) {
462	Children.clear();
463	for (const auto &C : Other.Children)
464	Children.push_back(x: C ->clone());
465	return *this;
466	}
467
468	void Document::append(Document Other) {
469	std::move(first: Other.Children.begin(), last: Other.Children.end(),
470	result: std::back_inserter(x&: Children));
471	}
472
473	Paragraph &Document::addParagraph() {
474	Children.push_back(x: std::make_unique<Paragraph>());
475	return *static_cast<Paragraph *>(Children.back().get());
476	}
477
478	void Document::addRuler() { Children.push_back(x: std::make_unique<Ruler>()); }
479
480	void Document::addCodeBlock(std::string Code, std::string Language) {
481	Children.emplace_back(
482	args: std::make_unique<CodeBlock>(args: std::move(Code), args: std::move(Language)));
483	}
484
485	std::string Document::asMarkdown() const {
486	return renderBlocks(Children, RenderFunc: &Block::renderMarkdown);
487	}
488
489	std::string Document::asPlainText() const {
490	return renderBlocks(Children, RenderFunc: &Block::renderPlainText);
491	}
492
493	BulletList &Document::addBulletList() {
494	Children.emplace_back(args: std::make_unique<BulletList>());
495	return *static_cast<BulletList *>(Children.back().get());
496	}
497
498	Paragraph &Document::addHeading(size_t Level) {
499	assert(Level > `0`);
500	Children.emplace_back(args: std::make_unique<Heading>(args&: Level));
501	return *static_cast<Paragraph *>(Children.back().get());
502	}
503	} // namespace markup
504	} // namespace clangd
505	} // namespace clang
506

source code of clang-tools-extra/clangd/support/Markup.cpp