1 | //===--- BuildConfusableTable.cpp - clang-tidy---------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/ADT/STLExtras.h" |
10 | #include "llvm/ADT/StringExtras.h" |
11 | #include "llvm/Support/ConvertUTF.h" |
12 | #include "llvm/Support/MemoryBuffer.h" |
13 | #include "llvm/Support/raw_ostream.h" |
14 | |
15 | using namespace llvm; |
16 | |
17 | int main(int argc, char *argv[]) { |
18 | auto ErrorOrBuffer = MemoryBuffer::getFile(Filename: argv[1], IsText: true); |
19 | if (!ErrorOrBuffer) |
20 | return 1; |
21 | std::unique_ptr<MemoryBuffer> Buffer = std::move(ErrorOrBuffer.get()); |
22 | StringRef Content = Buffer->getBuffer(); |
23 | Content = Content.drop_until(F: [](char c) { return c == '#'; }); |
24 | SmallVector<StringRef> Lines; |
25 | SplitString(Source: Content, OutFragments&: Lines, Delimiters: "\r\n" ); |
26 | |
27 | std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries; |
28 | SmallVector<StringRef> Values; |
29 | for (StringRef Line : Lines) { |
30 | if (Line.starts_with(Prefix: "#" )) |
31 | continue; |
32 | |
33 | Values.clear(); |
34 | Line.split(A&: Values, Separator: ';'); |
35 | if (Values.size() < 2) { |
36 | errs() << "Failed to parse: " << Line << "\n" ; |
37 | return 2; |
38 | } |
39 | |
40 | llvm::StringRef From = Values[0].trim(); |
41 | llvm::UTF32 CodePoint = 0; |
42 | From.getAsInteger(Radix: 16, Result&: CodePoint); |
43 | |
44 | SmallVector<llvm::UTF32> To; |
45 | SmallVector<StringRef> ToN; |
46 | Values[1].split(A&: ToN, Separator: ' ', MaxSplit: -1, KeepEmpty: false); |
47 | for (StringRef To_ : ToN) { |
48 | llvm::UTF32 ToCodePoint = 0; |
49 | To_.trim().getAsInteger(Radix: 16, Result&: ToCodePoint); |
50 | To.push_back(Elt: ToCodePoint); |
51 | } |
52 | // Sentinel |
53 | To.push_back(Elt: 0); |
54 | |
55 | Entries.emplace_back(args&: CodePoint, args&: To); |
56 | } |
57 | llvm::sort(C&: Entries); |
58 | |
59 | unsigned LargestValue = |
60 | std::max_element(first: Entries.begin(), last: Entries.end(), |
61 | comp: [](const auto &Entry0, const auto &Entry1) { |
62 | return Entry0.second.size() < Entry1.second.size(); |
63 | }) |
64 | ->second.size(); |
65 | |
66 | std::error_code ec; |
67 | llvm::raw_fd_ostream os(argv[2], ec); |
68 | |
69 | // FIXME: If memory consumption and/or lookup time becomes a constraint, it |
70 | // maybe worth using a more elaborate data structure. |
71 | os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue |
72 | << "];} " |
73 | "ConfusableEntries[] = {\n" ; |
74 | for (const auto &Values : Entries) { |
75 | os << " { " ; |
76 | os << Values.first; |
77 | os << ", {" ; |
78 | for (auto CP : Values.second) |
79 | os << CP << ", " ; |
80 | |
81 | os << "}},\n" ; |
82 | } |
83 | os << "};\n" ; |
84 | return 0; |
85 | } |
86 | |