1 | //===--- clang/Basic/CharInfo.h - Classifying ASCII Characters --*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef LLVM_CLANG_BASIC_CHARINFO_H |
10 | #define LLVM_CLANG_BASIC_CHARINFO_H |
11 | |
12 | #include "clang/Basic/LLVM.h" |
13 | #include "llvm/ADT/StringRef.h" |
14 | #include "llvm/Support/Compiler.h" |
15 | #include "llvm/Support/DataTypes.h" |
16 | |
17 | namespace clang { |
18 | namespace charinfo { |
19 | extern const uint16_t InfoTable[256]; |
20 | |
21 | enum { |
22 | CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0' |
23 | CHAR_VERT_WS = 0x0002, // '\r', '\n' |
24 | CHAR_SPACE = 0x0004, // ' ' |
25 | CHAR_DIGIT = 0x0008, // 0-9 |
26 | CHAR_XLETTER = 0x0010, // a-f,A-F |
27 | CHAR_UPPER = 0x0020, // A-Z |
28 | CHAR_LOWER = 0x0040, // a-z |
29 | CHAR_UNDER = 0x0080, // _ |
30 | CHAR_PERIOD = 0x0100, // . |
31 | CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"' |
32 | CHAR_PUNCT = 0x0400 // `$@() |
33 | }; |
34 | |
35 | enum { |
36 | CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER, |
37 | CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER |
38 | }; |
39 | } // end namespace charinfo |
40 | |
41 | /// Returns true if a byte is an ASCII character. |
42 | LLVM_READNONE inline bool isASCII(char c) { |
43 | return static_cast<unsigned char>(c) <= 127; |
44 | } |
45 | |
46 | LLVM_READNONE inline bool isASCII(unsigned char c) { return c <= 127; } |
47 | |
48 | /// Returns true if a codepoint is an ASCII character. |
49 | LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; } |
50 | LLVM_READNONE inline bool isASCII(int64_t c) { return 0 <= c && c <= 127; } |
51 | |
52 | /// Returns true if this is a valid first character of a C identifier, |
53 | /// which is [a-zA-Z_]. |
54 | LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c, |
55 | bool AllowDollar = false) { |
56 | using namespace charinfo; |
57 | if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER)) |
58 | return true; |
59 | return AllowDollar && c == '$'; |
60 | } |
61 | |
62 | LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c) { |
63 | // Precomputed CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER |
64 | static constexpr unsigned char IDContinue[256] = { |
65 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
66 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
67 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, |
68 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
69 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
70 | 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
71 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
72 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
73 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
74 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
75 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
76 | return IDContinue[c]; |
77 | } |
78 | |
79 | /// Returns true if this is a body character of a C identifier, |
80 | /// which is [a-zA-Z0-9_]. |
81 | LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c, |
82 | bool AllowDollar) { |
83 | if (isAsciiIdentifierContinue(c)) |
84 | return true; |
85 | return AllowDollar && c == '$'; |
86 | } |
87 | |
88 | /// Returns true if this character is horizontal ASCII whitespace: |
89 | /// ' ', '\\t', '\\f', '\\v'. |
90 | /// |
91 | /// Note that this returns false for '\\0'. |
92 | LLVM_READONLY inline bool isHorizontalWhitespace(unsigned char c) { |
93 | using namespace charinfo; |
94 | return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0; |
95 | } |
96 | |
97 | /// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'. |
98 | /// |
99 | /// Note that this returns false for '\\0'. |
100 | LLVM_READONLY inline bool isVerticalWhitespace(unsigned char c) { |
101 | using namespace charinfo; |
102 | return (InfoTable[c] & CHAR_VERT_WS) != 0; |
103 | } |
104 | |
105 | /// Return true if this character is horizontal or vertical ASCII whitespace: |
106 | /// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. |
107 | /// |
108 | /// Note that this returns false for '\\0'. |
109 | LLVM_READONLY inline bool isWhitespace(unsigned char c) { |
110 | using namespace charinfo; |
111 | return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0; |
112 | } |
113 | |
114 | /// Return true if this character is an ASCII digit: [0-9] |
115 | LLVM_READONLY inline bool isDigit(unsigned char c) { |
116 | using namespace charinfo; |
117 | return (InfoTable[c] & CHAR_DIGIT) != 0; |
118 | } |
119 | |
120 | /// Return true if this character is a lowercase ASCII letter: [a-z] |
121 | LLVM_READONLY inline bool isLowercase(unsigned char c) { |
122 | using namespace charinfo; |
123 | return (InfoTable[c] & CHAR_LOWER) != 0; |
124 | } |
125 | |
126 | /// Return true if this character is an uppercase ASCII letter: [A-Z] |
127 | LLVM_READONLY inline bool isUppercase(unsigned char c) { |
128 | using namespace charinfo; |
129 | return (InfoTable[c] & CHAR_UPPER) != 0; |
130 | } |
131 | |
132 | /// Return true if this character is an ASCII letter: [a-zA-Z] |
133 | LLVM_READONLY inline bool isLetter(unsigned char c) { |
134 | using namespace charinfo; |
135 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0; |
136 | } |
137 | |
138 | /// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9] |
139 | LLVM_READONLY inline bool isAlphanumeric(unsigned char c) { |
140 | using namespace charinfo; |
141 | return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0; |
142 | } |
143 | |
144 | /// Return true if this character is an ASCII hex digit: [0-9a-fA-F] |
145 | LLVM_READONLY inline bool isHexDigit(unsigned char c) { |
146 | using namespace charinfo; |
147 | return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0; |
148 | } |
149 | |
150 | /// Return true if this character is an ASCII punctuation character. |
151 | /// |
152 | /// Note that '_' is both a punctuation character and an identifier character! |
153 | LLVM_READONLY inline bool isPunctuation(unsigned char c) { |
154 | using namespace charinfo; |
155 | return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0; |
156 | } |
157 | |
158 | /// Return true if this character is an ASCII printable character; that is, a |
159 | /// character that should take exactly one column to print in a fixed-width |
160 | /// terminal. |
161 | LLVM_READONLY inline bool isPrintable(unsigned char c) { |
162 | using namespace charinfo; |
163 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT| |
164 | CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0; |
165 | } |
166 | |
167 | /// Return true if this is the body character of a C preprocessing number, |
168 | /// which is [a-zA-Z0-9_.]. |
169 | LLVM_READONLY inline bool isPreprocessingNumberBody(unsigned char c) { |
170 | using namespace charinfo; |
171 | return (InfoTable[c] & |
172 | (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0; |
173 | } |
174 | |
175 | /// Return true if this is the body character of a C++ raw string delimiter. |
176 | LLVM_READONLY inline bool isRawStringDelimBody(unsigned char c) { |
177 | using namespace charinfo; |
178 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD| |
179 | CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0; |
180 | } |
181 | |
182 | enum class EscapeChar { |
183 | Single = 1, |
184 | Double = 2, |
185 | SingleAndDouble = static_cast<int>(Single) | static_cast<int>(Double), |
186 | }; |
187 | |
188 | /// Return C-style escaped string for special characters, or an empty string if |
189 | /// there is no such mapping. |
190 | template <EscapeChar Opt, class CharT> |
191 | LLVM_READONLY inline auto escapeCStyle(CharT Ch) -> StringRef { |
192 | switch (Ch) { |
193 | case '\\': |
194 | return "\\\\" ; |
195 | case '\'': |
196 | if ((static_cast<int>(Opt) & static_cast<int>(EscapeChar::Single)) == 0) |
197 | break; |
198 | return "\\'" ; |
199 | case '"': |
200 | if ((static_cast<int>(Opt) & static_cast<int>(EscapeChar::Double)) == 0) |
201 | break; |
202 | return "\\\"" ; |
203 | case '\a': |
204 | return "\\a" ; |
205 | case '\b': |
206 | return "\\b" ; |
207 | case '\f': |
208 | return "\\f" ; |
209 | case '\n': |
210 | return "\\n" ; |
211 | case '\r': |
212 | return "\\r" ; |
213 | case '\t': |
214 | return "\\t" ; |
215 | case '\v': |
216 | return "\\v" ; |
217 | } |
218 | return {}; |
219 | } |
220 | |
221 | /// Converts the given ASCII character to its lowercase equivalent. |
222 | /// |
223 | /// If the character is not an uppercase character, it is returned as is. |
224 | LLVM_READONLY inline char toLowercase(char c) { |
225 | if (isUppercase(c)) |
226 | return c + 'a' - 'A'; |
227 | return c; |
228 | } |
229 | |
230 | /// Converts the given ASCII character to its uppercase equivalent. |
231 | /// |
232 | /// If the character is not a lowercase character, it is returned as is. |
233 | LLVM_READONLY inline char toUppercase(char c) { |
234 | if (isLowercase(c)) |
235 | return c + 'A' - 'a'; |
236 | return c; |
237 | } |
238 | |
239 | |
240 | /// Return true if this is a valid ASCII identifier. |
241 | /// |
242 | /// Note that this is a very simple check; it does not accept UCNs as valid |
243 | /// identifier characters. |
244 | LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S, |
245 | bool AllowDollar = false) { |
246 | if (S.empty() || !isAsciiIdentifierStart(c: S[0], AllowDollar)) |
247 | return false; |
248 | |
249 | for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I) |
250 | if (!isAsciiIdentifierContinue(c: *I, AllowDollar)) |
251 | return false; |
252 | |
253 | return true; |
254 | } |
255 | |
256 | } // end namespace clang |
257 | |
258 | #endif |
259 | |