1 | // Copyright 2009-2021 Intel Corporation |
2 | // SPDX-License-Identifier: Apache-2.0 |
3 | |
4 | #include "tokenstream.h" |
5 | #include "../math/math.h" |
6 | |
7 | namespace embree |
8 | { |
9 | /* shorthands for common sets of characters */ |
10 | const std::string TokenStream::alpha = "abcdefghijklmnopqrstuvwxyz" ; |
11 | const std::string TokenStream::ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" ; |
12 | const std::string TokenStream::numbers = "0123456789" ; |
13 | const std::string TokenStream::separators = "\n\t\r " ; |
14 | const std::string TokenStream::stringChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 _.,+-=:/*\\" ; |
15 | |
16 | /* creates map for fast categorization of characters */ |
17 | static void createCharMap(bool map[256], const std::string& chrs) { |
18 | for (size_t i=0; i<256; i++) map[i] = false; |
19 | for (size_t i=0; i<chrs.size(); i++) map[uint8_t(chrs[i])] = true; |
20 | } |
21 | |
22 | /* build full tokenizer that takes list of valid characters and keywords */ |
23 | TokenStream::TokenStream(const Ref<Stream<int> >& cin, //< stream to read from |
24 | const std::string& alpha, //< valid characters for identifiers |
25 | const std::string& seps, //< characters that act as separators |
26 | const std::vector<std::string>& symbols) //< symbols |
27 | : cin(cin), symbols(symbols) |
28 | { |
29 | createCharMap(map: isAlphaMap,chrs: alpha); |
30 | createCharMap(map: isSepMap,chrs: seps); |
31 | createCharMap(map: isStringCharMap,chrs: stringChars); |
32 | } |
33 | |
34 | bool TokenStream::decDigits(std::string& str_o) |
35 | { |
36 | bool ok = false; |
37 | std::string str; |
38 | if (cin->peek() == '+' || cin->peek() == '-') str += (char)cin->get(); |
39 | while (isDigit(c: cin->peek())) { ok = true; str += (char)cin->get(); } |
40 | if (ok) str_o += str; |
41 | else cin->unget(n: str.size()); |
42 | return ok; |
43 | } |
44 | |
45 | bool TokenStream::decDigits1(std::string& str_o) |
46 | { |
47 | bool ok = false; |
48 | std::string str; |
49 | while (isDigit(c: cin->peek())) { ok = true; str += (char)cin->get(); } |
50 | if (ok) str_o += str; else cin->unget(n: str.size()); |
51 | return ok; |
52 | } |
53 | |
54 | bool TokenStream::trySymbol(const std::string& symbol) |
55 | { |
56 | size_t pos = 0; |
57 | while (pos < symbol.size()) { |
58 | if (symbol[pos] != cin->peek()) { cin->unget(n: pos); return false; } |
59 | cin->drop(); pos++; |
60 | } |
61 | return true; |
62 | } |
63 | |
64 | bool TokenStream::trySymbols(Token& token, const ParseLocation& loc) |
65 | { |
66 | for (size_t i=0; i<symbols.size(); i++) { |
67 | if (!trySymbol(symbol: symbols[i])) continue; |
68 | token = Token(symbols[i],Token::TY_SYMBOL,loc); |
69 | return true; |
70 | } |
71 | return false; |
72 | } |
73 | |
74 | bool TokenStream::tryFloat(Token& token, const ParseLocation& loc) |
75 | { |
76 | bool ok = false; |
77 | std::string str; |
78 | if (trySymbol(symbol: "nan" )) { |
79 | token = Token(float(nan)); |
80 | return true; |
81 | } |
82 | if (trySymbol(symbol: "+inf" )) { |
83 | token = Token(float(pos_inf)); |
84 | return true; |
85 | } |
86 | if (trySymbol(symbol: "-inf" )) { |
87 | token = Token(float(neg_inf)); |
88 | return true; |
89 | } |
90 | |
91 | if (decDigits(str_o&: str)) |
92 | { |
93 | if (cin->peek() == '.') { |
94 | str += (char)cin->get(); |
95 | decDigits(str_o&: str); |
96 | if (cin->peek() == 'e' || cin->peek() == 'E') { |
97 | str += (char)cin->get(); |
98 | if (decDigits(str_o&: str)) ok = true; // 1.[2]E2 |
99 | } |
100 | else ok = true; // 1.[2] |
101 | } |
102 | else if (cin->peek() == 'e' || cin->peek() == 'E') { |
103 | str += (char)cin->get(); |
104 | if (decDigits(str_o&: str)) ok = true; // 1E2 |
105 | } |
106 | } |
107 | else |
108 | { |
109 | if (cin->peek() == '.') { |
110 | str += (char)cin->get(); |
111 | if (decDigits(str_o&: str)) { |
112 | if (cin->peek() == 'e' || cin->peek() == 'E') { |
113 | str += (char)cin->get(); |
114 | if (decDigits(str_o&: str)) ok = true; // .3E2 |
115 | } |
116 | else ok = true; // .3 |
117 | } |
118 | } |
119 | } |
120 | if (ok) { |
121 | token = Token((float)atof(nptr: str.c_str()),loc); |
122 | } |
123 | else cin->unget(n: str.size()); |
124 | return ok; |
125 | } |
126 | |
127 | bool TokenStream::tryInt(Token& token, const ParseLocation& loc) { |
128 | std::string str; |
129 | if (decDigits(str_o&: str)) { |
130 | token = Token(atoi(nptr: str.c_str()),loc); |
131 | return true; |
132 | } |
133 | return false; |
134 | } |
135 | |
136 | bool TokenStream::tryString(Token& token, const ParseLocation& loc) |
137 | { |
138 | std::string str; |
139 | if (cin->peek() != '\"') return false; |
140 | cin->drop(); |
141 | while (cin->peek() != '\"') { |
142 | const int c = cin->get(); |
143 | if (!isStringChar(c)) THROW_RUNTIME_ERROR("invalid string character " +std::string(1,c)+" at " +loc.str()); |
144 | str += (char)c; |
145 | } |
146 | cin->drop(); |
147 | token = Token(str,Token::TY_STRING,loc); |
148 | return true; |
149 | } |
150 | |
151 | bool TokenStream::tryIdentifier(Token& token, const ParseLocation& loc) |
152 | { |
153 | std::string str; |
154 | if (!isAlpha(c: cin->peek())) return false; |
155 | str += (char)cin->get(); |
156 | while (isAlphaNum(c: cin->peek())) str += (char)cin->get(); |
157 | token = Token(str,Token::TY_IDENTIFIER,loc); |
158 | return true; |
159 | } |
160 | |
161 | void TokenStream::skipSeparators() |
162 | { |
163 | /* skip separators */ |
164 | while (cin->peek() != EOF && isSeparator(c: cin->peek())) |
165 | cin->drop(); |
166 | } |
167 | |
168 | Token TokenStream::next() |
169 | { |
170 | Token token; |
171 | skipSeparators(); |
172 | ParseLocation loc = cin->loc(); |
173 | if (trySymbols (token,loc)) return token; /**< try to parse a symbol */ |
174 | if (tryFloat (token,loc)) return token; /**< try to parse float */ |
175 | if (tryInt (token,loc)) return token; /**< try to parse integer */ |
176 | if (tryString (token,loc)) return token; /**< try to parse string */ |
177 | if (tryIdentifier(token,loc)) return token; /**< try to parse identifier */ |
178 | if (cin->peek() == EOF ) return Token(loc); /**< return EOF token */ |
179 | return Token((char)cin->get(),loc); /**< return invalid character token */ |
180 | } |
181 | } |
182 | |