1 | // CODYlib -*- mode:c++ -*- |
2 | // Copyright (C) 2020 Nathan Sidwell, nathan@acm.org |
3 | // License: Apache v2.0 |
4 | |
5 | // Cody |
6 | #include "internal.hh" |
7 | // C++ |
8 | #include <algorithm> |
9 | // C |
10 | #include <cstring> |
11 | // OS |
12 | #include <unistd.h> |
13 | #include <cerrno> |
14 | |
15 | // MessageBuffer code |
16 | |
17 | // Lines consist of words and end with a NEWLINE (0xa) char |
18 | // Whitespace characters are TAB (0x9) and SPACE (0x20) |
19 | // Words consist of non-whitespace chars separated by whitespace. |
20 | // Multiple lines in one transaction are indicated by ending non-final |
21 | // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE |
22 | // Continuations with ; preceding it |
23 | // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting. |
24 | // Quoting with '...' |
25 | // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting |
26 | // Anything outside of <= <space> or DEL or \' or \\ needs escaping. |
27 | // Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>? |
28 | // Spaces separate words, UTF8 encoding for non-ascii chars |
29 | |
30 | namespace Cody { |
31 | namespace Detail { |
32 | |
33 | static const char CONTINUE = S2C(s: u8";" ); |
34 | |
35 | void MessageBuffer::BeginLine () |
36 | { |
37 | if (!buffer.empty ()) |
38 | { |
39 | // Terminate the previous line with a continuation |
40 | buffer.reserve (n: buffer.size () + 3); |
41 | buffer.push_back (x: S2C(s: u8" " )); |
42 | buffer.push_back (x: CONTINUE); |
43 | buffer.push_back (x: S2C(s: u8"\n" )); |
44 | } |
45 | lastBol = buffer.size (); |
46 | } |
47 | |
48 | // QUOTE means 'maybe quote', we search it for quote-needing chars |
49 | |
50 | void MessageBuffer::Append (char const *str, bool quote, size_t len) |
51 | { |
52 | if (len == ~size_t (0)) |
53 | len = strlen (s: str); |
54 | |
55 | if (!len && !quote) |
56 | return; |
57 | |
58 | // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything |
59 | // that could remotely be shell-active. UTF8 encoding for non-ascii. |
60 | if (quote && len) |
61 | { |
62 | quote = false; |
63 | // Scan looking for quote-needing characters. We could just |
64 | // append until we find one, but that's probably confusing |
65 | for (size_t ix = len; ix--;) |
66 | { |
67 | unsigned char c = (unsigned char)str[ix]; |
68 | if (!((c >= S2C(s: u8"a" ) && c <= S2C(s: u8"z" )) |
69 | || (c >= S2C(s: u8"A" ) && c <= S2C(s: u8"Z" )) |
70 | || (c >= S2C(s: u8"0" ) && c <= S2C(s: u8"9" )) |
71 | || c == S2C(s: u8"-" ) || c == S2C(s: u8"+" ) || c == S2C(s: u8"_" ) |
72 | || c == S2C(s: u8"/" ) || c == S2C(s: u8"%" ) || c == S2C(s: u8"." ))) |
73 | { |
74 | quote = true; |
75 | break; |
76 | } |
77 | } |
78 | } |
79 | |
80 | // Maximal length of appended string |
81 | buffer.reserve (n: buffer.size () + len * (quote ? 3 : 1) + 2); |
82 | |
83 | if (quote) |
84 | buffer.push_back (x: S2C(s: u8"'" )); |
85 | |
86 | for (auto *end = str + len; str != end;) |
87 | { |
88 | auto *e = end; |
89 | |
90 | if (quote) |
91 | // Look for next escape-needing char. More relaxed than |
92 | // the earlier needs-quoting check. |
93 | for (e = str; e != end; ++e) |
94 | { |
95 | unsigned char c = (unsigned char)*e; |
96 | if (c < S2C(s: u8" " ) || c == 0x7f |
97 | || c == S2C(s: u8"\\" ) || c == S2C(s: u8"'" )) |
98 | break; |
99 | } |
100 | buffer.insert (position: buffer.end (), first: str, last: e); |
101 | str = e; |
102 | |
103 | if (str == end) |
104 | break; |
105 | |
106 | buffer.push_back (x: S2C(s: u8"\\" )); |
107 | switch (unsigned char c = (unsigned char)*str++) |
108 | { |
109 | case S2C(s: u8"\t" ): |
110 | c = S2C(s: u8"t" ); |
111 | goto append; |
112 | |
113 | case S2C(s: u8"\n" ): |
114 | c = S2C(s: u8"n" ); |
115 | goto append; |
116 | |
117 | case S2C(s: u8"'" ): |
118 | case S2C(s: u8"\\" ): |
119 | append: |
120 | buffer.push_back (x: c); |
121 | break; |
122 | |
123 | default: |
124 | // Full-on escape. Use 2 lower-case hex chars |
125 | for (unsigned shift = 8; shift;) |
126 | { |
127 | shift -= 4; |
128 | |
129 | char nibble = (c >> shift) & 0xf; |
130 | nibble += S2C(s: u8"0" ); |
131 | if (nibble > S2C(s: u8"9" )) |
132 | nibble += S2C(s: u8"a" ) - (S2C(s: u8"9" ) + 1); |
133 | buffer.push_back (x: nibble); |
134 | } |
135 | } |
136 | } |
137 | |
138 | if (quote) |
139 | buffer.push_back (x: S2C(s: u8"'" )); |
140 | } |
141 | |
142 | void MessageBuffer::Append (char c) |
143 | { |
144 | buffer.push_back (x: c); |
145 | } |
146 | |
147 | void MessageBuffer::AppendInteger (unsigned u) |
148 | { |
149 | // Sigh, even though std::to_string is C++11, we support building on |
150 | // gcc 4.8, which is a C++11 compiler lacking std::to_string. so |
151 | // have something horrible. |
152 | std::string v (20, 0); |
153 | size_t len = snprintf (s: const_cast<char *> (v.data ()), maxlen: v.size (), format: "%u" , u); |
154 | v.erase (pos: len); |
155 | |
156 | AppendWord (str: v); |
157 | } |
158 | |
159 | int MessageBuffer::Write (int fd) noexcept |
160 | { |
161 | size_t limit = buffer.size () - lastBol; |
162 | ssize_t count = write (fd: fd, buf: &buffer.data ()[lastBol], n: limit); |
163 | |
164 | int err = 0; |
165 | if (count < 0) |
166 | err = errno; |
167 | else |
168 | { |
169 | lastBol += count; |
170 | if (size_t (count) != limit) |
171 | err = EAGAIN; |
172 | } |
173 | |
174 | if (err != EAGAIN && err != EINTR) |
175 | { |
176 | // Reset for next message |
177 | buffer.clear (); |
178 | lastBol = 0; |
179 | } |
180 | |
181 | return err; |
182 | } |
183 | |
184 | int MessageBuffer::Read (int fd) noexcept |
185 | { |
186 | constexpr size_t blockSize = 200; |
187 | |
188 | size_t lwm = buffer.size (); |
189 | size_t hwm = buffer.capacity (); |
190 | if (hwm - lwm < blockSize / 2) |
191 | hwm += blockSize; |
192 | buffer.resize (new_size: hwm); |
193 | |
194 | auto iter = buffer.begin () + lwm; |
195 | ssize_t count = read (fd: fd, buf: &*iter, nbytes: hwm - lwm); |
196 | buffer.resize (new_size: lwm + (count >= 0 ? count : 0)); |
197 | |
198 | if (count < 0) |
199 | return errno; |
200 | |
201 | if (!count) |
202 | // End of file |
203 | return -1; |
204 | |
205 | bool more = true; |
206 | for (;;) |
207 | { |
208 | auto newline = std::find (first: iter, last: buffer.end (), val: S2C(s: u8"\n" )); |
209 | if (newline == buffer.end ()) |
210 | break; |
211 | more = newline != buffer.begin () && newline[-1] == CONTINUE; |
212 | iter = newline + 1; |
213 | |
214 | if (iter == buffer.end ()) |
215 | break; |
216 | |
217 | if (!more) |
218 | { |
219 | // There is no continuation, but there are chars after the |
220 | // newline. Truncate the buffer and return an error |
221 | buffer.resize (new_size: iter - buffer.begin ()); |
222 | return EINVAL; |
223 | } |
224 | } |
225 | |
226 | return more ? EAGAIN : 0; |
227 | } |
228 | |
229 | int MessageBuffer::Lex (std::vector<std::string> &result) |
230 | { |
231 | result.clear (); |
232 | |
233 | if (IsAtEnd ()) |
234 | return ENOENT; |
235 | |
236 | Assert (buffer.back () == S2C(u8"\n" )); |
237 | |
238 | auto iter = buffer.begin () + lastBol; |
239 | |
240 | for (std::string *word = nullptr;;) |
241 | { |
242 | char c = *iter; |
243 | |
244 | ++iter; |
245 | if (c == S2C(s: u8" " ) || c == S2C(s: u8"\t" )) |
246 | { |
247 | word = nullptr; |
248 | continue; |
249 | } |
250 | |
251 | if (c == S2C(s: u8"\n" )) |
252 | break; |
253 | |
254 | if (c == CONTINUE) |
255 | { |
256 | // Line continuation |
257 | if (word || *iter != S2C(s: u8"\n" )) |
258 | goto malformed; |
259 | ++iter; |
260 | break; |
261 | } |
262 | |
263 | if (c <= S2C(s: u8" " ) || c >= 0x7f) |
264 | goto malformed; |
265 | |
266 | if (!word) |
267 | { |
268 | result.emplace_back (); |
269 | word = &result.back (); |
270 | } |
271 | |
272 | if (c == S2C(s: u8"'" )) |
273 | { |
274 | // Quoted word |
275 | for (;;) |
276 | { |
277 | c = *iter; |
278 | |
279 | if (c == S2C(s: u8"\n" )) |
280 | { |
281 | malformed:; |
282 | result.clear (); |
283 | iter = std::find (first: iter, last: buffer.end (), val: S2C(s: u8"\n" )); |
284 | auto back = iter; |
285 | if (back[-1] == CONTINUE && back[-2] == S2C(s: u8" " )) |
286 | // Smells like a line continuation |
287 | back -= 2; |
288 | result.emplace_back (args: &buffer[lastBol], |
289 | args: back - buffer.begin () - lastBol); |
290 | ++iter; |
291 | lastBol = iter - buffer.begin (); |
292 | return EINVAL; |
293 | } |
294 | |
295 | if (c < S2C(s: u8" " ) || c >= 0x7f) |
296 | goto malformed; |
297 | |
298 | ++iter; |
299 | if (c == S2C(s: u8"'" )) |
300 | break; |
301 | |
302 | if (c == S2C(s: u8"\\" )) |
303 | // escape |
304 | switch (c = *iter) |
305 | { |
306 | case S2C(s: u8"\\" ): |
307 | case S2C(s: u8"'" ): |
308 | ++iter; |
309 | break; |
310 | |
311 | case S2C(s: u8"n" ): |
312 | c = S2C(s: u8"\n" ); |
313 | ++iter; |
314 | break; |
315 | |
316 | case S2C(s: u8"_" ): |
317 | // We used to escape SPACE as \_, so accept that |
318 | c = S2C(s: u8" " ); |
319 | ++iter; |
320 | break; |
321 | |
322 | case S2C(s: u8"t" ): |
323 | c = S2C(s: u8"\t" ); |
324 | ++iter; |
325 | break; |
326 | |
327 | default: |
328 | { |
329 | unsigned v = 0; |
330 | for (unsigned nibble = 0; nibble != 2; nibble++) |
331 | { |
332 | c = *iter; |
333 | if (c < S2C(s: u8"0" )) |
334 | { |
335 | if (!nibble) |
336 | goto malformed; |
337 | break; |
338 | } |
339 | else if (c <= S2C(s: u8"9" )) |
340 | c -= S2C(s: u8"0" ); |
341 | else if (c < S2C(s: u8"a" )) |
342 | { |
343 | if (!nibble) |
344 | goto malformed; |
345 | break; |
346 | } |
347 | else if (c <= S2C(s: u8"f" )) |
348 | c -= S2C(s: u8"a" ) - 10; |
349 | else |
350 | { |
351 | if (!nibble) |
352 | goto malformed; |
353 | break; |
354 | } |
355 | ++iter; |
356 | v = (v << 4) | c; |
357 | } |
358 | c = v; |
359 | } |
360 | } |
361 | word->push_back (c: c); |
362 | } |
363 | } |
364 | else |
365 | // Unquoted character |
366 | word->push_back (c: c); |
367 | } |
368 | lastBol = iter - buffer.begin (); |
369 | if (result.empty ()) |
370 | return ENOENT; |
371 | |
372 | return 0; |
373 | } |
374 | |
375 | void MessageBuffer::LexedLine (std::string &str) |
376 | { |
377 | if (lastBol) |
378 | { |
379 | size_t pos = lastBol - 1; |
380 | for (; pos; pos--) |
381 | if (buffer[pos-1] == S2C(s: u8"\n" )) |
382 | break; |
383 | |
384 | size_t end = lastBol - 1; |
385 | if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(s: u8" " )) |
386 | // Strip line continuation |
387 | end -= 2; |
388 | str.append (s: &buffer[pos], n: end - pos); |
389 | } |
390 | } |
391 | } // Detail |
392 | } // Cody |
393 | |