1 | //======================================================================== |
2 | // |
3 | // UTF.cc |
4 | // |
5 | // Copyright 2001-2003 Glyph & Cog, LLC |
6 | // |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2008 Koji Otani <sho@bbr.jp> |
17 | // Copyright (C) 2012, 2017, 2021, 2023 Adrian Johnson <ajohnson@redneon.com> |
18 | // Copyright (C) 2012 Hib Eris <hib@hiberis.nl> |
19 | // Copyright (C) 2016, 2018-2022, 2024 Albert Astals Cid <aacid@kde.org> |
20 | // Copyright (C) 2016 Jason Crain <jason@aquaticape.us> |
21 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
22 | // Copyright (C) 2018, 2020 Nelson Benítez León <nbenitezl@gmail.com> |
23 | // Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net. |
24 | // Copyright (C) 2023, 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk> |
25 | // Copyright (C) 2023 Even Rouault <even.rouault@spatialys.com> |
26 | // Copyright (C) 2023, 2024 Oliver Sander <oliver.sander@tu-dresden.de> |
27 | // |
28 | // To see a description of the changes please see the Changelog file that |
29 | // came with your tarball or type make ChangeLog if you are building from git |
30 | // |
31 | //======================================================================== |
32 | |
33 | #include "goo/gmem.h" |
34 | #include "PDFDocEncoding.h" |
35 | #include "GlobalParams.h" |
36 | #include "UnicodeMap.h" |
37 | #include "UTF.h" |
38 | #include "UnicodeMapFuncs.h" |
39 | #include <algorithm> |
40 | |
41 | #include <config.h> |
42 | |
43 | std::vector<Unicode> UTF16toUCS4(const Unicode *utf16, int utf16Len) |
44 | { |
45 | // count characters |
46 | int len = 0; |
47 | for (int i = 0; i < utf16Len; i++) { |
48 | if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) { |
49 | i++; /* surrogate pair */ |
50 | } |
51 | len++; |
52 | } |
53 | std::vector<Unicode> u; |
54 | u.reserve(n: len); |
55 | // convert string |
56 | for (int i = 0; i < utf16Len; i++) { |
57 | if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */ |
58 | if (i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) { |
59 | /* next code is a low surrogate */ |
60 | u.push_back(x: (((utf16[i] & 0x3ff) << 10) | (utf16[i + 1] & 0x3ff)) + 0x10000); |
61 | ++i; |
62 | } else { |
63 | /* missing low surrogate |
64 | replace it with REPLACEMENT CHARACTER (U+FFFD) */ |
65 | u.push_back(x: 0xfffd); |
66 | } |
67 | } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) { |
68 | /* invalid low surrogate |
69 | replace it with REPLACEMENT CHARACTER (U+FFFD) */ |
70 | u.push_back(x: 0xfffd); |
71 | } else { |
72 | u.push_back(x: utf16[i]); |
73 | } |
74 | if (!UnicodeIsValid(ucs4: u.back())) { |
75 | u.back() = 0xfffd; |
76 | } |
77 | } |
78 | return u; |
79 | } |
80 | |
81 | std::vector<Unicode> TextStringToUCS4(const std::string &textStr) |
82 | { |
83 | bool isUnicode, isUnicodeLE; |
84 | |
85 | int len = textStr.size(); |
86 | const std::string &s = textStr; |
87 | if (len == 0) { |
88 | return {}; |
89 | } |
90 | |
91 | if (hasUnicodeByteOrderMark(s: textStr)) { |
92 | isUnicode = true; |
93 | isUnicodeLE = false; |
94 | } else if (hasUnicodeByteOrderMarkLE(s: textStr)) { |
95 | isUnicode = false; |
96 | isUnicodeLE = true; |
97 | } else { |
98 | isUnicode = false; |
99 | isUnicodeLE = false; |
100 | } |
101 | |
102 | if (isUnicode || isUnicodeLE) { |
103 | len = len / 2 - 1; |
104 | if (len > 0) { |
105 | std::vector<Unicode> utf16; |
106 | utf16.reserve(n: len); |
107 | for (int i = 0; i < len; i++) { |
108 | if (isUnicode) { |
109 | utf16.push_back(x: (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff)); |
110 | } else { // UnicodeLE |
111 | utf16.push_back(x: (s[3 + i * 2] & 0xff) << 8 | (s[2 + i * 2] & 0xff)); |
112 | } |
113 | } |
114 | return UTF16toUCS4(utf16: utf16.data(), utf16Len: utf16.size()); |
115 | |
116 | } else { |
117 | return {}; |
118 | } |
119 | } else { |
120 | std::vector<Unicode> u; |
121 | u.reserve(n: len); |
122 | for (int i = 0; i < len; i++) { |
123 | u.push_back(x: pdfDocEncoding[s[i] & 0xff]); |
124 | } |
125 | return u; |
126 | } |
127 | } |
128 | |
129 | bool UnicodeIsWhitespace(Unicode ucs4) |
130 | { |
131 | static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 }; |
132 | Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]); |
133 | Unicode const *i = std::lower_bound(first: spaces, last: end, val: ucs4); |
134 | return (i != end && *i == ucs4); |
135 | } |
136 | |
137 | // |
138 | // decodeUtf8() and decodeUtf8Table are: |
139 | // |
140 | // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
141 | // |
142 | // Permission is hereby granted, free of charge, to any person |
143 | // obtaining a copy of this software and associated documentation |
144 | // files (the "Software"), to deal in the Software without |
145 | // restriction, including without limitation the rights to use, copy, |
146 | // modify, merge, publish, distribute, sublicense, and/or sell copies |
147 | // of the Software, and to permit persons to whom the Software is |
148 | // furnished to do so, subject to the following conditions: |
149 | |
150 | // The above copyright notice and this permission notice shall be |
151 | // included in all copies or substantial portions of the Software. |
152 | // |
153 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
154 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
155 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
156 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
157 | // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
158 | // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
159 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
160 | // SOFTWARE. |
161 | // |
162 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
163 | // |
164 | static const uint32_t UTF8_ACCEPT = 0; |
165 | static const uint32_t UTF8_REJECT = 12; |
166 | static const uint32_t UCS4_MAX = 0x10FFFF; |
167 | static const Unicode REPLACEMENT_CHAR = 0xFFFD; |
168 | |
169 | // clang-format off |
170 | static const uint8_t decodeUtf8Table[] = { |
171 | // The first part of the table maps bytes to character classes |
172 | // to reduce the size of the transition table and create bitmasks. |
173 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f |
174 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f |
175 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f |
176 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f |
177 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f |
178 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf |
179 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df |
180 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff |
181 | |
182 | // The second part is a transition table that maps a combination |
183 | // of a state of the automaton and a character class to a state. |
184 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, |
185 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, |
186 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, |
187 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, |
188 | 12,36,12,12,12,12,12,12,12,12,12,12, |
189 | }; |
190 | // clang-format on |
191 | |
192 | // Decode utf8 state machine for fast UTF-8 decoding. Initialise state |
193 | // to 0 and call decodeUtf8() for each byte of UTF-8. Return value |
194 | // (and state) is UTF8_ACCEPT when it has found a valid codepoint |
195 | // (codepoint returned in codep), UTF8_REJECT when the byte is not |
196 | // allowed to occur at its position, and some other positive value if |
197 | // more bytes have to be read. Reset state to 0 to recover from |
198 | // errors. |
199 | inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte) |
200 | { |
201 | uint32_t b = (unsigned char)byte; |
202 | uint32_t type = decodeUtf8Table[b]; |
203 | |
204 | *codep = (*state != UTF8_ACCEPT) ? (b & 0x3fu) | (*codep << 6) : (0xff >> type) & (b); |
205 | |
206 | *state = decodeUtf8Table[256 + *state + type]; |
207 | return *state; |
208 | } |
209 | |
210 | int utf8CountUCS4(const char *utf8) |
211 | { |
212 | uint32_t codepoint; |
213 | uint32_t state = 0; |
214 | int count = 0; |
215 | |
216 | while (*utf8) { |
217 | decodeUtf8(state: &state, codep: &codepoint, byte: *utf8); |
218 | if (state == UTF8_ACCEPT) { |
219 | count++; |
220 | } else if (state == UTF8_REJECT) { |
221 | count++; // replace with REPLACEMENT_CHAR |
222 | state = 0; |
223 | } |
224 | utf8++; |
225 | } |
226 | if (state != UTF8_ACCEPT && state != UTF8_REJECT) { |
227 | count++; // replace with REPLACEMENT_CHAR |
228 | } |
229 | |
230 | return count; |
231 | } |
232 | |
233 | int utf8ToUCS4(const char *utf8, Unicode **ucs4_out) |
234 | { |
235 | int len = utf8CountUCS4(utf8); |
236 | Unicode *u = (Unicode *)gmallocn(count: len, size: sizeof(Unicode)); |
237 | int n = 0; |
238 | uint32_t codepoint; |
239 | uint32_t state = 0; |
240 | |
241 | while (*utf8 && n < len) { |
242 | decodeUtf8(state: &state, codep: &codepoint, byte: *utf8); |
243 | if (state == UTF8_ACCEPT) { |
244 | u[n++] = codepoint; |
245 | } else if (state == UTF8_REJECT) { |
246 | u[n++] = REPLACEMENT_CHAR; // invalid byte for this position |
247 | state = 0; |
248 | } |
249 | utf8++; |
250 | } |
251 | if (state != UTF8_ACCEPT && state != UTF8_REJECT) { |
252 | u[n] = REPLACEMENT_CHAR; // invalid byte for this position |
253 | } |
254 | |
255 | *ucs4_out = u; |
256 | return len; |
257 | } |
258 | |
259 | // Count number of UTF-16 code units required to convert a UTF-8 string |
260 | // (excluding terminating NULL). Each invalid byte is counted as a |
261 | // code point since the UTF-8 conversion functions will replace it with |
262 | // REPLACEMENT_CHAR. |
263 | int utf8CountUtf16CodeUnits(const char *utf8) |
264 | { |
265 | uint32_t codepoint; |
266 | uint32_t state = 0; |
267 | int count = 0; |
268 | |
269 | while (*utf8) { |
270 | decodeUtf8(state: &state, codep: &codepoint, byte: *utf8); |
271 | if (state == UTF8_ACCEPT) { |
272 | if (codepoint < 0x10000) { |
273 | count++; |
274 | } else if (codepoint <= UCS4_MAX) { |
275 | count += 2; |
276 | } else { |
277 | count++; // replace with REPLACEMENT_CHAR |
278 | } |
279 | } else if (state == UTF8_REJECT) { |
280 | count++; // replace with REPLACEMENT_CHAR |
281 | state = 0; |
282 | } |
283 | utf8++; |
284 | } |
285 | if (state != UTF8_ACCEPT && state != UTF8_REJECT) { |
286 | count++; // replace with REPLACEMENT_CHAR |
287 | } |
288 | |
289 | return count; |
290 | } |
291 | |
292 | // Convert UTF-8 to UTF-16 |
293 | // utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num |
294 | // bytes to convert |
295 | // utf16 - output buffer to write UTF-16 to. Output will always be null terminated. |
296 | // maxUtf16 - maximum size of output buffer including space for null. |
297 | // maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when |
298 | // either this count is reached or a null is encountered. |
299 | // Returns number of UTF-16 code units written (excluding NULL). |
300 | int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8) |
301 | { |
302 | uint16_t *p = utf16; |
303 | uint32_t codepoint; |
304 | uint32_t state = 0; |
305 | int nIn = 0; |
306 | int nOut = 0; |
307 | while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) { |
308 | decodeUtf8(state: &state, codep: &codepoint, byte: *utf8); |
309 | if (state == UTF8_ACCEPT) { |
310 | if (codepoint < 0x10000) { |
311 | *p++ = (uint16_t)codepoint; |
312 | nOut++; |
313 | } else if (codepoint <= UCS4_MAX) { |
314 | *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10)); |
315 | *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF)); |
316 | nOut += 2; |
317 | } else { |
318 | *p++ = REPLACEMENT_CHAR; |
319 | nOut++; |
320 | state = 0; |
321 | } |
322 | } else if (state == UTF8_REJECT) { |
323 | *p++ = REPLACEMENT_CHAR; // invalid byte for this position |
324 | nOut++; |
325 | } |
326 | utf8++; |
327 | nIn++; |
328 | } |
329 | // replace any trailing bytes too short for a valid UTF-8 with a replacement char |
330 | if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) { |
331 | *p++ = REPLACEMENT_CHAR; |
332 | nOut++; |
333 | } |
334 | if (nOut > maxUtf16 - 1) { |
335 | nOut = maxUtf16 - 1; |
336 | } |
337 | utf16[nOut] = 0; |
338 | return nOut; |
339 | } |
340 | |
341 | // Allocate utf16 string and convert utf8 into it. |
342 | uint16_t *utf8ToUtf16(const char *utf8, int *len) |
343 | { |
344 | if (isUtf8WithBom(str: utf8)) { |
345 | utf8 += 3; |
346 | } |
347 | int n = utf8CountUtf16CodeUnits(utf8); |
348 | if (len) { |
349 | *len = n; |
350 | } |
351 | uint16_t *utf16 = (uint16_t *)gmallocn(count: n + 1, size: sizeof(uint16_t)); |
352 | utf8ToUtf16(utf8, utf16, maxUtf16: n + 1, INT_MAX); |
353 | return utf16; |
354 | } |
355 | |
356 | std::string utf8ToUtf16WithBom(const std::string &utf8) |
357 | { |
358 | if (utf8.empty()) { |
359 | return {}; |
360 | } |
361 | int tmp_length; // Number of UTF-16 symbols. |
362 | char *tmp_str = (char *)utf8ToUtf16(utf8: utf8.c_str(), len: &tmp_length); |
363 | #ifndef WORDS_BIGENDIAN |
364 | for (int i = 0; i < tmp_length; i++) { |
365 | std::swap(a&: tmp_str[i * 2], b&: tmp_str[i * 2 + 1]); |
366 | } |
367 | #endif |
368 | |
369 | std::string result(unicodeByteOrderMark); |
370 | result.append(s: tmp_str, n: tmp_length * 2); |
371 | gfree(p: tmp_str); |
372 | return result; |
373 | } |
374 | |
375 | static const uint32_t UTF16_ACCEPT = 0; |
376 | static const uint32_t UTF16_REJECT = -1; |
377 | |
378 | // Initialise state to 0. Returns UTF16_ACCEPT when a valid code point |
379 | // has been found, UTF16_REJECT when invalid code unit for this state, |
380 | // some other valid if another code unit needs to be read. |
381 | inline uint32_t decodeUtf16(uint32_t *state, uint32_t *codePoint, uint16_t codeUnit) |
382 | { |
383 | if (*state == 0) { |
384 | if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */ |
385 | *state = codeUnit; |
386 | return *state; |
387 | } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) { |
388 | /* invalid low surrogate */ |
389 | return UTF16_REJECT; |
390 | } else { |
391 | *codePoint = codeUnit; |
392 | return UTF16_ACCEPT; |
393 | } |
394 | } else { |
395 | if (codeUnit >= 0xdc00 && codeUnit < 0xe000) { |
396 | *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000; |
397 | *state = 0; |
398 | return UTF16_ACCEPT; |
399 | } else { |
400 | /* invalid high surrogate */ |
401 | return UTF16_REJECT; |
402 | } |
403 | } |
404 | } |
405 | |
406 | // Count number of UTF-8 bytes required to convert a UTF-16 string to |
407 | // UTF-8 (excluding terminating NULL). |
408 | int utf16CountUtf8Bytes(const uint16_t *utf16) |
409 | { |
410 | uint32_t codepoint = 0; |
411 | uint32_t state = 0; |
412 | int count = 0; |
413 | |
414 | while (*utf16) { |
415 | decodeUtf16(state: &state, codePoint: &codepoint, codeUnit: *utf16); |
416 | if (state == UTF16_ACCEPT) { |
417 | if (codepoint < 0x80) { |
418 | count++; |
419 | } else if (codepoint < 0x800) { |
420 | count += 2; |
421 | } else if (codepoint < 0x10000) { |
422 | count += 3; |
423 | } else if (codepoint <= UCS4_MAX) { |
424 | count += 4; |
425 | } else { |
426 | count += 3; // replace with REPLACEMENT_CHAR |
427 | } |
428 | } else if (state == UTF16_REJECT) { |
429 | count += 3; // replace with REPLACEMENT_CHAR |
430 | state = 0; |
431 | } |
432 | utf16++; |
433 | } |
434 | if (state != UTF8_ACCEPT && state != UTF8_REJECT) { |
435 | count++; // replace with REPLACEMENT_CHAR |
436 | } |
437 | |
438 | return count; |
439 | } |
440 | |
441 | // Convert UTF-16 to UTF-8 |
442 | // utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num |
443 | // code units to convert |
444 | // utf8 - output buffer to write UTF-8 to. Output will always be null terminated. |
445 | // maxUtf8 - maximum size of output buffer including space for null. |
446 | // maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when |
447 | // either this count is reached or a null is encountered. |
448 | // Returns number of UTF-8 bytes written (excluding NULL). |
449 | int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16) |
450 | { |
451 | uint32_t codepoint = 0; |
452 | uint32_t state = 0; |
453 | int nIn = 0; |
454 | int nOut = 0; |
455 | char *p = utf8; |
456 | while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) { |
457 | decodeUtf16(state: &state, codePoint: &codepoint, codeUnit: *utf16); |
458 | if (state == UTF16_ACCEPT || state == UTF16_REJECT) { |
459 | if (state == UTF16_REJECT || codepoint > UCS4_MAX) { |
460 | codepoint = REPLACEMENT_CHAR; |
461 | state = 0; |
462 | } |
463 | |
464 | int bufSize = maxUtf8 - nOut; |
465 | int count = mapUTF8(u: codepoint, buf: p, bufSize); |
466 | p += count; |
467 | nOut += count; |
468 | } |
469 | utf16++; |
470 | nIn++; |
471 | } |
472 | // replace any trailing bytes too short for a valid UTF-8 with a replacement char |
473 | if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) { |
474 | int bufSize = maxUtf8 - nOut; |
475 | int count = mapUTF8(u: REPLACEMENT_CHAR, buf: p, bufSize); |
476 | p += count; |
477 | nOut += count; |
478 | nOut++; |
479 | } |
480 | if (nOut > maxUtf8 - 1) { |
481 | nOut = maxUtf8 - 1; |
482 | } |
483 | utf8[nOut] = 0; |
484 | return nOut; |
485 | } |
486 | |
487 | // Allocate utf8 string and convert utf16 into it. |
488 | char *utf16ToUtf8(const uint16_t *utf16, int *len) |
489 | { |
490 | int n = utf16CountUtf8Bytes(utf16); |
491 | if (len) { |
492 | *len = n; |
493 | } |
494 | char *utf8 = (char *)gmalloc(size: n + 1); |
495 | utf16ToUtf8(utf16, utf8); |
496 | return utf8; |
497 | } |
498 | |
499 | void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices) |
500 | { |
501 | const UnicodeMap *uMap = globalParams->getUnicodeMap(encodingName: "ASCII7" ); |
502 | int *idx = nullptr; |
503 | |
504 | if (!len) { |
505 | *ucs4_out = nullptr; |
506 | *out_len = 0; |
507 | return; |
508 | } |
509 | |
510 | if (indices) { |
511 | if (!in_idx) { |
512 | indices = nullptr; |
513 | } else { |
514 | idx = (int *)gmallocn(count: len * 8 + 1, size: sizeof(int)); |
515 | } |
516 | } |
517 | |
518 | std::string str; |
519 | |
520 | char buf[8]; // 8 is enough for mapping an unicode char to a string |
521 | int i, n, k; |
522 | |
523 | for (i = k = 0; i < len; ++i) { |
524 | n = uMap->mapUnicode(u: in[i], buf, bufSize: sizeof(buf)); |
525 | if (!n) { |
526 | // the Unicode char could not be converted to ascii7 counterpart |
527 | // so just fill with a non-printable ascii char |
528 | buf[0] = 31; |
529 | n = 1; |
530 | } |
531 | str.append(s: buf, n: n); |
532 | if (indices) { |
533 | for (; n > 0; n--) { |
534 | idx[k++] = in_idx[i]; |
535 | } |
536 | } |
537 | } |
538 | |
539 | std::vector<Unicode> ucs4 = TextStringToUCS4(textStr: str); |
540 | *out_len = ucs4.size(); |
541 | *ucs4_out = (Unicode *)gmallocn(count: ucs4.size(), size: sizeof(Unicode)); |
542 | memcpy(dest: *ucs4_out, src: ucs4.data(), n: ucs4.size() * sizeof(Unicode)); |
543 | |
544 | if (indices) { |
545 | idx[k] = in_idx[len]; |
546 | *indices = idx; |
547 | } |
548 | } |
549 | |
550 | // Convert a PDF Text String to UTF-8 |
551 | // textStr - PDF text string |
552 | // returns UTF-8 string. |
553 | std::string TextStringToUtf8(const std::string &textStr) |
554 | { |
555 | int i, len; |
556 | const char *s; |
557 | char *utf8; |
558 | |
559 | len = textStr.size(); |
560 | s = textStr.c_str(); |
561 | if (hasUnicodeByteOrderMark(s: textStr)) { |
562 | uint16_t *utf16; |
563 | len = len / 2 - 1; |
564 | utf16 = new uint16_t[len + 1]; |
565 | for (i = 0; i < len; i++) { |
566 | utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff); |
567 | } |
568 | utf16[i] = 0; |
569 | utf8 = utf16ToUtf8(utf16); |
570 | delete[] utf16; |
571 | } else { |
572 | utf8 = (char *)gmalloc(size: len + 1); |
573 | for (i = 0; i < len; i++) { |
574 | utf8[i] = pdfDocEncoding[s[i] & 0xff]; |
575 | } |
576 | utf8[i] = 0; |
577 | } |
578 | std::string utf8_string(utf8); |
579 | gfree(p: utf8); |
580 | return utf8_string; |
581 | } |
582 | |