1//========================================================================
2//
3// UTF.cc
4//
5// Copyright 2001-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9//========================================================================
10//
11// Modified under the Poppler project - http://poppler.freedesktop.org
12//
13// All changes made under the Poppler project to this file are licensed
14// under GPL version 2 or later
15//
16// Copyright (C) 2008 Koji Otani <sho@bbr.jp>
17// Copyright (C) 2012, 2017, 2021, 2023 Adrian Johnson <ajohnson@redneon.com>
18// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
19// Copyright (C) 2016, 2018-2022, 2024 Albert Astals Cid <aacid@kde.org>
20// Copyright (C) 2016 Jason Crain <jason@aquaticape.us>
21// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
22// Copyright (C) 2018, 2020 Nelson Benítez León <nbenitezl@gmail.com>
23// Copyright (C) 2021 Georgiy Sgibnev <georgiy@sgibnev.com>. Work sponsored by lab50.net.
24// Copyright (C) 2023, 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
25// Copyright (C) 2023 Even Rouault <even.rouault@spatialys.com>
26// Copyright (C) 2023, 2024 Oliver Sander <oliver.sander@tu-dresden.de>
27//
28// To see a description of the changes please see the Changelog file that
29// came with your tarball or type make ChangeLog if you are building from git
30//
31//========================================================================
32
33#include "goo/gmem.h"
34#include "PDFDocEncoding.h"
35#include "GlobalParams.h"
36#include "UnicodeMap.h"
37#include "UTF.h"
38#include "UnicodeMapFuncs.h"
39#include <algorithm>
40
41#include <config.h>
42
43std::vector<Unicode> UTF16toUCS4(const Unicode *utf16, int utf16Len)
44{
45 // count characters
46 int len = 0;
47 for (int i = 0; i < utf16Len; i++) {
48 if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00 && i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
49 i++; /* surrogate pair */
50 }
51 len++;
52 }
53 std::vector<Unicode> u;
54 u.reserve(n: len);
55 // convert string
56 for (int i = 0; i < utf16Len; i++) {
57 if (utf16[i] >= 0xd800 && utf16[i] < 0xdc00) { /* surrogate pair */
58 if (i + 1 < utf16Len && utf16[i + 1] >= 0xdc00 && utf16[i + 1] < 0xe000) {
59 /* next code is a low surrogate */
60 u.push_back(x: (((utf16[i] & 0x3ff) << 10) | (utf16[i + 1] & 0x3ff)) + 0x10000);
61 ++i;
62 } else {
63 /* missing low surrogate
64 replace it with REPLACEMENT CHARACTER (U+FFFD) */
65 u.push_back(x: 0xfffd);
66 }
67 } else if (utf16[i] >= 0xdc00 && utf16[i] < 0xe000) {
68 /* invalid low surrogate
69 replace it with REPLACEMENT CHARACTER (U+FFFD) */
70 u.push_back(x: 0xfffd);
71 } else {
72 u.push_back(x: utf16[i]);
73 }
74 if (!UnicodeIsValid(ucs4: u.back())) {
75 u.back() = 0xfffd;
76 }
77 }
78 return u;
79}
80
81std::vector<Unicode> TextStringToUCS4(const std::string &textStr)
82{
83 bool isUnicode, isUnicodeLE;
84
85 int len = textStr.size();
86 const std::string &s = textStr;
87 if (len == 0) {
88 return {};
89 }
90
91 if (hasUnicodeByteOrderMark(s: textStr)) {
92 isUnicode = true;
93 isUnicodeLE = false;
94 } else if (hasUnicodeByteOrderMarkLE(s: textStr)) {
95 isUnicode = false;
96 isUnicodeLE = true;
97 } else {
98 isUnicode = false;
99 isUnicodeLE = false;
100 }
101
102 if (isUnicode || isUnicodeLE) {
103 len = len / 2 - 1;
104 if (len > 0) {
105 std::vector<Unicode> utf16;
106 utf16.reserve(n: len);
107 for (int i = 0; i < len; i++) {
108 if (isUnicode) {
109 utf16.push_back(x: (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff));
110 } else { // UnicodeLE
111 utf16.push_back(x: (s[3 + i * 2] & 0xff) << 8 | (s[2 + i * 2] & 0xff));
112 }
113 }
114 return UTF16toUCS4(utf16: utf16.data(), utf16Len: utf16.size());
115
116 } else {
117 return {};
118 }
119 } else {
120 std::vector<Unicode> u;
121 u.reserve(n: len);
122 for (int i = 0; i < len; i++) {
123 u.push_back(x: pdfDocEncoding[s[i] & 0xff]);
124 }
125 return u;
126 }
127}
128
129bool UnicodeIsWhitespace(Unicode ucs4)
130{
131 static Unicode const spaces[] = { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000 };
132 Unicode const *end = spaces + sizeof(spaces) / sizeof(spaces[0]);
133 Unicode const *i = std::lower_bound(first: spaces, last: end, val: ucs4);
134 return (i != end && *i == ucs4);
135}
136
137//
138// decodeUtf8() and decodeUtf8Table are:
139//
140// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
141//
142// Permission is hereby granted, free of charge, to any person
143// obtaining a copy of this software and associated documentation
144// files (the "Software"), to deal in the Software without
145// restriction, including without limitation the rights to use, copy,
146// modify, merge, publish, distribute, sublicense, and/or sell copies
147// of the Software, and to permit persons to whom the Software is
148// furnished to do so, subject to the following conditions:
149
150// The above copyright notice and this permission notice shall be
151// included in all copies or substantial portions of the Software.
152//
153// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
154// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
155// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
156// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
157// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
158// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
159// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
160// SOFTWARE.
161//
162// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
163//
164static const uint32_t UTF8_ACCEPT = 0;
165static const uint32_t UTF8_REJECT = 12;
166static const uint32_t UCS4_MAX = 0x10FFFF;
167static const Unicode REPLACEMENT_CHAR = 0xFFFD;
168
169// clang-format off
170static const uint8_t decodeUtf8Table[] = {
171 // The first part of the table maps bytes to character classes
172 // to reduce the size of the transition table and create bitmasks.
173 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
174 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
175 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
176 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
177 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
178 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
179 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
180 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // e0..ff
181
182 // The second part is a transition table that maps a combination
183 // of a state of the automaton and a character class to a state.
184 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
185 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
186 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
187 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
188 12,36,12,12,12,12,12,12,12,12,12,12,
189};
190// clang-format on
191
192// Decode utf8 state machine for fast UTF-8 decoding. Initialise state
193// to 0 and call decodeUtf8() for each byte of UTF-8. Return value
194// (and state) is UTF8_ACCEPT when it has found a valid codepoint
195// (codepoint returned in codep), UTF8_REJECT when the byte is not
196// allowed to occur at its position, and some other positive value if
197// more bytes have to be read. Reset state to 0 to recover from
198// errors.
199inline uint32_t decodeUtf8(uint32_t *state, uint32_t *codep, char byte)
200{
201 uint32_t b = (unsigned char)byte;
202 uint32_t type = decodeUtf8Table[b];
203
204 *codep = (*state != UTF8_ACCEPT) ? (b & 0x3fu) | (*codep << 6) : (0xff >> type) & (b);
205
206 *state = decodeUtf8Table[256 + *state + type];
207 return *state;
208}
209
210int utf8CountUCS4(const char *utf8)
211{
212 uint32_t codepoint;
213 uint32_t state = 0;
214 int count = 0;
215
216 while (*utf8) {
217 decodeUtf8(state: &state, codep: &codepoint, byte: *utf8);
218 if (state == UTF8_ACCEPT) {
219 count++;
220 } else if (state == UTF8_REJECT) {
221 count++; // replace with REPLACEMENT_CHAR
222 state = 0;
223 }
224 utf8++;
225 }
226 if (state != UTF8_ACCEPT && state != UTF8_REJECT) {
227 count++; // replace with REPLACEMENT_CHAR
228 }
229
230 return count;
231}
232
233int utf8ToUCS4(const char *utf8, Unicode **ucs4_out)
234{
235 int len = utf8CountUCS4(utf8);
236 Unicode *u = (Unicode *)gmallocn(count: len, size: sizeof(Unicode));
237 int n = 0;
238 uint32_t codepoint;
239 uint32_t state = 0;
240
241 while (*utf8 && n < len) {
242 decodeUtf8(state: &state, codep: &codepoint, byte: *utf8);
243 if (state == UTF8_ACCEPT) {
244 u[n++] = codepoint;
245 } else if (state == UTF8_REJECT) {
246 u[n++] = REPLACEMENT_CHAR; // invalid byte for this position
247 state = 0;
248 }
249 utf8++;
250 }
251 if (state != UTF8_ACCEPT && state != UTF8_REJECT) {
252 u[n] = REPLACEMENT_CHAR; // invalid byte for this position
253 }
254
255 *ucs4_out = u;
256 return len;
257}
258
259// Count number of UTF-16 code units required to convert a UTF-8 string
260// (excluding terminating NULL). Each invalid byte is counted as a
261// code point since the UTF-8 conversion functions will replace it with
262// REPLACEMENT_CHAR.
263int utf8CountUtf16CodeUnits(const char *utf8)
264{
265 uint32_t codepoint;
266 uint32_t state = 0;
267 int count = 0;
268
269 while (*utf8) {
270 decodeUtf8(state: &state, codep: &codepoint, byte: *utf8);
271 if (state == UTF8_ACCEPT) {
272 if (codepoint < 0x10000) {
273 count++;
274 } else if (codepoint <= UCS4_MAX) {
275 count += 2;
276 } else {
277 count++; // replace with REPLACEMENT_CHAR
278 }
279 } else if (state == UTF8_REJECT) {
280 count++; // replace with REPLACEMENT_CHAR
281 state = 0;
282 }
283 utf8++;
284 }
285 if (state != UTF8_ACCEPT && state != UTF8_REJECT) {
286 count++; // replace with REPLACEMENT_CHAR
287 }
288
289 return count;
290}
291
292// Convert UTF-8 to UTF-16
293// utf8- UTF-8 string to convert. If not null terminated, set maxUtf8 to num
294// bytes to convert
295// utf16 - output buffer to write UTF-16 to. Output will always be null terminated.
296// maxUtf16 - maximum size of output buffer including space for null.
297// maxUtf8 - maximum number of UTF-8 bytes to convert. Conversion stops when
298// either this count is reached or a null is encountered.
299// Returns number of UTF-16 code units written (excluding NULL).
300int utf8ToUtf16(const char *utf8, uint16_t *utf16, int maxUtf16, int maxUtf8)
301{
302 uint16_t *p = utf16;
303 uint32_t codepoint;
304 uint32_t state = 0;
305 int nIn = 0;
306 int nOut = 0;
307 while (*utf8 && nIn < maxUtf8 && nOut < maxUtf16 - 1) {
308 decodeUtf8(state: &state, codep: &codepoint, byte: *utf8);
309 if (state == UTF8_ACCEPT) {
310 if (codepoint < 0x10000) {
311 *p++ = (uint16_t)codepoint;
312 nOut++;
313 } else if (codepoint <= UCS4_MAX) {
314 *p++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
315 *p++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
316 nOut += 2;
317 } else {
318 *p++ = REPLACEMENT_CHAR;
319 nOut++;
320 state = 0;
321 }
322 } else if (state == UTF8_REJECT) {
323 *p++ = REPLACEMENT_CHAR; // invalid byte for this position
324 nOut++;
325 }
326 utf8++;
327 nIn++;
328 }
329 // replace any trailing bytes too short for a valid UTF-8 with a replacement char
330 if (state != UTF8_ACCEPT && state != UTF8_REJECT && nOut < maxUtf16 - 1) {
331 *p++ = REPLACEMENT_CHAR;
332 nOut++;
333 }
334 if (nOut > maxUtf16 - 1) {
335 nOut = maxUtf16 - 1;
336 }
337 utf16[nOut] = 0;
338 return nOut;
339}
340
341// Allocate utf16 string and convert utf8 into it.
342uint16_t *utf8ToUtf16(const char *utf8, int *len)
343{
344 if (isUtf8WithBom(str: utf8)) {
345 utf8 += 3;
346 }
347 int n = utf8CountUtf16CodeUnits(utf8);
348 if (len) {
349 *len = n;
350 }
351 uint16_t *utf16 = (uint16_t *)gmallocn(count: n + 1, size: sizeof(uint16_t));
352 utf8ToUtf16(utf8, utf16, maxUtf16: n + 1, INT_MAX);
353 return utf16;
354}
355
356std::string utf8ToUtf16WithBom(const std::string &utf8)
357{
358 if (utf8.empty()) {
359 return {};
360 }
361 int tmp_length; // Number of UTF-16 symbols.
362 char *tmp_str = (char *)utf8ToUtf16(utf8: utf8.c_str(), len: &tmp_length);
363#ifndef WORDS_BIGENDIAN
364 for (int i = 0; i < tmp_length; i++) {
365 std::swap(a&: tmp_str[i * 2], b&: tmp_str[i * 2 + 1]);
366 }
367#endif
368
369 std::string result(unicodeByteOrderMark);
370 result.append(s: tmp_str, n: tmp_length * 2);
371 gfree(p: tmp_str);
372 return result;
373}
374
375static const uint32_t UTF16_ACCEPT = 0;
376static const uint32_t UTF16_REJECT = -1;
377
378// Initialise state to 0. Returns UTF16_ACCEPT when a valid code point
379// has been found, UTF16_REJECT when invalid code unit for this state,
380// some other valid if another code unit needs to be read.
381inline uint32_t decodeUtf16(uint32_t *state, uint32_t *codePoint, uint16_t codeUnit)
382{
383 if (*state == 0) {
384 if (codeUnit >= 0xd800 && codeUnit < 0xdc00) { /* surrogate pair */
385 *state = codeUnit;
386 return *state;
387 } else if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
388 /* invalid low surrogate */
389 return UTF16_REJECT;
390 } else {
391 *codePoint = codeUnit;
392 return UTF16_ACCEPT;
393 }
394 } else {
395 if (codeUnit >= 0xdc00 && codeUnit < 0xe000) {
396 *codePoint = (((*state & 0x3ff) << 10) | (codeUnit & 0x3ff)) + 0x10000;
397 *state = 0;
398 return UTF16_ACCEPT;
399 } else {
400 /* invalid high surrogate */
401 return UTF16_REJECT;
402 }
403 }
404}
405
406// Count number of UTF-8 bytes required to convert a UTF-16 string to
407// UTF-8 (excluding terminating NULL).
408int utf16CountUtf8Bytes(const uint16_t *utf16)
409{
410 uint32_t codepoint = 0;
411 uint32_t state = 0;
412 int count = 0;
413
414 while (*utf16) {
415 decodeUtf16(state: &state, codePoint: &codepoint, codeUnit: *utf16);
416 if (state == UTF16_ACCEPT) {
417 if (codepoint < 0x80) {
418 count++;
419 } else if (codepoint < 0x800) {
420 count += 2;
421 } else if (codepoint < 0x10000) {
422 count += 3;
423 } else if (codepoint <= UCS4_MAX) {
424 count += 4;
425 } else {
426 count += 3; // replace with REPLACEMENT_CHAR
427 }
428 } else if (state == UTF16_REJECT) {
429 count += 3; // replace with REPLACEMENT_CHAR
430 state = 0;
431 }
432 utf16++;
433 }
434 if (state != UTF8_ACCEPT && state != UTF8_REJECT) {
435 count++; // replace with REPLACEMENT_CHAR
436 }
437
438 return count;
439}
440
441// Convert UTF-16 to UTF-8
442// utf16- UTF-16 string to convert. If not null terminated, set maxUtf16 to num
443// code units to convert
444// utf8 - output buffer to write UTF-8 to. Output will always be null terminated.
445// maxUtf8 - maximum size of output buffer including space for null.
446// maxUtf16 - maximum number of UTF-16 code units to convert. Conversion stops when
447// either this count is reached or a null is encountered.
448// Returns number of UTF-8 bytes written (excluding NULL).
449int utf16ToUtf8(const uint16_t *utf16, char *utf8, int maxUtf8, int maxUtf16)
450{
451 uint32_t codepoint = 0;
452 uint32_t state = 0;
453 int nIn = 0;
454 int nOut = 0;
455 char *p = utf8;
456 while (*utf16 && nIn < maxUtf16 && nOut < maxUtf8 - 1) {
457 decodeUtf16(state: &state, codePoint: &codepoint, codeUnit: *utf16);
458 if (state == UTF16_ACCEPT || state == UTF16_REJECT) {
459 if (state == UTF16_REJECT || codepoint > UCS4_MAX) {
460 codepoint = REPLACEMENT_CHAR;
461 state = 0;
462 }
463
464 int bufSize = maxUtf8 - nOut;
465 int count = mapUTF8(u: codepoint, buf: p, bufSize);
466 p += count;
467 nOut += count;
468 }
469 utf16++;
470 nIn++;
471 }
472 // replace any trailing bytes too short for a valid UTF-8 with a replacement char
473 if (state != UTF16_ACCEPT && state != UTF16_REJECT && nOut < maxUtf8 - 1) {
474 int bufSize = maxUtf8 - nOut;
475 int count = mapUTF8(u: REPLACEMENT_CHAR, buf: p, bufSize);
476 p += count;
477 nOut += count;
478 nOut++;
479 }
480 if (nOut > maxUtf8 - 1) {
481 nOut = maxUtf8 - 1;
482 }
483 utf8[nOut] = 0;
484 return nOut;
485}
486
487// Allocate utf8 string and convert utf16 into it.
488char *utf16ToUtf8(const uint16_t *utf16, int *len)
489{
490 int n = utf16CountUtf8Bytes(utf16);
491 if (len) {
492 *len = n;
493 }
494 char *utf8 = (char *)gmalloc(size: n + 1);
495 utf16ToUtf8(utf16, utf8);
496 return utf8;
497}
498
499void unicodeToAscii7(const Unicode *in, int len, Unicode **ucs4_out, int *out_len, const int *in_idx, int **indices)
500{
501 const UnicodeMap *uMap = globalParams->getUnicodeMap(encodingName: "ASCII7");
502 int *idx = nullptr;
503
504 if (!len) {
505 *ucs4_out = nullptr;
506 *out_len = 0;
507 return;
508 }
509
510 if (indices) {
511 if (!in_idx) {
512 indices = nullptr;
513 } else {
514 idx = (int *)gmallocn(count: len * 8 + 1, size: sizeof(int));
515 }
516 }
517
518 std::string str;
519
520 char buf[8]; // 8 is enough for mapping an unicode char to a string
521 int i, n, k;
522
523 for (i = k = 0; i < len; ++i) {
524 n = uMap->mapUnicode(u: in[i], buf, bufSize: sizeof(buf));
525 if (!n) {
526 // the Unicode char could not be converted to ascii7 counterpart
527 // so just fill with a non-printable ascii char
528 buf[0] = 31;
529 n = 1;
530 }
531 str.append(s: buf, n: n);
532 if (indices) {
533 for (; n > 0; n--) {
534 idx[k++] = in_idx[i];
535 }
536 }
537 }
538
539 std::vector<Unicode> ucs4 = TextStringToUCS4(textStr: str);
540 *out_len = ucs4.size();
541 *ucs4_out = (Unicode *)gmallocn(count: ucs4.size(), size: sizeof(Unicode));
542 memcpy(dest: *ucs4_out, src: ucs4.data(), n: ucs4.size() * sizeof(Unicode));
543
544 if (indices) {
545 idx[k] = in_idx[len];
546 *indices = idx;
547 }
548}
549
550// Convert a PDF Text String to UTF-8
551// textStr - PDF text string
552// returns UTF-8 string.
553std::string TextStringToUtf8(const std::string &textStr)
554{
555 int i, len;
556 const char *s;
557 char *utf8;
558
559 len = textStr.size();
560 s = textStr.c_str();
561 if (hasUnicodeByteOrderMark(s: textStr)) {
562 uint16_t *utf16;
563 len = len / 2 - 1;
564 utf16 = new uint16_t[len + 1];
565 for (i = 0; i < len; i++) {
566 utf16[i] = (s[2 + i * 2] & 0xff) << 8 | (s[3 + i * 2] & 0xff);
567 }
568 utf16[i] = 0;
569 utf8 = utf16ToUtf8(utf16);
570 delete[] utf16;
571 } else {
572 utf8 = (char *)gmalloc(size: len + 1);
573 for (i = 0; i < len; i++) {
574 utf8[i] = pdfDocEncoding[s[i] & 0xff];
575 }
576 utf8[i] = 0;
577 }
578 std::string utf8_string(utf8);
579 gfree(p: utf8);
580 return utf8_string;
581}
582

source code of poppler/poppler/UTF.cc