1 | //======================================================================== |
2 | // |
3 | // CharCodeToUnicode.cc |
4 | // |
5 | // Copyright 2001-2003 Glyph & Cog, LLC |
6 | // |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2006, 2008-2010, 2012, 2018-2022, 2024 Albert Astals Cid <aacid@kde.org> |
17 | // Copyright (C) 2007 Julien Rebetez <julienr@svn.gnome.org> |
18 | // Copyright (C) 2007 Koji Otani <sho@bbr.jp> |
19 | // Copyright (C) 2008 Michael Vrable <mvrable@cs.ucsd.edu> |
20 | // Copyright (C) 2008 Vasile Gaburici <gaburici@cs.umd.edu> |
21 | // Copyright (C) 2010 William Bader <williambader@hotmail.com> |
22 | // Copyright (C) 2010 Jakub Wilk <jwilk@jwilk.net> |
23 | // Copyright (C) 2012 Thomas Freitag <Thomas.Freitag@alfa.de> |
24 | // Copyright (C) 2012, 2017 Adrian Johnson <ajohnson@redneon.com> |
25 | // Copyright (C) 2014 Jiri Slaby <jirislaby@gmail.com> |
26 | // Copyright (C) 2015 Marek Kasik <mkasik@redhat.com> |
27 | // Copyright (C) 2017 Jean Ghali <jghali@libertysurf.fr> |
28 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
29 | // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de> |
30 | // Copyright (C) 2019 <corentinf@free.fr> |
31 | // Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk> |
32 | // |
33 | // To see a description of the changes please see the Changelog file that |
34 | // came with your tarball or type make ChangeLog if you are building from git |
35 | // |
36 | //======================================================================== |
37 | |
38 | #include <config.h> |
39 | |
40 | #include <cstdio> |
41 | #include <cstring> |
42 | #include <functional> |
43 | #include "goo/glibc.h" |
44 | #include "goo/gmem.h" |
45 | #include "goo/gfile.h" |
46 | #include "goo/GooLikely.h" |
47 | #include "goo/GooString.h" |
48 | #include "Error.h" |
49 | #include "GlobalParams.h" |
50 | #include "PSTokenizer.h" |
51 | #include "CharCodeToUnicode.h" |
52 | #include "UTF.h" |
53 | |
54 | //------------------------------------------------------------------------ |
55 | |
56 | //------------------------------------------------------------------------ |
57 | |
58 | static int getCharFromString(void *data) |
59 | { |
60 | unsigned char *p; |
61 | int c; |
62 | |
63 | p = *(unsigned char **)data; |
64 | if (*p) { |
65 | c = *p++; |
66 | *(unsigned char **)data = p; |
67 | } else { |
68 | c = EOF; |
69 | } |
70 | return c; |
71 | } |
72 | |
73 | static int getCharFromFile(void *data) |
74 | { |
75 | return fgetc(stream: (FILE *)data); |
76 | } |
77 | |
78 | //------------------------------------------------------------------------ |
79 | |
80 | static const int hexCharVals[256] = { |
81 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x |
82 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 1x |
83 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 2x |
84 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 3x |
85 | -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 4x |
86 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 5x |
87 | -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 6x |
88 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 7x |
89 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8x |
90 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9x |
91 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ax |
92 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bx |
93 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Cx |
94 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Dx |
95 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ex |
96 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // Fx |
97 | }; |
98 | |
99 | // Parse a <len>-byte hex string <s> into *<val>. Returns false on |
100 | // error. |
101 | static bool parseHex(const char *s, int len, unsigned int *val) |
102 | { |
103 | int i, x, v = 0; |
104 | |
105 | for (i = 0; i < len; ++i) { |
106 | x = hexCharVals[s[i] & 0xff]; |
107 | if (x < 0) { |
108 | *val = 0; |
109 | return false; |
110 | } |
111 | v = (v << 4) + x; |
112 | } |
113 | *val = v; |
114 | return true; |
115 | } |
116 | |
117 | //------------------------------------------------------------------------ |
118 | |
119 | CharCodeToUnicode *CharCodeToUnicode::makeIdentityMapping() |
120 | { |
121 | CharCodeToUnicode *ctu = new CharCodeToUnicode(); |
122 | ctu->isIdentity = true; |
123 | ctu->map.resize(new_size: 1, x: 0); |
124 | return ctu; |
125 | } |
126 | CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(const char *fileName, const GooString *collection) |
127 | { |
128 | FILE *f; |
129 | CharCode size; |
130 | char buf[64]; |
131 | Unicode u; |
132 | |
133 | if (!(f = openFile(path: fileName, mode: "r" ))) { |
134 | error(category: errIO, pos: -1, msg: "Couldn't open cidToUnicode file '{0:s}'" , fileName); |
135 | return nullptr; |
136 | } |
137 | |
138 | size = 32768; |
139 | std::vector<Unicode> mapA; |
140 | mapA.resize(new_size: size, x: 0); |
141 | CharCode mapLenA = 0; |
142 | |
143 | while (getLine(buf, size: sizeof(buf), f)) { |
144 | if (mapLenA == size) { |
145 | size *= 2; |
146 | mapA.resize(new_size: size); |
147 | } |
148 | if (sscanf(s: buf, format: "%x" , &u) == 1) { |
149 | mapA[mapLenA] = u; |
150 | } else { |
151 | error(category: errSyntaxWarning, pos: -1, msg: "Bad line ({0:d}) in cidToUnicode file '{1:s}'" , (int)(mapLenA + 1), fileName); |
152 | mapA[mapLenA] = 0; |
153 | } |
154 | ++mapLenA; |
155 | } |
156 | fclose(stream: f); |
157 | mapA.resize(new_size: mapLenA); |
158 | |
159 | return new CharCodeToUnicode(collection->toStr(), std::move(mapA), {}); |
160 | } |
161 | |
162 | CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) |
163 | { |
164 | std::vector<Unicode> data(toUnicode, toUnicode + 256); |
165 | return new CharCodeToUnicode({}, std::move(data), {}); |
166 | } |
167 | |
168 | CharCodeToUnicode *CharCodeToUnicode::parseCMap(const GooString *buf, int nBits) |
169 | { |
170 | CharCodeToUnicode *ctu; |
171 | |
172 | ctu = new CharCodeToUnicode(std::optional<std::string>()); |
173 | const char *p = buf->c_str(); |
174 | if (!ctu->parseCMap1(getCharFunc: &getCharFromString, data: &p, nBits)) { |
175 | delete ctu; |
176 | return nullptr; |
177 | } |
178 | return ctu; |
179 | } |
180 | |
181 | CharCodeToUnicode *CharCodeToUnicode::parseCMapFromFile(const GooString *fileName, int nBits) |
182 | { |
183 | CharCodeToUnicode *ctu; |
184 | FILE *f; |
185 | |
186 | ctu = new CharCodeToUnicode(std::optional<std::string>()); |
187 | if ((f = globalParams->findToUnicodeFile(name: fileName))) { |
188 | if (!ctu->parseCMap1(getCharFunc: &getCharFromFile, data: f, nBits)) { |
189 | delete ctu; |
190 | fclose(stream: f); |
191 | return nullptr; |
192 | } |
193 | } else { |
194 | error(category: errSyntaxError, pos: -1, msg: "Couldn't find ToUnicode CMap file for '{0:t}'" , fileName); |
195 | } |
196 | return ctu; |
197 | } |
198 | |
199 | void CharCodeToUnicode::mergeCMap(const GooString *buf, int nBits) |
200 | { |
201 | const char *p = buf->c_str(); |
202 | parseCMap1(getCharFunc: &getCharFromString, data: &p, nBits); |
203 | } |
204 | |
205 | bool CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, int nBits) |
206 | { |
207 | PSTokenizer *pst; |
208 | char tok1[256], tok2[256], tok3[256]; |
209 | int n1, n2, n3; |
210 | CharCode i; |
211 | CharCode maxCode, code1, code2; |
212 | GooString *name; |
213 | FILE *f; |
214 | |
215 | bool ok = false; |
216 | maxCode = (nBits == 8) ? 0xff : (nBits == 16) ? 0xffff : 0xffffffff; |
217 | pst = new PSTokenizer(getCharFunc, data); |
218 | pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1); |
219 | while (pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2)) { |
220 | if (!strcmp(s1: tok2, s2: "usecmap" )) { |
221 | if (tok1[0] == '/') { |
222 | name = new GooString(tok1 + 1); |
223 | if ((f = globalParams->findToUnicodeFile(name))) { |
224 | if (parseCMap1(getCharFunc: &getCharFromFile, data: f, nBits)) { |
225 | ok = true; |
226 | } |
227 | fclose(stream: f); |
228 | } else { |
229 | error(category: errSyntaxError, pos: -1, msg: "Couldn't find ToUnicode CMap file for '{0:t}'" , name); |
230 | } |
231 | delete name; |
232 | } |
233 | pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1); |
234 | } else if (!strcmp(s1: tok2, s2: "beginbfchar" )) { |
235 | while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) { |
236 | if (!strcmp(s1: tok1, s2: "endbfchar" )) { |
237 | break; |
238 | } |
239 | if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) || !strcmp(s1: tok2, s2: "endbfchar" )) { |
240 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfchar block in ToUnicode CMap" ); |
241 | break; |
242 | } |
243 | if (!(tok1[0] == '<' && tok1[n1 - 1] == '>' && tok2[0] == '<' && tok2[n2 - 1] == '>')) { |
244 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfchar block in ToUnicode CMap" ); |
245 | continue; |
246 | } |
247 | tok1[n1 - 1] = tok2[n2 - 1] = '\0'; |
248 | if (!parseHex(s: tok1 + 1, len: n1 - 2, val: &code1)) { |
249 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfchar block in ToUnicode CMap" ); |
250 | continue; |
251 | } |
252 | if (code1 > maxCode) { |
253 | error(category: errSyntaxWarning, pos: -1, msg: "Invalid entry in bfchar block in ToUnicode CMap" ); |
254 | } |
255 | addMapping(code: code1, uStr: tok2 + 1, n: n2 - 2, offset: 0); |
256 | ok = true; |
257 | } |
258 | pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1); |
259 | } else if (!strcmp(s1: tok2, s2: "beginbfrange" )) { |
260 | while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) { |
261 | if (!strcmp(s1: tok1, s2: "endbfrange" )) { |
262 | break; |
263 | } |
264 | if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) || !strcmp(s1: tok2, s2: "endbfrange" ) || !pst->getToken(buf: tok3, size: sizeof(tok3), length: &n3) || !strcmp(s1: tok3, s2: "endbfrange" )) { |
265 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfrange block in ToUnicode CMap" ); |
266 | break; |
267 | } |
268 | if (!(tok1[0] == '<' && tok1[n1 - 1] == '>' && tok2[0] == '<' && tok2[n2 - 1] == '>')) { |
269 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfrange block in ToUnicode CMap" ); |
270 | continue; |
271 | } |
272 | tok1[n1 - 1] = tok2[n2 - 1] = '\0'; |
273 | if (!parseHex(s: tok1 + 1, len: n1 - 2, val: &code1) || !parseHex(s: tok2 + 1, len: n2 - 2, val: &code2)) { |
274 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfrange block in ToUnicode CMap" ); |
275 | continue; |
276 | } |
277 | if (code1 > maxCode || code2 > maxCode) { |
278 | error(category: errSyntaxWarning, pos: -1, msg: "Invalid entry in bfrange block in ToUnicode CMap" ); |
279 | if (code1 > maxCode) { |
280 | code1 = maxCode; |
281 | } |
282 | if (code2 > maxCode) { |
283 | code2 = maxCode; |
284 | } |
285 | } |
286 | if (!strcmp(s1: tok3, s2: "[" )) { |
287 | i = 0; |
288 | while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1) && code1 + i <= code2) { |
289 | if (!strcmp(s1: tok1, s2: "]" )) { |
290 | break; |
291 | } |
292 | if (tok1[0] == '<' && tok1[n1 - 1] == '>') { |
293 | tok1[n1 - 1] = '\0'; |
294 | addMapping(code: code1 + i, uStr: tok1 + 1, n: n1 - 2, offset: 0); |
295 | ok = true; |
296 | } else { |
297 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfrange block in ToUnicode CMap" ); |
298 | } |
299 | ++i; |
300 | } |
301 | } else if (tok3[0] == '<' && tok3[n3 - 1] == '>') { |
302 | tok3[n3 - 1] = '\0'; |
303 | for (i = 0; code1 <= code2; ++code1, ++i) { |
304 | addMapping(code: code1, uStr: tok3 + 1, n: n3 - 2, offset: i); |
305 | ok = true; |
306 | } |
307 | |
308 | } else { |
309 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in bfrange block in ToUnicode CMap" ); |
310 | } |
311 | } |
312 | pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1); |
313 | } else if (!strcmp(s1: tok2, s2: "begincidchar" )) { |
314 | // the begincidchar operator is not allowed in ToUnicode CMaps, |
315 | // but some buggy PDF generators incorrectly use |
316 | // code-to-CID-type CMaps here |
317 | error(category: errSyntaxWarning, pos: -1, msg: "Invalid 'begincidchar' operator in ToUnicode CMap" ); |
318 | while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) { |
319 | if (!strcmp(s1: tok1, s2: "endcidchar" )) { |
320 | break; |
321 | } |
322 | if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) || !strcmp(s1: tok2, s2: "endcidchar" )) { |
323 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in cidchar block in ToUnicode CMap" ); |
324 | break; |
325 | } |
326 | if (!(tok1[0] == '<' && tok1[n1 - 1] == '>')) { |
327 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in cidchar block in ToUnicode CMap" ); |
328 | continue; |
329 | } |
330 | tok1[n1 - 1] = '\0'; |
331 | if (!parseHex(s: tok1 + 1, len: n1 - 2, val: &code1)) { |
332 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in cidchar block in ToUnicode CMap" ); |
333 | continue; |
334 | } |
335 | if (code1 > maxCode) { |
336 | error(category: errSyntaxWarning, pos: -1, msg: "Invalid entry in cidchar block in ToUnicode CMap" ); |
337 | } |
338 | addMappingInt(code: code1, u: atoi(nptr: tok2)); |
339 | ok = true; |
340 | } |
341 | pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1); |
342 | } else if (!strcmp(s1: tok2, s2: "begincidrange" )) { |
343 | // the begincidrange operator is not allowed in ToUnicode CMaps, |
344 | // but some buggy PDF generators incorrectly use |
345 | // code-to-CID-type CMaps here |
346 | error(category: errSyntaxWarning, pos: -1, msg: "Invalid 'begincidrange' operator in ToUnicode CMap" ); |
347 | while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) { |
348 | if (!strcmp(s1: tok1, s2: "endcidrange" )) { |
349 | break; |
350 | } |
351 | if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) || !strcmp(s1: tok2, s2: "endcidrange" ) || !pst->getToken(buf: tok3, size: sizeof(tok3), length: &n3) || !strcmp(s1: tok3, s2: "endcidrange" )) { |
352 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in cidrange block in ToUnicode CMap" ); |
353 | break; |
354 | } |
355 | if (!(tok1[0] == '<' && tok1[n1 - 1] == '>' && tok2[0] == '<' && tok2[n2 - 1] == '>')) { |
356 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in cidrange block in ToUnicode CMap" ); |
357 | continue; |
358 | } |
359 | tok1[n1 - 1] = tok2[n2 - 1] = '\0'; |
360 | if (!parseHex(s: tok1 + 1, len: n1 - 2, val: &code1) || !parseHex(s: tok2 + 1, len: n2 - 2, val: &code2)) { |
361 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in cidrange block in ToUnicode CMap" ); |
362 | continue; |
363 | } |
364 | if (code1 > maxCode || code2 > maxCode) { |
365 | error(category: errSyntaxWarning, pos: -1, msg: "Invalid entry in cidrange block in ToUnicode CMap" ); |
366 | if (code2 > maxCode) { |
367 | code2 = maxCode; |
368 | } |
369 | } |
370 | for (i = atoi(nptr: tok3); code1 <= code2; ++code1, ++i) { |
371 | addMappingInt(code: code1, u: i); |
372 | ok = true; |
373 | } |
374 | } |
375 | pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1); |
376 | } else { |
377 | strcpy(dest: tok1, src: tok2); |
378 | } |
379 | } |
380 | delete pst; |
381 | return ok; |
382 | } |
383 | |
384 | void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n, int offset) |
385 | { |
386 | Unicode u; |
387 | int j; |
388 | |
389 | if (code > 0xffffff) { |
390 | // This is an arbitrary limit to avoid integer overflow issues. |
391 | // (I've seen CMaps with mappings for <ffffffff>.) |
392 | return; |
393 | } |
394 | if (code >= map.size()) { |
395 | size_t oldLen = map.size(); |
396 | auto newLen = oldLen ? 2 * oldLen : 256; |
397 | if (code >= newLen) { |
398 | newLen = (code + 256) & ~255; |
399 | } |
400 | if (unlikely(code >= newLen)) { |
401 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal code value in CharCodeToUnicode::addMapping" ); |
402 | return; |
403 | } else { |
404 | map.resize(new_size: newLen, x: 0); |
405 | } |
406 | } |
407 | if (n <= 4) { |
408 | if (!parseHex(s: uStr, len: n, val: &u)) { |
409 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in ToUnicode CMap" ); |
410 | return; |
411 | } |
412 | map[code] = u + offset; |
413 | if (!UnicodeIsValid(ucs4: map[code])) { |
414 | map[code] = 0xfffd; |
415 | } |
416 | } else { |
417 | map[code] = 0; |
418 | int utf16Len = n / 4; |
419 | std::vector<Unicode> utf16(utf16Len); |
420 | utf16.resize(new_size: utf16Len); |
421 | for (j = 0; j < utf16Len; ++j) { |
422 | if (!parseHex(s: uStr + j * 4, len: 4, val: &utf16[j])) { |
423 | error(category: errSyntaxWarning, pos: -1, msg: "Illegal entry in ToUnicode CMap" ); |
424 | return; |
425 | } |
426 | } |
427 | utf16[utf16Len - 1] += offset; |
428 | sMap.push_back(x: { .c: code, .u: UTF16toUCS4(utf16: utf16.data(), utf16Len: utf16.size()) }); |
429 | } |
430 | } |
431 | |
432 | void CharCodeToUnicode::addMappingInt(CharCode code, Unicode u) |
433 | { |
434 | if (code > 0xffffff) { |
435 | // This is an arbitrary limit to avoid integer overflow issues. |
436 | // (I've seen CMaps with mappings for <ffffffff>.) |
437 | return; |
438 | } |
439 | if (code >= map.size()) { |
440 | size_t oldLen = map.size(); |
441 | size_t newLen = oldLen ? 2 * oldLen : 256; |
442 | if (code >= newLen) { |
443 | newLen = (code + 256) & ~255; |
444 | } |
445 | map.resize(new_size: newLen, x: 0); |
446 | } |
447 | map[code] = u; |
448 | } |
449 | |
450 | CharCodeToUnicode::CharCodeToUnicode() |
451 | { |
452 | refCnt = 1; |
453 | isIdentity = false; |
454 | } |
455 | |
456 | CharCodeToUnicode::CharCodeToUnicode(const std::optional<std::string> &tagA) : tag(tagA) |
457 | { |
458 | map.resize(new_size: 256, x: 0); |
459 | refCnt = 1; |
460 | isIdentity = false; |
461 | } |
462 | CharCodeToUnicode::CharCodeToUnicode(const std::optional<std::string> &tagA, std::vector<Unicode> &&mapA, std::vector<CharCodeToUnicodeString> &&sMapA) : tag(tagA) |
463 | { |
464 | map = std::move(mapA); |
465 | sMap = std::move(sMapA); |
466 | refCnt = 1; |
467 | isIdentity = false; |
468 | } |
469 | |
470 | void CharCodeToUnicode::incRefCnt() |
471 | { |
472 | ++refCnt; |
473 | } |
474 | |
475 | void CharCodeToUnicode::decRefCnt() |
476 | { |
477 | if (--refCnt == 0) { |
478 | delete this; |
479 | } |
480 | } |
481 | |
482 | bool CharCodeToUnicode::match(const GooString *tagA) |
483 | { |
484 | return tag && tag == tagA->toStr(); |
485 | } |
486 | |
487 | void CharCodeToUnicode::setMapping(CharCode c, Unicode *u, int len) |
488 | { |
489 | size_t i; |
490 | int j; |
491 | |
492 | if (map.empty() || isIdentity) { |
493 | return; |
494 | } |
495 | if (len == 1) { |
496 | map[c] = u[0]; |
497 | } else { |
498 | std::optional<std::reference_wrapper<CharCodeToUnicodeString>> element; |
499 | for (i = 0; i < sMap.size(); ++i) { |
500 | if (sMap[i].c == c) { |
501 | sMap[i].u.clear(); |
502 | element = std::ref(t&: sMap[i]); |
503 | break; |
504 | } |
505 | } |
506 | if (!element) { |
507 | sMap.emplace_back(); |
508 | element = std::ref(t&: sMap.back()); |
509 | } |
510 | map[c] = 0; |
511 | element->get().c = c; |
512 | element->get().u.reserve(n: len); |
513 | for (j = 0; j < len; ++j) { |
514 | if (UnicodeIsValid(ucs4: u[j])) { |
515 | element->get().u.push_back(x: u[j]); |
516 | } else { |
517 | element->get().u.push_back(x: 0xfffd); |
518 | } |
519 | } |
520 | } |
521 | } |
522 | |
523 | int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode const **u) const |
524 | { |
525 | if (isIdentity) { |
526 | auto that = const_cast<CharCodeToUnicode *>(this); |
527 | that->map[0] = (Unicode)c; |
528 | *u = map.data(); |
529 | return 1; |
530 | } |
531 | if (c >= map.size()) { |
532 | return 0; |
533 | } |
534 | if (map[c]) { |
535 | *u = &map[c]; |
536 | return 1; |
537 | } |
538 | for (auto i = sMap.size(); i > 0; --i) { // in reverse so CMap takes precedence |
539 | if (sMap[i - 1].c == c) { |
540 | *u = sMap[i - 1].u.data(); |
541 | return sMap[i - 1].u.size(); |
542 | } |
543 | } |
544 | return 0; |
545 | } |
546 | |
547 | int CharCodeToUnicode::mapToCharCode(const Unicode *u, CharCode *c, int usize) const |
548 | { |
549 | // look for charcode in map |
550 | if (usize == 1 || (usize > 1 && !(*u & ~0xff))) { |
551 | if (isIdentity) { |
552 | *c = (CharCode)*u; |
553 | return 1; |
554 | } |
555 | for (CharCode i = 0; i < map.size(); i++) { |
556 | if (map[i] == *u) { |
557 | *c = i; |
558 | return 1; |
559 | } |
560 | } |
561 | *c = 'x'; |
562 | } else { |
563 | size_t j; |
564 | // for each entry in the sMap |
565 | for (const auto &element : sMap) { |
566 | // if the entry's unicode length isn't the same are usize, the strings |
567 | // are obviously different |
568 | if (element.u.size() != size_t(usize)) { |
569 | continue; |
570 | } |
571 | // compare the string char by char |
572 | for (j = 0; j < element.u.size(); j++) { |
573 | if (element.u[j] != u[j]) { |
574 | break; |
575 | } |
576 | } |
577 | |
578 | // we have the same strings |
579 | if (j == element.u.size()) { |
580 | *c = element.c; |
581 | return 1; |
582 | } |
583 | } |
584 | } |
585 | return 0; |
586 | } |
587 | |
588 | //------------------------------------------------------------------------ |
589 | |
590 | CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA) |
591 | { |
592 | int i; |
593 | |
594 | size = sizeA; |
595 | cache = (CharCodeToUnicode **)gmallocn(count: size, size: sizeof(CharCodeToUnicode *)); |
596 | for (i = 0; i < size; ++i) { |
597 | cache[i] = nullptr; |
598 | } |
599 | } |
600 | |
601 | CharCodeToUnicodeCache::~CharCodeToUnicodeCache() |
602 | { |
603 | int i; |
604 | |
605 | for (i = 0; i < size; ++i) { |
606 | if (cache[i]) { |
607 | cache[i]->decRefCnt(); |
608 | } |
609 | } |
610 | gfree(p: cache); |
611 | } |
612 | |
613 | CharCodeToUnicode *CharCodeToUnicodeCache::getCharCodeToUnicode(const GooString *tag) |
614 | { |
615 | CharCodeToUnicode *ctu; |
616 | int i, j; |
617 | |
618 | if (cache[0] && cache[0]->match(tagA: tag)) { |
619 | cache[0]->incRefCnt(); |
620 | return cache[0]; |
621 | } |
622 | for (i = 1; i < size; ++i) { |
623 | if (cache[i] && cache[i]->match(tagA: tag)) { |
624 | ctu = cache[i]; |
625 | for (j = i; j >= 1; --j) { |
626 | cache[j] = cache[j - 1]; |
627 | } |
628 | cache[0] = ctu; |
629 | ctu->incRefCnt(); |
630 | return ctu; |
631 | } |
632 | } |
633 | return nullptr; |
634 | } |
635 | |
636 | void CharCodeToUnicodeCache::add(CharCodeToUnicode *ctu) |
637 | { |
638 | int i; |
639 | |
640 | if (cache[size - 1]) { |
641 | cache[size - 1]->decRefCnt(); |
642 | } |
643 | for (i = size - 1; i >= 1; --i) { |
644 | cache[i] = cache[i - 1]; |
645 | } |
646 | cache[0] = ctu; |
647 | ctu->incRefCnt(); |
648 | } |
649 | |