CharCodeToUnicode.cc source code [poppler/poppler/CharCodeToUnicode.cc]

1	//========================================================================
2	//
3	// CharCodeToUnicode.cc
4	//
5	// Copyright 2001-2003 Glyph & Cog, LLC
6	//
7	//========================================================================
8
9	//========================================================================
10	//
11	// Modified under the Poppler project - http://poppler.freedesktop.org
12	//
13	// All changes made under the Poppler project to this file are licensed
14	// under GPL version 2 or later
15	//
16	// Copyright (C) 2006, 2008-2010, 2012, 2018-2022, 2024 Albert Astals Cid <aacid@kde.org>
17	// Copyright (C) 2007 Julien Rebetez <julienr@svn.gnome.org>
18	// Copyright (C) 2007 Koji Otani <sho@bbr.jp>
19	// Copyright (C) 2008 Michael Vrable <mvrable@cs.ucsd.edu>
20	// Copyright (C) 2008 Vasile Gaburici <gaburici@cs.umd.edu>
21	// Copyright (C) 2010 William Bader <williambader@hotmail.com>
22	// Copyright (C) 2010 Jakub Wilk <jwilk@jwilk.net>
23	// Copyright (C) 2012 Thomas Freitag <Thomas.Freitag@alfa.de>
24	// Copyright (C) 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
25	// Copyright (C) 2014 Jiri Slaby <jirislaby@gmail.com>
26	// Copyright (C) 2015 Marek Kasik <mkasik@redhat.com>
27	// Copyright (C) 2017 Jean Ghali <jghali@libertysurf.fr>
28	// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
29	// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
30	// Copyright (C) 2019 <corentinf@free.fr>
31	// Copyright (C) 2024 g10 Code GmbH, Author: Sune Stolborg Vuorela <sune@vuorela.dk>
32	//
33	// To see a description of the changes please see the Changelog file that
34	// came with your tarball or type make ChangeLog if you are building from git
35	//
36	//========================================================================
37
38	#include <config.h>
39
40	#include <cstdio>
41	#include <cstring>
42	#include <functional>
43	#include "goo/glibc.h"
44	#include "goo/gmem.h"
45	#include "goo/gfile.h"
46	#include "goo/GooLikely.h"
47	#include "goo/GooString.h"
48	#include "Error.h"
49	#include "GlobalParams.h"
50	#include "PSTokenizer.h"
51	#include "CharCodeToUnicode.h"
52	#include "UTF.h"
53
54	//------------------------------------------------------------------------
55
56	//------------------------------------------------------------------------
57
58	static int getCharFromString(void *data)
59	{
60	unsigned char *p;
61	int c;
62
63	p = (unsigned* char **)data;
64	if (*p) {
65	c = *p++;
66	(unsigned* char **)data = p;
67	} else {
68	c = EOF;
69	}
70	return c;
71	}
72
73	static int getCharFromFile(void *data)
74	{
75	return fgetc(stream: (FILE *)data);
76	}
77
78	//------------------------------------------------------------------------
79
80	static const int hexCharVals[`256`] = {
81	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 0x
82	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 1x
83	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 2x
84	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 3x
85	-`1`, `10`, `11`, `12`, `13`, `14`, `15`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 4x
86	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 5x
87	-`1`, `10`, `11`, `12`, `13`, `14`, `15`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 6x
88	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 7x
89	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 8x
90	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // 9x
91	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // Ax
92	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // Bx
93	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // Cx
94	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // Dx
95	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, // Ex
96	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1` // Fx
97	};
98
99	// Parse a <len>-byte hex string <s> into <val>. Returns false on*
100	// error.
101	static bool parseHex(const char s, int* len, unsigned int *val)
102	{
103	int i, x, v = `0`;
104
105	for (i = `0`; i < len; ++i) {
106	x = hexCharVals[s[i] & `0xff`];
107	if (x < `0`) {
108	*val = `0`;
109	return false;
110	}
111	v = (v << `4`) + x;
112	}
113	*val = v;
114	return true;
115	}
116
117	//------------------------------------------------------------------------
118
119	CharCodeToUnicode *CharCodeToUnicode::makeIdentityMapping()
120	{
121	CharCodeToUnicode ctu = new* CharCodeToUnicode ();
122	ctu->isIdentity = true;
123	ctu->map.resize(new_size: `1`, x: `0`);
124	return ctu;
125	}
126	CharCodeToUnicode CharCodeToUnicode::parseCIDToUnicode(const* char fileName, const* GooString *collection)
127	{
128	FILE *f;
129	CharCode size;
130	char buf[`64`];
131	Unicode u;
132
133	if (!(f = openFile(path: fileName, mode: "r"))) {
134	error(category: errIO, pos: -`1`, msg: "Couldn't open cidToUnicode file '{0:s}'", fileName);
135	return nullptr;
136	}
137
138	size = `32768`;
139	std::vector<Unicode> mapA;
140	mapA.resize(new_size: size, x: `0`);
141	CharCode mapLenA = `0`;
142
143	while (getLine(buf, size: sizeof(buf), f)) {
144	if (mapLenA == size) {
145	size *= `2`;
146	mapA.resize(new_size: size);
147	}
148	if (sscanf(s: buf, format: "%x", &u) == `1`) {
149	mapA [mapLenA] = u;
150	} else {
151	error(category: errSyntaxWarning, pos: -`1`, msg: "Bad line ({0:d}) in cidToUnicode file '{1:s}'", (int)(mapLenA + `1`), fileName);
152	mapA [mapLenA] = `0`;
153	}
154	++mapLenA;
155	}
156	fclose(stream: f);
157	mapA.resize(new_size: mapLenA);
158
159	return new CharCodeToUnicode (collection->toStr(), std::move(mapA), {});
160	}
161
162	CharCodeToUnicode CharCodeToUnicode::make8BitToUnicode(Unicode toUnicode)
163	{
164	std::vector<Unicode> data(toUnicode, toUnicode + `256`);
165	return new CharCodeToUnicode ({}, std::move(data), {});
166	}
167
168	CharCodeToUnicode CharCodeToUnicode::parseCMap(const* GooString buf, int* nBits)
169	{
170	CharCodeToUnicode *ctu;
171
172	ctu = new CharCodeToUnicode (std::optional<std::string>());
173	const char *p = buf->c_str();
174	if (!ctu->parseCMap1(getCharFunc: &getCharFromString, data: &p, nBits)) {
175	delete ctu;
176	return nullptr;
177	}
178	return ctu;
179	}
180
181	CharCodeToUnicode CharCodeToUnicode::parseCMapFromFile(const* GooString fileName, int* nBits)
182	{
183	CharCodeToUnicode *ctu;
184	FILE *f;
185
186	ctu = new CharCodeToUnicode (std::optional<std::string>());
187	if ((f = globalParams ->findToUnicodeFile(name: fileName))) {
188	if (!ctu->parseCMap1(getCharFunc: &getCharFromFile, data: f, nBits)) {
189	delete ctu;
190	fclose(stream: f);
191	return nullptr;
192	}
193	} else {
194	error(category: errSyntaxError, pos: -`1`, msg: "Couldn't find ToUnicode CMap file for '{0:t}'", fileName);
195	}
196	return ctu;
197	}
198
199	void CharCodeToUnicode::mergeCMap(const GooString buf, int* nBits)
200	{
201	const char *p = buf->c_str();
202	parseCMap1(getCharFunc: &getCharFromString, data: &p, nBits);
203	}
204
205	bool CharCodeToUnicode::parseCMap1(int (getCharFunc)(void* ), void* data, int* nBits)
206	{
207	PSTokenizer *pst;
208	char tok1[`256`], tok2[`256`], tok3[`256`];
209	int n1, n2, n3;
210	CharCode i;
211	CharCode maxCode, code1, code2;
212	GooString *name;
213	FILE *f;
214
215	bool ok = false;
216	maxCode = (nBits == `8`) ? `0xff` : (nBits == `16`) ? `0xffff` : `0xffffffff`;
217	pst = new PSTokenizer (getCharFunc, data);
218	pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1);
219	while (pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2)) {
220	if (!strcmp(s1: tok2, s2: "usecmap")) {
221	if (tok1[`0`] == `'/'`) {
222	name = new GooString (tok1 + `1`);
223	if ((f = globalParams ->findToUnicodeFile(name))) {
224	if (parseCMap1(getCharFunc: &getCharFromFile, data: f, nBits)) {
225	ok = true;
226	}
227	fclose(stream: f);
228	} else {
229	error(category: errSyntaxError, pos: -`1`, msg: "Couldn't find ToUnicode CMap file for '{0:t}'", name);
230	}
231	delete name;
232	}
233	pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1);
234	} else if (!strcmp(s1: tok2, s2: "beginbfchar")) {
235	while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) {
236	if (!strcmp(s1: tok1, s2: "endbfchar")) {
237	break;
238	}
239	if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) \|\| !strcmp(s1: tok2, s2: "endbfchar")) {
240	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfchar block in ToUnicode CMap");
241	break;
242	}
243	if (!(tok1[`0`] == `'<'` && tok1[n1 - `1`] == `'>'` && tok2[`0`] == `'<'` && tok2[n2 - `1`] == `'>'`)) {
244	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfchar block in ToUnicode CMap");
245	continue;
246	}
247	tok1[n1 - `1`] = tok2[n2 - `1`] = `'\0'`;
248	if (!parseHex(s: tok1 + `1`, len: n1 - `2`, val: &code1)) {
249	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfchar block in ToUnicode CMap");
250	continue;
251	}
252	if (code1 > maxCode) {
253	error(category: errSyntaxWarning, pos: -`1`, msg: "Invalid entry in bfchar block in ToUnicode CMap");
254	}
255	addMapping(code: code1, uStr: tok2 + `1`, n: n2 - `2`, offset: `0`);
256	ok = true;
257	}
258	pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1);
259	} else if (!strcmp(s1: tok2, s2: "beginbfrange")) {
260	while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) {
261	if (!strcmp(s1: tok1, s2: "endbfrange")) {
262	break;
263	}
264	if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) \|\| !strcmp(s1: tok2, s2: "endbfrange") \|\| !pst->getToken(buf: tok3, size: sizeof(tok3), length: &n3) \|\| !strcmp(s1: tok3, s2: "endbfrange")) {
265	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfrange block in ToUnicode CMap");
266	break;
267	}
268	if (!(tok1[`0`] == `'<'` && tok1[n1 - `1`] == `'>'` && tok2[`0`] == `'<'` && tok2[n2 - `1`] == `'>'`)) {
269	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfrange block in ToUnicode CMap");
270	continue;
271	}
272	tok1[n1 - `1`] = tok2[n2 - `1`] = `'\0'`;
273	if (!parseHex(s: tok1 + `1`, len: n1 - `2`, val: &code1) \|\| !parseHex(s: tok2 + `1`, len: n2 - `2`, val: &code2)) {
274	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfrange block in ToUnicode CMap");
275	continue;
276	}
277	if (code1 > maxCode \|\| code2 > maxCode) {
278	error(category: errSyntaxWarning, pos: -`1`, msg: "Invalid entry in bfrange block in ToUnicode CMap");
279	if (code1 > maxCode) {
280	code1 = maxCode;
281	}
282	if (code2 > maxCode) {
283	code2 = maxCode;
284	}
285	}
286	if (!strcmp(s1: tok3, s2: "[")) {
287	i = `0`;
288	while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1) && code1 + i <= code2) {
289	if (!strcmp(s1: tok1, s2: "]")) {
290	break;
291	}
292	if (tok1[`0`] == `'<'` && tok1[n1 - `1`] == `'>'`) {
293	tok1[n1 - `1`] = `'\0'`;
294	addMapping(code: code1 + i, uStr: tok1 + `1`, n: n1 - `2`, offset: `0`);
295	ok = true;
296	} else {
297	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfrange block in ToUnicode CMap");
298	}
299	++i;
300	}
301	} else if (tok3[`0`] == `'<'` && tok3[n3 - `1`] == `'>'`) {
302	tok3[n3 - `1`] = `'\0'`;
303	for (i = `0`; code1 <= code2; ++code1, ++i) {
304	addMapping(code: code1, uStr: tok3 + `1`, n: n3 - `2`, offset: i);
305	ok = true;
306	}
307
308	} else {
309	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in bfrange block in ToUnicode CMap");
310	}
311	}
312	pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1);
313	} else if (!strcmp(s1: tok2, s2: "begincidchar")) {
314	// the begincidchar operator is not allowed in ToUnicode CMaps,
315	// but some buggy PDF generators incorrectly use
316	// code-to-CID-type CMaps here
317	error(category: errSyntaxWarning, pos: -`1`, msg: "Invalid 'begincidchar' operator in ToUnicode CMap");
318	while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) {
319	if (!strcmp(s1: tok1, s2: "endcidchar")) {
320	break;
321	}
322	if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) \|\| !strcmp(s1: tok2, s2: "endcidchar")) {
323	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in cidchar block in ToUnicode CMap");
324	break;
325	}
326	if (!(tok1[`0`] == `'<'` && tok1[n1 - `1`] == `'>'`)) {
327	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in cidchar block in ToUnicode CMap");
328	continue;
329	}
330	tok1[n1 - `1`] = `'\0'`;
331	if (!parseHex(s: tok1 + `1`, len: n1 - `2`, val: &code1)) {
332	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in cidchar block in ToUnicode CMap");
333	continue;
334	}
335	if (code1 > maxCode) {
336	error(category: errSyntaxWarning, pos: -`1`, msg: "Invalid entry in cidchar block in ToUnicode CMap");
337	}
338	addMappingInt(code: code1, u: atoi(nptr: tok2));
339	ok = true;
340	}
341	pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1);
342	} else if (!strcmp(s1: tok2, s2: "begincidrange")) {
343	// the begincidrange operator is not allowed in ToUnicode CMaps,
344	// but some buggy PDF generators incorrectly use
345	// code-to-CID-type CMaps here
346	error(category: errSyntaxWarning, pos: -`1`, msg: "Invalid 'begincidrange' operator in ToUnicode CMap");
347	while (pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1)) {
348	if (!strcmp(s1: tok1, s2: "endcidrange")) {
349	break;
350	}
351	if (!pst->getToken(buf: tok2, size: sizeof(tok2), length: &n2) \|\| !strcmp(s1: tok2, s2: "endcidrange") \|\| !pst->getToken(buf: tok3, size: sizeof(tok3), length: &n3) \|\| !strcmp(s1: tok3, s2: "endcidrange")) {
352	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in cidrange block in ToUnicode CMap");
353	break;
354	}
355	if (!(tok1[`0`] == `'<'` && tok1[n1 - `1`] == `'>'` && tok2[`0`] == `'<'` && tok2[n2 - `1`] == `'>'`)) {
356	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in cidrange block in ToUnicode CMap");
357	continue;
358	}
359	tok1[n1 - `1`] = tok2[n2 - `1`] = `'\0'`;
360	if (!parseHex(s: tok1 + `1`, len: n1 - `2`, val: &code1) \|\| !parseHex(s: tok2 + `1`, len: n2 - `2`, val: &code2)) {
361	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in cidrange block in ToUnicode CMap");
362	continue;
363	}
364	if (code1 > maxCode \|\| code2 > maxCode) {
365	error(category: errSyntaxWarning, pos: -`1`, msg: "Invalid entry in cidrange block in ToUnicode CMap");
366	if (code2 > maxCode) {
367	code2 = maxCode;
368	}
369	}
370	for (i = atoi(nptr: tok3); code1 <= code2; ++code1, ++i) {
371	addMappingInt(code: code1, u: i);
372	ok = true;
373	}
374	}
375	pst->getToken(buf: tok1, size: sizeof(tok1), length: &n1);
376	} else {
377	strcpy(dest: tok1, src: tok2);
378	}
379	}
380	delete pst;
381	return ok;
382	}
383
384	void CharCodeToUnicode::addMapping(CharCode code, char uStr, int* n, int offset)
385	{
386	Unicode u;
387	int j;
388
389	if (code > `0xffffff`) {
390	// This is an arbitrary limit to avoid integer overflow issues.
391	// (I've seen CMaps with mappings for <ffffffff>.)
392	return;
393	}
394	if (code >= map.size()) {
395	size_t oldLen = map.size();
396	auto newLen = oldLen ? `2` * oldLen : `256`;
397	if (code >= newLen) {
398	newLen = (code + `256`) & ~`255`;
399	}
400	if (unlikely(code >= newLen)) {
401	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal code value in CharCodeToUnicode::addMapping");
402	return;
403	} else {
404	map.resize(new_size: newLen, x: `0`);
405	}
406	}
407	if (n <= `4`) {
408	if (!parseHex(s: uStr, len: n, val: &u)) {
409	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in ToUnicode CMap");
410	return;
411	}
412	map [code] = u + offset;
413	if (!UnicodeIsValid(ucs4: map [code])) {
414	map [code] = `0xfffd`;
415	}
416	} else {
417	map [code] = `0`;
418	int utf16Len = n / `4`;
419	std::vector<Unicode> utf16(utf16Len);
420	utf16.resize(new_size: utf16Len);
421	for (j = `0`; j < utf16Len; ++j) {
422	if (!parseHex(s: uStr + j * `4`, len: `4`, val: &utf16 [j])) {
423	error(category: errSyntaxWarning, pos: -`1`, msg: "Illegal entry in ToUnicode CMap");
424	return;
425	}
426	}
427	utf16 [utf16Len - `1`] += offset;
428	sMap.push_back(x: { .c: code, .u: UTF16toUCS4(utf16: utf16.data(), utf16Len: utf16.size()) });
429	}
430	}
431
432	void CharCodeToUnicode::addMappingInt(CharCode code, Unicode u)
433	{
434	if (code > `0xffffff`) {
435	// This is an arbitrary limit to avoid integer overflow issues.
436	// (I've seen CMaps with mappings for <ffffffff>.)
437	return;
438	}
439	if (code >= map.size()) {
440	size_t oldLen = map.size();
441	size_t newLen = oldLen ? `2` * oldLen : `256`;
442	if (code >= newLen) {
443	newLen = (code + `256`) & ~`255`;
444	}
445	map.resize(new_size: newLen, x: `0`);
446	}
447	map [code] = u;
448	}
449
450	CharCodeToUnicode::CharCodeToUnicode()
451	{
452	refCnt = `1`;
453	isIdentity = false;
454	}
455
456	CharCodeToUnicode::CharCodeToUnicode(const std::optional<std::string> &tagA) : tag (tagA)
457	{
458	map.resize(new_size: `256`, x: `0`);
459	refCnt = `1`;
460	isIdentity = false;
461	}
462	CharCodeToUnicode::CharCodeToUnicode(const std::optional<std::string> &tagA, std::vector<Unicode> &&mapA, std::vector<CharCodeToUnicodeString> &&sMapA) : tag (tagA)
463	{
464	map = std::move(mapA);
465	sMap = std::move(sMapA);
466	refCnt = `1`;
467	isIdentity = false;
468	}
469
470	void CharCodeToUnicode::incRefCnt()
471	{
472	++refCnt;
473	}
474
475	void CharCodeToUnicode::decRefCnt()
476	{
477	if (--refCnt == `0`) {
478	delete this;
479	}
480	}
481
482	bool CharCodeToUnicode::match(const GooString *tagA)
483	{
484	return tag && tag == tagA->toStr();
485	}
486
487	void CharCodeToUnicode::setMapping(CharCode c, Unicode u, int* len)
488	{
489	size_t i;
490	int j;
491
492	if (map.empty() \|\| isIdentity) {
493	return;
494	}
495	if (len == `1`) {
496	map [c] = u[`0`];
497	} else {
498	std::optional<std::reference_wrapper<CharCodeToUnicodeString>> element;
499	for (i = `0`; i < sMap.size(); ++i) {
500	if (sMap [i].c == c) {
501	sMap [i].u.clear();
502	element = std::ref(t&: sMap [i]);
503	break;
504	}
505	}
506	if (!element) {
507	sMap.emplace_back();
508	element = std::ref(t&: sMap.back());
509	}
510	map [c] = `0`;
511	element ->get().c = c;
512	element ->get().u.reserve(n: len);
513	for (j = `0`; j < len; ++j) {
514	if (UnicodeIsValid(ucs4: u[j])) {
515	element ->get().u.push_back(x: u[j]);
516	} else {
517	element ->get().u.push_back(x: `0xfffd`);
518	}
519	}
520	}
521	}
522
523	int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode const *u) const*
524	{
525	if (isIdentity) {
526	auto that = const_cast<CharCodeToUnicode >(this*);
527	that->map [`0`] = (Unicode)c;
528	*u = map.data();
529	return `1`;
530	}
531	if (c >= map.size()) {
532	return `0`;
533	}
534	if (map [c]) {
535	*u = &map [c];
536	return `1`;
537	}
538	for (auto i = sMap.size(); i > `0`; --i) { // in reverse so CMap takes precedence
539	if (sMap [i - `1`].c == c) {
540	*u = sMap [i - `1`].u.data();
541	return sMap [i - `1`].u.size();
542	}
543	}
544	return `0`;
545	}
546
547	int CharCodeToUnicode::mapToCharCode(const Unicode u, CharCode c, int usize) const
548	{
549	// look for charcode in map
550	if (usize == `1` \|\| (usize > `1` && !(*u & ~`0xff`))) {
551	if (isIdentity) {
552	c = (CharCode)u;
553	return `1`;
554	}
555	for (CharCode i = `0`; i < map.size(); i++) {
556	if (map [i] == *u) {
557	*c = i;
558	return `1`;
559	}
560	}
561	*c = `'x'`;
562	} else {
563	size_t j;
564	// for each entry in the sMap
565	for (const auto &element : sMap) {
566	// if the entry's unicode length isn't the same are usize, the strings
567	// are obviously different
568	if (element.u.size() != size_t(usize)) {
569	continue;
570	}
571	// compare the string char by char
572	for (j = `0`; j < element.u.size(); j++) {
573	if (element.u [j] != u[j]) {
574	break;
575	}
576	}
577
578	// we have the same strings
579	if (j == element.u.size()) {
580	*c = element.c;
581	return `1`;
582	}
583	}
584	}
585	return `0`;
586	}
587
588	//------------------------------------------------------------------------
589
590	CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA)
591	{
592	int i;
593
594	size = sizeA;
595	cache = (CharCodeToUnicode )gmallocn(count: size, size: sizeof*(CharCodeToUnicode ));
596	for (i = `0`; i < size; ++i) {
597	cache[i] = nullptr;
598	}
599	}
600
601	CharCodeToUnicodeCache::~CharCodeToUnicodeCache()
602	{
603	int i;
604
605	for (i = `0`; i < size; ++i) {
606	if (cache[i]) {
607	cache[i]->decRefCnt();
608	}
609	}
610	gfree(p: cache);
611	}
612
613	CharCodeToUnicode CharCodeToUnicodeCache::getCharCodeToUnicode(const* GooString *tag)
614	{
615	CharCodeToUnicode *ctu;
616	int i, j;
617
618	if (cache[`0`] && cache[`0`]->match(tagA: tag)) {
619	cache[`0`]->incRefCnt();
620	return cache[`0`];
621	}
622	for (i = `1`; i < size; ++i) {
623	if (cache[i] && cache[i]->match(tagA: tag)) {
624	ctu = cache[i];
625	for (j = i; j >= `1`; --j) {
626	cache[j] = cache[j - `1`];
627	}
628	cache[`0`] = ctu;
629	ctu->incRefCnt();
630	return ctu;
631	}
632	}
633	return nullptr;
634	}
635
636	void CharCodeToUnicodeCache::add(CharCodeToUnicode *ctu)
637	{
638	int i;
639
640	if (cache[size - `1`]) {
641	cache[size - `1`]->decRefCnt();
642	}
643	for (i = size - `1`; i >= `1`; --i) {
644	cache[i] = cache[i - `1`];
645	}
646	cache[`0`] = ctu;
647	ctu->incRefCnt();
648	}
649

source code of poppler/poppler/CharCodeToUnicode.cc