1 | //======================================================================== |
2 | // |
3 | // UnicodeMap.cc |
4 | // |
5 | // Copyright 2001-2003 Glyph & Cog, LLC |
6 | // |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2010 Jakub Wilk <jwilk@jwilk.net> |
17 | // Copyright (C) 2017-2020, 2022 Albert Astals Cid <aacid@kde.org> |
18 | // Copyright (C) 2017 Adrian Johnson <ajohnson@redneon.com> |
19 | // Copyright (C) 2017 Jean Ghali <jghali@libertysurf.fr> |
20 | // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de> |
21 | // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de> |
22 | // Copyright (C) 2019 Volker Krause <vkrause@kde.org> |
23 | // |
24 | // To see a description of the changes please see the Changelog file that |
25 | // came with your tarball or type make ChangeLog if you are building from git |
26 | // |
27 | //======================================================================== |
28 | |
29 | #include <config.h> |
30 | |
31 | #include <cstdio> |
32 | #include <cstring> |
33 | #include "goo/glibc.h" |
34 | #include "goo/gmem.h" |
35 | #include "goo/gfile.h" |
36 | #include "goo/GooString.h" |
37 | #include "Error.h" |
38 | #include "GlobalParams.h" |
39 | #include "UnicodeMap.h" |
40 | |
41 | //------------------------------------------------------------------------ |
42 | |
43 | #define maxExtCode 16 |
44 | |
45 | struct UnicodeMapExt |
46 | { |
47 | Unicode u; // Unicode char |
48 | char code[maxExtCode]; |
49 | unsigned int nBytes; |
50 | }; |
51 | |
52 | //------------------------------------------------------------------------ |
53 | |
54 | std::unique_ptr<UnicodeMap> UnicodeMap::parse(const std::string &encodingNameA) |
55 | { |
56 | FILE *f; |
57 | UnicodeMapRange *range; |
58 | UnicodeMapExt *eMap; |
59 | int size, eMapsSize; |
60 | char buf[256]; |
61 | int line, nBytes, i; |
62 | char *tok1, *tok2, *tok3; |
63 | char *tokptr; |
64 | |
65 | if (!(f = globalParams->getUnicodeMapFile(encodingName: encodingNameA))) { |
66 | error(category: errSyntaxError, pos: -1, msg: "Couldn't find unicodeMap file for the '{0:s}' encoding" , encodingNameA.c_str()); |
67 | return {}; |
68 | } |
69 | |
70 | auto map = std::unique_ptr<UnicodeMap>(new UnicodeMap(encodingNameA)); |
71 | |
72 | size = 8; |
73 | UnicodeMapRange *customRanges = (UnicodeMapRange *)gmallocn(count: size, size: sizeof(UnicodeMapRange)); |
74 | eMapsSize = 0; |
75 | |
76 | line = 1; |
77 | while (getLine(buf, size: sizeof(buf), f)) { |
78 | if ((tok1 = strtok_r(s: buf, delim: " \t\r\n" , save_ptr: &tokptr)) && (tok2 = strtok_r(s: nullptr, delim: " \t\r\n" , save_ptr: &tokptr))) { |
79 | if (!(tok3 = strtok_r(s: nullptr, delim: " \t\r\n" , save_ptr: &tokptr))) { |
80 | tok3 = tok2; |
81 | tok2 = tok1; |
82 | } |
83 | nBytes = strlen(s: tok3) / 2; |
84 | if (nBytes <= 4) { |
85 | if (map->len == size) { |
86 | size *= 2; |
87 | customRanges = (UnicodeMapRange *)greallocn(p: customRanges, count: size, size: sizeof(UnicodeMapRange)); |
88 | } |
89 | range = &customRanges[map->len]; |
90 | sscanf(s: tok1, format: "%x" , &range->start); |
91 | sscanf(s: tok2, format: "%x" , &range->end); |
92 | sscanf(s: tok3, format: "%x" , &range->code); |
93 | range->nBytes = nBytes; |
94 | ++map->len; |
95 | } else if (tok2 == tok1) { |
96 | if (map->eMapsLen == eMapsSize) { |
97 | eMapsSize += 16; |
98 | map->eMaps = (UnicodeMapExt *)greallocn(p: map->eMaps, count: eMapsSize, size: sizeof(UnicodeMapExt)); |
99 | } |
100 | eMap = &map->eMaps[map->eMapsLen]; |
101 | sscanf(s: tok1, format: "%x" , &eMap->u); |
102 | for (i = 0; i < nBytes; ++i) { |
103 | unsigned int x; |
104 | sscanf(s: tok3 + i * 2, format: "%2x" , &x); |
105 | eMap->code[i] = (char)x; |
106 | } |
107 | eMap->nBytes = nBytes; |
108 | ++map->eMapsLen; |
109 | } else { |
110 | error(category: errSyntaxError, pos: -1, msg: "Bad line ({0:d}) in unicodeMap file for the '{1:s}' encoding" , line, encodingNameA.c_str()); |
111 | } |
112 | } else { |
113 | error(category: errSyntaxError, pos: -1, msg: "Bad line ({0:d}) in unicodeMap file for the '{1:s}' encoding" , line, encodingNameA.c_str()); |
114 | } |
115 | ++line; |
116 | } |
117 | |
118 | fclose(stream: f); |
119 | |
120 | map->ranges = customRanges; |
121 | return map; |
122 | } |
123 | |
124 | UnicodeMap::UnicodeMap(const std::string &encodingNameA) |
125 | { |
126 | encodingName = encodingNameA; |
127 | unicodeOut = false; |
128 | kind = unicodeMapUser; |
129 | ranges = nullptr; |
130 | len = 0; |
131 | eMaps = nullptr; |
132 | eMapsLen = 0; |
133 | } |
134 | |
135 | UnicodeMap::UnicodeMap(const char *encodingNameA, bool unicodeOutA, const UnicodeMapRange *rangesA, int lenA) |
136 | { |
137 | encodingName = encodingNameA; |
138 | unicodeOut = unicodeOutA; |
139 | kind = unicodeMapResident; |
140 | ranges = rangesA; |
141 | len = lenA; |
142 | eMaps = nullptr; |
143 | eMapsLen = 0; |
144 | } |
145 | |
146 | UnicodeMap::UnicodeMap(const char *encodingNameA, bool unicodeOutA, UnicodeMapFunc funcA) |
147 | { |
148 | encodingName = encodingNameA; |
149 | unicodeOut = unicodeOutA; |
150 | kind = unicodeMapFunc; |
151 | func = funcA; |
152 | eMaps = nullptr; |
153 | eMapsLen = 0; |
154 | } |
155 | |
156 | UnicodeMap::~UnicodeMap() |
157 | { |
158 | if (kind == unicodeMapUser && ranges) { |
159 | gfree(p: const_cast<UnicodeMapRange *>(ranges)); |
160 | } |
161 | if (eMaps) { |
162 | gfree(p: eMaps); |
163 | } |
164 | } |
165 | |
166 | UnicodeMap::UnicodeMap(UnicodeMap &&other) noexcept : encodingName { std::move(other.encodingName) }, kind { other.kind }, unicodeOut { other.unicodeOut }, len { other.len }, eMaps { other.eMaps }, eMapsLen { other.eMapsLen } |
167 | { |
168 | switch (kind) { |
169 | case unicodeMapUser: |
170 | case unicodeMapResident: |
171 | ranges = other.ranges; |
172 | other.ranges = nullptr; |
173 | break; |
174 | case unicodeMapFunc: |
175 | func = other.func; |
176 | break; |
177 | } |
178 | other.eMaps = nullptr; |
179 | } |
180 | |
181 | UnicodeMap &UnicodeMap::operator=(UnicodeMap &&other) noexcept |
182 | { |
183 | if (this != &other) { |
184 | swap(other); |
185 | } |
186 | return *this; |
187 | } |
188 | |
189 | void UnicodeMap::swap(UnicodeMap &other) noexcept |
190 | { |
191 | using std::swap; |
192 | swap(lhs&: encodingName, rhs&: other.encodingName); |
193 | swap(a&: unicodeOut, b&: other.unicodeOut); |
194 | switch (kind) { |
195 | case unicodeMapUser: |
196 | case unicodeMapResident: |
197 | switch (other.kind) { |
198 | case unicodeMapUser: |
199 | case unicodeMapResident: |
200 | swap(a&: ranges, b&: other.ranges); |
201 | break; |
202 | case unicodeMapFunc: { |
203 | const auto tmp = ranges; |
204 | func = other.func; |
205 | other.ranges = tmp; |
206 | break; |
207 | } |
208 | } |
209 | break; |
210 | case unicodeMapFunc: |
211 | switch (other.kind) { |
212 | case unicodeMapUser: |
213 | case unicodeMapResident: { |
214 | const auto tmp = func; |
215 | ranges = other.ranges; |
216 | other.func = tmp; |
217 | break; |
218 | } |
219 | case unicodeMapFunc: |
220 | swap(a&: func, b&: other.func); |
221 | break; |
222 | } |
223 | break; |
224 | } |
225 | swap(a&: kind, b&: other.kind); |
226 | swap(a&: len, b&: other.len); |
227 | swap(a&: eMaps, b&: other.eMaps); |
228 | swap(a&: eMapsLen, b&: other.eMapsLen); |
229 | } |
230 | |
231 | bool UnicodeMap::match(const std::string &encodingNameA) const |
232 | { |
233 | return encodingName == encodingNameA; |
234 | } |
235 | |
236 | int UnicodeMap::mapUnicode(Unicode u, char *buf, int bufSize) const |
237 | { |
238 | int a, b, m, n, i, j; |
239 | unsigned int code; |
240 | |
241 | if (kind == unicodeMapFunc) { |
242 | return (*func)(u, buf, bufSize); |
243 | } |
244 | |
245 | a = 0; |
246 | b = len; |
247 | if (u >= ranges[a].start) { |
248 | // invariant: ranges[a].start <= u < ranges[b].start |
249 | while (b - a > 1) { |
250 | m = (a + b) / 2; |
251 | if (u >= ranges[m].start) { |
252 | a = m; |
253 | } else if (u < ranges[m].start) { |
254 | b = m; |
255 | } |
256 | } |
257 | if (u <= ranges[a].end) { |
258 | n = ranges[a].nBytes; |
259 | if (n > bufSize) { |
260 | return 0; |
261 | } |
262 | code = ranges[a].code + (u - ranges[a].start); |
263 | for (i = n - 1; i >= 0; --i) { |
264 | buf[i] = (char)(code & 0xff); |
265 | code >>= 8; |
266 | } |
267 | return n; |
268 | } |
269 | } |
270 | |
271 | for (i = 0; i < eMapsLen; ++i) { |
272 | if (eMaps[i].u == u) { |
273 | n = eMaps[i].nBytes; |
274 | for (j = 0; j < n; ++j) { |
275 | buf[j] = eMaps[i].code[j]; |
276 | } |
277 | return n; |
278 | } |
279 | } |
280 | |
281 | return 0; |
282 | } |
283 | |
284 | //------------------------------------------------------------------------ |
285 | |
286 | UnicodeMapCache::UnicodeMapCache() { } |
287 | |
288 | const UnicodeMap *UnicodeMapCache::getUnicodeMap(const std::string &encodingName) |
289 | { |
290 | for (const std::unique_ptr<UnicodeMap> &map : cache) { |
291 | if (map->match(encodingNameA: encodingName)) { |
292 | return map.get(); |
293 | } |
294 | } |
295 | std::unique_ptr<UnicodeMap> map = UnicodeMap::parse(encodingNameA: encodingName); |
296 | if (map) { |
297 | UnicodeMap *m = map.get(); |
298 | cache.emplace_back(args: std::move(map)); |
299 | return m; |
300 | } |
301 | return nullptr; |
302 | } |
303 | |