1 | //======================================================================== |
2 | // |
3 | // Parser.cc |
4 | // |
5 | // Copyright 1996-2003 Glyph & Cog, LLC |
6 | // |
7 | //======================================================================== |
8 | |
9 | //======================================================================== |
10 | // |
11 | // Modified under the Poppler project - http://poppler.freedesktop.org |
12 | // |
13 | // All changes made under the Poppler project to this file are licensed |
14 | // under GPL version 2 or later |
15 | // |
16 | // Copyright (C) 2006, 2009, 201, 2010, 2013, 2014, 2017-2020 Albert Astals Cid <aacid@kde.org> |
17 | // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com> |
18 | // Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com> |
19 | // Copyright (C) 2012 Hib Eris <hib@hiberis.nl> |
20 | // Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com> |
21 | // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> |
22 | // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich |
23 | // Copyright (C) 2018, 2019 Adam Reichold <adam.reichold@t-online.de> |
24 | // Copyright (C) 2018 Marek Kasik <mkasik@redhat.com> |
25 | // |
26 | // To see a description of the changes please see the Changelog file that |
27 | // came with your tarball or type make ChangeLog if you are building from git |
28 | // |
29 | //======================================================================== |
30 | |
31 | #include <config.h> |
32 | |
33 | #include <cstddef> |
34 | #include "Object.h" |
35 | #include "Array.h" |
36 | #include "Dict.h" |
37 | #include "Decrypt.h" |
38 | #include "Parser.h" |
39 | #include "XRef.h" |
40 | #include "Error.h" |
41 | |
42 | // Max number of nested objects. This is used to catch infinite loops |
43 | // in the object structure. And also technically valid files with |
44 | // lots of nested arrays that made us consume all the stack |
45 | #define recursionLimit 500 |
46 | |
47 | Parser::Parser(XRef *xrefA, Stream *streamA, bool allowStreamsA) : lexer { xrefA, streamA } |
48 | { |
49 | allowStreams = allowStreamsA; |
50 | buf1 = lexer.getObj(); |
51 | buf2 = lexer.getObj(); |
52 | inlineImg = 0; |
53 | } |
54 | |
55 | Parser::Parser(XRef *xrefA, Object *objectA, bool allowStreamsA) : lexer { xrefA, objectA } |
56 | { |
57 | allowStreams = allowStreamsA; |
58 | buf1 = lexer.getObj(); |
59 | buf2 = lexer.getObj(); |
60 | inlineImg = 0; |
61 | } |
62 | |
63 | Parser::~Parser() = default; |
64 | |
65 | Object Parser::getObj(int recursion) |
66 | { |
67 | return getObj(simpleOnly: false, fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0, recursion); |
68 | } |
69 | |
70 | static std::unique_ptr<GooString> decryptedString(const GooString *s, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen) |
71 | { |
72 | DecryptStream decrypt(new MemStream(s->c_str(), 0, s->getLength(), Object(objNull)), fileKey, encAlgorithm, keyLength, { .num: objNum, .gen: objGen }); |
73 | decrypt.reset(); |
74 | std::unique_ptr<GooString> res = std::make_unique<GooString>(); |
75 | int c; |
76 | while ((c = decrypt.getChar()) != EOF) { |
77 | res->append(c: (char)c); |
78 | } |
79 | return res; |
80 | } |
81 | |
82 | Object Parser::getObj(bool simpleOnly, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict, bool decryptString) |
83 | { |
84 | Object obj; |
85 | Stream *str; |
86 | |
87 | // refill buffer after inline image data |
88 | if (inlineImg == 2) { |
89 | buf1 = lexer.getObj(); |
90 | buf2 = lexer.getObj(); |
91 | inlineImg = 0; |
92 | } |
93 | |
94 | if (unlikely(recursion >= recursionLimit)) { |
95 | return Object(objError); |
96 | } |
97 | |
98 | // array |
99 | if (!simpleOnly && buf1.isCmd(cmdA: "[" )) { |
100 | shift(); |
101 | obj = Object(new Array(lexer.getXRef())); |
102 | while (!buf1.isCmd(cmdA: "]" ) && !buf1.isEOF() && recursion + 1 < recursionLimit) { |
103 | Object obj2 = getObj(simpleOnly: false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion: recursion + 1); |
104 | obj.arrayAdd(elem: std::move(obj2)); |
105 | } |
106 | if (recursion + 1 >= recursionLimit && strict) { |
107 | goto err; |
108 | } |
109 | if (buf1.isEOF()) { |
110 | error(category: errSyntaxError, pos: getPos(), msg: "End of file inside array" ); |
111 | if (strict) { |
112 | goto err; |
113 | } |
114 | } |
115 | shift(); |
116 | |
117 | // dictionary or stream |
118 | } else if (!simpleOnly && buf1.isCmd(cmdA: "<<" )) { |
119 | shift(objNum); |
120 | obj = Object(new Dict(lexer.getXRef())); |
121 | bool hasContentsEntry = false; |
122 | while (!buf1.isCmd(cmdA: ">>" ) && !buf1.isEOF()) { |
123 | if (!buf1.isName()) { |
124 | error(category: errSyntaxError, pos: getPos(), msg: "Dictionary key must be a name object" ); |
125 | if (strict) { |
126 | goto err; |
127 | } |
128 | shift(); |
129 | } else { |
130 | // buf1 will go away in shift(), so keep the key |
131 | const auto key = std::move(buf1); |
132 | shift(); |
133 | if (buf1.isEOF() || buf1.isError()) { |
134 | if (strict && buf1.isError()) { |
135 | goto err; |
136 | } |
137 | break; |
138 | } |
139 | // We don't decrypt strings that are the value of "Contents" key entries. We decrypt them if needed a few lines below. |
140 | // The "Contents" field of Sig dictionaries is not encrypted, but we can't know the type of the dictionary here yet |
141 | // so we don't decrypt any Contents and if later we find it's not a Sig dictionary we decrypt it |
142 | const bool isContents = !hasContentsEntry && key.isName(nameA: "Contents" ); |
143 | hasContentsEntry = hasContentsEntry || isContents; |
144 | Object obj2 = getObj(simpleOnly: false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion: recursion + 1, /*strict*/ false, /*decryptString*/ !isContents); |
145 | if (unlikely(obj2.isError() && recursion + 1 >= recursionLimit)) { |
146 | break; |
147 | } |
148 | obj.dictAdd(key: key.getName(), val: std::move(obj2)); |
149 | } |
150 | } |
151 | if (buf1.isEOF()) { |
152 | error(category: errSyntaxError, pos: getPos(), msg: "End of file inside dictionary" ); |
153 | if (strict) { |
154 | goto err; |
155 | } |
156 | } |
157 | if (fileKey && hasContentsEntry) { |
158 | Dict *dict = obj.getDict(); |
159 | const bool isSigDict = dict->is(type: "Sig" ); |
160 | if (!isSigDict) { |
161 | const Object &contentsObj = dict->lookupNF(key: "Contents" ); |
162 | if (contentsObj.isString()) { |
163 | std::unique_ptr<GooString> s = decryptedString(s: contentsObj.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen); |
164 | dict->set(key: "Contents" , val: Object(s.release())); |
165 | } |
166 | } |
167 | } |
168 | // stream objects are not allowed inside content streams or |
169 | // object streams |
170 | if (buf2.isCmd(cmdA: "stream" )) { |
171 | if (allowStreams && (str = makeStream(dict: std::move(obj), fileKey, encAlgorithm, keyLength, objNum, objGen, recursion: recursion + 1, strict))) { |
172 | return Object(str); |
173 | } else { |
174 | return Object(objError); |
175 | } |
176 | } else { |
177 | shift(); |
178 | } |
179 | |
180 | // indirect reference or integer |
181 | } else if (buf1.isInt()) { |
182 | const int num = buf1.getInt(); |
183 | shift(); |
184 | if (buf1.isInt() && buf2.isCmd(cmdA: "R" )) { |
185 | const int gen = buf1.getInt(); |
186 | shift(); |
187 | shift(); |
188 | |
189 | if (unlikely(num <= 0 || gen < 0)) { |
190 | return Object(); |
191 | } |
192 | |
193 | Ref r; |
194 | r.num = num; |
195 | r.gen = gen; |
196 | return Object(r); |
197 | } else { |
198 | return Object(num); |
199 | } |
200 | |
201 | // string |
202 | } else if (decryptString && buf1.isString() && fileKey) { |
203 | std::unique_ptr<GooString> s2 = decryptedString(s: buf1.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen); |
204 | obj = Object(s2.release()); |
205 | shift(); |
206 | |
207 | // simple object |
208 | } else { |
209 | // avoid re-allocating memory for complex objects like strings by |
210 | // shallow copy of <buf1> to <obj> and nulling <buf1> so that |
211 | // subsequent buf1.free() won't free this memory |
212 | obj = std::move(buf1); |
213 | shift(); |
214 | } |
215 | |
216 | return obj; |
217 | |
218 | err: |
219 | return Object(objError); |
220 | } |
221 | |
222 | Stream *Parser::makeStream(Object &&dict, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict) |
223 | { |
224 | BaseStream *baseStr; |
225 | Stream *str; |
226 | Goffset length; |
227 | Goffset pos, endPos; |
228 | |
229 | if (XRef *xref = lexer.getXRef()) { |
230 | XRefEntry *entry = xref->getEntry(i: objNum, complainIfMissing: false); |
231 | if (entry) { |
232 | if (!entry->getFlag(flag: XRefEntry::Parsing) || (objNum == 0 && objGen == 0)) { |
233 | entry->setFlag(flag: XRefEntry::Parsing, value: true); |
234 | } else { |
235 | error(category: errSyntaxError, pos: getPos(), msg: "Object '{0:d} {1:d} obj' is being already parsed" , objNum, objGen); |
236 | return nullptr; |
237 | } |
238 | } |
239 | } |
240 | |
241 | // get stream start position |
242 | lexer.skipToNextLine(); |
243 | if (!(str = lexer.getStream())) { |
244 | return nullptr; |
245 | } |
246 | pos = str->getPos(); |
247 | |
248 | // get length |
249 | Object obj = dict.dictLookup(key: "Length" , recursion); |
250 | if (obj.isInt()) { |
251 | length = obj.getInt(); |
252 | } else if (obj.isInt64()) { |
253 | length = obj.getInt64(); |
254 | } else { |
255 | error(category: errSyntaxError, pos: getPos(), msg: "Bad 'Length' attribute in stream" ); |
256 | if (strict) { |
257 | return nullptr; |
258 | } |
259 | length = 0; |
260 | } |
261 | |
262 | // check for length in damaged file |
263 | if (lexer.hasXRef() && lexer.getXRef()->getStreamEnd(streamStart: pos, streamEnd: &endPos)) { |
264 | length = endPos - pos; |
265 | } |
266 | |
267 | // in badly damaged PDF files, we can run off the end of the input |
268 | // stream immediately after the "stream" token |
269 | if (!lexer.getStream()) { |
270 | return nullptr; |
271 | } |
272 | baseStr = lexer.getStream()->getBaseStream(); |
273 | |
274 | // skip over stream data |
275 | if (Lexer::LOOK_VALUE_NOT_CACHED != lexer.lookCharLastValueCached) { |
276 | // take into account the fact that we've cached one value |
277 | pos = pos - 1; |
278 | lexer.lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED; |
279 | } |
280 | if (unlikely(length < 0)) { |
281 | return nullptr; |
282 | } |
283 | if (unlikely(pos > LLONG_MAX - length)) { |
284 | return nullptr; |
285 | } |
286 | lexer.setPos(pos + length); |
287 | |
288 | // refill token buffers and check for 'endstream' |
289 | shift(); // kill '>>' |
290 | shift(cmdA: "endstream" , objNum); // kill 'stream' |
291 | if (buf1.isCmd(cmdA: "endstream" )) { |
292 | shift(); |
293 | } else { |
294 | error(category: errSyntaxError, pos: getPos(), msg: "Missing 'endstream' or incorrect stream length" ); |
295 | if (strict) { |
296 | return nullptr; |
297 | } |
298 | if (lexer.hasXRef() && lexer.getStream()) { |
299 | // shift until we find the proper endstream or we change to another object or reach eof |
300 | length = lexer.getPos() - pos; |
301 | if (buf1.isCmd(cmdA: "endstream" )) { |
302 | dict.dictSet(key: "Length" , val: Object(length)); |
303 | } |
304 | } else { |
305 | // When building the xref we can't use it so use this |
306 | // kludge for broken PDF files: just add 5k to the length, and |
307 | // hope its enough |
308 | if (length < LLONG_MAX - pos - 5000) { |
309 | length += 5000; |
310 | } |
311 | } |
312 | } |
313 | |
314 | // make base stream |
315 | str = baseStr->makeSubStream(start: pos, limited: true, length, dict: std::move(dict)); |
316 | |
317 | // handle decryption |
318 | if (fileKey) { |
319 | str = new DecryptStream(str, fileKey, encAlgorithm, keyLength, { .num: objNum, .gen: objGen }); |
320 | } |
321 | |
322 | // get filters |
323 | str = str->addFilters(dict: str->getDict(), recursion); |
324 | |
325 | if (XRef *xref = lexer.getXRef()) { |
326 | // Don't try to reuse the entry from the block at the start |
327 | // of the function, xref can change in the middle because of |
328 | // reconstruction |
329 | XRefEntry *entry = xref->getEntry(i: objNum, complainIfMissing: false); |
330 | if (entry) { |
331 | entry->setFlag(flag: XRefEntry::Parsing, value: false); |
332 | } |
333 | } |
334 | |
335 | return str; |
336 | } |
337 | |
338 | void Parser::shift(int objNum) |
339 | { |
340 | if (inlineImg > 0) { |
341 | if (inlineImg < 2) { |
342 | ++inlineImg; |
343 | } else { |
344 | // in a damaged content stream, if 'ID' shows up in the middle |
345 | // of a dictionary, we need to reset |
346 | inlineImg = 0; |
347 | } |
348 | } else if (buf2.isCmd(cmdA: "ID" )) { |
349 | lexer.skipChar(); // skip char after 'ID' command |
350 | inlineImg = 1; |
351 | } |
352 | buf1 = std::move(buf2); |
353 | if (inlineImg > 0) { // don't buffer inline image data |
354 | buf2.setToNull(); |
355 | } else { |
356 | buf2 = lexer.getObj(objNum); |
357 | } |
358 | } |
359 | |
360 | void Parser::shift(const char *cmdA, int objNum) |
361 | { |
362 | if (inlineImg > 0) { |
363 | if (inlineImg < 2) { |
364 | ++inlineImg; |
365 | } else { |
366 | // in a damaged content stream, if 'ID' shows up in the middle |
367 | // of a dictionary, we need to reset |
368 | inlineImg = 0; |
369 | } |
370 | } else if (buf2.isCmd(cmdA: "ID" )) { |
371 | lexer.skipChar(); // skip char after 'ID' command |
372 | inlineImg = 1; |
373 | } |
374 | buf1 = std::move(buf2); |
375 | if (inlineImg > 0) { |
376 | buf2.setToNull(); |
377 | } else if (buf1.isCmd(cmdA)) { |
378 | buf2 = lexer.getObj(objNum); |
379 | } else { |
380 | buf2 = lexer.getObj(cmdA, objNum); |
381 | } |
382 | } |
383 | |