1//========================================================================
2//
3// Parser.cc
4//
5// Copyright 1996-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9//========================================================================
10//
11// Modified under the Poppler project - http://poppler.freedesktop.org
12//
13// All changes made under the Poppler project to this file are licensed
14// under GPL version 2 or later
15//
16// Copyright (C) 2006, 2009, 201, 2010, 2013, 2014, 2017-2020 Albert Astals Cid <aacid@kde.org>
17// Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18// Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com>
19// Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
20// Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
21// Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
22// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
23// Copyright (C) 2018, 2019 Adam Reichold <adam.reichold@t-online.de>
24// Copyright (C) 2018 Marek Kasik <mkasik@redhat.com>
25//
26// To see a description of the changes please see the Changelog file that
27// came with your tarball or type make ChangeLog if you are building from git
28//
29//========================================================================
30
31#include <config.h>
32
33#include <cstddef>
34#include "Object.h"
35#include "Array.h"
36#include "Dict.h"
37#include "Decrypt.h"
38#include "Parser.h"
39#include "XRef.h"
40#include "Error.h"
41
42// Max number of nested objects. This is used to catch infinite loops
43// in the object structure. And also technically valid files with
44// lots of nested arrays that made us consume all the stack
45#define recursionLimit 500
46
47Parser::Parser(XRef *xrefA, Stream *streamA, bool allowStreamsA) : lexer { xrefA, streamA }
48{
49 allowStreams = allowStreamsA;
50 buf1 = lexer.getObj();
51 buf2 = lexer.getObj();
52 inlineImg = 0;
53}
54
55Parser::Parser(XRef *xrefA, Object *objectA, bool allowStreamsA) : lexer { xrefA, objectA }
56{
57 allowStreams = allowStreamsA;
58 buf1 = lexer.getObj();
59 buf2 = lexer.getObj();
60 inlineImg = 0;
61}
62
63Parser::~Parser() = default;
64
65Object Parser::getObj(int recursion)
66{
67 return getObj(simpleOnly: false, fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0, recursion);
68}
69
70static std::unique_ptr<GooString> decryptedString(const GooString *s, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen)
71{
72 DecryptStream decrypt(new MemStream(s->c_str(), 0, s->getLength(), Object(objNull)), fileKey, encAlgorithm, keyLength, { .num: objNum, .gen: objGen });
73 decrypt.reset();
74 std::unique_ptr<GooString> res = std::make_unique<GooString>();
75 int c;
76 while ((c = decrypt.getChar()) != EOF) {
77 res->append(c: (char)c);
78 }
79 return res;
80}
81
82Object Parser::getObj(bool simpleOnly, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict, bool decryptString)
83{
84 Object obj;
85 Stream *str;
86
87 // refill buffer after inline image data
88 if (inlineImg == 2) {
89 buf1 = lexer.getObj();
90 buf2 = lexer.getObj();
91 inlineImg = 0;
92 }
93
94 if (unlikely(recursion >= recursionLimit)) {
95 return Object(objError);
96 }
97
98 // array
99 if (!simpleOnly && buf1.isCmd(cmdA: "[")) {
100 shift();
101 obj = Object(new Array(lexer.getXRef()));
102 while (!buf1.isCmd(cmdA: "]") && !buf1.isEOF() && recursion + 1 < recursionLimit) {
103 Object obj2 = getObj(simpleOnly: false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion: recursion + 1);
104 obj.arrayAdd(elem: std::move(obj2));
105 }
106 if (recursion + 1 >= recursionLimit && strict) {
107 goto err;
108 }
109 if (buf1.isEOF()) {
110 error(category: errSyntaxError, pos: getPos(), msg: "End of file inside array");
111 if (strict) {
112 goto err;
113 }
114 }
115 shift();
116
117 // dictionary or stream
118 } else if (!simpleOnly && buf1.isCmd(cmdA: "<<")) {
119 shift(objNum);
120 obj = Object(new Dict(lexer.getXRef()));
121 bool hasContentsEntry = false;
122 while (!buf1.isCmd(cmdA: ">>") && !buf1.isEOF()) {
123 if (!buf1.isName()) {
124 error(category: errSyntaxError, pos: getPos(), msg: "Dictionary key must be a name object");
125 if (strict) {
126 goto err;
127 }
128 shift();
129 } else {
130 // buf1 will go away in shift(), so keep the key
131 const auto key = std::move(buf1);
132 shift();
133 if (buf1.isEOF() || buf1.isError()) {
134 if (strict && buf1.isError()) {
135 goto err;
136 }
137 break;
138 }
139 // We don't decrypt strings that are the value of "Contents" key entries. We decrypt them if needed a few lines below.
140 // The "Contents" field of Sig dictionaries is not encrypted, but we can't know the type of the dictionary here yet
141 // so we don't decrypt any Contents and if later we find it's not a Sig dictionary we decrypt it
142 const bool isContents = !hasContentsEntry && key.isName(nameA: "Contents");
143 hasContentsEntry = hasContentsEntry || isContents;
144 Object obj2 = getObj(simpleOnly: false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion: recursion + 1, /*strict*/ false, /*decryptString*/ !isContents);
145 if (unlikely(obj2.isError() && recursion + 1 >= recursionLimit)) {
146 break;
147 }
148 obj.dictAdd(key: key.getName(), val: std::move(obj2));
149 }
150 }
151 if (buf1.isEOF()) {
152 error(category: errSyntaxError, pos: getPos(), msg: "End of file inside dictionary");
153 if (strict) {
154 goto err;
155 }
156 }
157 if (fileKey && hasContentsEntry) {
158 Dict *dict = obj.getDict();
159 const bool isSigDict = dict->is(type: "Sig");
160 if (!isSigDict) {
161 const Object &contentsObj = dict->lookupNF(key: "Contents");
162 if (contentsObj.isString()) {
163 std::unique_ptr<GooString> s = decryptedString(s: contentsObj.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
164 dict->set(key: "Contents", val: Object(s.release()));
165 }
166 }
167 }
168 // stream objects are not allowed inside content streams or
169 // object streams
170 if (buf2.isCmd(cmdA: "stream")) {
171 if (allowStreams && (str = makeStream(dict: std::move(obj), fileKey, encAlgorithm, keyLength, objNum, objGen, recursion: recursion + 1, strict))) {
172 return Object(str);
173 } else {
174 return Object(objError);
175 }
176 } else {
177 shift();
178 }
179
180 // indirect reference or integer
181 } else if (buf1.isInt()) {
182 const int num = buf1.getInt();
183 shift();
184 if (buf1.isInt() && buf2.isCmd(cmdA: "R")) {
185 const int gen = buf1.getInt();
186 shift();
187 shift();
188
189 if (unlikely(num <= 0 || gen < 0)) {
190 return Object();
191 }
192
193 Ref r;
194 r.num = num;
195 r.gen = gen;
196 return Object(r);
197 } else {
198 return Object(num);
199 }
200
201 // string
202 } else if (decryptString && buf1.isString() && fileKey) {
203 std::unique_ptr<GooString> s2 = decryptedString(s: buf1.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
204 obj = Object(s2.release());
205 shift();
206
207 // simple object
208 } else {
209 // avoid re-allocating memory for complex objects like strings by
210 // shallow copy of <buf1> to <obj> and nulling <buf1> so that
211 // subsequent buf1.free() won't free this memory
212 obj = std::move(buf1);
213 shift();
214 }
215
216 return obj;
217
218err:
219 return Object(objError);
220}
221
222Stream *Parser::makeStream(Object &&dict, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict)
223{
224 BaseStream *baseStr;
225 Stream *str;
226 Goffset length;
227 Goffset pos, endPos;
228
229 if (XRef *xref = lexer.getXRef()) {
230 XRefEntry *entry = xref->getEntry(i: objNum, complainIfMissing: false);
231 if (entry) {
232 if (!entry->getFlag(flag: XRefEntry::Parsing) || (objNum == 0 && objGen == 0)) {
233 entry->setFlag(flag: XRefEntry::Parsing, value: true);
234 } else {
235 error(category: errSyntaxError, pos: getPos(), msg: "Object '{0:d} {1:d} obj' is being already parsed", objNum, objGen);
236 return nullptr;
237 }
238 }
239 }
240
241 // get stream start position
242 lexer.skipToNextLine();
243 if (!(str = lexer.getStream())) {
244 return nullptr;
245 }
246 pos = str->getPos();
247
248 // get length
249 Object obj = dict.dictLookup(key: "Length", recursion);
250 if (obj.isInt()) {
251 length = obj.getInt();
252 } else if (obj.isInt64()) {
253 length = obj.getInt64();
254 } else {
255 error(category: errSyntaxError, pos: getPos(), msg: "Bad 'Length' attribute in stream");
256 if (strict) {
257 return nullptr;
258 }
259 length = 0;
260 }
261
262 // check for length in damaged file
263 if (lexer.hasXRef() && lexer.getXRef()->getStreamEnd(streamStart: pos, streamEnd: &endPos)) {
264 length = endPos - pos;
265 }
266
267 // in badly damaged PDF files, we can run off the end of the input
268 // stream immediately after the "stream" token
269 if (!lexer.getStream()) {
270 return nullptr;
271 }
272 baseStr = lexer.getStream()->getBaseStream();
273
274 // skip over stream data
275 if (Lexer::LOOK_VALUE_NOT_CACHED != lexer.lookCharLastValueCached) {
276 // take into account the fact that we've cached one value
277 pos = pos - 1;
278 lexer.lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED;
279 }
280 if (unlikely(length < 0)) {
281 return nullptr;
282 }
283 if (unlikely(pos > LLONG_MAX - length)) {
284 return nullptr;
285 }
286 lexer.setPos(pos + length);
287
288 // refill token buffers and check for 'endstream'
289 shift(); // kill '>>'
290 shift(cmdA: "endstream", objNum); // kill 'stream'
291 if (buf1.isCmd(cmdA: "endstream")) {
292 shift();
293 } else {
294 error(category: errSyntaxError, pos: getPos(), msg: "Missing 'endstream' or incorrect stream length");
295 if (strict) {
296 return nullptr;
297 }
298 if (lexer.hasXRef() && lexer.getStream()) {
299 // shift until we find the proper endstream or we change to another object or reach eof
300 length = lexer.getPos() - pos;
301 if (buf1.isCmd(cmdA: "endstream")) {
302 dict.dictSet(key: "Length", val: Object(length));
303 }
304 } else {
305 // When building the xref we can't use it so use this
306 // kludge for broken PDF files: just add 5k to the length, and
307 // hope its enough
308 if (length < LLONG_MAX - pos - 5000) {
309 length += 5000;
310 }
311 }
312 }
313
314 // make base stream
315 str = baseStr->makeSubStream(start: pos, limited: true, length, dict: std::move(dict));
316
317 // handle decryption
318 if (fileKey) {
319 str = new DecryptStream(str, fileKey, encAlgorithm, keyLength, { .num: objNum, .gen: objGen });
320 }
321
322 // get filters
323 str = str->addFilters(dict: str->getDict(), recursion);
324
325 if (XRef *xref = lexer.getXRef()) {
326 // Don't try to reuse the entry from the block at the start
327 // of the function, xref can change in the middle because of
328 // reconstruction
329 XRefEntry *entry = xref->getEntry(i: objNum, complainIfMissing: false);
330 if (entry) {
331 entry->setFlag(flag: XRefEntry::Parsing, value: false);
332 }
333 }
334
335 return str;
336}
337
338void Parser::shift(int objNum)
339{
340 if (inlineImg > 0) {
341 if (inlineImg < 2) {
342 ++inlineImg;
343 } else {
344 // in a damaged content stream, if 'ID' shows up in the middle
345 // of a dictionary, we need to reset
346 inlineImg = 0;
347 }
348 } else if (buf2.isCmd(cmdA: "ID")) {
349 lexer.skipChar(); // skip char after 'ID' command
350 inlineImg = 1;
351 }
352 buf1 = std::move(buf2);
353 if (inlineImg > 0) { // don't buffer inline image data
354 buf2.setToNull();
355 } else {
356 buf2 = lexer.getObj(objNum);
357 }
358}
359
360void Parser::shift(const char *cmdA, int objNum)
361{
362 if (inlineImg > 0) {
363 if (inlineImg < 2) {
364 ++inlineImg;
365 } else {
366 // in a damaged content stream, if 'ID' shows up in the middle
367 // of a dictionary, we need to reset
368 inlineImg = 0;
369 }
370 } else if (buf2.isCmd(cmdA: "ID")) {
371 lexer.skipChar(); // skip char after 'ID' command
372 inlineImg = 1;
373 }
374 buf1 = std::move(buf2);
375 if (inlineImg > 0) {
376 buf2.setToNull();
377 } else if (buf1.isCmd(cmdA)) {
378 buf2 = lexer.getObj(objNum);
379 } else {
380 buf2 = lexer.getObj(cmdA, objNum);
381 }
382}
383

source code of poppler/poppler/Parser.cc