1//========================================================================
2//
3// pdfunite.cc
4//
5// This file is licensed under the GPLv2 or later
6//
7// Copyright (C) 2011-2015, 2017 Thomas Freitag <Thomas.Freitag@alfa.de>
8// Copyright (C) 2012 Arseny Solokha <asolokha@gmx.com>
9// Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it>
10// Copyright (C) 2012, 2014, 2017-2019, 2021, 2022 Albert Astals Cid <aacid@kde.org>
11// Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
12// Copyright (C) 2013 Hib Eris <hib@hiberis.nl>
13// Copyright (C) 2015 Arthur Stavisky <vovodroid@gmail.com>
14// Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
15// Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
16// Copyright (C) 2019 Marek Kasik <mkasik@redhat.com>
17// Copyright (C) 2019, 2023 Oliver Sander <oliver.sander@tu-dresden.de>
18// Copyright (C) 2022 crt <chluo@cse.cuhk.edu.hk>
19//
20//========================================================================
21
22#include <PDFDoc.h>
23#include <GlobalParams.h>
24#include "parseargs.h"
25#include "config.h"
26#include <poppler-config.h>
27#include <vector>
28
29static bool printVersion = false;
30static bool printHelp = false;
31
32static const ArgDesc argDesc[] = { { .arg: "-v", .kind: argFlag, .val: &printVersion, .size: 0, .usage: "print copyright and version info" }, { .arg: "-h", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, { .arg: "-help", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" },
33 { .arg: "--help", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, { .arg: "-?", .kind: argFlag, .val: &printHelp, .size: 0, .usage: "print usage information" }, {} };
34
35static void doMergeNameTree(PDFDoc *doc, XRef *srcXRef, XRef *countRef, int oldRefNum, int newRefNum, Dict *srcNameTree, Dict *mergeNameTree, int numOffset)
36{
37 Object mergeNameArray = mergeNameTree->lookup(key: "Names");
38 Object srcNameArray = srcNameTree->lookup(key: "Names");
39 if (mergeNameArray.isArray() && srcNameArray.isArray()) {
40 Array *newNameArray = new Array(srcXRef);
41 int j = 0;
42 for (int i = 0; i < srcNameArray.arrayGetLength() - 1; i += 2) {
43 const Object &key = srcNameArray.arrayGetNF(i);
44 const Object &value = srcNameArray.arrayGetNF(i: i + 1);
45 if (key.isString() && value.isRef()) {
46 while (j < mergeNameArray.arrayGetLength() - 1) {
47 const Object &mkey = mergeNameArray.arrayGetNF(i: j);
48 const Object &mvalue = mergeNameArray.arrayGetNF(i: j + 1);
49 if (mkey.isString() && mvalue.isRef()) {
50 if (mkey.getString()->cmp(str: key.getString()) < 0) {
51 newNameArray->add(elem: Object(new GooString(mkey.getString()->c_str())));
52 newNameArray->add(elem: Object(Ref { .num: mvalue.getRef().num + numOffset, .gen: mvalue.getRef().gen }));
53 j += 2;
54 } else if (mkey.getString()->cmp(str: key.getString()) == 0) {
55 j += 2;
56 } else {
57 break;
58 }
59 } else {
60 j += 2;
61 }
62 }
63 newNameArray->add(elem: Object(new GooString(key.getString()->c_str())));
64 newNameArray->add(elem: Object(value.getRef()));
65 }
66 }
67 while (j < mergeNameArray.arrayGetLength() - 1) {
68 const Object &mkey = mergeNameArray.arrayGetNF(i: j);
69 const Object &mvalue = mergeNameArray.arrayGetNF(i: j + 1);
70 if (mkey.isString() && mvalue.isRef()) {
71 newNameArray->add(elem: Object(new GooString(mkey.getString()->c_str())));
72 newNameArray->add(elem: Object(Ref { .num: mvalue.getRef().num + numOffset, .gen: mvalue.getRef().gen }));
73 }
74 j += 2;
75 }
76 srcNameTree->set(key: "Names", val: Object(newNameArray));
77 doc->markPageObjects(pageDict: mergeNameTree, xRef: srcXRef, countRef, numOffset, oldRefNum, newRefNum);
78 } else if (srcNameArray.isNull() && mergeNameArray.isArray()) {
79 Array *newNameArray = new Array(srcXRef);
80 for (int i = 0; i < mergeNameArray.arrayGetLength() - 1; i += 2) {
81 const Object &key = mergeNameArray.arrayGetNF(i);
82 const Object &value = mergeNameArray.arrayGetNF(i: i + 1);
83 if (key.isString() && value.isRef()) {
84 newNameArray->add(elem: Object(new GooString(key.getString()->c_str())));
85 newNameArray->add(elem: Object(Ref { .num: value.getRef().num + numOffset, .gen: value.getRef().gen }));
86 }
87 }
88 srcNameTree->add(key: "Names", val: Object(newNameArray));
89 doc->markPageObjects(pageDict: mergeNameTree, xRef: srcXRef, countRef, numOffset, oldRefNum, newRefNum);
90 }
91}
92
93static void doMergeNameDict(PDFDoc *doc, XRef *srcXRef, XRef *countRef, int oldRefNum, int newRefNum, Dict *srcNameDict, Dict *mergeNameDict, int numOffset)
94{
95 for (int i = 0; i < mergeNameDict->getLength(); i++) {
96 const char *key = mergeNameDict->getKey(i);
97 Object mergeNameTree = mergeNameDict->lookup(key);
98 Object srcNameTree = srcNameDict->lookup(key);
99 if (srcNameTree.isDict() && mergeNameTree.isDict()) {
100 doMergeNameTree(doc, srcXRef, countRef, oldRefNum, newRefNum, srcNameTree: srcNameTree.getDict(), mergeNameTree: mergeNameTree.getDict(), numOffset);
101 } else if (srcNameTree.isNull() && mergeNameTree.isDict()) {
102 Object newNameTree(new Dict(srcXRef));
103 doMergeNameTree(doc, srcXRef, countRef, oldRefNum, newRefNum, srcNameTree: newNameTree.getDict(), mergeNameTree: mergeNameTree.getDict(), numOffset);
104 srcNameDict->add(key, val: std::move(newNameTree));
105 }
106 }
107}
108
109static bool doMergeFormDict(Dict *srcFormDict, Dict *mergeFormDict, int numOffset)
110{
111 Object srcFields = srcFormDict->lookup(key: "Fields");
112 Object mergeFields = mergeFormDict->lookup(key: "Fields");
113 if (srcFields.isArray() && mergeFields.isArray()) {
114 for (int i = 0; i < mergeFields.arrayGetLength(); i++) {
115 const Object &value = mergeFields.arrayGetNF(i);
116 if (!value.isRef()) {
117 error(category: errSyntaxError, pos: -1, msg: "Fields object is not a Ref.");
118 return false;
119 }
120 srcFields.arrayAdd(elem: Object(Ref { .num: value.getRef().num + numOffset, .gen: value.getRef().gen }));
121 }
122 }
123 return true;
124}
125
126///////////////////////////////////////////////////////////////////////////
127int main(int argc, char *argv[])
128///////////////////////////////////////////////////////////////////////////
129// Merge PDF files given by arguments 1 to argc-2 and write the result
130// to the file specified by argument argc-1.
131///////////////////////////////////////////////////////////////////////////
132{
133 int objectsCount = 0;
134 unsigned int numOffset = 0;
135 std::vector<Object> pages;
136 std::vector<unsigned int> offsets;
137 XRef *yRef, *countRef;
138 FILE *f;
139 OutStream *outStr;
140 int i;
141 int j, rootNum;
142 std::vector<std::unique_ptr<PDFDoc>> docs;
143 int majorVersion = 0;
144 int minorVersion = 0;
145 char *fileName = argv[argc - 1];
146
147 const bool ok = parseArgs(args: argDesc, argc: &argc, argv);
148 if (!ok || argc < 3 || printVersion || printHelp) {
149 fprintf(stderr, format: "pdfunite version %s\n", PACKAGE_VERSION);
150 fprintf(stderr, format: "%s\n", popplerCopyright);
151 fprintf(stderr, format: "%s\n", xpdfCopyright);
152 if (!printVersion) {
153 printUsage(program: "pdfunite", otherArgs: "<PDF-sourcefile-1>..<PDF-sourcefile-n> <PDF-destfile>", args: argDesc);
154 }
155 if (printVersion || printHelp) {
156 return 0;
157 }
158 return 99;
159 }
160 globalParams = std::make_unique<GlobalParams>();
161
162 for (i = 1; i < argc - 1; i++) {
163 std::unique_ptr<PDFDoc> doc = std::make_unique<PDFDoc>(args: std::make_unique<GooString>(args&: argv[i]));
164 if (doc->isOk() && !doc->isEncrypted() && doc->getXRef()->getCatalog().isDict()) {
165 if (doc->getPDFMajorVersion() > majorVersion) {
166 majorVersion = doc->getPDFMajorVersion();
167 minorVersion = doc->getPDFMinorVersion();
168 } else if (doc->getPDFMajorVersion() == majorVersion) {
169 if (doc->getPDFMinorVersion() > minorVersion) {
170 minorVersion = doc->getPDFMinorVersion();
171 }
172 }
173 docs.push_back(x: std::move(doc));
174 } else if (doc->isOk()) {
175 if (doc->isEncrypted()) {
176 error(category: errUnimplemented, pos: -1, msg: "Could not merge encrypted files ('{0:s}')", argv[i]);
177 return -1;
178 } else if (!doc->getXRef()->getCatalog().isDict()) {
179 error(category: errSyntaxError, pos: -1, msg: "XRef's Catalog is not a dictionary ('{0:s}')", argv[i]);
180 return -1;
181 }
182 } else {
183 error(category: errSyntaxError, pos: -1, msg: "Could not merge damaged documents ('{0:s}')", argv[i]);
184 return -1;
185 }
186 }
187
188 if (!(f = fopen(filename: fileName, modes: "wb"))) {
189 error(category: errIO, pos: -1, msg: "Could not open file '{0:s}'", fileName);
190 return -1;
191 }
192 outStr = new FileOutStream(f, 0);
193
194 yRef = new XRef();
195 countRef = new XRef();
196 yRef->add(num: 0, gen: 65535, offs: 0, used: false);
197 PDFDoc::writeHeader(outStr, major: majorVersion, minor: minorVersion);
198
199 // handle OutputIntents, AcroForm, OCProperties & Names
200 Object intents;
201 Object names;
202 Object afObj;
203 Object ocObj;
204 if (docs.size() >= 1) {
205 Object catObj = docs[0]->getXRef()->getCatalog();
206 if (!catObj.isDict()) {
207 fclose(stream: f);
208 delete yRef;
209 delete countRef;
210 delete outStr;
211 error(category: errSyntaxError, pos: -1, msg: "XRef's Catalog is not a dictionary.");
212 return -1;
213 }
214 Dict *catDict = catObj.getDict();
215 intents = catDict->lookup(key: "OutputIntents");
216 afObj = catDict->lookupNF(key: "AcroForm").copy();
217 Ref *refPage = docs[0]->getCatalog()->getPageRef(i: 1);
218 if (!afObj.isNull() && refPage) {
219 docs[0]->markAcroForm(afObj: &afObj, xRef: yRef, countRef, numOffset: 0, oldRefNum: refPage->num, newRefNum: refPage->num);
220 }
221 ocObj = catDict->lookupNF(key: "OCProperties").copy();
222 if (!ocObj.isNull() && ocObj.isDict() && refPage) {
223 docs[0]->markPageObjects(pageDict: ocObj.getDict(), xRef: yRef, countRef, numOffset: 0, oldRefNum: refPage->num, newRefNum: refPage->num);
224 }
225 names = catDict->lookup(key: "Names");
226 if (!names.isNull() && names.isDict() && refPage) {
227 docs[0]->markPageObjects(pageDict: names.getDict(), xRef: yRef, countRef, numOffset: 0, oldRefNum: refPage->num, newRefNum: refPage->num);
228 }
229 if (intents.isArray() && intents.arrayGetLength() > 0) {
230 for (i = 1; i < (int)docs.size(); i++) {
231 Object pagecatObj = docs[i]->getXRef()->getCatalog();
232 Dict *pagecatDict = pagecatObj.getDict();
233 Object pageintents = pagecatDict->lookup(key: "OutputIntents");
234 if (pageintents.isArray() && pageintents.arrayGetLength() > 0) {
235 for (j = intents.arrayGetLength() - 1; j >= 0; j--) {
236 Object intent = intents.arrayGet(i: j, recursion: 0);
237 if (intent.isDict()) {
238 Object idf = intent.dictLookup(key: "OutputConditionIdentifier");
239 if (idf.isString()) {
240 const GooString *gidf = idf.getString();
241 bool removeIntent = true;
242 for (int k = 0; k < pageintents.arrayGetLength(); k++) {
243 Object pgintent = pageintents.arrayGet(i: k, recursion: 0);
244 if (pgintent.isDict()) {
245 Object pgidf = pgintent.dictLookup(key: "OutputConditionIdentifier");
246 if (pgidf.isString()) {
247 const GooString *gpgidf = pgidf.getString();
248 if (gpgidf->cmp(str: gidf) == 0) {
249 removeIntent = false;
250 break;
251 }
252 }
253 }
254 }
255 if (removeIntent) {
256 intents.arrayRemove(i: j);
257 error(category: errSyntaxWarning, pos: -1, msg: "Output intent {0:s} missing in pdf {1:s}, removed", gidf->c_str(), docs[i]->getFileName()->c_str());
258 }
259 } else {
260 intents.arrayRemove(i: j);
261 error(category: errSyntaxWarning, pos: -1, msg: "Invalid output intent dict, missing required OutputConditionIdentifier");
262 }
263 } else {
264 intents.arrayRemove(i: j);
265 }
266 }
267 } else {
268 error(category: errSyntaxWarning, pos: -1, msg: "Output intents differs, remove them all");
269 break;
270 }
271 }
272 }
273 if (intents.isArray() && intents.arrayGetLength() > 0) {
274 for (j = intents.arrayGetLength() - 1; j >= 0; j--) {
275 Object intent = intents.arrayGet(i: j, recursion: 0);
276 if (intent.isDict()) {
277 docs[0]->markPageObjects(pageDict: intent.getDict(), xRef: yRef, countRef, numOffset, oldRefNum: 0, newRefNum: 0);
278 } else {
279 intents.arrayRemove(i: j);
280 }
281 }
282 }
283 }
284
285 for (i = 0; i < (int)docs.size(); i++) {
286 for (j = 1; j <= docs[i]->getNumPages(); j++) {
287 if (!docs[i]->getCatalog()->getPage(i: j)) {
288 continue;
289 }
290
291 const PDFRectangle *cropBox = nullptr;
292 if (docs[i]->getCatalog()->getPage(i: j)->isCropped()) {
293 cropBox = docs[i]->getCatalog()->getPage(i: j)->getCropBox();
294 }
295 if (!docs[i]->replacePageDict(pageNo: j, rotate: docs[i]->getCatalog()->getPage(i: j)->getRotate(), mediaBox: docs[i]->getCatalog()->getPage(i: j)->getMediaBox(), cropBox)) {
296 fclose(stream: f);
297 delete yRef;
298 delete countRef;
299 delete outStr;
300 error(category: errSyntaxError, pos: -1, msg: "PDFDoc::replacePageDict failed.");
301 return -1;
302 }
303 Ref *refPage = docs[i]->getCatalog()->getPageRef(i: j);
304 Object page = docs[i]->getXRef()->fetch(ref: *refPage);
305 Dict *pageDict = page.getDict();
306 Object *resDict = docs[i]->getCatalog()->getPage(i: j)->getResourceDictObject();
307 if (resDict->isDict()) {
308 pageDict->set(key: "Resources", val: resDict->copy());
309 }
310 pages.push_back(x: std::move(page));
311 offsets.push_back(x: numOffset);
312 docs[i]->markPageObjects(pageDict, xRef: yRef, countRef, numOffset, oldRefNum: refPage->num, newRefNum: refPage->num);
313 Object annotsObj = pageDict->lookupNF(key: "Annots").copy();
314 if (!annotsObj.isNull()) {
315 docs[i]->markAnnotations(annots: &annotsObj, xRef: yRef, countRef, numOffset, oldPageNum: refPage->num, newPageNum: refPage->num);
316 }
317 }
318 Object pageCatObj = docs[i]->getXRef()->getCatalog();
319 if (!pageCatObj.isDict()) {
320 fclose(stream: f);
321 delete yRef;
322 delete countRef;
323 delete outStr;
324 error(category: errSyntaxError, pos: -1, msg: "XRef's Catalog is not a dictionary.");
325 return -1;
326 }
327 Dict *pageCatDict = pageCatObj.getDict();
328 Object pageNames = pageCatDict->lookup(key: "Names");
329 if (!pageNames.isNull() && pageNames.isDict()) {
330 if (!names.isDict()) {
331 names = Object(new Dict(yRef));
332 }
333 doMergeNameDict(doc: docs[i].get(), srcXRef: yRef, countRef, oldRefNum: 0, newRefNum: 0, srcNameDict: names.getDict(), mergeNameDict: pageNames.getDict(), numOffset);
334 }
335 Object pageForm = pageCatDict->lookup(key: "AcroForm");
336 if (i > 0 && !pageForm.isNull() && pageForm.isDict()) {
337 if (afObj.isNull()) {
338 afObj = pageCatDict->lookupNF(key: "AcroForm").copy();
339 } else if (afObj.isDict()) {
340 if (!doMergeFormDict(srcFormDict: afObj.getDict(), mergeFormDict: pageForm.getDict(), numOffset)) {
341 fclose(stream: f);
342 delete yRef;
343 delete countRef;
344 delete outStr;
345 return -1;
346 }
347 }
348 }
349 objectsCount += docs[i]->writePageObjects(outStr, xRef: yRef, numOffset, combine: true);
350 numOffset = yRef->getNumObjects() + 1;
351 }
352
353 rootNum = yRef->getNumObjects() + 1;
354 yRef->add(num: rootNum, gen: 0, offs: outStr->getPos(), used: true);
355 outStr->printf(format: "%d 0 obj\n", rootNum);
356 outStr->printf(format: "<< /Type /Catalog /Pages %d 0 R", rootNum + 1);
357 // insert OutputIntents
358 if (intents.isArray() && intents.arrayGetLength() > 0) {
359 outStr->printf(format: " /OutputIntents [");
360 for (j = 0; j < intents.arrayGetLength(); j++) {
361 Object intent = intents.arrayGet(i: j, recursion: 0);
362 if (intent.isDict()) {
363 PDFDoc::writeObject(obj: &intent, outStr, xref: yRef, numOffset: 0, fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0);
364 }
365 }
366 outStr->printf(format: "]");
367 }
368 // insert AcroForm
369 if (!afObj.isNull()) {
370 outStr->printf(format: " /AcroForm ");
371 PDFDoc::writeObject(obj: &afObj, outStr, xref: yRef, numOffset: 0, fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0);
372 }
373 // insert OCProperties
374 if (!ocObj.isNull() && ocObj.isDict()) {
375 outStr->printf(format: " /OCProperties ");
376 PDFDoc::writeObject(obj: &ocObj, outStr, xref: yRef, numOffset: 0, fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0);
377 }
378 // insert Names
379 if (!names.isNull() && names.isDict()) {
380 outStr->printf(format: " /Names ");
381 PDFDoc::writeObject(obj: &names, outStr, xref: yRef, numOffset: 0, fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0);
382 }
383 outStr->printf(format: ">>\nendobj\n");
384 objectsCount++;
385
386 yRef->add(num: rootNum + 1, gen: 0, offs: outStr->getPos(), used: true);
387 outStr->printf(format: "%d 0 obj\n", rootNum + 1);
388 outStr->printf(format: "<< /Type /Pages /Kids [");
389 for (j = 0; j < (int)pages.size(); j++) {
390 outStr->printf(format: " %d 0 R", rootNum + j + 2);
391 }
392 outStr->printf(format: " ] /Count %zd >>\nendobj\n", pages.size());
393 objectsCount++;
394
395 for (i = 0; i < (int)pages.size(); i++) {
396 yRef->add(num: rootNum + i + 2, gen: 0, offs: outStr->getPos(), used: true);
397 outStr->printf(format: "%d 0 obj\n", rootNum + i + 2);
398 outStr->printf(format: "<< ");
399 Dict *pageDict = pages[i].getDict();
400 for (j = 0; j < pageDict->getLength(); j++) {
401 if (j > 0) {
402 outStr->printf(format: " ");
403 }
404 const char *key = pageDict->getKey(i: j);
405 Object value = pageDict->getValNF(i: j).copy();
406 if (strcmp(s1: key, s2: "Parent") == 0) {
407 outStr->printf(format: "/Parent %d 0 R", rootNum + 1);
408 } else {
409 outStr->printf(format: "/%s ", key);
410 PDFDoc::writeObject(obj: &value, outStr, xref: yRef, numOffset: offsets[i], fileKey: nullptr, encAlgorithm: cryptRC4, keyLength: 0, objNum: 0, objGen: 0);
411 }
412 }
413 outStr->printf(format: " >>\nendobj\n");
414 objectsCount++;
415 }
416 Goffset uxrefOffset = outStr->getPos();
417 Ref ref;
418 ref.num = rootNum;
419 ref.gen = 0;
420 Object trailerDict = PDFDoc::createTrailerDict(uxrefSize: objectsCount, incrUpdate: false, startxRef: 0, root: &ref, xRef: yRef, fileName, fileSize: outStr->getPos());
421 PDFDoc::writeXRefTableTrailer(trailerDict: std::move(trailerDict), uxref: yRef, writeAllEntries: true, // write all entries according to ISO 32000-1, 7.5.4 Cross-Reference Table: "For a file that has never been incrementally updated, the cross-reference section shall
422 // contain only one subsection, whose object numbering begins at 0."
423 uxrefOffset, outStr, xRef: yRef);
424
425 outStr->close();
426 delete outStr;
427 fclose(stream: f);
428 delete yRef;
429 delete countRef;
430 return 0;
431}
432

source code of poppler/utils/pdfunite.cc